From 66d617e3437e62a6650ffcb85b3190669d37a468 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 7 Aug 2024 17:12:05 +0800
Subject: [PATCH 01/18] [Frontend] Gracefully handle missing chat template and
 fix CI failure (#7238)

Co-authored-by: Roger Wang <ywang@roblox.com>
---
 tests/async_engine/test_chat_template.py      | 21 ++---
 tests/async_engine/test_openapi_server_ray.py | 10 ++-
 .../openai/test_oot_registration.py           | 87 ++++++++++++-------
 tests/utils.py                                |  4 +-
 vllm/entrypoints/chat_utils.py                | 37 +++++++-
 vllm/entrypoints/openai/protocol.py           |  5 +-
 vllm/entrypoints/openai/serving_chat.py       |  8 +-
 .../openai/serving_tokenization.py            | 14 +--
 vllm/transformers_utils/tokenizer.py          |  8 +-
 9 files changed, 125 insertions(+), 69 deletions(-)
diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py
index aea8a7fed6e33..4df6c02973284 100644
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@@ -1,22 +1,16 @@
-import os
-import pathlib
-
 import pytest
 
-from vllm.entrypoints.chat_utils import load_chat_template
+from vllm.entrypoints.chat_utils import apply_chat_template, load_chat_template
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
-    __file__))).parent.parent / "examples/template_chatml.jinja"
+from ..utils import VLLM_PATH
+
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
 assert chatml_jinja_path.exists()
 
 # Define models, templates, and their corresponding expected outputs
 MODEL_TEMPLATE_GENERATON_OUTPUT = [
-    ("facebook/opt-125m", None, True,
-     "Hello</s>Hi there!</s>What is the capital of</s>"),
-    ("facebook/opt-125m", None, False,
-     "Hello</s>Hi there!</s>What is the capital of</s>"),
     ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
@@ -93,11 +87,12 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
         add_generation_prompt=add_generation_prompt)
 
     # Call the function and get the result
-    result = tokenizer.apply_chat_template(
+    result = apply_chat_template(
+        tokenizer,
         conversation=mock_request.messages,
-        tokenize=False,
+        chat_template=mock_request.chat_template or template_content,
         add_generation_prompt=mock_request.add_generation_prompt,
-        chat_template=mock_request.chat_template or template_content)
+    )
 
     # Test assertion
     assert result == expected_output, (
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
index 5ecd770ede836..0d53b39e7ce1c 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -1,10 +1,12 @@
 import openai  # use the official client for correctness check
 import pytest
 
-from ..utils import RemoteOpenAIServer
+from ..utils import VLLM_PATH, RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
 
 
 @pytest.fixture(scope="module")
@@ -16,7 +18,9 @@ def server():
         "--max-model-len",
         "2048",
         "--enforce-eager",
-        "--engine-use-ray"
+        "--engine-use-ray",
+        "--chat-template",
+        str(chatml_jinja_path),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -83,7 +87,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI):
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=13, total_tokens=23)
+        completion_tokens=10, prompt_tokens=55, total_tokens=65)
 
     message = choice.message
     assert message.content is not None and len(message.content) >= 10
diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py
index 5272ac4065f1d..9f9a4cd972c51 100644
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
@@ -9,6 +9,11 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.utils import get_open_port
 
+from ...utils import VLLM_PATH, RemoteOpenAIServer
+
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
+
 
 class MyOPTForCausalLM(OPTForCausalLM):
 
@@ -21,12 +26,25 @@ def compute_logits(self, hidden_states: torch.Tensor,
         return logits
 
 
-def server_function(port):
+def server_function(port: int):
     # register our dummy model
     ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM)
-    sys.argv = ["placeholder.py"] + \
-        ("--model facebook/opt-125m --gpu-memory-utilization 0.10 "
-        f"--dtype float32 --api-key token-abc123 --port {port}").split()
+
+    sys.argv = ["placeholder.py"] + [
+        "--model",
+        "facebook/opt-125m",
+        "--gpu-memory-utilization",
+        "0.10",
+        "--dtype",
+        "float32",
+        "--api-key",
+        "token-abc123",
+        "--port",
+        str(port),
+        "--chat-template",
+        str(chatml_jinja_path),
+    ]
+
     import runpy
     runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
 
@@ -36,35 +54,40 @@ def test_oot_registration_for_api_server():
     ctx = torch.multiprocessing.get_context()
     server = ctx.Process(target=server_function, args=(port, ))
     server.start()
-    MAX_SERVER_START_WAIT_S = 60
-    client = OpenAI(
-        base_url=f"http://localhost:{port}/v1",
-        api_key="token-abc123",
-    )
-    now = time.time()
-    while True:
-        try:
-            completion = client.chat.completions.create(
-                model="facebook/opt-125m",
-                messages=[{
-                    "role": "system",
-                    "content": "You are a helpful assistant."
-                }, {
-                    "role": "user",
-                    "content": "Hello!"
-                }],
-                temperature=0,
-            )
-            break
-        except OpenAIError as e:
-            if "Connection error" in str(e):
-                time.sleep(3)
-                if time.time() - now > MAX_SERVER_START_WAIT_S:
-                    raise RuntimeError("Server did not start in time") from e
-            else:
-                raise e
-    server.kill()
+
+    try:
+        client = OpenAI(
+            base_url=f"http://localhost:{port}/v1",
+            api_key="token-abc123",
+        )
+        now = time.time()
+        while True:
+            try:
+                completion = client.chat.completions.create(
+                    model="facebook/opt-125m",
+                    messages=[{
+                        "role": "system",
+                        "content": "You are a helpful assistant."
+                    }, {
+                        "role": "user",
+                        "content": "Hello!"
+                    }],
+                    temperature=0,
+                )
+                break
+            except OpenAIError as e:
+                if "Connection error" in str(e):
+                    time.sleep(3)
+                    if time.time() - now > RemoteOpenAIServer.MAX_START_WAIT_S:
+                        msg = "Server did not start in time"
+                        raise RuntimeError(msg) from e
+                else:
+                    raise e
+    finally:
+        server.terminate()
+
     generated_text = completion.choices[0].message.content
+    assert generated_text is not None
     # make sure only the first token is generated
     rest = generated_text.replace("<s>", "")
     assert rest == ""
diff --git a/tests/utils.py b/tests/utils.py
index bd431b85d2663..e3d04cc638a95 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -50,7 +50,7 @@ def _nvml():
 
 class RemoteOpenAIServer:
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
-    MAX_SERVER_START_WAIT_S = 120  # wait for server to start for 120 seconds
+    MAX_START_WAIT_S = 120  # wait for server to start for 120 seconds
 
     def __init__(
         self,
@@ -85,7 +85,7 @@ def __init__(
                                      stdout=sys.stdout,
                                      stderr=sys.stderr)
         self._wait_for_server(url=self.url_for("health"),
-                              timeout=self.MAX_SERVER_START_WAIT_S)
+                              timeout=self.MAX_START_WAIT_S)
 
     def __enter__(self):
         return self
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 072450a6146ee..12634c3261856 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1,8 +1,9 @@
 import codecs
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import (Awaitable, Iterable, List, Optional, Tuple, Union, cast,
-                    final)
+from pathlib import Path
+from typing import (Any, Awaitable, Iterable, List, Optional, Tuple, Union,
+                    cast, final)
 
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -22,6 +23,7 @@
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import async_get_and_parse_image
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 logger = init_logger(__name__)
 
@@ -69,13 +71,17 @@ class ChatMessageParseResult:
     mm_futures: List[Awaitable[MultiModalDataDict]]
 
 
-def load_chat_template(chat_template: Optional[str]) -> Optional[str]:
+def load_chat_template(
+        chat_template: Optional[Union[Path, str]]) -> Optional[str]:
     if chat_template is None:
         return None
     try:
         with open(chat_template, "r") as f:
             resolved_chat_template = f.read()
     except OSError as e:
+        if isinstance(chat_template, Path):
+            raise
+
         JINJA_CHARS = "{}\n"
         if not any(c in chat_template for c in JINJA_CHARS):
             msg = (f"The supplied chat template ({chat_template}) "
@@ -208,3 +214,28 @@ def parse_chat_messages(
         mm_futures.extend(parse_result.mm_futures)
 
     return conversation, mm_futures
+
+
+def apply_chat_template(
+    tokenizer: AnyTokenizer,
+    conversation: List[ConversationMessage],
+    chat_template: Optional[str],
+    *,
+    tokenize: bool = False,  # Different from HF's default
+    **kwargs: Any,
+) -> str:
+    if chat_template is None and tokenizer.chat_template is None:
+        raise ValueError(
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one.")
+
+    prompt = tokenizer.apply_chat_template(
+        conversation=conversation,
+        chat_template=chat_template,
+        tokenize=tokenize,
+        **kwargs,
+    )
+    assert isinstance(prompt, str)
+
+    return prompt
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 76318a1271229..70467bd879690 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -190,8 +190,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
         default=None,
         description=(
             "A Jinja template to use for this conversion. "
-            "If this is not passed, the model's default chat template will be "
-            "used instead."),
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."),
     )
     chat_template_kwargs: Optional[Dict[str, Any]] = Field(
         default=None,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index add1ce8acc95e..2167b967b14b5 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -10,6 +10,7 @@
 from vllm.config import ModelConfig
 from vllm.engine.protocol import AsyncEngineClient
 from vllm.entrypoints.chat_utils import (ConversationMessage,
+                                         apply_chat_template,
                                          load_chat_template,
                                          parse_chat_messages)
 from vllm.entrypoints.logger import RequestLogger
@@ -99,16 +100,15 @@ async def create_chat_completion(
                 tool.model_dump() for tool in request.tools
             ]
 
-            prompt = tokenizer.apply_chat_template(
+            prompt = apply_chat_template(
+                tokenizer,
                 conversation=conversation,
-                tokenize=False,
+                chat_template=request.chat_template or self.chat_template,
                 add_generation_prompt=request.add_generation_prompt,
                 tools=tool_dicts,
                 documents=request.documents,
-                chat_template=request.chat_template or self.chat_template,
                 **(request.chat_template_kwargs or {}),
             )
-            assert isinstance(prompt, str)
         except Exception as e:
             logger.error("Error in applying chat template from request: %s", e)
             return self.create_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 5b6b979b9b9e7..1aeabb7a7d729 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -2,7 +2,9 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import AsyncEngineClient
-from vllm.entrypoints.chat_utils import load_chat_template, parse_chat_messages
+from vllm.entrypoints.chat_utils import (apply_chat_template,
+                                         load_chat_template,
+                                         parse_chat_messages)
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -70,12 +72,12 @@ async def create_tokenize(
                 logger.warning(
                     "Multi-modal inputs are ignored during tokenization")
 
-            prompt = tokenizer.apply_chat_template(
-                add_generation_prompt=request.add_generation_prompt,
+            prompt = apply_chat_template(
+                tokenizer,
                 conversation=conversation,
-                tokenize=False,
-                chat_template=self.chat_template)
-            assert isinstance(prompt, str)
+                chat_template=self.chat_template,
+                add_generation_prompt=request.add_generation_prompt,
+            )
         else:
             prompt = request.prompt
 
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index bf26d889d1388..25e4c41592c68 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -12,12 +12,12 @@
 from vllm.transformers_utils.tokenizers import BaichuanTokenizer
 from vllm.utils import make_async
 
+from .tokenizer_group import AnyTokenizer
+
 logger = init_logger(__name__)
 
 
-def get_cached_tokenizer(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
     """Get tokenizer with cached properties.
 
     This will patch the tokenizer object in place.
@@ -63,7 +63,7 @@ def get_tokenizer(
     revision: Optional[str] = None,
     download_dir: Optional[str] = None,
     **kwargs,
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+) -> AnyTokenizer:
     """Gets a tokenizer for the given model name via HuggingFace or ModelScope.
     """
     if VLLM_USE_MODELSCOPE:

From 639159b2a665173bc7a81625887b8e76f85e2e32 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 7 Aug 2024 08:54:52 -0700
Subject: [PATCH 02/18] [distributed][misc] add specialized method for cuda
 platform (#7249)

---
 .../device_communicators/custom_all_reduce.py |  8 ++-
 vllm/platforms/cuda.py                        | 37 +++++++++++++-
 vllm/utils.py                                 | 50 -------------------
 3 files changed, 42 insertions(+), 53 deletions(-)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index a4f30808d32e1..479dc95a8b667 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -11,7 +11,8 @@
     gpu_p2p_access_check)
 from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.logger import init_logger
-from vllm.utils import cuda_device_count_stateless, is_full_nvlink
+from vllm.platforms import current_platform
+from vllm.utils import cuda_device_count_stateless
 
 try:
     assert ops.is_custom_op_supported("_C_custom_ar::meta_size")
@@ -113,7 +114,10 @@ def __init__(self,
         # test nvlink first, this will filter out most of the cases
         # where custom allreduce is not supported
         # this checks hardware and driver support for NVLink
-        full_nvlink = is_full_nvlink(physical_device_ids)
+        assert current_platform.is_cuda()
+        from vllm.platforms.cuda import CudaPlatform
+        cuda_platform: CudaPlatform = current_platform
+        full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
         if world_size > 2 and not full_nvlink:
             logger.warning(
                 "Custom allreduce is disabled because it's not supported on"
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 02ba227460e3f..a7e760cc16408 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -4,12 +4,21 @@
 
 import os
 from functools import lru_cache, wraps
-from typing import Tuple
+from typing import List, Tuple
 
 import pynvml
 
+from vllm.logger import init_logger
+
 from .interface import Platform, PlatformEnum
 
+logger = init_logger(__name__)
+
+# NVML utils
+# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using NVML is that it will not initialize CUDA
+
 
 def with_nvml_context(fn):
 
@@ -47,3 +56,29 @@ class CudaPlatform(Platform):
     def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
         physical_device_id = device_id_to_physical_device_id(device_id)
         return get_physical_device_capability(physical_device_id)
+
+    @staticmethod
+    @with_nvml_context
+    def is_full_nvlink(physical_device_ids: List[int]) -> bool:
+        """
+        query if the set of gpus are fully connected by nvlink (1 hop)
+        """
+        handles = [
+            pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids
+        ]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                            handle, peer_handle,
+                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                        if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+                            return False
+                    except pynvml.NVMLError as error:
+                        logger.error(
+                            "NVLink detection failed. This is normal if your"
+                            " machine has no NVLink equipped.",
+                            exc_info=error)
+                        return False
+        return True
diff --git a/vllm/utils.py b/vllm/utils.py
index 61e3bb0bfc333..08aa889b5e447 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1034,56 +1034,6 @@ def cuda_device_count_stateless() -> int:
     return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
 
 
-# NVML utils
-# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
-# all the related functions work on real physical device ids.
-# the major benefit of using NVML is that it will not initialize CUDA
-
-try:
-    import pynvml
-except ImportError:
-    # For non-NV devices
-    pynvml = None
-
-
-def with_nvml_context(fn):
-
-    @wraps(fn)
-    def wrapper(*args, **kwargs):
-        if pynvml is not None:
-            pynvml.nvmlInit()
-        try:
-            return fn(*args, **kwargs)
-        finally:
-            if pynvml is not None:
-                pynvml.nvmlShutdown()
-
-    return wrapper
-
-
-@with_nvml_context
-def is_full_nvlink(device_ids: List[int]) -> bool:
-    """
-    query if the set of gpus are fully connected by nvlink (1 hop)
-    """
-    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids]
-    for i, handle in enumerate(handles):
-        for j, peer_handle in enumerate(handles):
-            if i < j:
-                try:
-                    p2p_status = pynvml.nvmlDeviceGetP2PStatus(
-                        handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
-                    if p2p_status != pynvml.NVML_P2P_STATUS_OK:
-                        return False
-                except pynvml.NVMLError as error:
-                    logger.error(
-                        "NVLink detection failed. This is normal if your"
-                        " machine has no NVLink equipped.",
-                        exc_info=error)
-                    return False
-    return True
-
-
 #From: https://stackoverflow.com/a/4104188/2749989
 def run_once(f):
 

From 0f7052bc7e7c3301588705abf7c7fadf3db293a6 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 7 Aug 2024 12:17:58 -0400
Subject: [PATCH 03/18] [Misc] Refactor linear layer weight loading; introduce
 `BasevLLMParameter` and `weight_loader_v2` (#5874)

---
 tests/quantization/test_compressed_tensors.py |  20 +-
 vllm/model_executor/__init__.py               |   4 +
 vllm/model_executor/layers/linear.py          | 153 +++++++++-
 .../compressed_tensors/compressed_tensors.py  |  21 +-
 .../schemes/compressed_tensors_unquantized.py |  20 +-
 .../schemes/compressed_tensors_w4a16_24.py    | 112 ++++---
 .../schemes/compressed_tensors_w8a16_fp8.py   |  51 ++--
 .../schemes/compressed_tensors_w8a8_fp8.py    |  51 ++--
 .../schemes/compressed_tensors_w8a8_int8.py   |  47 +--
 .../schemes/compressed_tensors_wNa16.py       | 102 +++----
 vllm/model_executor/parameter.py              | 277 ++++++++++++++++++
 11 files changed, 655 insertions(+), 203 deletions(-)
 create mode 100644 vllm/model_executor/parameter.py

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index bd79da84a7764..2ea340779b819 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -9,7 +9,7 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsWNA16)
+    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     QuantizationType)
 
@@ -109,7 +109,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
 
         assert qkv_proj.weight_packed.dtype is torch.int32
         assert qkv_proj.weight_scale.dtype is torch.float16
-        assert qkv_proj.weight_packed.pack_factor == pack_factor
+        assert qkv_proj.scheme.pack_factor == pack_factor
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
@@ -140,13 +140,17 @@ def test_compressed_tensors_fp8(vllm_runner):
         qkv_proj = layer.self_attn.qkv_proj
 
         assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
-        assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+        assert isinstance(
+            qkv_proj.scheme,
+            (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8))
+
         assert qkv_proj.input_scale.dtype is torch.float32
-        assert qkv_proj.weight_scale.dtype is torch.float32
-        # should be scalars after processing
-        assert len(qkv_proj.input_scale.shape) == 0
-        assert len(qkv_proj.weight_scale.shape) == 0
+
+        if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
+            assert len(qkv_proj.input_scale.shape) == 0
+            assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+            assert qkv_proj.weight_scale.dtype is torch.float32
+            assert len(qkv_proj.weight_scale.shape) == 0
 
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         assert output
diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py
index fb98f4a6b46f4..5c767e22de4d0 100644
--- a/vllm/model_executor/__init__.py
+++ b/vllm/model_executor/__init__.py
@@ -1,7 +1,11 @@
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           PackedvLLMParameter)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
 
 __all__ = [
     "SamplingMetadata",
     "set_random_seed",
+    "BasevLLMParameter",
+    "PackedvLLMParameter",
 ]
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index cd53c2b916211..646839ff303ee 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -13,10 +13,14 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           PackedvLLMParameter)
 from vllm.model_executor.utils import set_weight_attrs
 
 logger = init_logger(__name__)
 
+WEIGHT_LOADER_V2_SUPPORTED = ["CompressedTensorsLinearMethod"]
+
 
 def adjust_marlin_shard(param, shard_size, shard_offset):
     marlin_tile_size = getattr(param, "marlin_tile_size", None)
@@ -288,6 +292,7 @@ def __init__(self,
 
         if output_sizes is None:
             output_sizes = [output_size]
+
         self.quant_method.create_weights(
             layer=self,
             input_size_per_partition=self.input_size,
@@ -295,7 +300,9 @@ def __init__(self,
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
-            weight_loader=self.weight_loader,
+            weight_loader=(
+                self.weight_loader_v2 if self.quant_method.__class__.__name__
+                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader),
             prefix=prefix)
         if bias:
             self.bias = Parameter(
@@ -337,6 +344,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
+    def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
+        param.load_column_parallel_weight(loaded_weight=loaded_weight)
+
     def forward(self, input_):
         bias = self.bias if not self.skip_bias_add else None
 
@@ -527,6 +537,62 @@ def weight_loader(self,
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
+    def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter,
+                                           loaded_weight: torch.Tensor):
+        """
+        Handle special case for models where MLP layers are already
+        fused on disk. In this case, we have no shard id. This function
+        determmines the shard id by splitting these layers and then calls
+        the weight loader using the shard id.
+
+        An example of a model with these fused layers:
+        https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
+        """
+
+        current_shard_offset = 0
+        shard_offsets: List[Tuple[int, int, int]] = []
+        for i, output_size in enumerate(self.output_sizes):
+            shard_offsets.append((i, current_shard_offset, output_size))
+            current_shard_offset += output_size
+
+        for shard_id, shard_offset, shard_size in shard_offsets:
+            # Special case for Quantization.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            if isinstance(param, PackedvLLMParameter
+                          ) and param.packed_dim == param.output_dim:
+                param.adjust_shard_indexes_for_packing(
+                    shard_size=shard_size, shard_offset=shard_offset)
+
+            loaded_weight_shard = loaded_weight.narrow(param.output_dim,
+                                                       shard_offset,
+                                                       shard_size)
+            self.weight_loader_v2(param, loaded_weight_shard, shard_id)
+
+    def weight_loader_v2(self,
+                         param: BasevLLMParameter,
+                         loaded_weight: torch.Tensor,
+                         loaded_shard_id: Optional[int] = None):
+        param_data = param.data
+        if loaded_shard_id is None:
+            if param.output_dim is None:
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+                return
+            self._load_fused_module_from_checkpoint(param, loaded_weight)
+            return
+
+        assert loaded_shard_id < len(self.output_sizes)
+
+        tp_size = get_tensor_model_parallel_world_size()
+        shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
+        shard_size = self.output_sizes[loaded_shard_id] // tp_size
+
+        param.load_merged_column_weight(loaded_weight=loaded_weight,
+                                        shard_id=loaded_shard_id,
+                                        shard_offset=shard_offset,
+                                        shard_size=shard_size)
+
 
 class QKVParallelLinear(ColumnParallelLinear):
     """Linear layers for the attention's QKV transformation.
@@ -598,6 +664,82 @@ def __init__(self,
                          quant_config=quant_config,
                          prefix=prefix)
 
+    def _get_shard_offset_mapping(self, loaded_shard_id: str):
+        shard_offset_mapping = {
+            "q": 0,
+            "k": self.num_heads * self.head_size,
+            "v": (self.num_heads + self.num_kv_heads) * self.head_size,
+            "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size
+        }
+        return shard_offset_mapping.get(loaded_shard_id)
+
+    def _get_shard_size_mapping(self, loaded_shard_id: str):
+        shard_size_mapping = {
+            "q": self.num_heads * self.head_size,
+            "k": self.num_kv_heads * self.head_size,
+            "v": self.num_kv_heads * self.head_size,
+        }
+        return shard_size_mapping.get(loaded_shard_id)
+
+    def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter,
+                                           loaded_weight: torch.Tensor):
+        """
+        Handle special case for models where QKV layers are already 
+        fused on disk. In this case, we have no shard id. This function
+        determmines the shard id by splitting these layers and then calls
+        the weight loader using the shard id.
+
+        An example of a model with these fused layers:
+        https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
+        """
+        shard_offsets = [
+            # (shard_id, shard_offset, shard_size)
+            ("q", 0, self.total_num_heads * self.head_size),
+            ("k", self.total_num_heads * self.head_size,
+             self.total_num_kv_heads * self.head_size),
+            ("v",
+             (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
+             self.total_num_kv_heads * self.head_size),
+        ]
+
+        for shard_id, shard_offset, shard_size in shard_offsets:
+            # Special case for Quantization.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            if isinstance(param, PackedvLLMParameter
+                          ) and param.packed_dim == param.output_dim:
+                param.adjust_shard_indexes_for_packing(
+                    shard_size=shard_size, shard_offset=shard_offset)
+
+            loaded_weight_shard = loaded_weight.narrow(param.output_dim,
+                                                       shard_offset,
+                                                       shard_size)
+            self.weight_loader_v2(param, loaded_weight_shard, shard_id)
+
+    def weight_loader_v2(self,
+                         param: BasevLLMParameter,
+                         loaded_weight: torch.Tensor,
+                         loaded_shard_id: Optional[str] = None):
+        param_data = param.data
+        if loaded_shard_id is None:  # special case for certain models
+            if param.output_dim is None:
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+                return
+            self._load_fused_module_from_checkpoint(param, loaded_weight)
+            return
+
+        assert loaded_shard_id in ["q", "k", "v"]
+
+        shard_offset = self._get_shard_offset_mapping(loaded_shard_id)
+        shard_size = self._get_shard_size_mapping(loaded_shard_id)
+
+        param.load_qkv_weight(loaded_weight=loaded_weight,
+                              num_heads=self.num_kv_head_replicas,
+                              shard_id=loaded_shard_id,
+                              shard_offset=shard_offset,
+                              shard_size=shard_size)
+
     def weight_loader(self,
                       param: Parameter,
                       loaded_weight: torch.Tensor,
@@ -798,6 +940,7 @@ def __init__(self,
         self.tp_size = get_tensor_model_parallel_world_size()
         self.input_size_per_partition = divide(input_size, self.tp_size)
         assert self.quant_method is not None
+
         self.quant_method.create_weights(
             layer=self,
             input_size_per_partition=self.input_size_per_partition,
@@ -805,7 +948,9 @@ def __init__(self,
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
-            weight_loader=self.weight_loader,
+            weight_loader=(
+                self.weight_loader_v2 if self.quant_method.__class__.__name__
+                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader),
             prefix=prefix)
         if not reduce_results and (bias and not skip_bias_add):
             raise ValueError("When not reduce the results, adding bias to the "
@@ -850,6 +995,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
+    def weight_loader_v2(self, param: BasevLLMParameter,
+                         loaded_weight: torch.Tensor):
+        param.load_row_parallel_weight(loaded_weight=loaded_weight)
+
     def forward(self, input_):
         if self.input_is_parallel:
             input_parallel = input_
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 39d00bd5733ff..ae75781927381 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -19,6 +19,8 @@
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform
 
+__all__ = ["CompressedTensorsLinearMethod"]
+
 
 class CompressedTensorsConfig(QuantizationConfig):
 
@@ -146,18 +148,15 @@ def _is_fp8_w8a8(self, weight_quant: BaseModel,
         if weight_quant is None or input_quant is None:
             return False
 
-        # Confirm we have floating points.
-        if not (weight_quant.type == QuantizationType.FLOAT
-                and input_quant.type == QuantizationType.FLOAT):
-            return False
-
         # Confirm weight scheme is supported.
+        is_floating_point = (weight_quant.type == QuantizationType.FLOAT
+                             and input_quant.type == QuantizationType.FLOAT)
         is_symmetric_weight = weight_quant.symmetric
         is_static_weight = not weight_quant.dynamic
         is_per_tensor_or_channel_weight = (weight_quant.strategy in [
             QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
         ])
-        if not (is_symmetric_weight and is_static_weight
+        if not (is_floating_point and is_symmetric_weight and is_static_weight
                 and is_per_tensor_or_channel_weight):
             return False
 
@@ -169,11 +168,7 @@ def _is_fp8_w8a8(self, weight_quant: BaseModel,
         is_symmetric_activation = input_quant.symmetric
         is_per_tensor_activation = (
             input_quant.strategy == QuantizationStrategy.TENSOR)
-        if not (is_symmetric_activation and is_per_tensor_activation):
-            return False
-
-        # All conditions satisfied.
-        return True
+        return is_symmetric_activation and is_per_tensor_activation
 
     def _is_fp8_w8a16(self, weight_quant: BaseModel,
                       input_quant: BaseModel) -> bool:
@@ -230,6 +225,7 @@ def _get_scheme_from_parts(
                     group_size=weight_quant.group_size)
 
         # Detect If Activation Quantization.
+        # TODO @dsikka: clean-up conditions
         if is_activation_quantization_format(self.quant_format):
             if self._is_fp8_w8a8(weight_quant, input_quant):
                 is_fp8_w8a8_supported = self._check_scheme_supported(
@@ -237,7 +233,8 @@ def _get_scheme_from_parts(
                 if is_fp8_w8a8_supported:
                     return CompressedTensorsW8A8Fp8(
                         strategy=weight_quant.strategy,
-                        is_static_input_scheme=(not input_quant.dynamic))
+                        is_static_input_scheme=(input_quant
+                                                and not input_quant.dynamic))
                 else:
                     return CompressedTensorsW8A16Fp8(
                         strategy=weight_quant.strategy,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
index b7ba29ddc9840..2e8d520eacc81 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
@@ -2,11 +2,10 @@
 
 import torch
 import torch.nn.functional as F
-from torch.nn import Parameter
 
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.parameter import ModelWeightParameter
 
 __all__ = ["CompressedTensorsUnquantized"]
 
@@ -24,7 +23,9 @@ def get_min_capability(cls) -> int:
         return 70
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        pass
+        # required by torch.compile to be torch.nn.Parameter
+        layer.weight = torch.nn.Parameter(layer.weight.data,
+                                          requires_grad=False)
 
     def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int],
@@ -32,14 +33,15 @@ def create_weights(self, layer: torch.nn.Module,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
 
-        weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                       input_size_per_partition,
-                                       dtype=params_dtype),
-                           requires_grad=False)
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=params_dtype),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
 
-        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
         layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {"weight_loader": weight_loader})
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
                       bias: Optional[torch.Tensor]) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index c1adfdb2980b6..9ad61a64e406c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -8,7 +8,10 @@
     CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N)
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
 from vllm.scalar_type import scalar_types
 
 __all__ = ["CompressedTensorsW4A16Sparse24"]
@@ -45,7 +48,12 @@ def get_min_capability(cls) -> int:
         return 80
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        pass
+        # required by torch.compile to be torch.nn.Parameter
+        layer.weight_packed = Parameter(layer.weight_packed.data,
+                                        requires_grad=False)
+        layer.scale_packed = Parameter(layer.scale_packed.data,
+                                       requires_grad=False)
+        layer.meta = Parameter(layer.meta.data, requires_grad=False)
 
     def create_weights(self, layer: torch.nn.Module, input_size: int,
                        output_partition_sizes: List[int],
@@ -56,79 +64,65 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
         pack_factor = 32 // self.quant_type.size_bits
         output_size_per_partition = sum(output_partition_sizes)
 
-        qweight = Parameter(
-            torch.empty(
-                input_size_per_partition // self.tile_size // 2,
-                output_size_per_partition * self.tile_size // pack_factor,
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            qweight,
-            {
-                "input_dim": 0,
-                "output_dim": 1,
-                "packed_dim": 1,
-                "pack_factor": pack_factor,
-                "marlin_tile_size": self.tile_size,
-                "weight_loader": weight_loader
-            },
-        )
-
-        layer.register_parameter("weight_packed", qweight)
+        qweight = PackedvLLMParameter(data=torch.empty(
+            input_size_per_partition // self.tile_size // 2,
+            output_size_per_partition * self.tile_size // pack_factor,
+            dtype=torch.int32,
+        ),
+                                      input_dim=0,
+                                      output_dim=1,
+                                      packed_dim=1,
+                                      packed_factor=pack_factor,
+                                      marlin_tile_size=self.tile_size,
+                                      weight_loader=weight_loader)
 
         input_groups = (1 if self.group_size is None else
                         input_size_per_partition // self.group_size)
 
-        scales = Parameter(
+        weight_scale_args = {
+            "data":
             torch.empty(
                 input_groups,
                 output_size_per_partition,
                 dtype=params_dtype,
             ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            scales,
-            {
-                "output_dim": 1,
-                "input_dim": None if input_groups == 1 else 0,
-                "weight_loader": weight_loader
-            },
-        )
-        layer.register_parameter("scale_packed", scales)
-
-        weight_shape = Parameter(torch.empty(2, dtype=torch.int64),
-                                 requires_grad=False)
+            "weight_loader":
+            weight_loader
+        }
+
+        if self.group_size is not None:
+            scales = GroupQuantScaleParameter(output_dim=1,
+                                              input_dim=0,
+                                              **weight_scale_args)
+        else:
+            scales = ChannelQuantScaleParameter(output_dim=1,
+                                                **weight_scale_args)
+
+        weight_shape = BasevLLMParameter(data=torch.empty(2,
+                                                          dtype=torch.int64),
+                                         weight_loader=weight_loader)
+
+        meta = PackedvLLMParameter(data=torch.empty(
+            input_size_per_partition // 8 // 2 // 2,
+            output_size_per_partition * 2,
+            dtype=torch.int16,
+        ),
+                                   input_dim=0,
+                                   output_dim=1,
+                                   packed_dim=1,
+                                   packed_factor=1,
+                                   marlin_tile_size=2,
+                                   weight_loader=weight_loader)
 
+        layer.register_parameter("weight_packed", qweight)
         layer.register_parameter("weight_shape", weight_shape)
-        set_weight_attrs(weight_shape, {"weight_loader": weight_loader})
-
-        meta = Parameter(
-            torch.empty(
-                input_size_per_partition // 8 // 2 // 2,
-                output_size_per_partition * 2,
-                dtype=torch.int16,
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            meta,
-            {
-                "input_dim": 0,
-                "packed_dim": 1,
-                "pack_factor": 1,
-                "output_dim": 1,
-                "marlin_tile_size": 2,
-                "weight_loader": weight_loader
-            },
-        )
+        layer.register_parameter("scale_packed", scales)
         layer.register_parameter("meta", meta)
 
         max_workspace_size = (
             output_size_per_partition //
             GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL
+
         workspace = Parameter(torch.zeros(max_workspace_size, dtype=torch.int),
                               requires_grad=False)
         layer.workspace = workspace
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
index eeb7c042e1d1f..3d55d55cc390d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
@@ -9,9 +9,10 @@
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    convert_to_channelwise, create_per_channel_scale_param,
-    create_per_tensor_scale_param)
-from vllm.model_executor.utils import set_weight_attrs
+    convert_to_channelwise)
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
 
 __all__ = ["CompressedTensorsW8A16Fp8"]
 
@@ -40,11 +41,19 @@ def process_weights_after_loading(self, layer) -> None:
                                                     layer.logical_widths)
             layer.weight_scale = torch.nn.Parameter(ws_channelwise,
                                                     requires_grad=False)
+        else:
+            # required by torch.compile to be torch.nn.Parameter
+            layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
+                                                    requires_grad=False)
 
         # Weights must be transposed for marlin
         layer.weight = torch.nn.Parameter(layer.weight.t(),
                                           requires_grad=False)
 
+        if self.is_static_input_scheme:
+            # required by torch.compile to be torch.nn.Parameter
+            layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
+                                                   requires_grad=False)
         prepare_fp8_layer_for_marlin(layer, strategy="channel")
 
     def create_weights(self, layer: torch.nn.Module, input_size: int,
@@ -60,35 +69,39 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
         layer.orig_dtype = params_dtype
 
         # WEIGHT
-        weight = torch.nn.Parameter(torch.empty(output_size_per_partition,
-                                                input_size_per_partition,
-                                                dtype=torch.float8_e4m3fn),
-                                    requires_grad=False)
+        weight = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition,
+            dtype=torch.float8_e4m3fn),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
         layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {
-            "input_dim": 1,
-            "output_dim": 0,
-            "weight_loader": weight_loader,
-        })
 
         # WEIGHT SCALE
-        layer_kwargs = {"weight_loader": weight_loader}
         if self.strategy == QuantizationStrategy.CHANNEL:
-            weight_scale = create_per_channel_scale_param(
-                output_partition_sizes, **layer_kwargs)
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1),
+                                 dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader)
         elif self.strategy == QuantizationStrategy.TENSOR:
-            weight_scale = create_per_tensor_scale_param(
-                output_partition_sizes, **layer_kwargs)
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
         else:
             raise ValueError(
                 f"Unsupported weight strategy={self.strategy}, "
                 f"supported strategies are {SUPPORTED_STRATEGIES}")
+
+        weight_scale[:] = torch.finfo(torch.float32).min
         layer.register_parameter("weight_scale", weight_scale)
 
         # INPUT SCALE (to deal with converted checkpoints)
         if self.is_static_input_scheme:
-            input_scale = create_per_tensor_scale_param(
-                output_partition_sizes, **layer_kwargs)
+            input_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                  weight_loader=weight_loader)
             layer.register_parameter("input_scale", input_scale)
 
     def apply_weights(self,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index cc9d71db140c2..8a3d24e2fd258 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -8,10 +8,10 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     QuantizationStrategy)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear, create_per_channel_scale_param,
-    create_per_tensor_scale_param, cutlass_fp8_supported,
-    requantize_with_max_scale)
-from vllm.model_executor.utils import set_weight_attrs
+    apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale)
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
 
 __all__ = ["CompressedTensorsW8A8Fp8"]
 
@@ -46,6 +46,9 @@ def process_weights_after_loading(self, layer) -> None:
         elif self.strategy == QuantizationStrategy.CHANNEL:
             weight = layer.weight
             layer.weight = Parameter(weight.t(), requires_grad=False)
+            # required by torch.compile to be torch.nn.Parameter
+            layer.weight_scale = Parameter(layer.weight_scale.data,
+                                           requires_grad=False)
 
         else:
             raise ValueError(f"Unknown quantization strategy {self.strategy}")
@@ -66,32 +69,40 @@ def create_weights(self, layer: torch.nn.Module,
         layer.logical_widths = output_partition_sizes
 
         # WEIGHT
-        weight = torch.nn.Parameter(torch.empty(output_size_per_partition,
-                                                input_size_per_partition,
-                                                dtype=torch.float8_e4m3fn),
-                                    requires_grad=False)
+        weight = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition,
+            dtype=torch.float8_e4m3fn),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
         layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {
-            "input_dim": 1,
-            "output_dim": 0,
-            "weight_loader": weight_loader,
-        })
 
         # WEIGHT SCALE
-        layer_kwargs = {"weight_loader": weight_loader}
+        # TODO: update create_xxx_parameter functions to return
+        # the newly added parameters
         if self.strategy == QuantizationStrategy.CHANNEL:
-            weight_scale = create_per_channel_scale_param(
-                output_partition_sizes, **layer_kwargs)
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1),
+                                 dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader)
         else:
             assert self.strategy == QuantizationStrategy.TENSOR
-            weight_scale = create_per_tensor_scale_param(
-                output_partition_sizes, **layer_kwargs)
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
+
+        # min requirement for fp8 kernels
+        weight_scale[:] = torch.finfo(torch.float32).min
         layer.register_parameter("weight_scale", weight_scale)
 
         # INPUT SCALE
         if self.is_static_input_scheme:
-            input_scale = create_per_tensor_scale_param(
-                output_partition_sizes, **layer_kwargs)
+            input_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                  weight_loader=weight_loader)
+            input_scale[:] = torch.finfo(torch.float32).min
             layer.register_parameter("input_scale", input_scale)
 
     def apply_weights(self,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 3a80863d3abbe..078380f159291 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -8,9 +8,11 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     QuantizationStrategy)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_int8_linear, convert_to_channelwise, create_per_channel_scale_param,
-    create_per_tensor_scale_param)
-from vllm.model_executor.utils import set_weight_attrs
+    apply_int8_linear, convert_to_channelwise)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
 
 
 class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
@@ -39,7 +41,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             ws_channelwise = convert_to_channelwise(layer.weight_scale,
                                                     self.logical_widths)
             layer.weight_scale = Parameter(ws_channelwise, requires_grad=False)
-
+        else:
+            layer.weight_scale = Parameter(layer.weight_scale.data,
+                                           requires_grad=False)
         # INPUT SCALE
         if self.is_static_input_scheme:
             layer.input_scale = Parameter(layer.input_scale.max(),
@@ -55,32 +59,35 @@ def create_weights(self, layer: torch.nn.Module,
         self.logical_widths = output_partition_sizes
 
         # WEIGHT
-        weight = Parameter(torch.empty(sum(output_partition_sizes),
-                                       input_size_per_partition,
-                                       dtype=torch.int8),
-                           requires_grad=False)
+        weight = ModelWeightParameter(data=torch.empty(
+            sum(output_partition_sizes),
+            input_size_per_partition,
+            dtype=torch.int8),
+                                      input_dim=1,
+                                      output_dim=0,
+                                      weight_loader=weight_loader)
+
         layer.register_parameter("weight", weight)
-        set_weight_attrs(weight, {
-            "input_dim": 1,
-            "output_dim": 0,
-            "weight_loader": weight_loader,
-        })
 
         # WEIGHT SCALE
-        layer_kwargs = {"weight_loader": weight_loader}
         if self.strategy == QuantizationStrategy.CHANNEL:
-            weight_scale = create_per_channel_scale_param(
-                output_partition_sizes, **layer_kwargs)
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1),
+                                 dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader)
         else:
             assert self.strategy == QuantizationStrategy.TENSOR
-            weight_scale = create_per_tensor_scale_param(
-                output_partition_sizes, **layer_kwargs)
+            weight_scale = PerTensorScaleParameter(data=torch.empty(
+                len(output_partition_sizes), dtype=torch.float32),
+                                                   weight_loader=weight_loader)
         layer.register_parameter("weight_scale", weight_scale)
 
         # INPUT SCALE
         if self.is_static_input_scheme:
-            input_scale = create_per_tensor_scale_param(
-                output_partition_sizes, **layer_kwargs)
+            input_scale = BasevLLMParameter(data=torch.empty(
+                1, dtype=torch.float32),
+                                            weight_loader=weight_loader)
             layer.register_parameter("input_scale", input_scale)
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index b8880f7ac136f..94699c27d5cee 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -1,7 +1,6 @@
 from typing import Callable, List, Optional
 
 import torch
-from torch.nn import Parameter
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
@@ -10,7 +9,10 @@
     apply_gptq_marlin_linear, marlin_make_empty_g_idx, marlin_make_workspace,
     marlin_permute_scales, replace_tensor, verify_marlin_supported,
     verify_marlin_supports_shape)
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
 from vllm.scalar_type import scalar_types
 
 __all__ = ["CompressedTensorsWNA16"]
@@ -30,17 +32,12 @@ def __init__(self,
 
         self.pack_factor = 32 // num_bits
         self.strategy = strategy
+        self.group_size = -1 if group_size is None else group_size
 
-        self.group_size: int
-        if group_size is None:
-            if self.strategy != "channel":
-                raise ValueError(
-                    "Marlin kernels require group quantization or "
-                    "channelwise quantization, but found no group "
-                    "size and strategy is not channelwise.")
-            self.group_size = -1
-        else:
-            self.group_size = group_size
+        if self.group_size == -1 and self.strategy != "channel":
+            raise ValueError("Marlin kernels require group quantization or "
+                             "channelwise quantization, but found no group "
+                             "size and strategy is not channelwise.")
 
         if num_bits not in WNA16_SUPPORTED_TYPES_MAP:
             raise ValueError(
@@ -63,11 +60,12 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                        input_size_per_partition: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
+
         output_size_per_partition = sum(output_partition_sizes)
 
         # If group_size is -1, we are in channelwise case.
         channelwise = (self.group_size == -1)
-        group_size = input_size if channelwise else self.group_size
+        group_size = self.group_size if self.group_size != -1 else input_size
         row_parallel = (input_size != input_size_per_partition)
         # In the case of channelwise quantization, we need to replicate the
         # scales across all gpus.
@@ -79,60 +77,51 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
             input_size=input_size,
             group_size=group_size)
 
-        weight_scale_dim = None
         scales_and_zp_size = input_size // group_size
 
         if partition_scales:
             assert input_size_per_partition % group_size == 0
-            weight_scale_dim = 1
             scales_and_zp_size = input_size_per_partition // group_size
 
-        weight = Parameter(
-            torch.empty(
-                output_size_per_partition,
-                input_size_per_partition // self.pack_factor,
-                dtype=torch.int32,
-            ),
-            requires_grad=False,
-        )
-
-        set_weight_attrs(
-            weight, {
-                "input_dim": 1,
-                "output_dim": 0,
-                "packed_dim": 1,
-                "pack_factor": self.pack_factor,
-                "weight_loader": weight_loader
-            })
-        layer.register_parameter("weight_packed", weight)
-
-        weight_scale = Parameter(
+        weight = PackedvLLMParameter(input_dim=1,
+                                     output_dim=0,
+                                     weight_loader=weight_loader,
+                                     packed_factor=self.pack_factor,
+                                     packed_dim=1,
+                                     data=torch.empty(
+                                         output_size_per_partition,
+                                         input_size_per_partition //
+                                         self.pack_factor,
+                                         dtype=torch.int32,
+                                     ))
+
+        weight_scale_args = {
+            "weight_loader":
+            weight_loader,
+            "data":
             torch.empty(
                 output_size_per_partition,
                 scales_and_zp_size,
                 dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-
-        set_weight_attrs(
-            weight_scale, {
-                "weight_loader": weight_loader,
-                "input_dim": weight_scale_dim,
-                "output_dim": 0
-            })
-        layer.register_parameter("weight_scale", weight_scale)
+            )
+        }
+        if self.group_size == -1:
+            weight_scale = ChannelQuantScaleParameter(output_dim=0,
+                                                      **weight_scale_args)
+        else:
+            weight_scale = GroupQuantScaleParameter(output_dim=0,
+                                                    input_dim=1,
+                                                    **weight_scale_args)
 
         # A 2D array defining the original shape of the weights
         # before packing
-        weight_shape = Parameter(torch.empty(2, dtype=torch.int64),
-                                 requires_grad=False)
+        weight_shape = BasevLLMParameter(data=torch.empty(2,
+                                                          dtype=torch.int64),
+                                         weight_loader=weight_loader)
 
+        layer.register_parameter("weight_packed", weight)
+        layer.register_parameter("weight_scale", weight_scale)
         layer.register_parameter("weight_shape", weight_shape)
-        set_weight_attrs(weight_shape, {
-            "weight_loader": weight_loader,
-            "ignore_warning": True,
-        })
 
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
@@ -154,10 +143,15 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         # No zero-point
         layer.weight_zp = marlin_make_empty_g_idx(device)
+        # Update for kernel
+        layer.weight_packed = torch.nn.Parameter(
+            layer.weight_packed.t().contiguous(), requires_grad=False)
+        layer.weight_scale = torch.nn.Parameter(
+            layer.weight_scale.squeeze().t().contiguous(), requires_grad=False)
 
         # Repack weights from compressed-tensors format to marlin format.
         marlin_qweight = ops.gptq_marlin_repack(
-            layer.weight_packed.t().contiguous(),
+            layer.weight_packed,
             perm=layer.g_idx_sort_indices,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
@@ -166,7 +160,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         # Permute scales from compressed-tensors format to marlin format.
         marlin_scales = marlin_permute_scales(
-            layer.weight_scale.squeeze().t().contiguous(),
+            layer.weight_scale,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
             group_size=layer.group_size)
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
new file mode 100644
index 0000000000000..10239843b3222
--- /dev/null
+++ b/vllm/model_executor/parameter.py
@@ -0,0 +1,277 @@
+from typing import Callable, Optional, Union
+
+import torch
+from torch.nn import Parameter
+
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.logger import init_logger
+
+__all__ = [
+    "BasevLLMParameter", "PackedvLLMParameter", "PerTensorScaleParameter",
+    "ModelWeightParameter", "ChannelQuantScaleParameter",
+    "GroupQuantScaleParameter"
+]
+
+logger = init_logger(__name__)
+
+
+class BasevLLMParameter(Parameter):
+    """
+    Base parameter for vLLM linear layers. Extends the torch.nn.parameter
+    by taking in a linear weight loader. Will copy the loaded weight
+    into the parameter when the provided weight loader is called.
+    """
+
+    def __new__(cls, data: torch.Tensor, **kwargs):
+
+        return super().__new__(cls, data=data, requires_grad=False)
+
+    def __init__(self, data: torch.Tensor, weight_loader: Callable):
+        """
+        Initialize the BasevLLMParameter
+
+        :param data: torch tensor with the parameter data
+        :param weight_loader: weight loader callable
+
+        :returns: a torch.nn.parameter
+        """
+
+        self._weight_loader = weight_loader
+
+    @property
+    def weight_loader(self):
+        return self._weight_loader
+
+    def _assert_and_load(self, loaded_weight: torch.Tensor):
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+
+    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
+        self._assert_and_load(loaded_weight)
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        self._assert_and_load(loaded_weight)
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        self._assert_and_load(loaded_weight)
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        self._assert_and_load(loaded_weight)
+
+
+class _ColumnvLLMParameter(BasevLLMParameter):
+    """
+    Private class defining weight loading functionality 
+    (load_merged_column_weight, load_qkv_weight)
+    for parameters being loaded into linear layers with column
+    parallelism. This includes QKV and MLP layers which are
+    not already fused on disk. Requires an output dimension 
+    to be defined. Called within the weight loader of
+    each of the column parallel linear layers.
+    """
+
+    def __init__(self, output_dim: int, **kwargs):
+        self._output_dim = output_dim
+        super().__init__(**kwargs)
+
+    @property
+    def output_dim(self):
+        return self._output_dim
+
+    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_size = self.data.shape[self.output_dim]
+        loaded_weight = loaded_weight.narrow(self.output_dim,
+                                             tp_rank * shard_size, shard_size)
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+
+        shard_offset = kwargs.get("shard_offset")
+        shard_size = kwargs.get("shard_size")
+        if isinstance(
+                self,
+                PackedvLLMParameter) and self.packed_dim == self.output_dim:
+            shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
+                shard_offset=shard_offset, shard_size=shard_size)
+
+        param_data = self.data
+
+        tp_rank = get_tensor_model_parallel_rank()
+        param_data = param_data.narrow(self.output_dim, shard_offset,
+                                       shard_size)
+        loaded_weight = loaded_weight.narrow(self.output_dim,
+                                             tp_rank * shard_size, shard_size)
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+
+        shard_offset = kwargs.get("shard_offset")
+        shard_size = kwargs.get("shard_size")
+        shard_id = kwargs.get("shard_id")
+        num_heads = kwargs.get("num_heads")
+
+        if isinstance(
+                self,
+                PackedvLLMParameter) and self.output_dim == self.packed_dim:
+            shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
+                shard_offset=shard_offset, shard_size=shard_size)
+
+        param_data = self.data
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
+        param_data = param_data.narrow(self.output_dim, shard_offset,
+                                       shard_size)
+        loaded_weight = loaded_weight.narrow(self.output_dim,
+                                             shard_id * shard_size, shard_size)
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+class ModelWeightParameter(_ColumnvLLMParameter):
+    """
+    Parameter class for linear layer weights. Extends the
+    _ColumnvLLMParameter by adding loading functionality
+    for linear layers with row parallel functionality.
+    Requires an input dimension to be defined.
+    """
+
+    def __init__(self, input_dim: int, **kwargs):
+        self._input_dim = input_dim
+        super().__init__(**kwargs)
+
+    @property
+    def input_dim(self):
+        return self._input_dim
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_size = self.data.shape[self.input_dim]
+        loaded_weight = loaded_weight.narrow(self.input_dim,
+                                             tp_rank * shard_size, shard_size)
+
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+
+
+class GroupQuantScaleParameter(ModelWeightParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    grouped quantization. Equivalent to ModelWeightParameter.
+    """
+    pass
+
+
+class ChannelQuantScaleParameter(_ColumnvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    channel-wise quantization. Equivalent to _ColumnvLLMParameter. 
+    """
+    pass
+
+
+class PerTensorScaleParameter(BasevLLMParameter):
+    """
+    Parameter class for scales where the number of scales is
+    equivalent to the number of logical matrices in fused linear
+    layers (e.g. for QKV, there are 3 scales loaded from disk).
+    This is relevant to weights with per-tensor quantization. 
+    Adds functionality to map the scalers to a shard during
+    weight loading. 
+
+    Note: additional parameter manipulation may be handled 
+    for each quantization config specifically, within 
+    process_weights_after_loading 
+    """
+
+    def __init__(self, **kwargs):
+        self.qkv_idxs = {"q": 0, "k": 1, "v": 2}
+        super().__init__(**kwargs)
+
+    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+
+        assert isinstance(shard_id, str)
+        assert shard_id in self.qkv_idxs
+        return self.qkv_idxs[shard_id]
+
+    def load_merged_column_weight(self, *args, **kwargs):
+        self._load_into_shard_id(*args, **kwargs)
+
+    def load_qkv_weight(self, *args, **kwargs):
+        self._load_into_shard_id(*args, **kwargs)
+
+    def load_column_parallel_weight(self, *args, **kwargs):
+        self._load_into_shard_id(*args, **kwargs)
+
+    def _load_into_shard_id(self, loaded_weight: torch.Tensor,
+                            shard_id: Union[str, int], **kwargs):
+        """
+        Slice the parameter data based on the shard id for 
+        loading.
+        """
+
+        param_data = self.data
+        shard_id = self._shard_id_as_int(shard_id)
+
+        # AutoFP8 scales do not have a shape
+        # compressed-tensors scales do have a shape
+        if len(loaded_weight.shape) != 0:
+            assert loaded_weight.shape[0] == 1
+            loaded_weight = loaded_weight[0]
+
+        param_data = param_data[shard_id]
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+class PackedvLLMParameter(ModelWeightParameter):
+    """
+    Parameter for model weights which are packed on disk.
+    Example: GPTQ Marlin weights are int4 or int8, packed into int32.
+    Extends the ModelWeightParameter to take in the
+    packed factor, the packed dimension, and optionally, marlin
+    tile size for marlin kernels. Adjusts the shard_size and 
+    shard_offset for fused linear layers model weight loading 
+    by accounting for packing and optionally, marlin tile size.
+    """
+
+    def __init__(self,
+                 packed_factor: int,
+                 packed_dim: int,
+                 marlin_tile_size: Optional[int] = None,
+                 **kwargs):
+        self._packed_factor = packed_factor
+        self._packed_dim = packed_dim
+        self._marlin_tile = marlin_tile_size
+        super().__init__(**kwargs)
+
+    @property
+    def packed_dim(self):
+        return self._packed_dim
+
+    @property
+    def packed_factor(self):
+        return self._packed_factor
+
+    @property
+    def marlin_tile(self):
+        return self._marlin_tile
+
+    def _adjust_shard_indexes_for_marlin(self, shard_size, shard_offset):
+        return shard_size * self.marlin_tile, shard_offset * self.marlin_tile
+
+    def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
+        shard_size = shard_size // self.packed_factor
+        shard_offset = shard_offset // self.packed_factor
+        if self.marlin_tile is not None:
+            return self._adjust_shard_indexes_for_marlin(
+                shard_size, shard_offset)
+        return shard_size, shard_offset

From 564985729abc267af281de11f737cfb29b5c0abb Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Wed, 7 Aug 2024 12:24:56 -0400
Subject: [PATCH 04/18] [ BugFix ] Move `zmq` frontend to IPC instead of TCP
 (#7222)

---
 vllm/entrypoints/openai/api_server.py | 12 ++++++++----
 vllm/entrypoints/openai/rpc/client.py |  6 +++---
 vllm/entrypoints/openai/rpc/server.py | 10 ++++------
 vllm/envs.py                          | 11 ++++++-----
 vllm/utils.py                         | 12 ++++++++----
 5 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 88f0bd4ee4dbe..48aa904d4721d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -43,7 +43,7 @@
     OpenAIServingTokenization)
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, get_open_port
+from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
@@ -106,16 +106,20 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]:
 
     # Otherwise, use the multiprocessing AsyncLLMEngine.
     else:
+        # Select random path for IPC.
+        rpc_path = get_open_zmq_ipc_path()
+        logger.info("Multiprocessing frontend to use %s for RPC Path.",
+                    rpc_path)
+
         # Start RPCServer in separate process (holds the AsyncLLMEngine).
-        port = get_open_port(envs.VLLM_RPC_PORT)
         rpc_server_process = Process(target=run_rpc_server,
                                      args=(engine_args,
                                            UsageContext.OPENAI_API_SERVER,
-                                           port))
+                                           rpc_path))
         rpc_server_process.start()
 
         # Build RPCClient, which conforms to AsyncEngineClient Protocol.
-        async_engine_client = AsyncEngineRPCClient(port)
+        async_engine_client = AsyncEngineRPCClient(rpc_path)
         await async_engine_client.setup()
 
         try:
diff --git a/vllm/entrypoints/openai/rpc/client.py b/vllm/entrypoints/openai/rpc/client.py
index 043649131560c..8552c286eeeea 100644
--- a/vllm/entrypoints/openai/rpc/client.py
+++ b/vllm/entrypoints/openai/rpc/client.py
@@ -21,9 +21,9 @@
 
 class AsyncEngineRPCClient:
 
-    def __init__(self, port: int):
+    def __init__(self, rpc_path: str):
         self.context = zmq.asyncio.Context()
-        self.path = f"tcp://localhost:{port}"
+        self.rpc_path = rpc_path
 
     async def setup(self):
         """Setup the client before it starts sending server requests."""
@@ -58,7 +58,7 @@ def socket(self):
         # to enable streaming.
         socket = self.context.socket(zmq.constants.DEALER)
         try:
-            socket.connect(self.path)
+            socket.connect(self.rpc_path)
             yield socket
         finally:
             socket.close()
diff --git a/vllm/entrypoints/openai/rpc/server.py b/vllm/entrypoints/openai/rpc/server.py
index 60bb23b9bde05..617c9b7070e2c 100644
--- a/vllm/entrypoints/openai/rpc/server.py
+++ b/vllm/entrypoints/openai/rpc/server.py
@@ -20,7 +20,7 @@
 class AsyncEngineRPCServer:
 
     def __init__(self, async_engine_args: AsyncEngineArgs,
-                 usage_context: UsageContext, port: int):
+                 usage_context: UsageContext, rpc_path: str):
         # Initialize engine first.
         self.engine = AsyncLLMEngine.from_engine_args(async_engine_args,
                                                       usage_context)
@@ -30,9 +30,7 @@ def __init__(self, async_engine_args: AsyncEngineArgs,
 
         # Init socket for readiness state.
         self.socket = self.context.socket(zmq.constants.ROUTER)
-        # Note numeric form of localhost should be used for zmq bind(),
-        # see https://stackoverflow.com/a/8958414
-        self.socket.bind(f"tcp://127.0.0.1:{port}")
+        self.socket.bind(rpc_path)
 
     def cleanup(self):
         """Cleanup all resources."""
@@ -213,6 +211,6 @@ def signal_handler() -> None:
 
 
 def run_rpc_server(async_engine_args: AsyncEngineArgs,
-                   usage_context: UsageContext, port: int):
-    server = AsyncEngineRPCServer(async_engine_args, usage_context, port)
+                   usage_context: UsageContext, rpc_path: str):
+    server = AsyncEngineRPCServer(async_engine_args, usage_context, rpc_path)
     asyncio.run(run_server(server))
diff --git a/vllm/envs.py b/vllm/envs.py
index 81d2d80e65e46..df4c994359dbd 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1,10 +1,11 @@
 import os
+import tempfile
 from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
 
 if TYPE_CHECKING:
     VLLM_HOST_IP: str = ""
     VLLM_PORT: Optional[int] = None
-    VLLM_RPC_PORT: int = 5570
+    VLLM_RPC_BASE_PATH: str = tempfile.gettempdir()
     VLLM_USE_MODELSCOPE: bool = False
     VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
     VLLM_INSTANCE_ID: Optional[str] = None
@@ -142,10 +143,10 @@ def get_default_config_root():
     lambda: int(os.getenv('VLLM_PORT', '0'))
     if 'VLLM_PORT' in os.environ else None,
 
-    # used when the frontend api server is running in multi-processing mode,
-    # to communicate with the backend engine process over ZMQ.
-    'VLLM_RPC_PORT':
-    lambda: int(os.getenv('VLLM_RPC_PORT', '5570')),
+    # path used for ipc when the frontend api server is running in
+    # multi-processing mode to communicate with the backend engine process.
+    'VLLM_RPC_BASE_PATH':
+    lambda: os.getenv('VLLM_RPC_BASE_PATH', tempfile.gettempdir()),
 
     # If true, will load models from ModelScope instead of Hugging Face Hub.
     # note that the value is true or false, not numbers
diff --git a/vllm/utils.py b/vllm/utils.py
index 08aa889b5e447..1fd395c04ca24 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -19,6 +19,7 @@
 from typing import (Any, AsyncGenerator, Awaitable, Callable, Dict, Generic,
                     Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar,
                     Union, overload)
+from uuid import uuid4
 
 import numpy as np
 import numpy.typing as npt
@@ -484,10 +485,13 @@ def get_distributed_init_method(ip: str, port: int) -> str:
     return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}"
 
 
-def get_open_port(port: Optional[int] = None) -> int:
-    if port is None:
-        # Default behavior here is to return a port for multi-gpu communication
-        port = envs.VLLM_PORT
+def get_open_zmq_ipc_path() -> str:
+    base_rpc_path = envs.VLLM_RPC_BASE_PATH
+    return f"ipc://{base_rpc_path}/{uuid4()}"
+
+
+def get_open_port() -> int:
+    port = envs.VLLM_PORT
     if port is not None:
         while True:
             try:

From ab0f5e2823d31c150c01fc30c146b6ea801d6bdd Mon Sep 17 00:00:00 2001
From: Rafael Vasquez <rafvasq21@gmail.com>
Date: Wed, 7 Aug 2024 12:29:27 -0400
Subject: [PATCH 05/18] Fixes typo in function name (#7275)

Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
---
 vllm/scripts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/scripts.py b/vllm/scripts.py
index 403b22239aed0..f45bfe06047de 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -14,7 +14,7 @@
 from vllm.utils import FlexibleArgumentParser
 
 
-def registrer_signal_handlers():
+def register_signal_handlers():
 
     def signal_handler(sig, frame):
         sys.exit(0)
@@ -31,7 +31,7 @@ def serve(args: argparse.Namespace) -> None:
 
 
 def interactive_cli(args: argparse.Namespace) -> None:
-    registrer_signal_handlers()
+    register_signal_handlers()
 
     base_url = args.url
     api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY")

From b764547616e6ae1517929985400b1dc62cdbc3a3 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Thu, 8 Aug 2024 00:32:07 +0800
Subject: [PATCH 06/18] [Bugfix] Fix input processor for InternVL2 model
 (#7164)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/models/test_internvl.py          | 23 +++++--
 vllm/model_executor/models/internvl.py | 84 +++++++++++++++++---------
 2 files changed, 73 insertions(+), 34 deletions(-)

diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py
index 66cb8dda248db..6aa0189648d72 100644
--- a/tests/models/test_internvl.py
+++ b/tests/models/test_internvl.py
@@ -5,6 +5,7 @@
 import torch
 from huggingface_hub import snapshot_download
 from PIL.Image import Image
+from transformers import AutoConfig
 
 from vllm.model_executor.models.internvl import (IMG_CONTEXT, IMG_END,
                                                  IMG_START,
@@ -26,10 +27,15 @@
 
 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner
+DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
 models = [
-    snapshot_download("OpenGVLab/InternVL2-1B"),
-    snapshot_download("OpenGVLab/InternVL2-2B"),
-    # snapshot_download("OpenGVLab/InternVL2-4B"),  # broken
+    snapshot_download("OpenGVLab/InternVL2-1B",
+                      allow_patterns=DOWNLOAD_PATTERN),
+    snapshot_download("OpenGVLab/InternVL2-2B",
+                      allow_patterns=DOWNLOAD_PATTERN),
+    # Broken due to outdated implementation of Phi-3
+    # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3
+    # snapshot_download("OpenGVLab/InternVL2-4B"),
 ]
 
 
@@ -41,8 +47,17 @@ def __init__(self, hf_runner: HfRunner):
         self.tokenizer = hf_runner.tokenizer
         self.dtype = hf_runner.model.dtype
 
+        self.config = AutoConfig.from_pretrained(hf_runner.model_name)
+        self.vision_config = self.config.vision_config
+        self.use_thumbnail = self.config.use_thumbnail
+        self.min_num = self.config.min_dynamic_patch
+        self.max_num = self.config.max_dynamic_patch
+        self.image_size = self.vision_config.image_size
+
     def __call__(self, text: str, images: Image, **kwargs):
-        pixel_values = image_to_pixel_values(images).to(self.dtype)
+        pixel_values = image_to_pixel_values(images, self.image_size,
+                                             self.min_num, self.max_num,
+                                             self.use_thumbnail).to(self.dtype)
         num_patches_list = [pixel_values.shape[0]]
         for num_patches in num_patches_list:
             context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 8850fd7c6763b..49f9a4c85f2d0 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -38,9 +38,6 @@
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
 
-MAX_IMAGE_FEATURE_SIZE_WIDTH = 3000
-MAX_IMAGE_FEATURE_SIZE_HEIGHT = 500
-
 
 class InternVLImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
@@ -84,11 +81,9 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
     return best_ratio
 
 
-def calculate_num_blocks(orig_width: int,
-                         orig_height: int,
-                         min_num=1,
-                         max_num=6,
-                         image_size=448):
+def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int,
+                         max_num: int,
+                         image_size: int) -> Tuple[int, int, int]:
     aspect_ratio = orig_width / orig_height
 
     # calculate the existing image aspect ratio
@@ -110,11 +105,9 @@ def calculate_num_blocks(orig_width: int,
 
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def dynamic_preprocess(image,
-                       min_num=1,
-                       max_num=6,
-                       image_size=448,
-                       use_thumbnail=False):
+def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int,
+                       image_size: int,
+                       use_thumbnail: int) -> List[Image.Image]:
     orig_width, orig_height = image.size
 
     blocks, target_width, target_height = calculate_num_blocks(
@@ -138,12 +131,14 @@ def dynamic_preprocess(image,
 
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
-def image_to_pixel_values(image: Image.Image, input_size=448, max_num=6):
+def image_to_pixel_values(image: Image.Image, input_size: int, min_num: int,
+                          max_num: int, use_thumbnail: bool) -> torch.Tensor:
     transform = build_transform(input_size=input_size)
     images = dynamic_preprocess(image,
+                                min_num=min_num,
+                                max_num=max_num,
                                 image_size=input_size,
-                                use_thumbnail=True,
-                                max_num=max_num)
+                                use_thumbnail=use_thumbnail)
     pixel_values = [transform(image) for image in images]
     pixel_values = torch.stack(pixel_values)
     return pixel_values
@@ -159,12 +154,18 @@ def get_internvl_num_patches(image_size: int, patch_size: int,
 def get_max_internvl_image_tokens(ctx: InputContext):
     hf_config = ctx.get_hf_config(PretrainedConfig)
     vision_config = hf_config.vision_config
+
+    use_thumbnail = hf_config.use_thumbnail
+    max_dynamic_patch = hf_config.max_dynamic_patch
+    if use_thumbnail:
+        max_dynamic_patch += 1
+    downsample_ratio = hf_config.downsample_ratio
+
     image_size = vision_config.image_size
     patch_size = vision_config.patch_size
-    downsample_ratio = hf_config.downsample_ratio
     num_patches = get_internvl_num_patches(image_size, patch_size,
                                            downsample_ratio)
-    return num_patches * 7
+    return num_patches * max_dynamic_patch
 
 
 def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
@@ -176,21 +177,27 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
     hf_config = ctx.get_hf_config(PretrainedConfig)
     vision_config = hf_config.vision_config
 
+    image_size = vision_config.image_size
+    patch_size = vision_config.patch_size
+    downsample_ratio = hf_config.downsample_ratio
+    num_patches = get_internvl_num_patches(image_size, patch_size,
+                                           downsample_ratio)
+
     image_data = multi_modal_data["image"]
     if isinstance(image_data, Image.Image):
         width, height = image_data.size
-        num_blocks, _, _ = calculate_num_blocks(width, height)
+        min_num = hf_config.min_dynamic_patch
+        max_num = hf_config.max_dynamic_patch
+        num_blocks, _, _ = calculate_num_blocks(width, height, min_num,
+                                                max_num, image_size)
+        # add thumbnail image if num_blocks > 1
+        if hf_config.use_thumbnail and num_blocks > 1:
+            num_blocks += 1
     elif isinstance(image_data, torch.Tensor):
         raise NotImplementedError("Embeddings input is not supported yet")
     else:
         raise TypeError(f"Invalid image type: {type(image_data)}")
 
-    image_size = vision_config.image_size
-    patch_size = vision_config.patch_size
-    downsample_ratio = hf_config.downsample_ratio
-    num_patches = get_internvl_num_patches(image_size, patch_size,
-                                           downsample_ratio)
-
     tokenizer = cached_get_tokenizer(model_config.tokenizer,
                                      trust_remote_code=True)
 
@@ -198,8 +205,7 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
     prompt_token_ids = llm_inputs["prompt_token_ids"]
     if prompt is None:
         prompt = tokenizer.decode(prompt_token_ids)
-    image_prompt = IMG_START + IMG_CONTEXT * (num_blocks +
-                                              1) * num_patches + IMG_END
+    image_prompt = IMG_START + IMG_CONTEXT * num_blocks * num_patches + IMG_END
     new_prompt = prompt.replace('<image>', image_prompt, 1)
     new_prompt_token_ids = tokenizer.encode(new_prompt)
 
@@ -209,8 +215,19 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs):
 
 
 def input_mapper_for_internvl(ctx: InputContext, data: object):
+    hf_config = ctx.get_hf_config(PretrainedConfig)
+
+    use_thumbnail = hf_config.use_thumbnail
+    min_num = hf_config.min_dynamic_patch
+    max_num = hf_config.max_dynamic_patch
+    image_size = hf_config.vision_config.image_size
+
     if isinstance(data, Image.Image):
-        data = image_to_pixel_values(data)
+        data = image_to_pixel_values(data,
+                                     image_size,
+                                     min_num,
+                                     max_num,
+                                     use_thumbnail=use_thumbnail)
     model_config = ctx.model_config
     tokenizer = cached_get_tokenizer(model_config.tokenizer,
                                      trust_remote_code=True)
@@ -240,10 +257,17 @@ def dummy_data_for_internvl(ctx: InputContext, seq_len: int):
                                         add_special_tokens=False)[0],
         image_feature_size_override=image_feature_size,
     )
+
+    image_size = vision_config.image_size
+    min_num = hf_config.min_dynamic_patch
+    max_num = hf_config.max_dynamic_patch
+    max_image_width = max_num * image_size
+    max_image_height = min_num * image_size
+
     mm_data = dummy_image_for_clip(
         vision_config,
-        image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH,
-        image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+        image_width_override=max_image_width,
+        image_height_override=max_image_height,
     )
 
     return seq_data, mm_data

From 80cbe10c59a3354d0fc7c841fdc6422fc64899aa Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Wed, 7 Aug 2024 20:49:10 +0400
Subject: [PATCH 07/18] [OpenVINO] migrate to latest dependencies versions
 (#7251)

---
 Dockerfile.openvino                           |  2 +-
 .../getting_started/openvino-installation.rst |  2 +-
 requirements-openvino.txt                     | 33 ++-----------------
 setup.py                                      |  2 +-
 4 files changed, 6 insertions(+), 33 deletions(-)

diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index c84dea419e58a..06ca4638dfeb9 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -21,7 +21,7 @@ COPY setup.py /workspace/vllm/
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
 # build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
 
 COPY examples/ /workspace/vllm/examples
 COPY benchmarks/ /workspace/vllm/benchmarks
diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst
index 62256df091a44..d8f27c4328a58 100644
--- a/docs/source/getting_started/openvino-installation.rst
+++ b/docs/source/getting_started/openvino-installation.rst
@@ -57,7 +57,7 @@ Install from source
 
   .. code-block:: console
 
-      $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+      $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
 
 .. _openvino_backend_performance_tips:
 
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
index 2dd971d6400be..419294aa75626 100644
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -1,34 +1,7 @@
 # Common dependencies
-# -r requirements-common.txt
-# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
-cmake >= 3.21
-ninja  # For faster builds.
-psutil
-sentencepiece  # Required for LLaMA tokenizer.
-numpy < 2.0.0
-requests
-tqdm
-py-cpuinfo
-transformers < 4.43
-tokenizers >= 0.19.1  # Required for Llama 3.
-fastapi
-aiohttp
-openai
-uvicorn[standard]
-pydantic >= 2.0  # Required for OpenAI server.
-pillow  # Required for image processing
-prometheus_client >= 0.18.0
-prometheus-fastapi-instrumentator >= 7.0.0
-tiktoken >= 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.10.3
-outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
-typing_extensions
-filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
-pyzmq
-gguf == 0.9.1
+-r requirements-common.txt
 
 # OpenVINO dependencies
 torch >= 2.1.2
-openvino ~= 2024.3.0.dev
-openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
-optimum-intel[openvino] >= 1.18.1
+openvino ~= 2024.3.0
+optimum-intel[openvino] >= 1.18.2
diff --git a/setup.py b/setup.py
index b146299f8269d..f6e005879aeff 100644
--- a/setup.py
+++ b/setup.py
@@ -272,7 +272,7 @@ def _build_custom_ops() -> bool:
 
 
 def _build_core_ext() -> bool:
-    return not _is_neuron() and not _is_tpu()
+    return not _is_neuron() and not _is_tpu() and not _is_openvino()
 
 
 def get_hipcc_rocm_version():

From 0e12cd67a8f84047e6d28084bfe94a8278e10218 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 7 Aug 2024 09:58:02 -0700
Subject: [PATCH 08/18] [Doc] add online speculative decoding example (#7243)

---
 docs/source/models/spec_decode.rst | 66 +++++++++++++++++++++++++-----
 1 file changed, 55 insertions(+), 11 deletions(-)

diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst
index be901fa881b12..d3c196faff25d 100644
--- a/docs/source/models/spec_decode.rst
+++ b/docs/source/models/spec_decode.rst
@@ -14,17 +14,17 @@ Speculative decoding is a technique which improves inter-token latency in memory
 Speculating with a draft model
 ------------------------------
 
-The following code configures vLLM to use speculative decoding with a draft model, speculating 5 tokens at a time.
+The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
 
 .. code-block:: python
 
     from vllm import LLM, SamplingParams
-    
+
     prompts = [
         "The future of AI is",
     ]
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    
+
     llm = LLM(
         model="facebook/opt-6.7b",
         tensor_parallel_size=1,
@@ -33,12 +33,56 @@ The following code configures vLLM to use speculative decoding with a draft mode
         use_v2_block_manager=True,
     )
     outputs = llm.generate(prompts, sampling_params)
-    
+
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
+To perform the same with an online mode launch the server:
+
+.. code-block:: bash
+
+    python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
+    --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \
+    --num_speculative_tokens 5 --gpu_memory_utilization 0.8
+
+ Then use a client:
+
+.. code-block:: python
+
+    from openai import OpenAI
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Completion API
+    stream = False
+    completion = client.completions.create(
+        model=model,
+        prompt="The future of AI is",
+        echo=False,
+        n=1,
+        stream=stream,
+    )
+
+    print("Completion results:")
+    if stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+
 Speculating by matching n-grams in the prompt
 ---------------------------------------------
 
@@ -48,12 +92,12 @@ matching n-grams in the prompt. For more information read `this thread. <https:/
 .. code-block:: python
 
     from vllm import LLM, SamplingParams
-    
+
     prompts = [
         "The future of AI is",
     ]
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    
+
     llm = LLM(
         model="facebook/opt-6.7b",
         tensor_parallel_size=1,
@@ -63,7 +107,7 @@ matching n-grams in the prompt. For more information read `this thread. <https:/
         use_v2_block_manager=True,
     )
     outputs = llm.generate(prompts, sampling_params)
-    
+
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
@@ -74,7 +118,7 @@ Speculating using MLP speculators
 
 The following code configures vLLM to use speculative decoding where proposals are generated by
 draft models that conditioning draft predictions on both context vectors and sampled tokens.
-For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/>`_ or 
+For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/>`_ or
 `this technical report <https://arxiv.org/abs/2404.19124>`_.
 
 .. code-block:: python
@@ -100,9 +144,9 @@ For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-Note that these speculative models currently need to be run without tensor parallelism, although 
-it is possible to run the main model using tensor parallelism (see example above). Since the 
-speculative models are relatively small, we still see significant speedups. However, this 
+Note that these speculative models currently need to be run without tensor parallelism, although
+it is possible to run the main model using tensor parallelism (see example above). Since the
+speculative models are relatively small, we still see significant speedups. However, this
 limitation will be fixed in a future release.
 
 A variety of speculative models of this type are available on HF hub:

From fde47d3bc2d51fa02d1132d541de52924a0b0838 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <maxdebayser@gmail.com>
Date: Wed, 7 Aug 2024 15:09:36 -0300
Subject: [PATCH 09/18] [BugFix] Fix frontend multiprocessing hang (#7217)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
---
 tests/entrypoints/openai/test_mp_crash.py | 35 +++++++++++++++++++++++
 vllm/entrypoints/openai/api_server.py     | 11 ++++++-
 vllm/entrypoints/openai/rpc/client.py     | 26 ++++++++++++++---
 3 files changed, 67 insertions(+), 5 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_mp_crash.py

diff --git a/tests/entrypoints/openai/test_mp_crash.py b/tests/entrypoints/openai/test_mp_crash.py
new file mode 100644
index 0000000000000..7dc595a7be351
--- /dev/null
+++ b/tests/entrypoints/openai/test_mp_crash.py
@@ -0,0 +1,35 @@
+from typing import Any
+
+import pytest
+
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.openai.api_server import build_async_engine_client
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.utils import FlexibleArgumentParser
+
+
+def crashing_from_engine_args(
+    cls,
+    engine_args: Any = None,
+    start_engine_loop: Any = None,
+    usage_context: Any = None,
+    stat_loggers: Any = None,
+) -> "AsyncLLMEngine":
+    raise Exception("foo")
+
+
+@pytest.mark.asyncio
+async def test_mp_crash_detection(monkeypatch):
+
+    with pytest.raises(RuntimeError) as excinfo, monkeypatch.context() as m:
+        m.setattr(AsyncLLMEngine, "from_engine_args",
+                  crashing_from_engine_args)
+        parser = FlexibleArgumentParser(
+            description="vLLM's remote OpenAI server.")
+        parser = make_arg_parser(parser)
+        args = parser.parse_args([])
+
+        async with build_async_engine_client(args):
+            pass
+    assert "The server process died before responding to the readiness probe"\
+          in str(excinfo.value)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 48aa904d4721d..d44604b12fb69 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -120,9 +120,18 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]:
 
         # Build RPCClient, which conforms to AsyncEngineClient Protocol.
         async_engine_client = AsyncEngineRPCClient(rpc_path)
-        await async_engine_client.setup()
 
         try:
+            while True:
+                try:
+                    await async_engine_client.setup()
+                    break
+                except TimeoutError as e:
+                    if not rpc_server_process.is_alive():
+                        raise RuntimeError(
+                            "The server process died before "
+                            "responding to the readiness probe") from e
+
             yield async_engine_client
         finally:
             # Ensure rpc server process was terminated
diff --git a/vllm/entrypoints/openai/rpc/client.py b/vllm/entrypoints/openai/rpc/client.py
index 8552c286eeeea..d69b202e2d1bb 100644
--- a/vllm/entrypoints/openai/rpc/client.py
+++ b/vllm/entrypoints/openai/rpc/client.py
@@ -18,6 +18,9 @@
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 
+# Time to wait before checking it the server process is alive.
+SERVER_START_TIMEOUT_MS = 1000
+
 
 class AsyncEngineRPCClient:
 
@@ -61,7 +64,16 @@ def socket(self):
             socket.connect(self.rpc_path)
             yield socket
         finally:
-            socket.close()
+            # linger == 0 means discard unsent messages
+            # when the socket is closed. This is necessary
+            # because otherwise self.context.destroy() will
+            # wait for 30 seconds until unsent messages are
+            # received, which is impossible if the server
+            # crashed. In the absence of a server crash we
+            # always expect a response before closing the
+            # socket anyway.
+            # Reference: http://api.zeromq.org/4-2:zmq-setsockopt#toc24
+            socket.close(linger=0)
 
     async def _send_get_data_rpc_request(self, request: RPCUtilityRequest,
                                          expected_type: Any,
@@ -85,14 +97,19 @@ async def _send_get_data_rpc_request(self, request: RPCUtilityRequest,
 
         return data
 
-    async def _send_one_way_rpc_request(self, request: RPC_REQUEST_TYPE,
-                                        error_message: str):
+    async def _send_one_way_rpc_request(self,
+                                        request: RPC_REQUEST_TYPE,
+                                        error_message: str,
+                                        timeout: Optional[int] = None):
         """Send one-way RPC request to trigger an action."""
         with self.socket() as socket:
             # Ping RPC Server with request.
             await socket.send(cloudpickle.dumps(request))
 
             # Await acknowledgement from RPCServer.
+            if timeout is not None and await socket.poll(timeout=timeout) == 0:
+                raise TimeoutError(f"server didn't reply within {timeout} ms")
+
             response = cloudpickle.loads(await socket.recv())
 
         if not isinstance(response, str) or response != VLLM_RPC_SUCCESS_STR:
@@ -117,7 +134,8 @@ async def wait_for_server(self):
 
         await self._send_one_way_rpc_request(
             request=RPCUtilityRequest.IS_SERVER_READY,
-            error_message="Unable to start RPC Server.")
+            error_message="Unable to start RPC Server.",
+            timeout=SERVER_START_TIMEOUT_MS)
 
     async def _get_model_config_rpc(self) -> ModelConfig:
         """Get the ModelConfig object from the RPC Server"""

From 5223199e03ac3729eb60043a1ef57156c8af1bc9 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 7 Aug 2024 14:23:12 -0400
Subject: [PATCH 10/18] [Bugfix][FP8] Fix dynamic FP8 Marlin quantization
 (#7219)

---
 tests/quantization/test_fp8.py                | 19 +++++++++++++++----
 vllm/envs.py                                  |  8 ++++++++
 .../model_executor/layers/quantization/fp8.py | 11 ++++++++++-
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index a020f7bf37262..ebb06ed20f249 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -9,6 +9,7 @@
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
                                                          Fp8LinearMethod)
+from vllm.platforms import current_platform
 
 MODELS = [
     "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
@@ -20,7 +21,12 @@
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", MODELS)
-def test_model_load_and_run(vllm_runner, model_id: str):
+@pytest.mark.parametrize("force_marlin", [False, True])
+def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
+                            monkeypatch) -> None:
+    if force_marlin:
+        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
+
     with vllm_runner(model_id) as llm:
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
@@ -61,7 +67,12 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
-def test_load_fp16_model(vllm_runner, kv_cache_dtype: str) -> None:
+@pytest.mark.parametrize("force_marlin", [False, True])
+def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
+                         monkeypatch) -> None:
+    if force_marlin:
+        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
+
     with vllm_runner("facebook/opt-125m",
                      quantization="fp8",
                      kv_cache_dtype=kv_cache_dtype) as llm:
@@ -75,9 +86,9 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str) -> None:
             assert attn._k_scale == 1.0
             assert attn._v_scale == 1.0
 
-        capability = torch.cuda.get_device_capability()
+        capability = current_platform.get_device_capability()
         capability = capability[0] * 10 + capability[1]
-        if capability >= 89:
+        if capability >= 89 and not force_marlin:
             # For GPUs with hardware support, we keep weights in fp8
             assert fc1.weight.dtype == torch.float8_e4m3fn
         else:
diff --git a/vllm/envs.py b/vllm/envs.py
index df4c994359dbd..81f30b1d42a13 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -52,6 +52,7 @@
     CMAKE_BUILD_TYPE: Optional[str] = None
     VERBOSE: bool = False
     VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
+    VLLM_TEST_FORCE_FP8_MARLIN: bool = False
 
 
 def get_default_cache_root():
@@ -342,6 +343,13 @@ def get_default_config_root():
     lambda:
     (os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in
      ("1", "true")),
+
+    # If set, forces FP8 Marlin to be used for FP8 quantization regardless
+    # of the hardware support for FP8 compute.
+    "VLLM_TEST_FORCE_FP8_MARLIN":
+    lambda:
+    (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
+     ("1", "true")),
 }
 
 # end-env-vars-definition
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index c829cb836ee4c..cdd2413f5b2c4 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -4,6 +4,7 @@
 from torch.nn import Module
 from torch.nn.parameter import Parameter
 
+import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
@@ -118,7 +119,7 @@ def __init__(self, quant_config: Fp8Config):
         # kernel for fast weight-only FP8 quantization
         capability = current_platform.get_device_capability()
         capability = capability[0] * 10 + capability[1]
-        self.use_marlin = capability < 89
+        self.use_marlin = capability < 89 or envs.VLLM_TEST_FORCE_FP8_MARLIN
 
     def create_weights(
         self,
@@ -174,6 +175,14 @@ def process_weights_after_loading(self, layer: Module) -> None:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
                                                          scale=None)
 
+            # If using marlin (w8a16), kernel uses channelwise weights,
+            # so extend the weight scales to be channelwise.
+            if self.use_marlin:
+                assert weight_scale.numel() == 1
+                weight_scale = convert_to_channelwise(
+                    weight_scale.expand(len(layer.logical_widths)),
+                    layer.logical_widths)
+
             # Update the layer with the new values.
             layer.weight = Parameter(qweight.t(), requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)

From 469b3bc538bcbdc1d9b7d30c415bdb0361853871 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Wed, 7 Aug 2024 11:34:25 -0700
Subject: [PATCH 11/18] [ci] Make building wheels per commit optional (#7278)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/release-pipeline.yaml | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 5be9a553dddd4..416fe344a36ea 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,9 +1,27 @@
 steps:
-  - label: "Build wheel - CUDA {{matrix.cuda_version}}"
+  - label: "Build wheel - CUDA 12.1"
     agents:
       queue: cpu_queue
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      # rename the files to change linux -> manylinux1
+      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - block: "Build CUDA 11.8 wheel"
+    key: block-build-cu118-wheel
+  
+  - label: "Build wheel - CUDA 11.8"
+    depends_on: block-build-cu118-wheel
+    agents:
+      queue: cpu_queue
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       # rename the files to change linux -> manylinux1
@@ -12,8 +30,3 @@ steps:
       - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
     env:
       DOCKER_BUILDKIT: "1"
-    matrix:
-      setup:
-        cuda_version:
-          - "11.8.0"
-          - "12.1.0"

From 311f743831f164f83361295b562f6cdb88ad3418 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 7 Aug 2024 16:05:37 -0400
Subject: [PATCH 12/18] [Bugfix] Fix gptq failure on T4s (#7264)

---
 .../layers/quantization/awq_marlin.py         |  3 +--
 .../layers/quantization/gptq_marlin.py        |  3 +--
 .../layers/quantization/utils/marlin_utils.py | 23 ++++++++++---------
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 2cc080608c7a9..1b0453c2bd6f8 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -126,8 +126,7 @@ def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]):
 
         return check_marlin_supported(quant_type=cls.TYPE_MAP[num_bits],
                                       group_size=group_size,
-                                      has_zp=has_zp,
-                                      min_capability=cls.get_min_capability())
+                                      has_zp=has_zp)
 
 
 class AWQMarlinLinearMethod(LinearMethodBase):
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 066102f3a01c0..b92697531c299 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -136,8 +136,7 @@ def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
             return False
 
         return check_marlin_supported(quant_type=cls.TYPE_MAP[(num_bits, sym)],
-                                      group_size=group_size,
-                                      min_capability=cls.get_min_capability())
+                                      group_size=group_size)
 
 
 class GPTQMarlinLinearMethod(LinearMethodBase):
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 6e84d36219361..0ec68ac5b0f21 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -26,12 +26,13 @@
 #  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
 #  TODO: we may want to move this into the C++ so its closer to the actual impl
 def query_marlin_supported_quant_types(has_zp: bool,
-                                       min_capability: Optional[int] = None):
-    if min_capability is None:
+                                       device_capability: Optional[int] = None
+                                       ):
+    if device_capability is None:
         major, minor = current_platform.get_device_capability()
-        min_capability = major * 10 + minor
+        device_capability = major * 10 + minor
 
-    if min_capability < 80:
+    if device_capability < 80:
         return []
 
     if has_zp:
@@ -48,20 +49,20 @@ def _check_marlin_supported(
         quant_type: ScalarType,
         group_size: Optional[int],
         has_zp: bool,
-        min_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]:
+        device_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]:
 
-    if min_capability is None:
+    if device_capability is None:
         major, minor = current_platform.get_device_capability()
-        min_capability = major * 10 + minor
+        device_capability = major * 10 + minor
 
     supported_types = query_marlin_supported_quant_types(
-        has_zp, min_capability)
+        has_zp, device_capability)
 
     if quant_type not in supported_types:
         return (False, f"Marlin does not support weight_bits = {quant_type}. "
                 f"Only types = {supported_types} "
                 f"are supported (for group_size = {group_size}, "
-                f"min_capability = {min_capability}, zp = {has_zp}).")
+                f"device_capability = {device_capability}, zp = {has_zp}).")
     if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
         return (False, f"Marlin does not support group_size = {group_size}. "
                 f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
@@ -73,9 +74,9 @@ def _check_marlin_supported(
 def check_marlin_supported(quant_type: ScalarType,
                            group_size: int,
                            has_zp: bool = False,
-                           min_capability: Optional[int] = None) -> bool:
+                           device_capability: Optional[int] = None) -> bool:
     cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
-                                      min_capability)
+                                      device_capability)
     return cond
 
 

From fc1493a01edf0ae6f50299daae15679e4c08d74c Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Wed, 7 Aug 2024 13:35:14 -0700
Subject: [PATCH 13/18] [FrontEnd] Make `merge_async_iterators` `is_cancelled`
 arg optional (#7282)

---
 vllm/utils.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 1fd395c04ca24..4137aaec8a93c 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -405,7 +405,7 @@ async def iterate_with_cancellation(
 
 async def merge_async_iterators(
     *iterators: AsyncGenerator[T, None],
-    is_cancelled: Callable[[], Awaitable[bool]],
+    is_cancelled: Optional[Callable[[], Awaitable[bool]]] = None,
 ) -> AsyncGenerator[Tuple[int, T], None]:
     """Merge multiple asynchronous iterators into a single iterator.
 
@@ -413,8 +413,8 @@ async def merge_async_iterators(
     When it yields, it yields a tuple (i, item) where i is the index of the
     iterator that yields the item.
 
-    It also polls the provided function at least once per second to check
-    for client cancellation.
+    It also optionally polls a provided function at least once per second
+    to check for client cancellation.
     """
 
     # Can use anext() in python >= 3.10
@@ -422,12 +422,13 @@ async def merge_async_iterators(
         ensure_future(pair[1].__anext__()): pair
         for pair in enumerate(iterators)
     }
+    timeout = None if is_cancelled is None else 1
     try:
         while awaits:
             done, pending = await asyncio.wait(awaits.keys(),
                                                return_when=FIRST_COMPLETED,
-                                               timeout=1)
-            if await is_cancelled():
+                                               timeout=timeout)
+            if is_cancelled is not None and await is_cancelled():
                 raise asyncio.CancelledError("client cancelled")
             for d in done:
                 pair = awaits.pop(d)

From 6d94420246f2d88137c04f357502a6a8c38a2e8e Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 7 Aug 2024 17:21:50 -0400
Subject: [PATCH 14/18] [Doc] Update supported_hardware.rst (#7276)

---
 .../quantization/supported_hardware.rst       | 28 ++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst
index ecc330d866dbd..bb41bfed342c6 100644
--- a/docs/source/quantization/supported_hardware.rst
+++ b/docs/source/quantization/supported_hardware.rst
@@ -5,18 +5,20 @@ Supported Hardware for Quantization Kernels
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
-==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
-Implementation  Volta   Turing   Ampere   Ada    Hopper  AMD GPU  Intel GPU  x86 CPU  AWS Inferentia  Google TPU
-==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
-AQLM            ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-AWQ             ❌      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-DeepSpeedFP     ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-FP8             ❌      ❌       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-Marlin          ❌      ❌       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-GPTQ            ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-SqueezeLLM      ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-bitsandbytes    ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+=====================  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+Implementation         Volta   Turing   Ampere   Ada    Hopper  AMD GPU  Intel GPU  x86 CPU  AWS Inferentia  Google TPU
+=====================  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+AWQ                    ❌      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+GPTQ                   ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+Marlin (GPTQ/AWQ/FP8)  ❌      ❌       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+INT8 (W8A8)            ❌      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+FP8 (W8A8)             ❌      ❌       ❌       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+AQLM                   ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+bitsandbytes           ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+DeepSpeedFP            ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+GGUF                   ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+SqueezeLLM             ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+=====================  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
 
 Notes:
 ^^^^^^
@@ -27,4 +29,4 @@ Notes:
 
 Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
 
-For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization>`_ or consult with the vLLM development team.
\ No newline at end of file
+For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization>`_ or consult with the vLLM development team.

From e53dfd3eafd9bd5cc217c3796c7d1911a88ec893 Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Wed, 7 Aug 2024 16:26:52 -0700
Subject: [PATCH 15/18] [Kernel] Fix Flashinfer Correctness (#7284)

---
 vllm/attention/backends/flashinfer.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 03188164a9637..64b1f4a89f23c 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -127,6 +127,7 @@ def __post_init__(self):
             raise ValueError(
                 f"Only {supported_head_sizes} are supported for head_dim,",
                 f"received {self.head_dim}.")
+        self.is_profile_run = is_block_tables_empty(self.block_tables)
 
     def begin_forward(self):
         if self.num_prefill_tokens > 0:
@@ -140,11 +141,14 @@ def begin_forward(self):
             assert self.paged_kv_last_page_len is not None
             batch_size = self.query_start_loc.shape[0] - 1
             assert batch_size >= 0
-            # The prefill stage does not read kv cache.
+            # The profile run does not read kv cache.
             # Both paged_kv_indices and paged_kv_last_page_len are empty.
             # paged_kv_indptr is a zero tensor with size batch_size + 1.
-            self.paged_kv_indptr = torch.zeros(batch_size + 1,
-                                               device=self.device)
+            if self.is_profile_run:
+                self.paged_kv_indptr = torch.zeros(batch_size + 1,
+                                                   device=self.device)
+            else:
+                self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
             self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
                 self.device)
             self.paged_kv_indices = self.paged_kv_indices.to(self.device)

From 746709642c81aa22926765aef67e086a15aef076 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Wed, 7 Aug 2024 17:06:01 -0700
Subject: [PATCH 16/18] [Misc] Fix typos in scheduler.py (#7285)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 vllm/core/scheduler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index f60463107be44..950abfccba4c3 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -678,7 +678,7 @@ def _schedule_prefills(
                 all tokens.
 
         Returns:
-            SchedulerSwappedInOutputs.
+            SchedulerPrefillOutputs.
         """
         ignored_seq_groups: List[SequenceGroup] = []
         seq_groups: List[SequenceGroup] = []
@@ -851,7 +851,7 @@ def _schedule_default(self) -> SchedulerOutputs:
             preempted=preempted,
         )
 
-    def _schedule_chunked_prefill(self):
+    def _schedule_chunked_prefill(self) -> SchedulerOutputs:
         """Schedule queued requests.
         
         Chunked prefill allows to chunk prefill requests, batch them together
@@ -862,7 +862,7 @@ def _schedule_chunked_prefill(self):
 
         The policy can sustain the high GPU utilization because it can put
         prefill and decodes requests to the same batch, while it improves
-        inter token latency because decodes requests don't need to blocked
+        inter token latency because decodes requests don't need to be blocked
         by prefill requests.
         """
         budget = SchedulingBudget(

From 48abee9e5492924a69551d859d66d98874d72d60 Mon Sep 17 00:00:00 2001
From: Cherilyn Buren <88433283+NiuBlibing@users.noreply.github.com>
Date: Thu, 8 Aug 2024 14:17:29 +0800
Subject: [PATCH 17/18] [Frontend] remove max_num_batched_tokens limit for lora
 (#7288)

---
 vllm/config.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index ec6d587e7925b..2ac31657979f2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1377,11 +1377,6 @@ def verify_with_model_config(self, model_config: ModelConfig):
                            model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        if scheduler_config.max_num_batched_tokens > 65528:
-            raise ValueError(
-                "Due to limitations of the custom LoRA CUDA kernel, "
-                "max_num_batched_tokens must be <= 65528 when "
-                "LoRA is enabled.")
         if scheduler_config.chunked_prefill_enabled:
             raise ValueError("LoRA is not supported with chunked prefill yet.")
 

From 6dffa4b0a6120159ef2fe44d695a46817aff65bc Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Thu, 8 Aug 2024 00:02:27 -0700
Subject: [PATCH 18/18] [Bugfix] Fix LoRA with PP (#7292)

---
 vllm/lora/models.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 279477562a940..bc4cab1470f44 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -25,6 +25,7 @@
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models.interfaces import SupportsLoRA
+from vllm.model_executor.models.utils import PPMissingLayer
 from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
@@ -432,6 +433,8 @@ def remove_all_adapters(self):
     def _create_lora_modules(self):
         for module_name, module in self.model.named_modules(
                 remove_duplicate=False):
+            if isinstance(module, PPMissingLayer):
+                continue
             if not self._match_target_modules(module_name):
                 continue
             parts = module_name.split(".")[-1]