From cbb9dfbd251e46aa1052b2af6412ea5161340228 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 13:01:41 -0700
Subject: [PATCH 01/37] fix

---
 vllm/model_executor/models/__init__.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 3f52eb44edfff..8c68ab1459d70 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -117,6 +117,7 @@
 # Architecture -> type.
 # out of tree models
 _OOT_MODELS: Dict[str, Type[nn.Module]] = {}
+_OOT_MULTIMODAL_MODELS: Dict[str, Type[nn.Module]] = {}
 
 # Models not supported by ROCm.
 _ROCM_UNSUPPORTED_MODELS: List[str] = []
@@ -189,12 +190,21 @@ def get_supported_archs() -> List[str]:
         return list(_MODELS.keys()) + list(_OOT_MODELS.keys())
 
     @staticmethod
-    def register_model(model_arch: str, model_cls: Type[nn.Module]):
+    def register_model(model_arch: str,
+                       model_cls: Type[nn.Module],
+                       is_multimodal: bool = False):
         if model_arch in _MODELS:
             logger.warning(
                 "Model architecture %s is already registered, and will be "
                 "overwritten by the new model class %s.", model_arch,
                 model_cls.__name__)
+
+        # NOTE: This is needed to store the information if the OOT model is
+        # an multimodal model.
+        if is_multimodal:
+            global _OOT_MULTIMODAL_MODELS
+            _OOT_MULTIMODAL_MODELS[model_arch] = model_cls
+
         global _OOT_MODELS
         _OOT_MODELS[model_arch] = model_cls
 
@@ -209,7 +219,8 @@ def is_multimodal_model(model_arch: str) -> bool:
         # use `supports_multimodal` to determine if a model is multimodal
         # model_cls = ModelRegistry._try_load_model_cls(model_arch)
         # from vllm.model_executor.models.interfaces import supports_multimodal
-        return model_arch in _MULTIMODAL_MODELS
+        return (model_arch in _MULTIMODAL_MODELS
+                or model_arch in _OOT_MULTIMODAL_MODELS)
 
 
 __all__ = [

From 7ae3e071b4373d0056e745955167263a27c5be66 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 13:09:01 -0700
Subject: [PATCH 02/37] update doc

---
 docs/source/models/adding_model.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index 5cffb58cafd96..b3459244b6334 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -114,6 +114,14 @@ Just add the following lines in your code:
     from your_code import YourModelForCausalLM
     ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
 
+If your model is a multimodal model, then indicate so by setting the `is_multimodal` flag.
+
+.. code-block:: python
+
+    from vllm import ModelRegistry
+    from your_code import YourModelForCausalLM
+    ModelRegistry.register_model("YourModelForConditionalGeneration", YourModelForConditionalGeneration, is_multimodal=True)
+
 If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code:
 
 .. code-block:: python

From b67ed869ba75b85c89b4d9b5ca4be46070ced347 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 13:09:29 -0700
Subject: [PATCH 03/37] iterate

---
 docs/source/models/adding_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index b3459244b6334..61c33ee5c02a3 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -119,7 +119,7 @@ If your model is a multimodal model, then indicate so by setting the `is_multimo
 .. code-block:: python
 
     from vllm import ModelRegistry
-    from your_code import YourModelForCausalLM
+    from your_code import YourModelForConditionalGeneration
     ModelRegistry.register_model("YourModelForConditionalGeneration", YourModelForConditionalGeneration, is_multimodal=True)
 
 If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code:

From a9f3d3fbc34bfae677abfb667084b2f01ee88022 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 13:18:21 -0700
Subject: [PATCH 04/37] typo

---
 vllm/model_executor/models/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 8c68ab1459d70..b943096b6b0c4 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -200,7 +200,7 @@ def register_model(model_arch: str,
                 model_cls.__name__)
 
         # NOTE: This is needed to store the information if the OOT model is
-        # an multimodal model.
+        # a multimodal model.
         if is_multimodal:
             global _OOT_MULTIMODAL_MODELS
             _OOT_MULTIMODAL_MODELS[model_arch] = model_cls

From 1d174d5b8a64bf12a27febdc16345a47ad43e97b Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 14:44:27 -0700
Subject: [PATCH 05/37] update

---
 docs/source/models/adding_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index 61c33ee5c02a3..2861634665912 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -114,7 +114,7 @@ Just add the following lines in your code:
     from your_code import YourModelForCausalLM
     ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
 
-If your model is a multimodal model, then indicate so by setting the `is_multimodal` flag.
+If your model is a multimodal model, then indicate so by setting the :code:`is_multimodal` flag.
 
 .. code-block:: python
 

From 4ec5b75f29c3efe9e795a02c4fe0719b0f83e430 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 15:17:44 -0700
Subject: [PATCH 06/37] add test

---
 tests/conftest.py                             | 19 ++++++++++++++++++
 tests/models/test_oot_registration.py         | 20 +++++++++++++++++++
 .../vllm_add_dummy_model/__init__.py          | 20 +++++++++++++++++++
 3 files changed, 59 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index c2616bcf7091c..c2e2b53e956b9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -878,3 +878,22 @@ def dummy_opt_path():
         with open(json_path, "w") as f:
             json.dump(config, f)
     return _dummy_path
+
+
+@pytest.fixture
+def dummy_phi3v_path():
+    json_path = os.path.join(_dummy_path, "config.json")
+    if not os.path.exists(_dummy_path):
+        snapshot_download(repo_id="microsoft/Phi-3-vision-128k-instruct",
+                          local_dir=_dummy_path,
+                          ignore_patterns=[
+                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
+                              "*.msgpack"
+                          ])
+        assert os.path.exists(json_path)
+        with open(json_path, "r") as f:
+            config = json.load(f)
+        config["architectures"] = ["MyPhi3VForCausalLM"]
+        with open(json_path, "w") as f:
+            json.dump(config, f)
+    return _dummy_path
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 5cb82a5ac4c7d..873c0f261680b 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -29,3 +29,23 @@ def test_oot_registration(dummy_opt_path):
         # make sure only the first token is generated
         rest = generated_text.replace(first_token, "")
         assert rest == ""
+
+
+@fork_new_process_for_each_test
+def test_oot_mutlimodal_registration(dummy_phi3v_path):
+    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
+    prompts = ["Hello, my name is", "The text does not matter"]
+    sampling_params = SamplingParams(temperature=0)
+    llm = LLM(model=dummy_phi3v_path,
+              load_format="dummy",
+              max_num_seqs=1,
+              trust_remote_code=True,
+              limit_mm_per_prompt={"image": 2})
+    first_token = llm.get_tokenizer().decode(0)
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        # make sure only the first token is generated
+        rest = generated_text.replace(first_token, "")
+        assert rest == ""
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index dcc0305e657ab..dd04c193932c4 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -4,6 +4,7 @@
 
 from vllm import ModelRegistry
 from vllm.model_executor.models.opt import OPTForCausalLM
+from vllm.model_executor.models.phi3v import Phi3VForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 
 
@@ -20,7 +21,26 @@ def compute_logits(
         return logits
 
 
+class MyPhi3VForCausalLM(Phi3VForCausalLM):
+
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        # this dummy model always predicts the first token
+        logits = super().compute_logits(hidden_states, sampling_metadata)
+        if logits is not None:
+            logits.zero_()
+            logits[:, 0] += 1.0
+        return logits
+
+
 def register():
     # register our dummy model
     if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs():
         ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)
+
+    # register our dummy multimodal model
+    if "MyPhi3VForCausalLM" not in ModelRegistry.get_supported_archs():
+        ModelRegistry.register_model("MyPhi3VForCausalLM",
+                                     MyPhi3VForCausalLM,
+                                     is_multimodal=True)

From 0ce8165ca968667384ad38482289ad053be54f7c Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 18:12:35 -0700
Subject: [PATCH 07/37] update conftest

---
 tests/conftest.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index c2e2b53e956b9..86db96322f7c5 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -858,15 +858,16 @@ def num_gpus_available():
 
 
 temp_dir = tempfile.gettempdir()
-_dummy_path = os.path.join(temp_dir, "dummy_opt")
+_dummy_opt_path = os.path.join(temp_dir, "dummy_opt")
+_dummy_phi3v_path = os.path.join(temp_dir, "dummy_phi3v")
 
 
 @pytest.fixture
 def dummy_opt_path():
-    json_path = os.path.join(_dummy_path, "config.json")
-    if not os.path.exists(_dummy_path):
+    json_path = os.path.join(_dummy_opt_path, "config.json")
+    if not os.path.exists(_dummy_opt_path):
         snapshot_download(repo_id="facebook/opt-125m",
-                          local_dir=_dummy_path,
+                          local_dir=_dummy_opt_path,
                           ignore_patterns=[
                               "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                               "*.msgpack"
@@ -877,15 +878,15 @@ def dummy_opt_path():
         config["architectures"] = ["MyOPTForCausalLM"]
         with open(json_path, "w") as f:
             json.dump(config, f)
-    return _dummy_path
+    return _dummy_opt_path
 
 
 @pytest.fixture
 def dummy_phi3v_path():
-    json_path = os.path.join(_dummy_path, "config.json")
-    if not os.path.exists(_dummy_path):
+    json_path = os.path.join(_dummy_phi3v_path, "config.json")
+    if not os.path.exists(_dummy_phi3v_path):
         snapshot_download(repo_id="microsoft/Phi-3-vision-128k-instruct",
-                          local_dir=_dummy_path,
+                          local_dir=_dummy_phi3v_path,
                           ignore_patterns=[
                               "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                               "*.msgpack"
@@ -896,4 +897,4 @@ def dummy_phi3v_path():
         config["architectures"] = ["MyPhi3VForCausalLM"]
         with open(json_path, "w") as f:
             json.dump(config, f)
-    return _dummy_path
+    return _dummy_phi3v_path

From 84094a46a31b1ad516157288959a4ff57ff3cb02 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 18:32:37 -0700
Subject: [PATCH 08/37] add plugin loading to model config

---
 vllm/config.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index 7a15606836dcc..17fd4e96615c9 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -151,6 +151,10 @@ def __init__(self,
                  use_async_output_proc: bool = True,
                  override_neuron_config: Optional[Dict[str, Any]] = None,
                  config_format: ConfigFormat = ConfigFormat.AUTO) -> None:
+
+        from vllm.plugins import load_general_plugins
+        load_general_plugins()
+
         self.model = model
         self.tokenizer = tokenizer
         self.tokenizer_mode = tokenizer_mode

From 0c36bb181de0a9046f5f23b8a054642d5c8b1109 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 19:08:04 -0700
Subject: [PATCH 09/37] fix and add test

---
 tests/models/test_oot_registration.py           | 17 ++++++++++++++++-
 .../vllm_add_dummy_model/__init__.py            | 11 ++++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 873c0f261680b..fbe3f142f7015 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -3,6 +3,7 @@
 import pytest
 
 from vllm import LLM, SamplingParams
+from vllm.assets.image import ImageAsset
 
 from ..utils import fork_new_process_for_each_test
 
@@ -31,10 +32,24 @@ def test_oot_registration(dummy_opt_path):
         assert rest == ""
 
 
+image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+
+
 @fork_new_process_for_each_test
 def test_oot_mutlimodal_registration(dummy_phi3v_path):
     os.environ["VLLM_PLUGINS"] = "register_dummy_model"
-    prompts = ["Hello, my name is", "The text does not matter"]
+    prompts = [{
+        "prompt": "What's in the image?<|image_1|>",
+        "multi_modal_data": {
+            "image": image
+        },
+    }, {
+        "prompt": "Describe the image<|image_1|>",
+        "multi_modal_data": {
+            "image": image
+        },
+    }]
+
     sampling_params = SamplingParams(temperature=0)
     llm = LLM(model=dummy_phi3v_path,
               load_format="dummy",
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index dd04c193932c4..cc0af9a791376 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -3,9 +3,14 @@
 import torch
 
 from vllm import ModelRegistry
+from vllm.inputs import INPUT_REGISTRY
 from vllm.model_executor.models.opt import OPTForCausalLM
-from vllm.model_executor.models.phi3v import Phi3VForCausalLM
+from vllm.model_executor.models.phi3v import (Phi3VForCausalLM,
+                                              dummy_data_for_phi3v,
+                                              get_max_phi3v_image_tokens,
+                                              input_processor_for_phi3v)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
 class MyOPTForCausalLM(OPTForCausalLM):
@@ -21,6 +26,10 @@ def compute_logits(
         return logits
 
 
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)
 class MyPhi3VForCausalLM(Phi3VForCausalLM):
 
     def compute_logits(

From d2035930bd30c0839025d59a8fae5b9d7af11a49 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 20:47:04 -0700
Subject: [PATCH 10/37] move plugin loading

---
 vllm/config.py            | 4 ----
 vllm/engine/arg_utils.py  | 2 ++
 vllm/engine/llm_engine.py | 3 ---
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 17fd4e96615c9..7a15606836dcc 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -151,10 +151,6 @@ def __init__(self,
                  use_async_output_proc: bool = True,
                  override_neuron_config: Optional[Dict[str, Any]] = None,
                  config_format: ConfigFormat = ConfigFormat.AUTO) -> None:
-
-        from vllm.plugins import load_general_plugins
-        load_general_plugins()
-
         self.model = model
         self.tokenizer = tokenizer
         self.tokenizer_mode = tokenizer_mode
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4139eca9c1832..1920f988522af 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -179,6 +179,8 @@ class EngineArgs:
     def __post_init__(self):
         if self.tokenizer is None:
             self.tokenizer = self.model
+        from vllm.plugins import load_general_plugins
+        load_general_plugins()
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 39409757d3812..4319455ac07b3 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -269,9 +269,6 @@ def __init__(
             cache_config.enable_prefix_caching,
             model_config.use_async_output_proc,
         )
-        # TODO(woosuk): Print more configs in debug mode.
-        from vllm.plugins import load_general_plugins
-        load_general_plugins()
 
         self.model_config = model_config
         self.cache_config = cache_config

From a020de6911cfd02f1bd658700237312c00c493a6 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 21:27:57 -0700
Subject: [PATCH 11/37] infer multimodality

---
 vllm/model_executor/models/__init__.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index b943096b6b0c4..80487b05eca4b 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -5,6 +5,7 @@
 import torch.nn as nn
 
 from vllm.logger import init_logger
+from vllm.model_executor.models.interfaces import SupportsMultiModal
 from vllm.utils import is_hip
 
 logger = init_logger(__name__)
@@ -190,15 +191,15 @@ def get_supported_archs() -> List[str]:
         return list(_MODELS.keys()) + list(_OOT_MODELS.keys())
 
     @staticmethod
-    def register_model(model_arch: str,
-                       model_cls: Type[nn.Module],
-                       is_multimodal: bool = False):
+    def register_model(model_arch: str, model_cls: Type[nn.Module]):
         if model_arch in _MODELS:
             logger.warning(
                 "Model architecture %s is already registered, and will be "
                 "overwritten by the new model class %s.", model_arch,
                 model_cls.__name__)
 
+        is_multimodal: bool = issubclass(model_cls, SupportsMultiModal)
+
         # NOTE: This is needed to store the information if the OOT model is
         # a multimodal model.
         if is_multimodal:

From 51c961a9f60fadcb6ad8375ecc7ac90aaa35e4bf Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 21:31:50 -0700
Subject: [PATCH 12/37] update doc

---
 docs/source/models/adding_model.rst                      | 9 +++------
 .../vllm_add_dummy_model/__init__.py                     | 3 +--
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index 2861634665912..d472a3894bbd5 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -114,13 +114,10 @@ Just add the following lines in your code:
     from your_code import YourModelForCausalLM
     ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
 
-If your model is a multimodal model, then indicate so by setting the :code:`is_multimodal` flag.
+.. important::
+    If your model is a multimodal model, make sure the model class is implemented with 
+    the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
 
-.. code-block:: python
-
-    from vllm import ModelRegistry
-    from your_code import YourModelForConditionalGeneration
-    ModelRegistry.register_model("YourModelForConditionalGeneration", YourModelForConditionalGeneration, is_multimodal=True)
 
 If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code:
 
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index cc0af9a791376..20ee8c484b35b 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -51,5 +51,4 @@ def register():
     # register our dummy multimodal model
     if "MyPhi3VForCausalLM" not in ModelRegistry.get_supported_archs():
         ModelRegistry.register_model("MyPhi3VForCausalLM",
-                                     MyPhi3VForCausalLM,
-                                     is_multimodal=True)
+                                     MyPhi3VForCausalLM)

From 81629f8889377456732ef34cecc298dcfff369bc Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 21:32:01 -0700
Subject: [PATCH 13/37] format

---
 .../vllm_add_dummy_model/vllm_add_dummy_model/__init__.py      | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index 20ee8c484b35b..fd30b85e93b6f 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -50,5 +50,4 @@ def register():
 
     # register our dummy multimodal model
     if "MyPhi3VForCausalLM" not in ModelRegistry.get_supported_archs():
-        ModelRegistry.register_model("MyPhi3VForCausalLM",
-                                     MyPhi3VForCausalLM)
+        ModelRegistry.register_model("MyPhi3VForCausalLM", MyPhi3VForCausalLM)

From ec204df8a7876f797a265cda2c63d4b74e34eb68 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 21:43:12 -0700
Subject: [PATCH 14/37] more robust check

---
 vllm/model_executor/models/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 80487b05eca4b..6c4193835f11b 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -5,7 +5,6 @@
 import torch.nn as nn
 
 from vllm.logger import init_logger
-from vllm.model_executor.models.interfaces import SupportsMultiModal
 from vllm.utils import is_hip
 
 logger = init_logger(__name__)
@@ -198,7 +197,9 @@ def register_model(model_arch: str, model_cls: Type[nn.Module]):
                 "overwritten by the new model class %s.", model_arch,
                 model_cls.__name__)
 
-        is_multimodal: bool = issubclass(model_cls, SupportsMultiModal)
+        # Avoid circular import
+        from vllm.model_executor.models.interfaces import supports_multimodal
+        is_multimodal: bool = supports_multimodal(model_cls)
 
         # NOTE: This is needed to store the information if the OOT model is
         # a multimodal model.

From adbb0631ee5b1ff02f2dc709b524efd3ff9d72e2 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 21:48:26 -0700
Subject: [PATCH 15/37] add back the TODO for woosuk

---
 vllm/engine/llm_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 4319455ac07b3..5e65f3968f1ce 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -269,7 +269,7 @@ def __init__(
             cache_config.enable_prefix_caching,
             model_config.use_async_output_proc,
         )
-
+        # TODO(woosuk): Print more configs in debug mode.
         self.model_config = model_config
         self.cache_config = cache_config
         self.lora_config = lora_config

From 273ce7e5d34cb49ed202b5ccfe973ac5732cd69f Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 21:50:06 -0700
Subject: [PATCH 16/37] update

---
 vllm/model_executor/models/__init__.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 6c4193835f11b..b7e5bec4d38b2 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -199,11 +199,9 @@ def register_model(model_arch: str, model_cls: Type[nn.Module]):
 
         # Avoid circular import
         from vllm.model_executor.models.interfaces import supports_multimodal
-        is_multimodal: bool = supports_multimodal(model_cls)
-
-        # NOTE: This is needed to store the information if the OOT model is
-        # a multimodal model.
-        if is_multimodal:
+        if supports_multimodal(model_cls):
+            # NOTE: This map is needed to store the information if the OOT model
+            # is a multimodal model.
             global _OOT_MULTIMODAL_MODELS
             _OOT_MULTIMODAL_MODELS[model_arch] = model_cls
 

From 19c31d9a956812a2370f166899c048f6abc3042e Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 22 Sep 2024 23:32:50 -0700
Subject: [PATCH 17/37] try better config

---
 tests/models/test_oot_registration.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index fbe3f142f7015..62ea76213ddc2 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -55,7 +55,10 @@ def test_oot_mutlimodal_registration(dummy_phi3v_path):
               load_format="dummy",
               max_num_seqs=1,
               trust_remote_code=True,
-              limit_mm_per_prompt={"image": 2})
+              gpu_memory_utilization=0.98,
+              max_model_len=4096,
+              enforce_eager=True,
+              limit_mm_per_prompt={"image": 1})
     first_token = llm.get_tokenizer().decode(0)
     outputs = llm.generate(prompts, sampling_params)
 

From dbd198d602c2741398463f268a79a3a40f0d77b0 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 23 Sep 2024 12:44:32 +0000
Subject: [PATCH 18/37] Fix CUDA re-initialization error

---
 .../vllm_add_dummy_model/__init__.py          | 70 +++++++++----------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index fd30b85e93b6f..693e6351c5245 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -4,46 +4,46 @@
 
 from vllm import ModelRegistry
 from vllm.inputs import INPUT_REGISTRY
-from vllm.model_executor.models.opt import OPTForCausalLM
-from vllm.model_executor.models.phi3v import (Phi3VForCausalLM,
-                                              dummy_data_for_phi3v,
-                                              get_max_phi3v_image_tokens,
-                                              input_processor_for_phi3v)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
-class MyOPTForCausalLM(OPTForCausalLM):
-
-    def compute_logits(
-            self, hidden_states: torch.Tensor,
-            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
-        # this dummy model always predicts the first token
-        logits = super().compute_logits(hidden_states, sampling_metadata)
-        if logits is not None:
-            logits.zero_()
-            logits[:, 0] += 1.0
-        return logits
-
-
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)
-class MyPhi3VForCausalLM(Phi3VForCausalLM):
-
-    def compute_logits(
-            self, hidden_states: torch.Tensor,
-            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
-        # this dummy model always predicts the first token
-        logits = super().compute_logits(hidden_states, sampling_metadata)
-        if logits is not None:
-            logits.zero_()
-            logits[:, 0] += 1.0
-        return logits
-
-
 def register():
+    # Lazy import to avoid initializing CUDA during test collection
+    from vllm.model_executor.models.opt import OPTForCausalLM
+    from vllm.model_executor.models.phi3v import (Phi3VForCausalLM,
+                                                  dummy_data_for_phi3v,
+                                                  get_max_phi3v_image_tokens,
+                                                  input_processor_for_phi3v)
+
+    class MyOPTForCausalLM(OPTForCausalLM):
+
+        def compute_logits(
+                self, hidden_states: torch.Tensor,
+                sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+            # this dummy model always predicts the first token
+            logits = super().compute_logits(hidden_states, sampling_metadata)
+            if logits is not None:
+                logits.zero_()
+                logits[:, 0] += 1.0
+            return logits
+
+    @MULTIMODAL_REGISTRY.register_image_input_mapper()
+    @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
+    @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
+    @INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)
+    class MyPhi3VForCausalLM(Phi3VForCausalLM):
+
+        def compute_logits(
+                self, hidden_states: torch.Tensor,
+                sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+            # this dummy model always predicts the first token
+            logits = super().compute_logits(hidden_states, sampling_metadata)
+            if logits is not None:
+                logits.zero_()
+                logits[:, 0] += 1.0
+            return logits
+
     # register our dummy model
     if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs():
         ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)

From 263a4e7706532bfa8bafc6a6a9e6cb38702bd75f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 23 Sep 2024 14:46:03 +0000
Subject: [PATCH 19/37] Revert "Fix CUDA re-initialization error"

This reverts commit dbd198d602c2741398463f268a79a3a40f0d77b0.
---
 .../vllm_add_dummy_model/__init__.py          | 70 +++++++++----------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index 693e6351c5245..fd30b85e93b6f 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -4,46 +4,46 @@
 
 from vllm import ModelRegistry
 from vllm.inputs import INPUT_REGISTRY
+from vllm.model_executor.models.opt import OPTForCausalLM
+from vllm.model_executor.models.phi3v import (Phi3VForCausalLM,
+                                              dummy_data_for_phi3v,
+                                              get_max_phi3v_image_tokens,
+                                              input_processor_for_phi3v)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
 
-def register():
-    # Lazy import to avoid initializing CUDA during test collection
-    from vllm.model_executor.models.opt import OPTForCausalLM
-    from vllm.model_executor.models.phi3v import (Phi3VForCausalLM,
-                                                  dummy_data_for_phi3v,
-                                                  get_max_phi3v_image_tokens,
-                                                  input_processor_for_phi3v)
-
-    class MyOPTForCausalLM(OPTForCausalLM):
-
-        def compute_logits(
-                self, hidden_states: torch.Tensor,
-                sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
-            # this dummy model always predicts the first token
-            logits = super().compute_logits(hidden_states, sampling_metadata)
-            if logits is not None:
-                logits.zero_()
-                logits[:, 0] += 1.0
-            return logits
-
-    @MULTIMODAL_REGISTRY.register_image_input_mapper()
-    @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
-    @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
-    @INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)
-    class MyPhi3VForCausalLM(Phi3VForCausalLM):
-
-        def compute_logits(
-                self, hidden_states: torch.Tensor,
-                sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
-            # this dummy model always predicts the first token
-            logits = super().compute_logits(hidden_states, sampling_metadata)
-            if logits is not None:
-                logits.zero_()
-                logits[:, 0] += 1.0
-            return logits
+class MyOPTForCausalLM(OPTForCausalLM):
+
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        # this dummy model always predicts the first token
+        logits = super().compute_logits(hidden_states, sampling_metadata)
+        if logits is not None:
+            logits.zero_()
+            logits[:, 0] += 1.0
+        return logits
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)
+class MyPhi3VForCausalLM(Phi3VForCausalLM):
 
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        # this dummy model always predicts the first token
+        logits = super().compute_logits(hidden_states, sampling_metadata)
+        if logits is not None:
+            logits.zero_()
+            logits[:, 0] += 1.0
+        return logits
+
+
+def register():
     # register our dummy model
     if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs():
         ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)

From b8e6e8d0ec7563980c97ba41c083f92098cecea3 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 23 Sep 2024 16:17:38 -0700
Subject: [PATCH 20/37] try llava

---
 tests/conftest.py                             | 16 +++++++--------
 tests/models/test_oot_registration.py         |  8 ++++----
 .../vllm_add_dummy_model/__init__.py          | 20 +++++++++----------
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 86db96322f7c5..e5019ceeffada 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -859,7 +859,7 @@ def num_gpus_available():
 
 temp_dir = tempfile.gettempdir()
 _dummy_opt_path = os.path.join(temp_dir, "dummy_opt")
-_dummy_phi3v_path = os.path.join(temp_dir, "dummy_phi3v")
+_dummy_llava_path = os.path.join(temp_dir, "dummy_llava")
 
 
 @pytest.fixture
@@ -882,11 +882,11 @@ def dummy_opt_path():
 
 
 @pytest.fixture
-def dummy_phi3v_path():
-    json_path = os.path.join(_dummy_phi3v_path, "config.json")
-    if not os.path.exists(_dummy_phi3v_path):
-        snapshot_download(repo_id="microsoft/Phi-3-vision-128k-instruct",
-                          local_dir=_dummy_phi3v_path,
+def dummy_llava_path():
+    json_path = os.path.join(_dummy_llava_path, "config.json")
+    if not os.path.exists(_dummy_llava_path):
+        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
+                          local_dir=_dummy_llava_path,
                           ignore_patterns=[
                               "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                               "*.msgpack"
@@ -894,7 +894,7 @@ def dummy_phi3v_path():
         assert os.path.exists(json_path)
         with open(json_path, "r") as f:
             config = json.load(f)
-        config["architectures"] = ["MyPhi3VForCausalLM"]
+        config["architectures"] = ["MyLlava"]
         with open(json_path, "w") as f:
             json.dump(config, f)
-    return _dummy_phi3v_path
+    return _dummy_llava_path
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 62ea76213ddc2..53b78dfcd0138 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -36,22 +36,22 @@ def test_oot_registration(dummy_opt_path):
 
 
 @fork_new_process_for_each_test
-def test_oot_mutlimodal_registration(dummy_phi3v_path):
+def test_oot_mutlimodal_registration(dummy_llava_path):
     os.environ["VLLM_PLUGINS"] = "register_dummy_model"
     prompts = [{
-        "prompt": "What's in the image?<|image_1|>",
+        "prompt": "What's in the image?<image>",
         "multi_modal_data": {
             "image": image
         },
     }, {
-        "prompt": "Describe the image<|image_1|>",
+        "prompt": "Describe the image<image>",
         "multi_modal_data": {
             "image": image
         },
     }]
 
     sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=dummy_phi3v_path,
+    llm = LLM(model=dummy_llava_path,
               load_format="dummy",
               max_num_seqs=1,
               trust_remote_code=True,
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index fd30b85e93b6f..7c55b51337da7 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -4,11 +4,11 @@
 
 from vllm import ModelRegistry
 from vllm.inputs import INPUT_REGISTRY
+from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
+                                              dummy_data_for_llava,
+                                              get_max_llava_image_tokens,
+                                              input_processor_for_llava)
 from vllm.model_executor.models.opt import OPTForCausalLM
-from vllm.model_executor.models.phi3v import (Phi3VForCausalLM,
-                                              dummy_data_for_phi3v,
-                                              get_max_phi3v_image_tokens,
-                                              input_processor_for_phi3v)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
@@ -27,10 +27,10 @@ def compute_logits(
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper()
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)
-class MyPhi3VForCausalLM(Phi3VForCausalLM):
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
+class MyLlava(LlavaForConditionalGeneration):
 
     def compute_logits(
             self, hidden_states: torch.Tensor,
@@ -49,5 +49,5 @@ def register():
         ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)
 
     # register our dummy multimodal model
-    if "MyPhi3VForCausalLM" not in ModelRegistry.get_supported_archs():
-        ModelRegistry.register_model("MyPhi3VForCausalLM", MyPhi3VForCausalLM)
+    if "MyLlava" not in ModelRegistry.get_supported_archs():
+        ModelRegistry.register_model("MyLlava", MyLlava)

From 85cedeb4b949570d0aed71a2450f29bb9215378b Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 24 Sep 2024 03:21:06 +0000
Subject: [PATCH 21/37] Add debug script

---
 find_cuda_init.py | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 find_cuda_init.py

diff --git a/find_cuda_init.py b/find_cuda_init.py
new file mode 100644
index 0000000000000..d5088c05446ce
--- /dev/null
+++ b/find_cuda_init.py
@@ -0,0 +1,35 @@
+import importlib
+import traceback
+from functools import wraps
+from typing import Callable, TypeVar
+from unittest.mock import patch
+
+from typing_extensions import ParamSpec
+
+
+_P, _R_co = ParamSpec("_P"), TypeVar("_R_co", covariant=True)
+
+
+def print_stack(f: Callable[_P, _R_co]) -> Callable[_P, _R_co]:
+    @wraps(f)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs):
+        traceback.print_stack()
+        return f(*args, **kwargs)
+
+    return wrapper
+
+
+def find_cuda_init(fn: Callable[[], object]) -> None:
+    """
+    Helper function to debug CUDA re-initialization errors.
+
+    If `fn` initializes CUDA, prints the stack trace of how this happens.
+    """
+    from torch.cuda import _lazy_init
+
+    with patch("torch.cuda._lazy_init", print_stack(_lazy_init)):
+        fn()
+
+
+if __name__ == "__main__":
+    find_cuda_init(lambda: importlib.import_module("vllm.model_executor.models.llava"))  # noqa: E501

From 8952494e763c45c84fc43a8d91ca865f3870ec7f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 24 Sep 2024 03:22:57 +0000
Subject: [PATCH 22/37] format

---
 find_cuda_init.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/find_cuda_init.py b/find_cuda_init.py
index d5088c05446ce..4181dfc8c19f4 100644
--- a/find_cuda_init.py
+++ b/find_cuda_init.py
@@ -6,7 +6,6 @@
 
 from typing_extensions import ParamSpec
 
-
 _P, _R_co = ParamSpec("_P"), TypeVar("_R_co", covariant=True)
 
 

From 989fb166b9fceea330ed7093f5681503f65bc477 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 24 Sep 2024 03:23:22 +0000
Subject: [PATCH 23/37] format

---
 find_cuda_init.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/find_cuda_init.py b/find_cuda_init.py
index 4181dfc8c19f4..fb4a0d7fcd31c 100644
--- a/find_cuda_init.py
+++ b/find_cuda_init.py
@@ -10,6 +10,7 @@
 
 
 def print_stack(f: Callable[_P, _R_co]) -> Callable[_P, _R_co]:
+
     @wraps(f)
     def wrapper(*args: _P.args, **kwargs: _P.kwargs):
         traceback.print_stack()
@@ -31,4 +32,5 @@ def find_cuda_init(fn: Callable[[], object]) -> None:
 
 
 if __name__ == "__main__":
-    find_cuda_init(lambda: importlib.import_module("vllm.model_executor.models.llava"))  # noqa: E501
+    find_cuda_init(
+        lambda: importlib.import_module("vllm.model_executor.models.llava"))

From 732d4628589eac8b167424694e70c41353dbb0a3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 24 Sep 2024 07:33:01 +0000
Subject: [PATCH 24/37] Avoid CUDA reinitialization error

---
 .buildkite/test-pipeline.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 379a67c4c8cf8..3b64f44b864ca 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -92,7 +92,8 @@ steps:
   - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
-  - pytest -v -s entrypoints/openai
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
   - pytest -v -s entrypoints/test_chat_utils.py
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
@@ -360,7 +361,7 @@ steps:
   - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - pip install -e ./plugins/vllm_add_dummy_model
-  - pytest -v -s distributed/test_distributed_oot.py
+  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s distributed/test_distributed_oot.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 

From bf369e538a298683eb8cd7539540d507f5024461 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 24 Sep 2024 07:38:27 +0000
Subject: [PATCH 25/37] Improve debug script

---
 find_cuda_init.py | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/find_cuda_init.py b/find_cuda_init.py
index fb4a0d7fcd31c..51db23102f9ac 100644
--- a/find_cuda_init.py
+++ b/find_cuda_init.py
@@ -1,23 +1,8 @@
 import importlib
 import traceback
-from functools import wraps
-from typing import Callable, TypeVar
+from typing import Callable
 from unittest.mock import patch
 
-from typing_extensions import ParamSpec
-
-_P, _R_co = ParamSpec("_P"), TypeVar("_R_co", covariant=True)
-
-
-def print_stack(f: Callable[_P, _R_co]) -> Callable[_P, _R_co]:
-
-    @wraps(f)
-    def wrapper(*args: _P.args, **kwargs: _P.kwargs):
-        traceback.print_stack()
-        return f(*args, **kwargs)
-
-    return wrapper
-
 
 def find_cuda_init(fn: Callable[[], object]) -> None:
     """
@@ -27,9 +12,21 @@ def find_cuda_init(fn: Callable[[], object]) -> None:
     """
     from torch.cuda import _lazy_init
 
-    with patch("torch.cuda._lazy_init", print_stack(_lazy_init)):
+    stack = None
+
+    def wrapper():
+        nonlocal stack
+        stack = traceback.extract_stack()
+        return _lazy_init()
+
+    with patch("torch.cuda._lazy_init", wrapper):
         fn()
 
+    if stack is not None:
+        print("==== CUDA Initialized ====")
+        print("".join(traceback.format_list(stack)).strip())
+        print("==========================")
+
 
 if __name__ == "__main__":
     find_cuda_init(

From 571eda99a07414d934d65c820c5a810fc2a56201 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Tue, 24 Sep 2024 14:41:19 -0700
Subject: [PATCH 26/37] patch

---
 .buildkite/test-pipeline.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 3b64f44b864ca..5e595d3404d36 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -92,6 +92,7 @@ steps:
   - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+  - export VLLM_AUDIO_FETCH_TIMEOUT=10
   - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
   - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
   - pytest -v -s entrypoints/test_chat_utils.py
@@ -361,7 +362,8 @@ steps:
   - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - pip install -e ./plugins/vllm_add_dummy_model
-  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s distributed/test_distributed_oot.py
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s distributed/test_distributed_oot.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 

From 52b600b97f36860b3ffbe49eeaca819610f4c8c2 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Tue, 24 Sep 2024 21:59:34 -0700
Subject: [PATCH 27/37] switch

---
 .buildkite/test-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5e595d3404d36..717434b310062 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -88,12 +88,12 @@ steps:
   source_file_dependencies:
   - vllm/
   commands:
-  - pip install -e ./plugins/vllm_add_dummy_model
   - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - export VLLM_AUDIO_FETCH_TIMEOUT=10
   - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
+  - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
   - pytest -v -s entrypoints/test_chat_utils.py
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

From 45fb02be17098c41ba983a247b21255cdc1fb7ec Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 25 Sep 2024 08:06:45 +0000
Subject: [PATCH 28/37] Try instead reducing model memory

---
 .buildkite/test-pipeline.yaml           |  3 +--
 tests/entrypoints/openai/test_audio.py  |  4 +++-
 tests/entrypoints/openai/test_vision.py | 13 ++++++++++---
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 717434b310062..62eda2cb10a91 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -88,12 +88,11 @@ steps:
   source_file_dependencies:
   - vllm/
   commands:
+  - pip install -e ./plugins/vllm_add_dummy_model
   - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
-  - export VLLM_AUDIO_FETCH_TIMEOUT=10
   - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
-  - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
   - pytest -v -s entrypoints/test_chat_utils.py
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index a9a0ac012c8ff..df8a140283fbb 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -21,7 +21,9 @@ def server():
         "--dtype",
         "bfloat16",
         "--max-model-len",
-        "4096",
+        "2048",
+        "--max-num-seqs",
+        "5",
         "--enforce-eager",
     ]
 
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index f61fa127b7d06..81d79601124a7 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -23,9 +23,16 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--dtype", "bfloat16", "--max-model-len", "4096", "--max-num-seqs",
-        "5", "--enforce-eager", "--trust-remote-code", "--limit-mm-per-prompt",
-        f"image={MAXIMUM_IMAGES}"
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "5",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        f"image={MAXIMUM_IMAGES}",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:

From 7c987e957fb4fcc3bf2f5ccf0773dbb790dfb84d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 25 Sep 2024 11:01:57 +0000
Subject: [PATCH 29/37] Reorder the tests

---
 .buildkite/test-pipeline.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 62eda2cb10a91..dfa0f72255599 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -88,14 +88,14 @@ steps:
   source_file_dependencies:
   - vllm/
   commands:
-  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s entrypoints/test_chat_utils.py
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
   - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+  - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
   - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
-  - pytest -v -s entrypoints/test_chat_utils.py
-  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
 - label: Distributed Tests (4 GPUs) # 10min
   working_dir: "/vllm-workspace/tests"

From 45a6fa880e62958c08999387c902419150450f69 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 25 Sep 2024 16:04:09 +0000
Subject: [PATCH 30/37] Iterate

---
 .buildkite/test-pipeline.yaml           | 2 +-
 tests/entrypoints/openai/test_audio.py  | 4 ++--
 tests/entrypoints/openai/test_vision.py | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index dfa0f72255599..ddb8a9d3da377 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -90,9 +90,9 @@ steps:
   commands:
   - pytest -v -s entrypoints/test_chat_utils.py
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-  - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+  - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
   - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index df8a140283fbb..90105f853a38b 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -18,12 +18,12 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
+        "--gpu-memory-utilization",
+        "0.6",
         "--dtype",
         "bfloat16",
         "--max-model-len",
         "2048",
-        "--max-num-seqs",
-        "5",
         "--enforce-eager",
     ]
 
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 81d79601124a7..cdd624a42f300 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -23,12 +23,12 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
+        "--gpu-memory-utilization",
+        "0.6",
         "--dtype",
         "bfloat16",
         "--max-model-len",
         "2048",
-        "--max-num-seqs",
-        "5",
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",

From 83e86e4b88df16d4eec1c949521934aff72838f7 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 29 Sep 2024 01:21:24 +0000
Subject: [PATCH 31/37] Try limit `max_num_seqs`

---
 tests/entrypoints/openai/test_audio.py  | 4 ++--
 tests/entrypoints/openai/test_vision.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 90105f853a38b..df8a140283fbb 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -18,12 +18,12 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--gpu-memory-utilization",
-        "0.6",
         "--dtype",
         "bfloat16",
         "--max-model-len",
         "2048",
+        "--max-num-seqs",
+        "5",
         "--enforce-eager",
     ]
 
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index cdd624a42f300..81d79601124a7 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -23,12 +23,12 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--gpu-memory-utilization",
-        "0.6",
         "--dtype",
         "bfloat16",
         "--max-model-len",
         "2048",
+        "--max-num-seqs",
+        "5",
         "--enforce-eager",
         "--trust-remote-code",
         "--limit-mm-per-prompt",

From 8f9f7b56a32d2d5b538dd980c77e3a4d968dbbe7 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 29 Sep 2024 01:23:48 +0000
Subject: [PATCH 32/37] No need to set this anymore

---
 .buildkite/test-pipeline.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 11b91178b63e5..bb42b5f29a725 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -392,7 +392,6 @@ steps:
   - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - pip install -e ./plugins/vllm_add_dummy_model
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s distributed/test_distributed_oot.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py

From 113d3f05df5c7ac630478ebeb4cecd065d0adfb8 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 29 Sep 2024 01:28:25 +0000
Subject: [PATCH 33/37] Remove the need for deferred imports

---
 vllm/model_executor/models/__init__.py   |  8 ++------
 vllm/model_executor/models/interfaces.py | 20 +++++++++++---------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 0a09c8a872000..0c9c49118619f 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -7,6 +7,8 @@
 from vllm.logger import init_logger
 from vllm.utils import is_hip
 
+from .interfaces import supports_multimodal
+
 logger = init_logger(__name__)
 
 _GENERATION_MODELS = {
@@ -199,15 +201,11 @@ def register_model(model_arch: str, model_cls: Type[nn.Module]):
                 "overwritten by the new model class %s.", model_arch,
                 model_cls.__name__)
 
-        # Avoid circular import
-        from vllm.model_executor.models.interfaces import supports_multimodal
         if supports_multimodal(model_cls):
             # NOTE: This map is needed to store the information if the OOT model
             # is a multimodal model.
-            global _OOT_MULTIMODAL_MODELS
             _OOT_MULTIMODAL_MODELS[model_arch] = model_cls
 
-        global _OOT_MODELS
         _OOT_MODELS[model_arch] = model_cls
 
     @staticmethod
@@ -216,11 +214,9 @@ def is_embedding_model(model_arch: str) -> bool:
 
     @staticmethod
     def is_multimodal_model(model_arch: str) -> bool:
-
         # TODO: find a way to avoid initializing CUDA prematurely to
         # use `supports_multimodal` to determine if a model is multimodal
         # model_cls = ModelRegistry._try_load_model_cls(model_arch)
-        # from vllm.model_executor.models.interfaces import supports_multimodal
         return (model_arch in _MULTIMODAL_MODELS
                 or model_arch in _OOT_MULTIMODAL_MODELS)
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 069948f812253..a5b94a317e8f4 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1,11 +1,13 @@
-from typing import (ClassVar, Dict, List, Literal, Optional, Protocol, Type,
-                    Union, overload, runtime_checkable)
+from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional,
+                    Protocol, Type, Union, overload, runtime_checkable)
 
 from typing_extensions import TypeIs
 
-from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig
 from vllm.logger import init_logger
 
+if TYPE_CHECKING:
+    from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig
+
 logger = init_logger(__name__)
 
 
@@ -22,7 +24,7 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
-    def __init__(self, *, multimodal_config: MultiModalConfig) -> None:
+    def __init__(self, *, multimodal_config: "MultiModalConfig") -> None:
         ...
 
 
@@ -32,7 +34,7 @@ def __init__(self, *, multimodal_config: MultiModalConfig) -> None:
 class _SupportsMultiModalType(Protocol):
     supports_multimodal: Literal[True]
 
-    def __call__(self, *, multimodal_config: MultiModalConfig) -> None:
+    def __call__(self, *, multimodal_config: "MultiModalConfig") -> None:
         ...
 
 
@@ -75,7 +77,7 @@ class SupportsLoRA(Protocol):
     embedding_padding_modules: ClassVar[List[str]]
 
     # lora_config is None when LoRA is not enabled
-    def __init__(self, *, lora_config: Optional[LoRAConfig] = None) -> None:
+    def __init__(self, *, lora_config: Optional["LoRAConfig"] = None) -> None:
         ...
 
 
@@ -90,7 +92,7 @@ class _SupportsLoRAType(Protocol):
     embedding_modules: Dict[str, str]
     embedding_padding_modules: List[str]
 
-    def __call__(self, *, lora_config: Optional[LoRAConfig] = None) -> None:
+    def __call__(self, *, lora_config: Optional["LoRAConfig"] = None) -> None:
         ...
 
 
@@ -158,7 +160,7 @@ class HasInnerState(Protocol):
 
     def __init__(self,
                  *,
-                 scheduler_config: Optional[SchedulerConfig] = None) -> None:
+                 scheduler_config: Optional["SchedulerConfig"] = None) -> None:
         ...
 
 
@@ -168,7 +170,7 @@ class _HasInnerStateType(Protocol):
 
     def __init__(self,
                  *,
-                 scheduler_config: Optional[SchedulerConfig] = None) -> None:
+                 scheduler_config: Optional["SchedulerConfig"] = None) -> None:
         ...
 
 

From 2066ff36323d8b147bcafb71eef3517a8f24d86b Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 29 Sep 2024 09:56:17 +0000
Subject: [PATCH 34/37] Try separating out `test_accuracy.py` and
 `test_audio.py`

---
 .buildkite/test-pipeline.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index bb42b5f29a725..c1e1bed68b388 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -96,7 +96,9 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_accuracy.py --ignore=entrypoints/openai/test_audio.py --ignore=entrypoints/openai/test_oot_registration.py
+  - pytest -v -s entrypoints/openai/test_accuracy.py # it needs a clean process
+  - pytest -v -s entrypoints/openai/test_audio.py # it needs a clean process
   - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
   - pytest -v -s entrypoints/test_chat_utils.py
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

From e39907920a2ba0edaab723148fe45887ad9a80a6 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 4 Oct 2024 11:02:06 +0000
Subject: [PATCH 35/37] Enable lazy import

---
 tests/models/test_oot_registration.py         |  2 +-
 .../vllm_add_dummy_model/__init__.py          | 51 ++--------------
 .../vllm_add_dummy_model/my_llava.py          | 28 +++++++++
 .../vllm_add_dummy_model/my_opt.py            | 19 ++++++
 vllm/model_executor/models/registry.py        | 60 ++++++++++++++-----
 5 files changed, 100 insertions(+), 60 deletions(-)
 create mode 100644 tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
 create mode 100644 tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py

diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 53b78dfcd0138..ee3f8911f318c 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -36,7 +36,7 @@ def test_oot_registration(dummy_opt_path):
 
 
 @fork_new_process_for_each_test
-def test_oot_mutlimodal_registration(dummy_llava_path):
+def test_oot_multimodal_registration(dummy_llava_path):
     os.environ["VLLM_PLUGINS"] = "register_dummy_model"
     prompts = [{
         "prompt": "What's in the image?<image>",
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
index 7c55b51337da7..022ba66e38cc3 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
@@ -1,53 +1,14 @@
-from typing import Optional
-
-import torch
-
 from vllm import ModelRegistry
-from vllm.inputs import INPUT_REGISTRY
-from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
-                                              dummy_data_for_llava,
-                                              get_max_llava_image_tokens,
-                                              input_processor_for_llava)
-from vllm.model_executor.models.opt import OPTForCausalLM
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-
-
-class MyOPTForCausalLM(OPTForCausalLM):
-
-    def compute_logits(
-            self, hidden_states: torch.Tensor,
-            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
-        # this dummy model always predicts the first token
-        logits = super().compute_logits(hidden_states, sampling_metadata)
-        if logits is not None:
-            logits.zero_()
-            logits[:, 0] += 1.0
-        return logits
-
-
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
-class MyLlava(LlavaForConditionalGeneration):
-
-    def compute_logits(
-            self, hidden_states: torch.Tensor,
-            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
-        # this dummy model always predicts the first token
-        logits = super().compute_logits(hidden_states, sampling_metadata)
-        if logits is not None:
-            logits.zero_()
-            logits[:, 0] += 1.0
-        return logits
 
 
 def register():
-    # register our dummy model
+    # Test directly passing the model
+    from .my_opt import MyOPTForCausalLM
+
     if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs():
         ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)
 
-    # register our dummy multimodal model
+    # Test passing lazy model
     if "MyLlava" not in ModelRegistry.get_supported_archs():
-        ModelRegistry.register_model("MyLlava", MyLlava)
+        ModelRegistry.register_model("MyLlava",
+                                     "vllm_add_dummy_model.my_llava:MyLlava")
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
new file mode 100644
index 0000000000000..3ebd7864b8fc8
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -0,0 +1,28 @@
+from typing import Optional
+
+import torch
+
+from vllm.inputs import INPUT_REGISTRY
+from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
+                                              dummy_data_for_llava,
+                                              get_max_llava_image_tokens,
+                                              input_processor_for_llava)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
+class MyLlava(LlavaForConditionalGeneration):
+
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        # this dummy model always predicts the first token
+        logits = super().compute_logits(hidden_states, sampling_metadata)
+        if logits is not None:
+            logits.zero_()
+            logits[:, 0] += 1.0
+        return logits
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
new file mode 100644
index 0000000000000..569ef216c9f0a
--- /dev/null
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
@@ -0,0 +1,19 @@
+from typing import Optional
+
+import torch
+
+from vllm.model_executor.models.opt import OPTForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+
+
+class MyOPTForCausalLM(OPTForCausalLM):
+
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        # this dummy model always predicts the first token
+        logits = super().compute_logits(hidden_states, sampling_metadata)
+        if logits is not None:
+            logits.zero_()
+            logits[:, 0] += 1.0
+        return logits
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index aa5736e7cd517..a72b9e8909db2 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -125,9 +125,10 @@
     **_CONDITIONAL_GENERATION_MODELS,
 }
 
-# Architecture -> type.
+# Architecture -> type or (module, class).
 # out of tree models
 _OOT_MODELS: Dict[str, Type[nn.Module]] = {}
+_OOT_MODELS_LAZY: Dict[str, Tuple[str, str]] = {}
 
 # Models not supported by ROCm.
 _ROCM_UNSUPPORTED_MODELS: List[str] = []
@@ -159,17 +160,24 @@ class ModelRegistry:
 
     @staticmethod
     def _get_module_cls_name(model_arch: str) -> Tuple[str, str]:
-        module_relname, cls_name = _MODELS[model_arch]
-        return f"vllm.model_executor.models.{module_relname}", cls_name
+        if model_arch in _MODELS:
+            module_relname, cls_name = _MODELS[model_arch]
+            return f"vllm.model_executor.models.{module_relname}", cls_name
+
+        if model_arch in _OOT_MODELS_LAZY:
+            return _OOT_MODELS_LAZY[model_arch]
+
+        raise KeyError(model_arch)
 
     @staticmethod
     @lru_cache(maxsize=128)
     def _try_get_model_stateful(model_arch: str) -> Optional[Type[nn.Module]]:
-        if model_arch not in _MODELS:
+        try:
+            mod_name, cls_name = ModelRegistry._get_module_cls_name(model_arch)
+        except KeyError:
             return None
 
-        module_name, cls_name = ModelRegistry._get_module_cls_name(model_arch)
-        module = importlib.import_module(module_name)
+        module = importlib.import_module(mod_name)
         return getattr(module, cls_name, None)
 
     @staticmethod
@@ -219,14 +227,35 @@ def get_supported_archs() -> List[str]:
         return list(_MODELS.keys()) + list(_OOT_MODELS.keys())
 
     @staticmethod
-    def register_model(model_arch: str, model_cls: Type[nn.Module]):
+    def register_model(model_arch: str, model_cls: Union[Type[nn.Module],
+                                                         str]):
+        """
+        Register an external model to be used in vLLM.
+
+        :code:`model_cls` can be either:
+
+        - A :class:`torch.nn.Module` class directly referencing the model.
+        - A string in the format :code:`<module>:<class>` which can be used to
+          lazily import the model. This is useful to avoid initializing CUDA
+          when importing the model and thus the related error
+          :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
+        """
         if model_arch in _MODELS:
             logger.warning(
                 "Model architecture %s is already registered, and will be "
                 "overwritten by the new model class %s.", model_arch,
-                model_cls.__name__)
+                model_cls)
+
+        if isinstance(model_cls, str):
+            split_str = model_cls.split(":")
+            if len(split_str) != 2:
+                msg = "Expected a string in the format `<module>:<class>`"
+                raise ValueError(msg)
 
-        _OOT_MODELS[model_arch] = model_cls
+            module_name, cls_name = split_str
+            _OOT_MODELS_LAZY[model_arch] = module_name, cls_name
+        else:
+            _OOT_MODELS[model_arch] = model_cls
 
     @staticmethod
     @lru_cache(maxsize=128)
@@ -248,13 +277,16 @@ def _check_stateless(
         if model is not None:
             return func(model)
 
-        if model_arch not in _MODELS and default is not None:
-            return default
+        try:
+            mod_name, cls_name = ModelRegistry._get_module_cls_name(model_arch)
+        except KeyError:
+            if default is not None:
+                return default
 
-        module_name, cls_name = ModelRegistry._get_module_cls_name(model_arch)
+            raise
 
         valid_name_characters = string.ascii_letters + string.digits + "._"
-        if any(s not in valid_name_characters for s in module_name):
+        if any(s not in valid_name_characters for s in mod_name):
             raise ValueError(f"Unsafe module name detected for {model_arch}")
         if any(s not in valid_name_characters for s in cls_name):
             raise ValueError(f"Unsafe class name detected for {model_arch}")
@@ -266,7 +298,7 @@ def _check_stateless(
         err_id = uuid.uuid4()
 
         stmts = ";".join([
-            f"from {module_name} import {cls_name}",
+            f"from {mod_name} import {cls_name}",
             f"from {func.__module__} import {func.__name__}",
             f"assert {func.__name__}({cls_name}), '{err_id}'",
         ])

From cf980b4c2161273d4d4c783cbdfc91502a751609 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 4 Oct 2024 12:47:48 +0000
Subject: [PATCH 36/37] Revert test pipeline

---
 .buildkite/test-pipeline.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 3a6f487ab5acb..427dc14513d45 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -104,9 +104,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_accuracy.py --ignore=entrypoints/openai/test_audio.py --ignore=entrypoints/openai/test_oot_registration.py
-  - pytest -v -s entrypoints/openai/test_accuracy.py # it needs a clean process
-  - pytest -v -s entrypoints/openai/test_audio.py # it needs a clean process
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
   - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
   - pytest -v -s entrypoints/test_chat_utils.py
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

From dada11df448f84b6138be95dc8cc958cd7ca4c3a Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 4 Oct 2024 12:56:41 +0000
Subject: [PATCH 37/37] Update docs

---
 docs/source/models/adding_model.rst | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index 371fd0ea1a914..fa1003874033e 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -85,16 +85,16 @@ When it comes to the linear layers, we provide the following options to parallel
 * :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
 * :code:`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer.
 * :code:`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer.
-* :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
+* :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple :code:`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices.
 * :code:`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices.
 
-Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
+Note that all the linear layers above take :code:`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization.
 
 4. Implement the weight loading logic
 -------------------------------------
 
 You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class.
-This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
+This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for :code:`MergedColumnParallelLinear` and :code:`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately.
 
 5. Register your model
 ----------------------
@@ -114,10 +114,17 @@ Just add the following lines in your code:
     from your_code import YourModelForCausalLM
     ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
 
-.. important::
-    If your model is a multimodal model, make sure the model class is implemented with 
-    the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
+If your model imports modules that initialize CUDA, consider instead lazy-importing it to avoid an error like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
+
+.. code-block:: python
 
+    from vllm import ModelRegistry
+
+    ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
+
+.. important::
+    If your model is a multimodal model, make sure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
+    Read more about that :ref:`here <enabling_multimodal_inputs>`.
 
 If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code: