Separate text-only and vision variants of the same model architecture

Signed-off-by: DarkLight1337 <[email protected]>
vllm-project · Feb 12, 2025 · 3f5c41c · 3f5c41c
1 parent f4d97e4
commit 3f5c41c
Show file tree

Hide file tree

Showing 8 changed files with 1,547 additions and 1,488 deletions.
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
@@ -699,10 +699,10 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
-- * `DeepseekVLV2ForCausalLM`
+- * `DeepseekVLV2ForCausalLM`\^
   * DeepSeek-VL2
   * T + I<sup>+</sup>
-  * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
+  * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc.
   *
   * ✅︎
   * ✅︎
@@ -713,7 +713,7 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
-- * `ChatGLMModel`
+- * `GLM4VForCausalLM`\^
   * GLM-4V
   * T + I
   * `THUDM/glm-4v-9b` etc.
@@ -825,7 +825,7 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
-- * `QWenLMHeadModel`
+- * `QwenVLForConditionalGeneration`\^
   * Qwen-VL
   * T + I<sup>E+</sup>
   * `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
@@ -862,13 +862,10 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 :::
 
+\^ You need to set the architecture name via `--hf-overrides` to match the one in vLLM. For example, to use DeepSeek-VL2 series models, use `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
-:::{note}
-To use DeepSeek-VL2 series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
-:::
-
 :::{note}
 H2O-VL series models will be available in V1 once we support backends other than FlashAttention.
 :::

diff --git a/tests/models/registry.py b/tests/models/registry.py
@@ -104,7 +104,8 @@ def check_available_online(
                                          trust_remote_code=True),
     "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B"),
     "BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"),
-    # ChatGLMModel supports multimodal
+    "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
+                                    trust_remote_code=True),
     "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
                                          trust_remote_code=True),
     "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
@@ -167,7 +168,8 @@ def check_available_online(
                                             trust_remote_code=True),
     "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                          trust_remote_code=True),
-    # QWenLMHeadModel supports multimodal
+    "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
+                                       trust_remote_code=True),
     "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"),
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
@@ -232,14 +234,12 @@ def check_available_online(
     "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
     "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"),  # noqa: E501
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
-    "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b",
-                                    extras={"text_only": "THUDM/chatglm3-6b"},
-                                    trust_remote_code=True),
-    "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
-                                                       is_available_online=False),
     "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                                hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
+    "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
+                                        trust_remote_code=True,
+                                        hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          trust_remote_code=True),
@@ -264,9 +264,9 @@ def check_available_online(
                                         trust_remote_code=True),
     "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
                                                        tokenizer_mode="mistral"),
-    "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-VL-Chat",
-                                       extras={"text_only": "Qwen/Qwen-7B-Chat"},  # noqa: E501
-                                       trust_remote_code=True),
+    "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL-Chat",
+                                                      trust_remote_code=True,
+                                                      hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}),  # noqa: E501
     "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501