[VLM] Merged multi-modal processor for InternVL-based models (vllm-pr…

…oject#12553) Signed-off-by: DarkLight1337 <[email protected]> Signed-off-by: Isotr0py <[email protected]> Co-authored-by: Isotr0py <[email protected]>
ShangmingCai · Feb 10, 2025 · 2c55c72 · 2c55c72
1 parent 8152ff7
commit 2c55c72
Show file tree

Hide file tree

Showing 34 changed files with 1,434 additions and 986 deletions.
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
@@ -250,7 +250,11 @@ def get_max_image_tokens(self) -> int:
 And thus, we can override the method as:
 
 ```python
-def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+def get_mm_max_tokens_per_item(
+    self,
+    seq_len: int,
+    mm_counts: Mapping[str, int],
+) -> Mapping[str, int]:
     return {"image": self.get_max_image_tokens()}
 ```
 

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
@@ -726,7 +726,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
   *
   * ✅︎
-  *
+  * \*
 - * `Idefics3ForConditionalGeneration`
   * Idefics3
   * T + I
@@ -799,7 +799,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 - * `NVLM_D_Model`
   * NVLM-D 1.0
-  * T + I<sup>E+</sup>
+  * T + I<sup>+</sup>
   * `nvidia/NVLM-D-72B`, etc.
   *
   * ✅︎
@@ -859,7 +859,11 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 :::{note}
-To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
+To use DeepSeek-VL2 series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
+:::
+
+:::{note}
+H2O-VL series models will be available in V1 once we support backends other than FlashAttention.
 :::
 
 :::{note}

diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
@@ -250,6 +250,7 @@
         max_model_len=8192,
         dtype="bfloat16",
         use_tokenizer_eos=True,
+        num_logprobs=10,
         patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
     ),
     "idefics3": VLMTestInfo(
@@ -282,7 +283,6 @@
         dtype="bfloat16",
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
-        marks=[large_gpu_mark(min_gb=32)],
     ),
     "llava_next": VLMTestInfo(
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],

diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -334,12 +334,12 @@ class H2OVLProcessor:
         def __init__(self, hf_runner: HfRunner):
             self.num_image_token = hf_runner.model.num_image_token
             self.tokenizer = hf_runner.tokenizer
-            self.dtype = hf_runner.model.dtype
 
             self.config = AutoConfig.from_pretrained(hf_runner.model_name,
                                                      trust_remote_code=True)
             self.vision_config = self.config.vision_config
             self.use_thumbnail = self.config.use_thumbnail
+            self.use_msac = self.config.use_msac
             self.min_num = self.config.min_dynamic_patch
             self.max_num = self.config.max_dynamic_patch
             self.image_size = self.vision_config.image_size
@@ -348,18 +348,19 @@ def __call__(self, text: str, images: Union[Image, List[Image]],
                      **kwargs):
             # yapf: disable
             from vllm.model_executor.models.h2ovl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
 
             # yapf: enable
             images = [images] if isinstance(images, Image) else images
             pixel_values = [
-                image_to_pixel_values(image,
-                                      self.image_size,
-                                      self.min_num,
-                                      self.max_num,
-                                      self.use_thumbnail,
-                                      use_MSAC=self.config.use_msac).to(
-                                          self.dtype) for image in images
+                image_to_pixel_values_h2ovl(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                    use_msac=self.use_msac,
+                ) for image in images
             ]
             num_patches_list = [
                 pixel_value.shape[0] for pixel_value in pixel_values
@@ -394,7 +395,6 @@ class InternVLProcessor:
         def __init__(self, hf_runner: HfRunner):
             self.num_image_token = hf_runner.model.num_image_token
             self.tokenizer = hf_runner.tokenizer
-            self.dtype = hf_runner.model.dtype
 
             self.config = AutoConfig.from_pretrained(hf_runner.model_name,
                                                      trust_remote_code=True)
@@ -407,13 +407,17 @@ def __init__(self, hf_runner: HfRunner):
         def __call__(self, text: str, images: Union[Image, List[Image]],
                      **kwargs):
             from vllm.model_executor.models.internvl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+                IMG_CONTEXT, IMG_END, IMG_START,
+                image_to_pixel_values_internvl)
             images = [images] if isinstance(images, Image) else images
             pixel_values = [
-                image_to_pixel_values(image, self.image_size, self.min_num,
-                                      self.max_num,
-                                      self.use_thumbnail).to(self.dtype)
-                for image in images
+                image_to_pixel_values_internvl(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                ) for image in images
             ]
             num_patches_list = [
                 pixel_value.shape[0] for pixel_value in pixel_values
@@ -448,7 +452,8 @@ def _internvl_generate(
 ) -> torch.LongTensor:
     """Generate method for InternVL2 model without fixed use_cache."""
     assert self.img_context_token_id is not None
-    vit_embeds = self.extract_feature(pixel_values)
+    target_dtype = next(self.parameters()).dtype
+    vit_embeds = self.extract_feature(pixel_values.to(target_dtype))
     input_embeds = self.language_model.get_input_embeddings()(input_ids)
     B, N, C = input_embeds.shape
     input_embeds = input_embeds.reshape(B * N, C)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
@@ -141,13 +141,14 @@ def _test_processing_correctness(
 
 
 # yapf: disable
-# True if the model supports multiple data items of the modality per request
 @pytest.mark.parametrize("model_id", [
     "rhymes-ai/Aria",
     "Salesforce/blip2-opt-2.7b",
     "facebook/chameleon-7b",
     "deepseek-ai/deepseek-vl2-tiny",
     "adept/fuyu-8b",
+    "h2oai/h2ovl-mississippi-800m",
+    "OpenGVLab/InternVL2-1B",
     "llava-hf/llava-1.5-7b-hf",
     "llava-hf/llava-v1.6-mistral-7b-hf",
     "llava-hf/LLaVA-NeXT-Video-7B-hf",
@@ -156,6 +157,7 @@ def _test_processing_correctness(
     "mistral-community/pixtral-12b",
     "openbmb/MiniCPM-o-2_6",
     "openbmb/MiniCPM-V-2_6",
+    "nvidia/NVLM-D-72B",
     "Qwen/Qwen-VL-Chat",
     "Qwen/Qwen2-VL-2B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",