From fda97adc85ce56a63deff1e057cf6b0b20e3b6c0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 17 Feb 2025 06:24:46 +0000 Subject: [PATCH 1/3] Check required fields before initializing field config Signed-off-by: DarkLight1337 --- vllm/model_executor/models/minicpmo.py | 13 ++++++++----- vllm/model_executor/models/minicpmv.py | 18 ++++++++++++------ vllm/model_executor/models/qwen2_vl.py | 4 ++-- vllm/multimodal/parse.py | 18 +++++++++++------- 4 files changed, 33 insertions(+), 20 deletions(-) diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index 473881f955465..aa8c193ed6a52 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -23,8 +23,8 @@ # limitations under the License. """Inference-only MiniCPM-O model compatible with HuggingFace weights.""" from functools import partial -from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set, - Tuple, TypedDict, Union) +from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, + Optional, Set, Tuple, TypedDict, Union) import torch from torch import nn @@ -122,13 +122,16 @@ class MiniCPMOAudioEmbeddingItems(DictEmbeddingItems): def __init__( self, data: Mapping[str, torch.Tensor], - fields_config: Mapping[str, MultiModalFieldConfig], + fields_factory: Callable[ + [Mapping[str, torch.Tensor]], + Mapping[str, MultiModalFieldConfig], + ], ) -> None: super().__init__( data, modality="image", - fields_config=fields_config, required_fields={"audio_embeds"}, + fields_factory=fields_factory, ) @@ -141,7 +144,7 @@ def _parse_audio_data( if isinstance(data, dict): return MiniCPMOAudioEmbeddingItems( data, - fields_config=_minicpmo_field_config(data), + fields_factory=_minicpmo_field_config, ) return super()._parse_audio_data(data) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 77ac9eb467be6..2083e7dc0b83b 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -255,13 +255,16 @@ class MiniCPMVImageEmbeddingItems(DictEmbeddingItems): def __init__( self, data: Mapping[str, torch.Tensor], - fields_config: Mapping[str, MultiModalFieldConfig], + fields_factory: Callable[ + [Mapping[str, torch.Tensor]], + Mapping[str, MultiModalFieldConfig], + ], ) -> None: super().__init__( data, modality="image", - fields_config=fields_config, required_fields={"image_embeds", "image_sizes"}, + fields_factory=fields_factory, ) def get_image_size(self, index: int) -> ImageSize: @@ -274,13 +277,16 @@ class MiniCPMVVideoEmbeddingItems(DictEmbeddingItems): def __init__( self, data: Mapping[str, torch.Tensor], - fields_config: Mapping[str, MultiModalFieldConfig], + fields_factory: Callable[ + [Mapping[str, torch.Tensor]], + Mapping[str, MultiModalFieldConfig], + ], ) -> None: super().__init__( data, modality="video", - fields_config=fields_config, required_fields={"video_embeds", "video_image_sizes"}, + fields_factory=fields_factory, ) def get_frame_size(self, index: int) -> ImageSize: @@ -300,7 +306,7 @@ def _parse_image_data( if isinstance(data, dict): return MiniCPMVImageEmbeddingItems( data, - fields_config=_minicpmv_field_config(data), + fields_factory=_minicpmv_field_config, ) return super()._parse_image_data(data) @@ -312,7 +318,7 @@ def _parse_video_data( if isinstance(data, dict): return MiniCPMVVideoEmbeddingItems( data, - fields_config=_minicpmv_field_config(data), + fields_factory=_minicpmv_field_config, ) return super()._parse_video_data(data) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 3821f8d55bed1..68340ace18ddd 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -691,8 +691,8 @@ def _parse_image_data( return DictEmbeddingItems( data, modality="image", - fields_config=_qwen2vl_field_config(data), required_fields={"image_embeds", "image_grid_thw"}, + fields_factory=_qwen2vl_field_config, ) return super()._parse_image_data(data) @@ -705,8 +705,8 @@ def _parse_video_data( return DictEmbeddingItems( data, modality="video", - fields_config=_qwen2vl_field_config(data), required_fields={"video_embeds", "video_grid_thw"}, + fields_factory=_qwen2vl_field_config, ) return super()._parse_video_data(data) diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index fb07c5c6a25d6..fd6f3283316ad 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -125,17 +125,14 @@ def __init__( self, data: Mapping[str, torch.Tensor], modality: str, - fields_config: Mapping[str, MultiModalFieldConfig], required_fields: set[str], + fields_factory: Callable[ + [Mapping[str, torch.Tensor]], + Mapping[str, MultiModalFieldConfig], + ], ) -> None: super().__init__(data, modality) - missing_required_fields = required_fields - fields_config.keys() - if missing_required_fields: - fields = set(fields_config.keys()) - msg = f"{required_fields=} should be a subset of {fields=}" - raise ValueError(msg) - missing_required_data_keys = required_fields - data.keys() if missing_required_data_keys: data_keys = set(data.keys()) @@ -143,6 +140,13 @@ def __init__( f"but only found the following keys: {data_keys}") raise ValueError(msg) + fields_config = fields_factory(data) + missing_required_fields = required_fields - fields_config.keys() + if missing_required_fields: + fields = set(fields_factory.keys()) + msg = f"{required_fields=} should be a subset of {fields=}" + raise ValueError(msg) + self.fields_config = fields_config self.required_fields = required_fields From 51052545ee9fd7aecdf673fd5019d8679644c7cd Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 17 Feb 2025 06:28:42 +0000 Subject: [PATCH 2/3] Update docs Signed-off-by: DarkLight1337 --- docs/source/serving/multimodal_inputs.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md index ade59e3773839..5cec5548ba183 100644 --- a/docs/source/serving/multimodal_inputs.md +++ b/docs/source/serving/multimodal_inputs.md @@ -184,8 +184,8 @@ llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={ mm_data = { "image": { "image_embeds": image_embeds, - # image_size_list is needed to calculate details of the sliced image. - "image_size_list": [image.size for image in images], # list of image sizes + # image_sizes is needed to calculate details of the sliced image. + "image_sizes": [image.size for image in images], # list of image sizes } } From 8801698f8a160d517e2e9d2f968f290a01496170 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 17 Feb 2025 06:33:21 +0000 Subject: [PATCH 3/3] Fix Signed-off-by: DarkLight1337 --- vllm/multimodal/parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index fd6f3283316ad..4e3e5b2088640 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -143,7 +143,7 @@ def __init__( fields_config = fields_factory(data) missing_required_fields = required_fields - fields_config.keys() if missing_required_fields: - fields = set(fields_factory.keys()) + fields = set(fields_config.keys()) msg = f"{required_fields=} should be a subset of {fields=}" raise ValueError(msg)