diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 55b3f52356cd0..67ed57baf8ea3 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -699,10 +699,10 @@ See [this page](#generative-models) for more information on how to use generativ * * ✅︎ * ✅︎ -- * `DeepseekVLV2ForCausalLM` +- * `DeepseekVLV2ForCausalLM`\^ * DeepSeek-VL2 * T + I+ - * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note) + * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. * * ✅︎ * ✅︎ @@ -713,7 +713,7 @@ See [this page](#generative-models) for more information on how to use generativ * * ✅︎ * ✅︎ -- * `ChatGLMModel` +- * `GLM4VForCausalLM`\^ * GLM-4V * T + I * `THUDM/glm-4v-9b` etc. @@ -825,7 +825,7 @@ See [this page](#generative-models) for more information on how to use generativ * * ✅︎ * ✅︎ -- * `QWenLMHeadModel` +- * `QwenVLForConditionalGeneration`\^ * Qwen-VL * T + IE+ * `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. @@ -862,13 +862,10 @@ See [this page](#generative-models) for more information on how to use generativ * ✅︎ ::: +\^ You need to set the architecture name via `--hf-overrides` to match the one in vLLM. For example, to use DeepSeek-VL2 series models, use `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. E Pre-computed embeddings can be inputted for this modality. + Multiple items can be inputted per text prompt for this modality. -:::{note} -To use DeepSeek-VL2 series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. -::: - :::{note} H2O-VL series models will be available in V1 once we support backends other than FlashAttention. ::: diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 9a4183106cff9..b9963669a0de1 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -105,7 +105,9 @@ def run_glm4v(question: str, modality: str): max_num_seqs=2, trust_remote_code=True, enforce_eager=True, + hf_overrides={"architectures": ["GLM4VForCausalLM"]}, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) + prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ {question}<|assistant|>" @@ -495,6 +497,7 @@ def run_qwen_vl(question: str, modality: str): trust_remote_code=True, max_model_len=1024, max_num_seqs=2, + hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 8d2172a606f8d..1a5ea0c70bccd 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]): ) -def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: +def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData: model_name = "h2oai/h2ovl-mississippi-2b" llm = LLM( @@ -302,6 +302,7 @@ def load_qwen_vl_chat(question: str, trust_remote_code=True, max_model_len=1024, max_num_seqs=2, + hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}, limit_mm_per_prompt={"image": len(image_urls)}, ) placeholders = "".join(f"Picture {i}: \n" @@ -452,7 +453,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData: model_example_map = { "aria": load_aria, "deepseek_vl_v2": load_deepseek_vl2, - "h2ovl_chat": load_h2onvl, + "h2ovl_chat": load_h2ovl, "idefics3": load_idefics3, "internvl_chat": load_internvl, "mllama": load_mllama, diff --git a/tests/models/registry.py b/tests/models/registry.py index 7b1db55494fe4..3eb7ab83cb6a4 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -104,7 +104,8 @@ def check_available_online( trust_remote_code=True), "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B"), "BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"), - # ChatGLMModel supports multimodal + "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b", + trust_remote_code=True), "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01", trust_remote_code=True), "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501 @@ -167,7 +168,8 @@ def check_available_online( trust_remote_code=True), "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct", trust_remote_code=True), - # QWenLMHeadModel supports multimodal + "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat", + trust_remote_code=True), "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"), "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"), "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b", @@ -232,14 +234,12 @@ def check_available_online( "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"), "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501 "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501 - "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b", - extras={"text_only": "THUDM/chatglm3-6b"}, - trust_remote_code=True), - "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b", - is_available_online=False), "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501 hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501 "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), + "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b", + trust_remote_code=True, + hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"), "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", trust_remote_code=True), @@ -264,9 +264,9 @@ def check_available_online( trust_remote_code=True), "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501 tokenizer_mode="mistral"), - "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-VL-Chat", - extras={"text_only": "Qwen/Qwen-7B-Chat"}, # noqa: E501 - trust_remote_code=True), + "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL-Chat", + trust_remote_code=True, + hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}), # noqa: E501 "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501 diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 153c85cfb2141..26b4a95c530e8 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -1,20 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 - # Adapted from -# https://github.com/THUDM/CogAgent -"""Inference-only CogAgent model compatible with THUDM weights.""" -from argparse import Namespace -from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, - Union) +# https://github.com/THUDM/ChatGLM2-6B +"""Inference-only ChatGLM model compatible with THUDM weights.""" +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn from torch.nn import LayerNorm -from torchvision import transforms -from torchvision.transforms import InterpolationMode -from transformers import PreTrainedTokenizer, TensorType -from transformers.image_utils import ImageInput -from transformers.tokenization_utils_base import TextInput from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, VllmConfig @@ -31,204 +23,14 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel -from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors -from vllm.multimodal.parse import MultiModalDataItems -from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, BatchFeature, - MultiModalFieldConfig, - PromptReplacement) -from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import ChatGLMConfig -from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP +from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix, merge_multimodal_embeddings) - - -class GLMImagePixelInputs(TypedDict): - pixel_values: torch.Tensor - """Shape: `(batch_size, num_channels, height, width)`""" - - -class GLM4VProcessor: - """ - This model doesn't define its own HF processor, - so we implement our own one here. - - """ - - def __init__( - self, - config: ChatGLMConfig, - tokenizer: PreTrainedTokenizer, - ) -> None: - super().__init__() - - self.config = config - self.tokenizer = tokenizer - - if vision_config := getattr(config, "vision_config", None): - image_size = vision_config["image_size"] - - self.image_transform = transforms.Compose([ - transforms.Resize( - (image_size, image_size), - interpolation=InterpolationMode.BICUBIC, - ), - transforms.ToTensor(), - transforms.Normalize( - mean=(0.48145466, 0.4578275, 0.40821073), - std=(0.26862954, 0.26130258, 0.27577711), - ), - ]) - else: - self.image_transform = None - - def __call__( - self, - text: Optional[Union[TextInput, list[TextInput]]] = None, - images: Optional[Union[ImageInput, list[ImageInput]]] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - ) -> BatchFeature: - if text is None: - text = [] - if not isinstance(text, list): - text = [text] - if images is None: - images = [] - if not isinstance(images, list): - images = [images] - text_inputs = self.tokenizer(text) - if len(images) == 0: - image_inputs = {} - else: - if self.image_transform is None: - raise ValueError("This model does not support image inputs") - - pixel_values = [self.image_transform(image) for image in images] - image_inputs = {"pixel_values": torch.stack(pixel_values)} - - return BatchFeature( - { - **text_inputs, - **image_inputs, - }, - tensor_type=return_tensors, - ) - - -class GLM4VProcessingInfo(BaseProcessingInfo): - - def get_tokenizer(self): - tokenizer = self.ctx.tokenizer - assert isinstance(tokenizer, PreTrainedTokenizer) - return tokenizer - - def get_hf_config(self): - return self.ctx.get_hf_config(ChatGLMConfig) - - def get_hf_processor(self) -> GLM4VProcessor: - return GLM4VProcessor( - self.get_hf_config(), - self.get_tokenizer(), - ) - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": 1} - - def get_mm_max_tokens_per_item( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> Mapping[str, int]: - return {"image": self.get_num_image_feature_tokens()} - - def get_num_image_tokens(self) -> int: - hf_config = self.get_hf_config() - if not (vision_config := getattr(hf_config, "vision_config", None)): - return 0 - - image_size = vision_config["image_size"] - patch_size = vision_config["patch_size"] - grid_length = image_size // patch_size // 2 - return grid_length * grid_length - - def get_num_image_feature_tokens(self) -> int: - # EVA2CLIPModel has embeddings for boi and eoi tokens as well - return self.get_num_image_tokens() + 2 - - -class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]): - - def get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - hf_config = self.info.get_hf_config() - if not (vision_config := getattr(hf_config, "vision_config", None)): - return ProcessorInputs(prompt_text="", mm_data={}) - - target_width = target_height = vision_config["image_size"] - num_images = mm_counts.get("image", 0) - - mm_data = { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } - - base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>" - - return ProcessorInputs( - prompt_text=base_text * num_images, - mm_data=mm_data, - ) - - -class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]): - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict(pixel_values=MultiModalFieldConfig.batched("image")) - - def _get_prompt_replacements( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, - ) -> list[PromptReplacement]: - hf_config = self.info.get_hf_config() - if not hasattr(hf_config, "vision_config"): - return [] - - boi_token_id = hf_config.boi_token_id - image_token_id = hf_config.pad_token_id - eoi_token_id = hf_config.eoi_token_id - - def get_replacement(item_idx: int): - num_image_tokens = self.info.get_num_image_tokens() - image_tokens = [image_token_id] * num_image_tokens - - return [boi_token_id] + image_tokens + [eoi_token_id] - - return [ - PromptReplacement( - modality="image", - target=[boi_token_id, image_token_id, eoi_token_id], - replacement=get_replacement, - ), - ] + maybe_prefix) class GLMAttention(nn.Module): @@ -489,7 +291,7 @@ def forward( position_ids: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, IntermediateTensors]: for i in range(self.start_layer, self.end_layer): layer = self.layers[i] hidden_states = layer( @@ -498,8 +300,12 @@ def forward( kv_cache=kv_caches[i - self.start_layer], attn_metadata=attn_metadata, ) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({"hidden_states": hidden_states}) + # Final layer norm. - if get_pp_group().is_last_rank and self.post_layer_norm: + if self.post_layer_norm: hidden_states = self.final_layernorm(hidden_states) return hidden_states @@ -534,61 +340,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config=quant_config, prefix=f"{prefix}.output_layer") - vision_config_flag = getattr(config, 'vision_config', None) - if vision_config_flag is not None: - self.vision_config = Namespace(**config.vision_config) - self.vision = EVA2CLIPModel(self.config, - quant_config, - prefix=f"{prefix}.vision") - else: - self.vision = None - self.make_empty_intermediate_tensors = ( self.encoder.make_empty_intermediate_tensors) - def _parse_and_validate_image_input( - self, **kwargs: object) -> GLMImagePixelInputs: - - pixel_values = kwargs.pop("pixel_values", None) - if pixel_values is not None and self.vision is not None: - if isinstance(pixel_values, torch.Tensor): - if pixel_values.ndim > 2: - pixel_values = torch.concat(list(pixel_values)) - elif isinstance(pixel_values, list): - return torch.concat(pixel_values) - else: - raise TypeError("""pixel_values must be a torch.Tensor - or a list of torch.Tensor - """) - return GLMImagePixelInputs(pixel_values=pixel_values) - - def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input["pixel_values"] is None: - return None - pixel_values = image_input["pixel_values"].to( - dtype=self.config.torch_dtype) - vision_embeddings = self.vision(pixel_values) - return vision_embeddings - - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, - ) -> torch.Tensor: - inputs_embeds = self.embedding(input_ids) - if multimodal_embeddings is not None: - inputs_embeds = merge_multimodal_embeddings( - input_ids=input_ids, - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - placeholder_token_id=[ - self.config.boi_token_id, - self.config.pad_token_id, - self.config.eoi_token_id, - ], - ) - return inputs_embeds + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embedding(input_ids) def forward( self, @@ -599,26 +355,24 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - if intermediate_tensors is not None: - inputs_embeds = intermediate_tensors["hidden_states"] - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) # Run encoder. hidden_states = self.encoder( - hidden_states=inputs_embeds, + hidden_states=hidden_states, position_ids=positions, kv_caches=kv_caches, attn_metadata=attn_metadata, ) - if not get_pp_group().is_last_rank: - return IntermediateTensors({"hidden_states": hidden_states}) return hidden_states def load_weights(self, weights: Iterable[Tuple[str, @@ -660,12 +414,18 @@ def load_weights(self, weights: Iterable[Tuple[str, return loaded_params -class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP): +class ChatGLMBaseModel(nn.Module): hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={".word_embeddings": ""}, ) - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + transformer_type: type[ChatGLMModel] = ChatGLMModel, + ) -> None: super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config @@ -678,27 +438,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.quant_config = quant_config self.max_position_embeddings = getattr(config, "max_sequence_length", 8192) - self.transformer = ChatGLMModel(vllm_config=vllm_config, - prefix=maybe_prefix( - prefix, "transformer")) + self.transformer = transformer_type(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) if self.config.tie_word_embeddings: self.transformer.output_layer.weight = ( self.transformer.embedding.weight) self.lm_head = self.transformer.output_layer self.logits_processor = LogitsProcessor(config.padded_vocab_size) self.sampler = get_sampler() - - def forward(self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors] = None, - **kwargs) -> torch.Tensor: - hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors, - **kwargs) - return hidden_states + self.make_empty_intermediate_tensors = ( + self.transformer.make_empty_intermediate_tensors) def compute_logits( self, @@ -722,7 +472,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) -class ChatGLM(ChatGLMBaseModel): +class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP): packed_modules_mapping = { "query_key_value": ["query_key_value"], "dense_h_to_4h": ["dense_h_to_4h"] @@ -738,82 +488,28 @@ class ChatGLM(ChatGLMBaseModel): embedding_modules = {} embedding_padding_modules = [] + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + if hasattr(config, "vision_config"): + hf_overrides = {"architectures": ["GLM4VForCausalLM"]} + raise RuntimeError( + "The configuration of this model indicates that it supports " + "vision inputs, but you instantiated the text-only version " + "of this model. Please use the vision model by setting " + f"`--hf-overrides {hf_overrides!r}`") -class ChatGLMV(ChatGLMBaseModel, SupportsMultiModal): - - packed_modules_mapping = { - "query_key_value": ["query_key_value"], - "dense_h_to_4h": ["dense_h_to_4h"], - "merged_proj": ["gate_proj", "dense_h_to_4h"] - } - # LoRA specific attributes - supported_lora_modules = [ - "query_key_value", - "dense", - "dense_h_to_4h", - "dense_4h_to_h", - # vision - "fc1", - "fc2", - "merged_proj", - "linear_proj" - ] - - embedding_modules = {} - embedding_padding_modules = [] - - def get_mm_mapping(self) -> MultiModelKeys: - """ - Get the module prefix in multimodal models - """ - return MultiModelKeys.from_string_field( - language_model="transformer.encoder", - connector="transformer.vision.linear_proj", - tower_model="transformer.vision.transformer") - - def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: - return self.transformer.get_multimodal_embeddings(**kwargs) + super().__init__(vllm_config=vllm_config, prefix=prefix) - def get_input_embeddings( + def forward( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, - ) -> torch.Tensor: - return self.transformer.get_input_embeddings(input_ids, - multimodal_embeddings) - - -@MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor, - info=GLM4VProcessingInfo, - dummy_inputs=GLM4VDummyInputsBuilder) -class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, - SupportsMultiModal): - # Ensure that the LoRA support check passes when the class is not - # initialized, but set all these attributes to empty. - # These will be updated when an instance class is selected - packed_modules_mapping = {} - supported_lora_modules = [] - embedding_modules = {} - embedding_padding_modules = [] - - def __new__( - cls, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: - config = vllm_config.model_config.hf_config - - # Initialize VL - if hasattr(config, "vision_config"): # noqa: SIM108 - instance_cls = ChatGLMV - # Initialize LLM - else: - instance_cls = ChatGLM - - # quant_config references base class members, - # so update values before init is called - cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping) - cls.supported_lora_modules += instance_cls.supported_lora_modules - cls.embedding_modules.update(instance_cls.embedding_modules) - cls.embedding_padding_modules += instance_cls.embedding_padding_modules - return instance_cls(vllm_config=vllm_config, prefix=prefix) + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.transformer(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors, + inputs_embeds) + return hidden_states diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py deleted file mode 100644 index 2facd1353aef1..0000000000000 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ /dev/null @@ -1,312 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -# Adapted from -# https://github.com/THUDM/GLM-4 -"""Inference-only GLM-4v model visual encoder compatible with THUDM weights.""" -from argparse import Namespace -from typing import Optional - -import torch -from torch import nn -from torch.nn import LayerNorm - -from vllm.attention.layer import MultiHeadAttention -from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, - QKVParallelLinear, - ReplicatedLinear, - RowParallelLinear) -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) - - -class PatchEmbedding(nn.Module): - - def __init__(self, config): - super().__init__() - self.proj = nn.Conv2d(config.in_channels, - config.hidden_size, - kernel_size=config.patch_size, - stride=config.patch_size) - self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size)) - self.position_embedding = nn.Embedding(config.num_positions, - config.hidden_size) - - def forward(self, images: torch.Tensor) -> torch.Tensor: - """ - Parameters: - images : torch.Tensor - Input image tensor with shape (B, C, H, W) - - Returns: - torch.Tensor - Transformed tensor with shape (B, L, D) - """ - images = images.to(device=self.proj.weight.device, - dtype=self.proj.weight.dtype) - x = self.proj(images) - x = x.flatten(2).transpose(1, 2) - cls_token = self.cls_embedding.expand(x.shape[0], -1, -1) - x = torch.cat((cls_token, x), dim=1) - x += self.position_embedding.weight.unsqueeze(0) - return x - - -class Attention(nn.Module): - - def __init__( - self, - config, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = '', - ): - super().__init__() - self.hidden_size = config.hidden_size - self.tp_size = get_tensor_model_parallel_world_size() - self.num_heads_per_rank = config.num_heads // self.tp_size - self.head_dim = config.hidden_size // config.num_heads - self.scale = self.head_dim**-0.5 - - self.query_key_value = QKVParallelLinear( - config.hidden_size, - self.head_dim, - config.num_heads, - quant_config=quant_config, - prefix=f"{prefix}.query_key_value", - ) - self.dense = RowParallelLinear( - config.hidden_size, - config.hidden_size, - quant_config=quant_config, - prefix=f"{prefix}.dense", - ) - - self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim, - self.scale) - self.output_dropout = torch.nn.Dropout(config.dropout_prob) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - qkv, _ = self.query_key_value(x) # B, L, 3 * H * D - q, k, v = qkv.chunk(3, dim=-1) - - out = self.attn(q, k, v) - output, _ = self.dense(out) - output = self.output_dropout(output) - return output - - -class MLP(nn.Module): - - def __init__( - self, - config, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = '', - ): - super().__init__() - self.config = config - self.activation_fn = get_act_fn(config.hidden_act) - self.fc1 = ColumnParallelLinear( - config.hidden_size, - config.intermediate_size, - quant_config=quant_config, - prefix=f"{prefix}.fc1", - ) - self.fc2 = RowParallelLinear( - config.intermediate_size, - config.hidden_size, - quant_config=quant_config, - prefix=f"{prefix}.fc2", - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x, _ = self.fc1(x) - x = self.activation_fn(x) - x, _ = self.fc2(x) - return x - - -class TransformerLayer(nn.Module): - - def __init__( - self, - config, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = '', - ): - super().__init__() - self.input_layernorm = LayerNorm(config.hidden_size, - eps=config.layer_norm_eps) - self.attention = Attention(config, - quant_config=quant_config, - prefix=f"{prefix}.attention") - self.mlp = MLP(config, - quant_config=quant_config, - prefix=f"{prefix}.mlp") - self.post_attention_layernorm = LayerNorm(config.hidden_size, - eps=config.layer_norm_eps) - - def forward(self, hidden_states): - attention_input = hidden_states - attention_output = self.input_layernorm( - self.attention(attention_input)) - hidden_states = attention_input + attention_output - mlp_input = hidden_states - mlp_output = self.post_attention_layernorm(self.mlp(mlp_input)) - output = mlp_input + mlp_output - return output - - -class Transformer(nn.Module): - - def __init__( - self, - config, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = '', - ): - super().__init__() - self.layers = nn.ModuleList([ - TransformerLayer(config, - quant_config=quant_config, - prefix=f"{prefix}.layers.{layer_idx}") - for layer_idx in range(config.num_hidden_layers) - ]) - - def forward(self, hidden_states): - for layer_module in self.layers: - hidden_states = layer_module(hidden_states) - return hidden_states - - -class GLU(nn.Module): - - def __init__( - self, - config, - in_features, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = '', - ): - """ - The original implementation is the same as: - ```python - self.dense_h_to_4h = ColumnParallelLinear( - config.hidden_size, - config.ffn_hidden_size, - bias=False, - quant_config=quant_config - ) - - self.gate_proj = ColumnParallelLinear( - config.hidden_size, - config.ffn_hidden_size, - bias=False, - quant_config=quant_config - ) - ``` - ``` - gate_proj_output, _ = self.gate_proj(x) - dense_h_to_4h_output, _ = self.dense_h_to_4h(x) - x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1) - ``` - - We merge two ColumnParallelLinear into one MergedColumnParallelLinear: - ``` - self.merged_proj = MergedColumnParallelLinear( - config.hidden_size, - [config.ffn_hidden_size] * 2, - bias=False, - quant_config=quant_config - ) - ``` - ``` - x, _ = self.merged_proj(x) - ``` - """ - super().__init__() - self.linear_proj = ReplicatedLinear(in_features, - config.hidden_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.linear_proj") - self.norm1 = nn.LayerNorm(config.hidden_size) - self.act1 = nn.GELU() - self.act2 = SiluAndMul() - - self.merged_proj = MergedColumnParallelLinear( - config.hidden_size, [config.ffn_hidden_size] * 2, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.merged_proj") - - self.dense_4h_to_h = RowParallelLinear( - config.ffn_hidden_size, - config.hidden_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.dense_4h_to_h") - - def forward(self, x): - x, _ = self.linear_proj(x) - x = self.act1(self.norm1(x)) - x, _ = self.merged_proj(x) - x = self.act2(x) - x, _ = self.dense_4h_to_h(x) - return x - - -class EVA2CLIPModel(nn.Module): - - def __init__( - self, - config, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = '', - ): - super().__init__() - vision_config = Namespace(**config.vision_config) - self.patch_embedding = PatchEmbedding(vision_config) - self.transformer = Transformer(vision_config, - quant_config=quant_config, - prefix=f"{prefix}.transformer") - self.linear_proj = GLU(config, - in_features=config.hidden_size, - quant_config=quant_config, - prefix=f"{prefix}.linear_proj") - self.conv = nn.Conv2d(in_channels=vision_config.hidden_size, - out_channels=config.hidden_size, - kernel_size=2, - stride=2) - self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) - self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) - self.scaling_factor = vision_config.scaling_factor - - def forward(self, images: torch.Tensor) -> torch.Tensor: - """ - Parameters: - images : torch.Tensor - Input image tensor with shape (B, C, H, W) - - Returns: - torch.Tensor - Transformed tensor with shape (B, L, D) - """ - x = self.patch_embedding(images) - x = self.transformer(x) - x = x[:, 1:] - - b, s, h = x.shape - grid_size = int(s**0.5) - x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2) - x = self.conv(x) - - x = x.flatten(2).transpose(1, 2) - x = self.linear_proj(x) - boi = self.boi.expand(x.shape[0], -1, -1) - eoi = self.eoi.expand(x.shape[0], -1, -1) - x = torch.cat((boi, x, eoi), dim=1) - x = x / self.scaling_factor - return x diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py new file mode 100644 index 0000000000000..67f19841f4aa7 --- /dev/null +++ b/vllm/model_executor/models/glm4v.py @@ -0,0 +1,662 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from +# https://github.com/THUDM/CogAgent +"""Inference-only CogAgent model compatible with THUDM weights.""" +from argparse import Namespace +from typing import List, Literal, Mapping, Optional, TypedDict, Union + +import torch +from torch import nn +from torch.nn import LayerNorm +from torchvision import transforms +from torchvision.transforms import InterpolationMode +from transformers import PreTrainedTokenizer, TensorType +from transformers.image_utils import ImageInput +from transformers.tokenization_utils_base import TextInput + +from vllm.attention import AttentionMetadata +from vllm.attention.layer import MultiHeadAttention +from vllm.config import VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors +from vllm.multimodal.parse import MultiModalDataItems +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, BatchFeature, + MultiModalFieldConfig, + PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs import ChatGLMConfig + +from .chatglm import ChatGLMBaseModel, ChatGLMModel +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP +from .utils import flatten_bn, merge_multimodal_embeddings + + +class GLMVImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """Shape: `(batch_size, num_channels, height, width)`""" + + +class EVA2CLIPPatchEmbedding(nn.Module): + + def __init__(self, config): + super().__init__() + self.proj = nn.Conv2d(config.in_channels, + config.hidden_size, + kernel_size=config.patch_size, + stride=config.patch_size) + self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size)) + self.position_embedding = nn.Embedding(config.num_positions, + config.hidden_size) + + def forward(self, images: torch.Tensor) -> torch.Tensor: + """ + Parameters: + images : torch.Tensor + Input image tensor with shape (B, C, H, W) + + Returns: + torch.Tensor + Transformed tensor with shape (B, L, D) + """ + images = images.to(device=self.proj.weight.device, + dtype=self.proj.weight.dtype) + x = self.proj(images) + x = x.flatten(2).transpose(1, 2) + cls_token = self.cls_embedding.expand(x.shape[0], -1, -1) + x = torch.cat((cls_token, x), dim=1) + x += self.position_embedding.weight.unsqueeze(0) + return x + + +class EVA2CLIPAttention(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = '', + ): + super().__init__() + self.hidden_size = config.hidden_size + self.tp_size = get_tensor_model_parallel_world_size() + self.num_heads_per_rank = config.num_heads // self.tp_size + self.head_dim = config.hidden_size // config.num_heads + self.scale = self.head_dim**-0.5 + + self.query_key_value = QKVParallelLinear( + config.hidden_size, + self.head_dim, + config.num_heads, + quant_config=quant_config, + prefix=f"{prefix}.query_key_value", + ) + self.dense = RowParallelLinear( + config.hidden_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.dense", + ) + + self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim, + self.scale) + self.output_dropout = torch.nn.Dropout(config.dropout_prob) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + qkv, _ = self.query_key_value(x) # B, L, 3 * H * D + q, k, v = qkv.chunk(3, dim=-1) + + out = self.attn(q, k, v) + output, _ = self.dense(out) + output = self.output_dropout(output) + return output + + +class EVA2CLIPMLP(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = '', + ): + super().__init__() + self.config = config + self.activation_fn = get_act_fn(config.hidden_act) + self.fc1 = ColumnParallelLinear( + config.hidden_size, + config.intermediate_size, + quant_config=quant_config, + prefix=f"{prefix}.fc1", + ) + self.fc2 = RowParallelLinear( + config.intermediate_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.fc2", + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, _ = self.fc1(x) + x = self.activation_fn(x) + x, _ = self.fc2(x) + return x + + +class EVA2CLIPTransformerLayer(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = '', + ): + super().__init__() + self.input_layernorm = LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.attention = EVA2CLIPAttention(config, + quant_config=quant_config, + prefix=f"{prefix}.attention") + self.mlp = EVA2CLIPMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + self.post_attention_layernorm = LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + def forward(self, hidden_states): + attention_input = hidden_states + attention_output = self.input_layernorm( + self.attention(attention_input)) + hidden_states = attention_input + attention_output + mlp_input = hidden_states + mlp_output = self.post_attention_layernorm(self.mlp(mlp_input)) + output = mlp_input + mlp_output + return output + + +class EVA2CLIPTransformer(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = '', + ): + super().__init__() + self.layers = nn.ModuleList([ + EVA2CLIPTransformerLayer(config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(config.num_hidden_layers) + ]) + + def forward(self, hidden_states): + for layer_module in self.layers: + hidden_states = layer_module(hidden_states) + return hidden_states + + +class EVA2CLIPGLU(nn.Module): + + def __init__( + self, + config, + in_features, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = '', + ): + """ + The original implementation is the same as: + ```python + self.dense_h_to_4h = ColumnParallelLinear( + config.hidden_size, + config.ffn_hidden_size, + bias=False, + quant_config=quant_config + ) + + self.gate_proj = ColumnParallelLinear( + config.hidden_size, + config.ffn_hidden_size, + bias=False, + quant_config=quant_config + ) + ``` + ``` + gate_proj_output, _ = self.gate_proj(x) + dense_h_to_4h_output, _ = self.dense_h_to_4h(x) + x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1) + ``` + + We merge two ColumnParallelLinear into one MergedColumnParallelLinear: + ``` + self.merged_proj = MergedColumnParallelLinear( + config.hidden_size, + [config.ffn_hidden_size] * 2, + bias=False, + quant_config=quant_config + ) + ``` + ``` + x, _ = self.merged_proj(x) + ``` + """ + super().__init__() + self.linear_proj = ReplicatedLinear(in_features, + config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.linear_proj") + self.norm1 = nn.LayerNorm(config.hidden_size) + self.act1 = nn.GELU() + self.act2 = SiluAndMul() + + self.merged_proj = MergedColumnParallelLinear( + config.hidden_size, [config.ffn_hidden_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.merged_proj") + + self.dense_4h_to_h = RowParallelLinear( + config.ffn_hidden_size, + config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.dense_4h_to_h") + + def forward(self, x): + x, _ = self.linear_proj(x) + x = self.act1(self.norm1(x)) + x, _ = self.merged_proj(x) + x = self.act2(x) + x, _ = self.dense_4h_to_h(x) + return x + + +class EVA2CLIPModel(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = '', + ): + super().__init__() + vision_config = Namespace(**config.vision_config) + self.patch_embedding = EVA2CLIPPatchEmbedding(vision_config) + self.transformer = EVA2CLIPTransformer(vision_config, + quant_config=quant_config, + prefix=f"{prefix}.transformer") + self.linear_proj = EVA2CLIPGLU(config, + in_features=config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.linear_proj") + self.conv = nn.Conv2d(in_channels=vision_config.hidden_size, + out_channels=config.hidden_size, + kernel_size=2, + stride=2) + self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.scaling_factor = vision_config.scaling_factor + + def forward(self, images: torch.Tensor) -> torch.Tensor: + """ + Parameters: + images : torch.Tensor + Input image tensor with shape (B, C, H, W) + + Returns: + torch.Tensor + Transformed tensor with shape (B, L, D) + """ + x = self.patch_embedding(images) + x = self.transformer(x) + x = x[:, 1:] + + b, s, h = x.shape + grid_size = int(s**0.5) + x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2) + x = self.conv(x) + + x = x.flatten(2).transpose(1, 2) + x = self.linear_proj(x) + boi = self.boi.expand(x.shape[0], -1, -1) + eoi = self.eoi.expand(x.shape[0], -1, -1) + x = torch.cat((boi, x, eoi), dim=1) + x = x / self.scaling_factor + return x + + +class GLM4VModel(ChatGLMModel): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + + quant_config = vllm_config.quant_config + + self.vision = EVA2CLIPModel(self.config, + quant_config, + prefix=f"{prefix}.vision") + + +class GLM4VProcessor: + """ + This model doesn't define its own HF processor, + so we implement our own one here. + """ + + def __init__( + self, + config: ChatGLMConfig, + tokenizer: PreTrainedTokenizer, + ) -> None: + super().__init__() + + self.config = config + self.tokenizer = tokenizer + + vision_config = config.vision_config + image_size = vision_config["image_size"] + + self.image_transform = transforms.Compose([ + transforms.Resize( + (image_size, image_size), + interpolation=InterpolationMode.BICUBIC, + ), + transforms.ToTensor(), + transforms.Normalize( + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711), + ), + ]) + + def __call__( + self, + text: Optional[Union[TextInput, list[TextInput]]] = None, + images: Optional[Union[ImageInput, list[ImageInput]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> BatchFeature: + if text is None: + text = [] + if not isinstance(text, list): + text = [text] + if images is None: + images = [] + if not isinstance(images, list): + images = [images] + + text_inputs = self.tokenizer(text) + + if len(images) == 0: + image_inputs = {} + else: + pixel_values = [self.image_transform(image) for image in images] + image_inputs = {"pixel_values": torch.stack(pixel_values)} + + return BatchFeature( + { + **text_inputs, + **image_inputs, + }, + tensor_type=return_tensors, + ) + + +class GLM4VProcessingInfo(BaseProcessingInfo): + + def get_tokenizer(self): + tokenizer = self.ctx.tokenizer + assert isinstance(tokenizer, PreTrainedTokenizer) + return tokenizer + + def get_hf_config(self): + return self.ctx.get_hf_config(ChatGLMConfig) + + def get_hf_processor(self) -> GLM4VProcessor: + return GLM4VProcessor( + self.get_hf_config(), + self.get_tokenizer(), + ) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": 1} + + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + return {"image": self.get_num_image_feature_tokens()} + + def get_num_image_tokens(self) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + + image_size = vision_config["image_size"] + patch_size = vision_config["patch_size"] + grid_length = image_size // patch_size // 2 + return grid_length * grid_length + + def get_num_image_feature_tokens(self) -> int: + # EVA2CLIPModel has embeddings for boi and eoi tokens as well + return self.get_num_image_tokens() + 2 + + +class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + hf_config = self.info.get_hf_config() + vision_config = hf_config.vision_config + + target_width = target_height = vision_config["image_size"] + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>" + + return ProcessorInputs( + prompt_text=base_text * num_images, + mm_data=mm_data, + ) + + +class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]): + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(pixel_values=MultiModalFieldConfig.batched("image")) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_config = self.info.get_hf_config() + + boi_token_id = hf_config.boi_token_id + image_token_id = hf_config.pad_token_id + eoi_token_id = hf_config.eoi_token_id + + def get_replacement(item_idx: int): + num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [image_token_id] * num_image_tokens + + return [boi_token_id] + image_tokens + [eoi_token_id] + + return [ + PromptReplacement( + modality="image", + target=[boi_token_id, image_token_id, eoi_token_id], + replacement=get_replacement, + ), + ] + + +@MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor, + info=GLM4VProcessingInfo, + dummy_inputs=GLM4VDummyInputsBuilder) +class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, + SupportsMultiModal): + + packed_modules_mapping = { + "query_key_value": ["query_key_value"], + "dense_h_to_4h": ["dense_h_to_4h"], + "merged_proj": ["gate_proj", "dense_h_to_4h"] + } + # LoRA specific attributes + supported_lora_modules = [ + "query_key_value", + "dense", + "dense_h_to_4h", + "dense_4h_to_h", + # vision + "fc1", + "fc2", + "merged_proj", + "linear_proj" + ] + + embedding_modules = {} + embedding_padding_modules = [] + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="transformer.encoder", + connector="transformer.vision.linear_proj", + tower_model="transformer.vision.transformer") + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + transformer_type: type[GLM4VModel] = GLM4VModel, + ) -> None: + super().__init__( + vllm_config=vllm_config, + prefix=prefix, + transformer_type=transformer_type, + ) + + self.transformer: GLM4VModel + + def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: + h = w = self.config.vision_config["image_size"] + expected_dims = (3, h, w) + actual_dims = tuple(data.shape[1:]) + + if actual_dims != expected_dims: + expected_expr = ("batch_size", *map(str, expected_dims)) + raise ValueError( + f"The expected shape of pixel values is {expected_expr}. " + f"You supplied {tuple(data.shape)}.") + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[GLMVImagePixelInputs]: + pixel_values = kwargs.pop("pixel_values", None) + + if pixel_values is not None: + if not isinstance(pixel_values, torch.Tensor): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return GLMVImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values( + flatten_bn(pixel_values, concat=True)), + ) + + return None + + def _process_image_input( + self, image_input: GLMVImagePixelInputs) -> torch.Tensor: + pixel_values = image_input["data"].to(dtype=self.config.torch_dtype) + + return self.transformer.vision(pixel_values) + + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.transformer.get_input_embeddings(input_ids) + + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + placeholder_token_id=[ + self.config.boi_token_id, + self.config.pad_token_id, + self.config.eoi_token_id, + ], + ) + + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + + hidden_states = self.transformer(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors, + inputs_embeds) + + return hidden_states diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 4b8aeaddbdd37..a45e9463ab67b 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -6,381 +6,35 @@ # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE """Inference-only QWen model compatible with HuggingFace weights.""" -import copy -import math -import re -import unicodedata -from functools import lru_cache, partial -from typing import (AbstractSet, Any, Callable, Collection, Dict, Iterable, - List, Literal, Mapping, Optional, Set, Tuple, TypedDict, - Union) +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn -from torchvision import transforms -from torchvision.transforms import InterpolationMode -from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer, - TensorType) -from transformers.image_utils import ImageInput -from transformers.tokenization_utils_base import TextInput +from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.logger import init_logger -from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn +from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) -from vllm.multimodal.parse import MultiModalDataItems -from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement, - PromptReplacementDetails) -from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP -from .utils import (flatten_bn, is_pp_missing_parameter, +from .interfaces import SupportsLoRA, SupportsPP +from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix, merge_multimodal_embeddings) - -logger = init_logger(__name__) - - -class QwenImagePixelInputs(TypedDict): - type: Literal["pixel_values"] - data: torch.Tensor - """ - Shape: `(batch_size * num_images, 3, image_size, image_size)` - - Note that image_size is the value in the vision config to which we resize - the image to in the normalization transform. Currently multi-image support - can only be leveraged by passing image embeddings directly. - """ - - -class QwenImageEmbeddingInputs(TypedDict): - type: Literal["image_embeds"] - data: torch.Tensor - """Shape: `(batch_size * num_images, 256, hidden_size)` - - `hidden_size` must match the hidden size of the language model backbone - and is stored in the visual config of the model if we have one. - """ - - -QwenImageInputs = Union[QwenImagePixelInputs, QwenImageEmbeddingInputs] - - -class VisualAttention(nn.Module): - """self-attention layer class. - Self-attention layer takes input with size [s, b, h] - and returns output of the same size. - """ - - def __init__( - self, - embed_dim: int, - num_heads: int, - bias: bool = True, - kdim: Optional[int] = None, - vdim: Optional[int] = None, - ): - super().__init__() - self.embed_dim = embed_dim - self.kdim = kdim if kdim is not None else embed_dim - self.vdim = vdim if vdim is not None else embed_dim - self._qkv_same_embed_dim = self.kdim == embed_dim \ - and self.vdim == embed_dim - - self.num_heads = num_heads - - # Per attention head and per partition values. - assert embed_dim % num_heads == 0 - self.hidden_size_per_attention_head = embed_dim // num_heads - self.num_attention_heads_per_partition = num_heads - self.hidden_size_per_partition = embed_dim - - # Strided linear layer. - assert self._qkv_same_embed_dim, \ - 'Visual Attention implementation only supports self-attention' - self.in_proj = ReplicatedLinear(embed_dim, 3 * embed_dim) - self.out_proj = ReplicatedLinear(embed_dim, embed_dim) - self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) - - def forward( - self, - x: torch.Tensor, - attn_mask: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - # query/key/value: [sq, b, h] - sq, b, _ = x.size() - mixed_x_layer, _ = self.in_proj(x) - - # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn] - new_tensor_shape = mixed_x_layer.size()[:-1] + \ - (self.num_attention_heads_per_partition, - 3 * self.hidden_size_per_attention_head) - mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) - - # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] - query_layer, key_layer, value_layer = mixed_x_layer.split( - self.hidden_size_per_attention_head, dim=-1) - - # [sq, b, np, hn] -> [sq, b * np, hn] - query_layer = query_layer.view( - sq, b * self.num_attention_heads_per_partition, - self.hidden_size_per_attention_head).transpose(0, 1) - # [sk, b, np, hn] -> [sk, b * np, hn] - key_layer = key_layer.view( - sq, b * self.num_attention_heads_per_partition, - self.hidden_size_per_attention_head).transpose(0, 1) - - q_scaled = query_layer / self.norm_factor - if attn_mask is not None: - attention_probs = torch.baddbmm(attn_mask, q_scaled, - key_layer.transpose(-2, -1)) - else: - attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1)) - attention_probs = attention_probs.softmax(dim=-1) - - value_layer = value_layer.view( - sq, b * self.num_attention_heads_per_partition, - self.hidden_size_per_attention_head).transpose(0, 1) - - # matmul: [b * np, sq, hn] - context_layer = torch.bmm(attention_probs, value_layer) - - # change view [b, np, sq, hn] - context_layer = context_layer.view( - b, self.num_attention_heads_per_partition, sq, - self.hidden_size_per_attention_head) - - # [b, np, sq, hn] --> [sq, b, np, hn] - context_layer = context_layer.permute(2, 0, 1, 3).contiguous() - - # [sq, b, np, hn] --> [sq, b, hp] - new_context_layer_shape = context_layer.size()[:-2] + \ - (self.hidden_size_per_partition,) - context_layer = context_layer.view(*new_context_layer_shape) - - output, _ = self.out_proj(context_layer) - - return output - - -class QwenVMLP(nn.Module): - """MLP for the visual component of the Qwen model.""" - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - quant_config: Optional[QuantizationConfig] = None, - ): - super().__init__() - self.c_fc = ColumnParallelLinear(hidden_size, - intermediate_size, - bias=True, - quant_config=quant_config) - self.act_fn = get_act_fn("gelu") - self.c_proj = RowParallelLinear( - intermediate_size, - hidden_size, - bias=True, - quant_config=quant_config, - ) - - def forward(self, x): - x, _ = self.c_fc(x) - x = self.act_fn(x) - x, _ = self.c_proj(x) - return x - - -class VisualAttentionBlock(nn.Module): - - def __init__( - self, - d_model: int, - n_head: int, - mlp_ratio: float = 4.0, - norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, - quant_config: Optional[QuantizationConfig] = None, - ): - super().__init__() - - self.ln_1 = norm_layer(d_model) - self.ln_2 = norm_layer(d_model) - mlp_width = int(d_model * mlp_ratio) - self.attn = VisualAttention(d_model, n_head) - self.mlp = QwenVMLP( - hidden_size=d_model, - intermediate_size=mlp_width, - quant_config=quant_config, - ) - - def attention( - self, - x: torch.Tensor, - attn_mask: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - attn_mask = attn_mask.to(x.dtype) if attn_mask is not None else None - return self.attn(x, attn_mask=attn_mask) - - def forward( - self, - x: torch.Tensor, - attn_mask: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - x = x + self.attention(self.ln_1(x), attn_mask=attn_mask) - x = x + self.mlp(self.ln_2(x)) - return x - - -class TransformerBlock(nn.Module): - - def __init__( - self, - width: int, - layers: int, - heads: int, - mlp_ratio: float = 4.0, - norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, - quant_config: Optional[QuantizationConfig] = None, - ): - super().__init__() - self.width = width - self.layers = layers - - self.resblocks = nn.ModuleList([ - VisualAttentionBlock(width, - heads, - mlp_ratio, - norm_layer=norm_layer, - quant_config=quant_config) - for _ in range(layers) - ]) - - def get_cast_dtype(self) -> torch.dtype: - return self.resblocks[0].mlp.c_fc.weight.dtype - - def get_cast_device(self) -> torch.device: - return self.resblocks[0].mlp.c_fc.weight.device - - def forward(self, - x: torch.Tensor, - attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor: - for r in self.resblocks: - x = r(x, attn_mask=attn_mask) - return x - - -class VisionTransformer(nn.Module): - - def __init__(self, - image_size: int, - patch_size: int, - width: int, - layers: int, - heads: int, - mlp_ratio: float, - n_queries: int = 256, - output_dim: int = 512, - image_start_id: int = 151857, - quant_config: Optional[QuantizationConfig] = None, - **kwargs): - super().__init__() - image_height, image_width = self.image_size = (image_size, image_size) - patch_height, patch_width = self.patch_size = (patch_size, patch_size) - self.grid_size = (image_height // patch_height, - image_width // patch_width) - self.output_dim = output_dim - self.conv1 = nn.Conv2d(in_channels=3, - out_channels=width, - kernel_size=patch_size, - stride=patch_size, - bias=False) - - # class embeddings and positional embeddings - scale = width**-0.5 - self.positional_embedding = nn.Parameter(scale * - torch.randn(256, width)) - - norm_layer = partial(nn.LayerNorm, eps=1e-6) - - self.ln_pre = norm_layer(width) - self.transformer = TransformerBlock(width, - layers, - heads, - mlp_ratio, - norm_layer=norm_layer, - quant_config=quant_config) - - self.attn_pool = Resampler2( - grid_size=int(math.sqrt(n_queries)), - embed_dim=output_dim, - num_heads=output_dim // 128, - kv_dim=width, - norm_layer=norm_layer, - adaptive=False, - do_post_projection=False, - ).to( - device=self.positional_embedding.device, - dtype=self.positional_embedding.dtype, - ) - - self.ln_post = norm_layer(output_dim) - self.proj = nn.Parameter( - (output_dim**-0.5) * torch.randn(output_dim, output_dim)) - - self.image_start_id = image_start_id - self.image_end_id = image_start_id + 1 - self.image_pad_id = image_start_id + 2 - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x = x.to( - dtype=self.transformer.get_cast_dtype(), - device=self.transformer.get_cast_device(), - ) - - # to patches - x = self.conv1(x) # shape = [*, width, grid, grid] - x = x.reshape(x.shape[0], x.shape[1], - -1) # shape = [*, width, grid ** 2] - x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] - - x = x + get_abs_pos(self.positional_embedding, int(math.sqrt( - x.size(1)))) - - x = self.ln_pre(x) - - x = x.permute(1, 0, 2) # NLD -> LND - x = self.transformer(x) - x = x.permute(1, 0, 2) # LND -> NLD - - x = self.attn_pool(x) - x = self.ln_post(x) - x = x @ self.proj - - return x + maybe_prefix) class QWenMLP(nn.Module): @@ -564,12 +218,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) - if (vision_config := getattr(config, "visual", None)): - self.visual = VisionTransformer(**vision_config, - quant_config=quant_config) - else: - self.visual = None - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.wte(input_ids) @@ -592,6 +240,7 @@ def forward( assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] + for i in range(self.start_layer, self.end_layer): layer = self.h[i] hidden_states, residual = layer( @@ -610,302 +259,25 @@ def forward( return hidden_states -@lru_cache(maxsize=1) -def _get_tokenizer_without_image_pad( - tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer: - """ - The logic of adding image pad tokens should only be applied in - :class:`QWenVLProcessor`, so they are patched out here. - - The definition of the wrapped tokenizer can be found here: - https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py - """ - new_tokenizer = copy.deepcopy(tokenizer) - - class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore - - def tokenize( - self, - text: str, - allowed_special: Union[AbstractSet[str], str] = "all", - disallowed_special: Union[Collection[str], str] = (), - **kwargs, - ) -> list[Union[bytes, str]]: - text = unicodedata.normalize("NFC", text) - - return [ - self.decoder[t] for t in self.tokenizer.encode( - text, - allowed_special=allowed_special, - disallowed_special=disallowed_special, - ) - ] - - def _decode( - self, - token_ids: Union[int, List[int]], - skip_special_tokens: bool = False, - errors: Optional[str] = None, - **kwargs, - ) -> str: - if isinstance(token_ids, int): - token_ids = [token_ids] - - return self.tokenizer.decode( - token_ids, - errors=errors or self.errors, - ) - - TokenizerWithoutImagePad.__name__ = \ - f"{tokenizer.__class__.__name__}WithoutImagePad" - - new_tokenizer.__class__ = TokenizerWithoutImagePad - return new_tokenizer - - -class QWenVLProcessor: - """ - This model doesn't define its own HF processor, - so we implement our own one here. - - We call the wrapped tokenizer to automatically insert image pad tokens: - https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py#L245 - - The image processor is defined here: - https://huggingface.co/Qwen/Qwen-VL/blob/main/visual.py#L354 - """ +class QWenBaseModel(nn.Module): def __init__( self, - config: PretrainedConfig, - tokenizer: PreTrainedTokenizer, + *, + vllm_config: VllmConfig, + prefix: str = "", + transformer_type: type[QWenModel] = QWenModel, ) -> None: super().__init__() - - self.config = config - self.tokenizer = tokenizer - - if vision_config := getattr(self.config, "visual", None): - image_size = vision_config["image_size"] - - self.image_transform = transforms.Compose([ - transforms.Resize( - (image_size, image_size), - interpolation=InterpolationMode.BICUBIC, - ), - transforms.ToTensor(), - transforms.Normalize( - mean=(0.48145466, 0.4578275, 0.40821073), - std=(0.26862954, 0.26130258, 0.27577711), - ), - ]) - else: - self.image_transform = None - - @property - def image_start_tag(self) -> str: - return self.tokenizer.image_start_tag # type: ignore - - @property - def image_end_tag(self) -> str: - return self.tokenizer.image_end_tag # type: ignore - - @property - def image_pad_tag(self) -> str: - return self.tokenizer.image_pad_tag # type: ignore - - def __call__( - self, - text: Optional[Union[TextInput, list[TextInput]]] = None, - images: Optional[Union[ImageInput, list[ImageInput]]] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - ) -> BatchFeature: - if text is None: - text = [] - if not isinstance(text, list): - text = [text] - if images is None: - images = [] - if not isinstance(images, list): - images = [images] - - text_inputs = self.tokenizer(text) - - if len(images) == 0: - image_inputs = {} - else: - if self.image_transform is None: - raise ValueError("This model does not support image inputs") - - pixel_values = [self.image_transform(image) for image in images] - image_inputs = {"pixel_values": torch.stack(pixel_values)} - - return BatchFeature( - { - **text_inputs, - **image_inputs, - }, - tensor_type=return_tensors, - ) - - -class QWenVLProcessingInfo(BaseProcessingInfo): - - def get_tokenizer(self) -> PreTrainedTokenizer: - tokenizer = self.ctx.tokenizer - assert isinstance(tokenizer, PreTrainedTokenizer) - - return _get_tokenizer_without_image_pad(tokenizer) - - def get_hf_processor(self) -> QWenVLProcessor: - tokenizer = self.ctx.tokenizer - assert isinstance(tokenizer, PreTrainedTokenizer) - - return QWenVLProcessor(self.get_hf_config(), tokenizer) - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None} - - def get_mm_max_tokens_per_item( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> Mapping[str, int]: - return {"image": self.get_num_image_tokens()} - - def get_num_image_tokens(self) -> int: - hf_config = self.get_hf_config() - if not (vision_config := getattr(hf_config, "visual", None)): - return 0 - - image_size = vision_config["image_size"] - patch_size = vision_config["patch_size"] - grid_length = image_size // patch_size // 2 - return grid_length * grid_length - - -class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]): - - def get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - hf_config = self.info.get_hf_config() - if not (vision_config := getattr(hf_config, "visual", None)): - return ProcessorInputs(prompt_text="", mm_data={}) - - processor = self.info.get_hf_processor() - img_start = processor.image_start_tag - img_end = processor.image_end_tag - - target_width = target_height = vision_config["image_size"] - num_images = mm_counts.get("image", 0) - - mm_data = { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } - - return ProcessorInputs( - prompt_text="".join(f"Picture {i}: {img_start}{img_end}\n" - for i in range(1, num_images + 1)), - mm_data=mm_data, - ) - - -class QWenVLMultiModalProcessor(BaseMultiModalProcessor[QWenVLProcessingInfo]): - - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - ) -> BatchFeature: - # Drops anything between / tags; encoding with the tokenizer - # will automatically add the image pads for the context. - prompt, num_matched_images = re.subn( - r"(Picture \d*: ).*?(<\/img>\n)", - r"\1\2", - prompt, - ) - - image_data = mm_data.get("images") - if image_data is not None: - assert isinstance(image_data, list) - - num_images = len(image_data) - if num_matched_images != num_images: - logger.warning( - "Number of matched image placeholders %s doesn't match " - "the number of expected images %s; check your placeholder " - "formatting.", num_matched_images, num_images) - - return super()._call_hf_processor( - prompt=prompt, - mm_data=mm_data, - mm_kwargs=mm_kwargs, - ) - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), - ) - - def _get_prompt_replacements( - self, - mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], - out_mm_kwargs: MultiModalKwargs, - ) -> list[PromptReplacement]: - hf_config = self.info.get_hf_config() - if not hasattr(hf_config, "visual"): - return [] - - tokenizer = self.info.get_tokenizer() - special_tokens: dict[str, - int] = tokenizer.special_tokens # type: ignore - - processor = self.info.get_hf_processor() - img_start_id = special_tokens[processor.image_start_tag] - img_end_id = special_tokens[processor.image_end_tag] - img_pad_id = special_tokens[processor.image_pad_tag] - - num_image_tokens = self.info.get_num_image_tokens() - image_tokens = [img_pad_id] * num_image_tokens - - return [ - PromptReplacement( - modality="image", - target=[img_start_id, img_end_id], - replacement=PromptReplacementDetails( - full=[img_start_id] + image_tokens + [img_end_id], - features=image_tokens, - ), - ) - ] - - -class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config self.quant_config = quant_config - self.transformer = QWenModel(vllm_config=vllm_config, - prefix=maybe_prefix( - prefix, "transformer")) + self.transformer = transformer_type(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) @@ -916,104 +288,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: - h = w = self.config.visual["image_size"] - expected_dims = (3, h, w) - actual_dims = tuple(data.shape[1:]) - - if actual_dims != expected_dims: - expected_expr = ("batch_size", *map(str, expected_dims)) - raise ValueError( - f"The expected shape of pixel values is {expected_expr}. " - f"You supplied {tuple(data.shape)}.") - - return data - - def _parse_and_validate_image_input( - self, **kwargs: object) -> Optional[QwenImageInputs]: - pixel_values = kwargs.pop("pixel_values", None) - image_embeds = kwargs.pop("image_embeds", None) - - if pixel_values is not None: - if not isinstance(pixel_values, torch.Tensor): - raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") - - return QwenImagePixelInputs( - type="pixel_values", - data=self._validate_pixel_values( - flatten_bn(pixel_values, concat=True)), - ) - - if image_embeds is not None: - if not isinstance(image_embeds, torch.Tensor): - raise ValueError("Incorrect type of image embeddings. " - f"Got type: {type(image_embeds)}") - - return QwenImageEmbeddingInputs( - type="image_embeds", - data=flatten_bn(image_embeds), - ) - - return None - - def _process_image_input(self, - image_input: QwenImageInputs) -> torch.Tensor: - if image_input["type"] == "image_embeds": - return image_input["data"] - - assert self.transformer.visual is not None - return self.transformer.visual(image_input["data"]) - - def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is None: - return None - - vision_embeddings = self._process_image_input(image_input) - return vision_embeddings - - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[NestedTensors] = None, - ) -> torch.Tensor: - inputs_embeds = self.transformer.get_input_embeddings(input_ids) - - if multimodal_embeddings is not None: - assert self.transformer.visual is not None - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.transformer.visual.image_pad_id) - - return inputs_embeds - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - **kwargs: object, - ) -> Union[torch.Tensor, IntermediateTensors]: - if intermediate_tensors is not None: - inputs_embeds = None - - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. - elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) - input_ids = None - - hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors, - inputs_embeds) - return hidden_states - def compute_logits( self, hidden_states: torch.Tensor, @@ -1072,26 +346,7 @@ def load_weights(self, weights: Iterable[Tuple[str, return loaded_params -class QWenLLM(QWenBaseModel): - packed_modules_mapping = { - "c_attn": ["c_attn"], - "gate_up_proj": [ - "w2", - "w1", - ], - } - # LoRA specific attributes - supported_lora_modules = [ - "c_attn", - "gate_up_proj", - "c_proj", - ] - - embedding_modules = {} - embedding_padding_modules = [] - - -class QWenVL(QWenBaseModel, SupportsMultiModal): +class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA): packed_modules_mapping = { "c_attn": ["c_attn"], "gate_up_proj": [ @@ -1104,62 +359,35 @@ class QWenVL(QWenBaseModel, SupportsMultiModal): "c_attn", "gate_up_proj", "c_proj", - # visual module - "out_proj", - "in_proj", - "c_fc", - # resampler - "kv_proj", ] embedding_modules = {} embedding_padding_modules = [] - def get_mm_mapping(self) -> MultiModelKeys: - """ - Get the module prefix in multimodal models - """ - return MultiModelKeys.from_string_field( - language_model="transformer.h", - connector="transformer.visual.attn_pool", - tower_model="transformer.visual.transformer") - - -@MULTIMODAL_REGISTRY.register_processor(QWenVLMultiModalProcessor, - info=QWenVLProcessingInfo, - dummy_inputs=QWenVLDummyInputsBuilder) -class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA): - """ - QWenLMHeadModel is not only applicable to LLM but also to VL, which is not - conducive to the current integration logic of LoRA in vLLM. Therefore, it - is necessary to separate them. - """ - # Ensure that the LoRA support check passes when the class is not - # initialized, but set all these attributes to empty. - # These will be updated when an instance class is selected - packed_modules_mapping = {} - supported_lora_modules = [] - embedding_modules = {} - embedding_padding_modules = [] - - def __new__( - cls, - vllm_config: VllmConfig, - prefix: str = "", - ) -> QWenBaseModel: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config + if hasattr(config, "visual"): + hf_overrides = { + "architectures": ["QwenVLForConditionalGeneration"] + } + raise RuntimeError( + "The configuration of this model indicates that it supports " + "vision inputs, but you instantiated the text-only version " + "of this model. Please use the vision model by setting " + f"`--hf-overrides {hf_overrides!r}`") + + super().__init__(vllm_config=vllm_config, prefix=prefix) - # Initialize VL - if hasattr(config, "visual"): # noqa: SIM108 - instance_cls = QWenVL - # Initialize LLM - else: - instance_cls = QWenLLM - - # quant_config references base class members, - # so update values before init is called - cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping) - cls.supported_lora_modules += instance_cls.supported_lora_modules - cls.embedding_modules.update(instance_cls.embedding_modules) - cls.embedding_padding_modules += instance_cls.embedding_padding_modules - return instance_cls(vllm_config=vllm_config, prefix=prefix) + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.transformer(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors, + inputs_embeds) + return hidden_states diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py new file mode 100644 index 0000000000000..5316eb7e002bc --- /dev/null +++ b/vllm/model_executor/models/qwen_vl.py @@ -0,0 +1,794 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from +# https://huggingface.co/Qwen/Qwen-VL/blob/main/modeling_qwen.py +# Copyright (c) Alibaba Cloud. +"""Inference-only Qwen-VL model compatible with HuggingFace weights.""" + +import copy +import math +import re +import unicodedata +from functools import lru_cache, partial +from typing import (AbstractSet, Callable, Collection, List, Literal, Mapping, + Optional, TypedDict, Union) + +import torch +from torch import nn +from torchvision import transforms +from torchvision.transforms import InterpolationMode +from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer, + TensorType) +from transformers.image_utils import ImageInput +from transformers.tokenization_utils_base import TextInput + +from vllm.attention import AttentionMetadata +from vllm.config import VllmConfig +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import MultiModalDataItems +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.sequence import IntermediateTensors + +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP +from .qwen import QWenBaseModel, QWenModel +from .utils import flatten_bn, merge_multimodal_embeddings + + +class QwenImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """ + Shape: `(batch_size * num_images, 3, image_size, image_size)` + + Note that image_size is the value in the vision config to which we resize + the image to in the normalization transform. Currently multi-image support + can only be leveraged by passing image embeddings directly. + """ + + +class QwenImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + """Shape: `(batch_size * num_images, 256, hidden_size)` + + `hidden_size` must match the hidden size of the language model backbone + and is stored in the visual config of the model if we have one. + """ + + +QwenImageInputs = Union[QwenImagePixelInputs, QwenImageEmbeddingInputs] + + +class VisualAttention(nn.Module): + """self-attention layer class. + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + bias: bool = True, + kdim: Optional[int] = None, + vdim: Optional[int] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self._qkv_same_embed_dim = self.kdim == embed_dim \ + and self.vdim == embed_dim + + self.num_heads = num_heads + + # Per attention head and per partition values. + assert embed_dim % num_heads == 0 + self.hidden_size_per_attention_head = embed_dim // num_heads + self.num_attention_heads_per_partition = num_heads + self.hidden_size_per_partition = embed_dim + + # Strided linear layer. + assert self._qkv_same_embed_dim, \ + 'Visual Attention implementation only supports self-attention' + self.in_proj = ReplicatedLinear(embed_dim, 3 * embed_dim) + self.out_proj = ReplicatedLinear(embed_dim, embed_dim) + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + + def forward( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # query/key/value: [sq, b, h] + sq, b, _ = x.size() + mixed_x_layer, _ = self.in_proj(x) + + # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn] + new_tensor_shape = mixed_x_layer.size()[:-1] + \ + (self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head) + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) + + # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] + query_layer, key_layer, value_layer = mixed_x_layer.split( + self.hidden_size_per_attention_head, dim=-1) + + # [sq, b, np, hn] -> [sq, b * np, hn] + query_layer = query_layer.view( + sq, b * self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head).transpose(0, 1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key_layer = key_layer.view( + sq, b * self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head).transpose(0, 1) + + q_scaled = query_layer / self.norm_factor + if attn_mask is not None: + attention_probs = torch.baddbmm(attn_mask, q_scaled, + key_layer.transpose(-2, -1)) + else: + attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1)) + attention_probs = attention_probs.softmax(dim=-1) + + value_layer = value_layer.view( + sq, b * self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head).transpose(0, 1) + + # matmul: [b * np, sq, hn] + context_layer = torch.bmm(attention_probs, value_layer) + + # change view [b, np, sq, hn] + context_layer = context_layer.view( + b, self.num_attention_heads_per_partition, sq, + self.hidden_size_per_attention_head) + + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + + # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size_per_partition,) + context_layer = context_layer.view(*new_context_layer_shape) + + output, _ = self.out_proj(context_layer) + + return output + + +class QwenVLMLP(nn.Module): + """MLP for the visual component of the Qwen model.""" + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.c_fc = ColumnParallelLinear(hidden_size, + intermediate_size, + bias=True, + quant_config=quant_config) + self.act_fn = get_act_fn("gelu") + self.c_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=True, + quant_config=quant_config, + ) + + def forward(self, x): + x, _ = self.c_fc(x) + x = self.act_fn(x) + x, _ = self.c_proj(x) + return x + + +class VisualAttentionBlock(nn.Module): + + def __init__( + self, + d_model: int, + n_head: int, + mlp_ratio: float = 4.0, + norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + + self.ln_1 = norm_layer(d_model) + self.ln_2 = norm_layer(d_model) + mlp_width = int(d_model * mlp_ratio) + self.attn = VisualAttention(d_model, n_head) + self.mlp = QwenVLMLP( + hidden_size=d_model, + intermediate_size=mlp_width, + quant_config=quant_config, + ) + + def attention( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + attn_mask = attn_mask.to(x.dtype) if attn_mask is not None else None + return self.attn(x, attn_mask=attn_mask) + + def forward( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + x = x + self.attention(self.ln_1(x), attn_mask=attn_mask) + x = x + self.mlp(self.ln_2(x)) + return x + + +class TransformerBlock(nn.Module): + + def __init__( + self, + width: int, + layers: int, + heads: int, + mlp_ratio: float = 4.0, + norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.width = width + self.layers = layers + + self.resblocks = nn.ModuleList([ + VisualAttentionBlock(width, + heads, + mlp_ratio, + norm_layer=norm_layer, + quant_config=quant_config) + for _ in range(layers) + ]) + + def get_cast_dtype(self) -> torch.dtype: + return self.resblocks[0].mlp.c_fc.weight.dtype + + def get_cast_device(self) -> torch.device: + return self.resblocks[0].mlp.c_fc.weight.device + + def forward(self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + for r in self.resblocks: + x = r(x, attn_mask=attn_mask) + return x + + +class VisionTransformer(nn.Module): + + def __init__(self, + image_size: int, + patch_size: int, + width: int, + layers: int, + heads: int, + mlp_ratio: float, + n_queries: int = 256, + output_dim: int = 512, + image_start_id: int = 151857, + quant_config: Optional[QuantizationConfig] = None, + **kwargs): + super().__init__() + image_height, image_width = self.image_size = (image_size, image_size) + patch_height, patch_width = self.patch_size = (patch_size, patch_size) + self.grid_size = (image_height // patch_height, + image_width // patch_width) + self.output_dim = output_dim + self.conv1 = nn.Conv2d(in_channels=3, + out_channels=width, + kernel_size=patch_size, + stride=patch_size, + bias=False) + + # class embeddings and positional embeddings + scale = width**-0.5 + self.positional_embedding = nn.Parameter(scale * + torch.randn(256, width)) + + norm_layer = partial(nn.LayerNorm, eps=1e-6) + + self.ln_pre = norm_layer(width) + self.transformer = TransformerBlock(width, + layers, + heads, + mlp_ratio, + norm_layer=norm_layer, + quant_config=quant_config) + + self.attn_pool = Resampler2( + grid_size=int(math.sqrt(n_queries)), + embed_dim=output_dim, + num_heads=output_dim // 128, + kv_dim=width, + norm_layer=norm_layer, + adaptive=False, + do_post_projection=False, + ).to( + device=self.positional_embedding.device, + dtype=self.positional_embedding.dtype, + ) + + self.ln_post = norm_layer(output_dim) + self.proj = nn.Parameter( + (output_dim**-0.5) * torch.randn(output_dim, output_dim)) + + self.image_start_id = image_start_id + self.image_end_id = image_start_id + 1 + self.image_pad_id = image_start_id + 2 + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.to( + dtype=self.transformer.get_cast_dtype(), + device=self.transformer.get_cast_device(), + ) + + # to patches + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], + -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + + x = x + get_abs_pos(self.positional_embedding, int(math.sqrt( + x.size(1)))) + + x = self.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + + x = self.attn_pool(x) + x = self.ln_post(x) + x = x @ self.proj + + return x + + +class QwenVLModel(QWenModel): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + + self.visual = VisionTransformer(**config.visual, + quant_config=quant_config) + + +@lru_cache(maxsize=1) +def _get_tokenizer_without_image_pad( + tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer: + """ + The logic of adding image pad tokens should only be applied in + :class:`QwenVLProcessor`, so they are patched out here. + + The definition of the wrapped tokenizer can be found here: + https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py + """ + new_tokenizer = copy.deepcopy(tokenizer) + + class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore + + def tokenize( + self, + text: str, + allowed_special: Union[AbstractSet[str], str] = "all", + disallowed_special: Union[Collection[str], str] = (), + **kwargs, + ) -> list[Union[bytes, str]]: + text = unicodedata.normalize("NFC", text) + + return [ + self.decoder[t] for t in self.tokenizer.encode( + text, + allowed_special=allowed_special, + disallowed_special=disallowed_special, + ) + ] + + def _decode( + self, + token_ids: Union[int, List[int]], + skip_special_tokens: bool = False, + errors: Optional[str] = None, + **kwargs, + ) -> str: + if isinstance(token_ids, int): + token_ids = [token_ids] + + return self.tokenizer.decode( + token_ids, + errors=errors or self.errors, + ) + + TokenizerWithoutImagePad.__name__ = \ + f"{tokenizer.__class__.__name__}WithoutImagePad" + + new_tokenizer.__class__ = TokenizerWithoutImagePad + return new_tokenizer + + +class QwenVLProcessor: + """ + This model doesn't define its own HF processor, + so we implement our own one here. + + We call the wrapped tokenizer to automatically insert image pad tokens: + https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py#L245 + + The image processor is defined here: + https://huggingface.co/Qwen/Qwen-VL/blob/main/visual.py#L354 + """ + + def __init__( + self, + config: PretrainedConfig, + tokenizer: PreTrainedTokenizer, + ) -> None: + super().__init__() + + self.config = config + self.tokenizer = tokenizer + + vision_config = config.visual + image_size = vision_config["image_size"] + + self.image_transform = transforms.Compose([ + transforms.Resize( + (image_size, image_size), + interpolation=InterpolationMode.BICUBIC, + ), + transforms.ToTensor(), + transforms.Normalize( + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711), + ), + ]) + + @property + def image_start_tag(self) -> str: + return self.tokenizer.image_start_tag # type: ignore + + @property + def image_end_tag(self) -> str: + return self.tokenizer.image_end_tag # type: ignore + + @property + def image_pad_tag(self) -> str: + return self.tokenizer.image_pad_tag # type: ignore + + def __call__( + self, + text: Optional[Union[TextInput, list[TextInput]]] = None, + images: Optional[Union[ImageInput, list[ImageInput]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> BatchFeature: + if text is None: + text = [] + if not isinstance(text, list): + text = [text] + if images is None: + images = [] + if not isinstance(images, list): + images = [images] + + text_inputs = self.tokenizer(text) + + if len(images) == 0: + image_inputs = {} + else: + pixel_values = [self.image_transform(image) for image in images] + image_inputs = {"pixel_values": torch.stack(pixel_values)} + + return BatchFeature( + { + **text_inputs, + **image_inputs, + }, + tensor_type=return_tensors, + ) + + +class QwenVLProcessingInfo(BaseProcessingInfo): + + def get_tokenizer(self) -> PreTrainedTokenizer: + tokenizer = self.ctx.tokenizer + assert isinstance(tokenizer, PreTrainedTokenizer) + + return _get_tokenizer_without_image_pad(tokenizer) + + def get_hf_processor(self) -> QwenVLProcessor: + tokenizer = self.ctx.tokenizer + assert isinstance(tokenizer, PreTrainedTokenizer) + + return QwenVLProcessor(self.get_hf_config(), tokenizer) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + return {"image": self.get_num_image_tokens()} + + def get_num_image_tokens(self) -> int: + hf_config = self.get_hf_config() + vision_config = hf_config.visual + + image_size = vision_config["image_size"] + patch_size = vision_config["patch_size"] + grid_length = image_size // patch_size // 2 + return grid_length * grid_length + + +class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + hf_config = self.info.get_hf_config() + vision_config = hf_config.visual + + processor = self.info.get_hf_processor() + img_start = processor.image_start_tag + img_end = processor.image_end_tag + + target_width = target_height = vision_config["image_size"] + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="".join(f"Picture {i}: {img_start}{img_end}\n" + for i in range(1, num_images + 1)), + mm_data=mm_data, + ) + + +class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]): + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + # Drops anything between / tags; encoding with the tokenizer + # will automatically add the image pads for the context. + prompt, num_matched_images = re.subn( + r"(Picture \d*: ).*?(<\/img>\n)", + r"\1\2", + prompt, + ) + + image_data = mm_data.get("images") + if image_data is not None: + assert isinstance(image_data, list) + + num_images = len(image_data) + assert num_matched_images == num_images + + return super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + tokenizer = self.info.get_tokenizer() + special_tokens: dict[str, + int] = tokenizer.special_tokens # type: ignore + + processor = self.info.get_hf_processor() + img_start_id = special_tokens[processor.image_start_tag] + img_end_id = special_tokens[processor.image_end_tag] + img_pad_id = special_tokens[processor.image_pad_tag] + + num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [img_pad_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[img_start_id, img_end_id], + replacement=PromptReplacementDetails( + full=[img_start_id] + image_tokens + [img_end_id], + features=image_tokens, + ), + ) + ] + + +@MULTIMODAL_REGISTRY.register_processor(QwenVLMultiModalProcessor, + info=QwenVLProcessingInfo, + dummy_inputs=QwenVLDummyInputsBuilder) +class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA, + SupportsMultiModal): + packed_modules_mapping = { + "c_attn": ["c_attn"], + "gate_up_proj": [ + "w2", + "w1", + ], + } + # LoRA specific attributes + supported_lora_modules = [ + "c_attn", + "gate_up_proj", + "c_proj", + # visual module + "out_proj", + "in_proj", + "c_fc", + # resampler + "kv_proj", + ] + + embedding_modules = {} + embedding_padding_modules = [] + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="transformer.h", + connector="transformer.visual.attn_pool", + tower_model="transformer.visual.transformer") + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + transformer_type: type[QwenVLModel] = QwenVLModel, + ) -> None: + super().__init__( + vllm_config=vllm_config, + prefix=prefix, + transformer_type=transformer_type, + ) + + self.transformer: QwenVLModel + + def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: + h = w = self.config.visual["image_size"] + expected_dims = (3, h, w) + actual_dims = tuple(data.shape[1:]) + + if actual_dims != expected_dims: + expected_expr = ("batch_size", *map(str, expected_dims)) + raise ValueError( + f"The expected shape of pixel values is {expected_expr}. " + f"You supplied {tuple(data.shape)}.") + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[QwenImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + + if pixel_values is not None: + if not isinstance(pixel_values, torch.Tensor): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return QwenImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values( + flatten_bn(pixel_values, concat=True)), + ) + + if image_embeds is not None: + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + + return QwenImageEmbeddingInputs( + type="image_embeds", + data=flatten_bn(image_embeds), + ) + + return None + + def _process_image_input(self, + image_input: QwenImageInputs) -> torch.Tensor: + if image_input["type"] == "image_embeds": + return image_input["data"] + + return self.transformer.visual(image_input["data"]) + + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.transformer.get_input_embeddings(input_ids) + + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.transformer.visual.image_pad_id) + + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + + hidden_states = self.transformer(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors, + inputs_embeds) + return hidden_states diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 198b6d134718f..256718a0670df 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -39,7 +39,7 @@ "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), "BambaForCausalLM": ("bamba", "BambaForCausalLM"), "BloomForCausalLM": ("bloom", "BloomForCausalLM"), - # ChatGLMModel supports multimodal + "ChatGLMForCausalLM": ("chatglm", "ChatGLMForCausalLM"), "CohereForCausalLM": ("commandr", "CohereForCausalLM"), "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"), "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"), @@ -90,7 +90,7 @@ "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"), "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"), - # QWenLMHeadModel supports multimodal + "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), "RWForCausalLM": ("falcon", "FalconForCausalLM"), @@ -156,10 +156,9 @@ "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"), "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"), "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501 - "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), - "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"), "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"), "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), + "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"), # noqa: E501 "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"), "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), @@ -175,7 +174,7 @@ "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501 - "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), + "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"), # noqa: E501 "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501 "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501