From b7abed2e14460ed403863c233c2b3e86fb4656da Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 10 Oct 2024 04:06:20 +0000 Subject: [PATCH 01/17] Enable `require_post_norm` in vision encoders --- vllm/model_executor/models/blip.py | 34 ++++++++++++++--------- vllm/model_executor/models/clip.py | 41 +++++++++++++++++++--------- vllm/model_executor/models/siglip.py | 22 +++++++++++---- 3 files changed, 65 insertions(+), 32 deletions(-) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 7c8e76461dd67..d90f2089db065 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -122,7 +122,7 @@ def input_processor_for_blip( # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa class BlipVisionEmbeddings(nn.Module): - def __init__(self, config: BlipVisionConfig): + def __init__(self, config: Union[BlipVisionConfig, Blip2VisionConfig]): super().__init__() self.config = config @@ -167,7 +167,7 @@ class BlipParallelAttention(nn.Module): def __init__( self, - config: BlipVisionConfig, + config: Union[BlipVisionConfig, Blip2VisionConfig], quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -236,7 +236,7 @@ def forward( class BlipMLP(nn.Module): def __init__(self, - config: BlipVisionConfig, + config: Union[BlipVisionConfig, Blip2VisionConfig], quant_config: Optional[QuantizationConfig] = None): super().__init__() @@ -263,7 +263,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class BlipEncoderLayer(nn.Module): def __init__(self, - config: BlipVisionConfig, + config: Union[BlipVisionConfig, Blip2VisionConfig], quant_config: Optional[QuantizationConfig] = None): super().__init__() @@ -308,7 +308,7 @@ class BlipEncoder(nn.Module): """ def __init__(self, - config: BlipVisionConfig, + config: Union[BlipVisionConfig, Blip2VisionConfig], quant_config: Optional[QuantizationConfig] = None, num_hidden_layers_override: Optional[int] = None): super().__init__() @@ -337,10 +337,14 @@ class BlipVisionModel(nn.Module): config_class = BlipVisionConfig main_input_name = "pixel_values" - def __init__(self, - config: BlipVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: Union[BlipVisionConfig, Blip2VisionConfig], + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: Optional[bool] = None, + ) -> None: super().__init__() tp_size = get_tensor_model_parallel_world_size() @@ -356,17 +360,21 @@ def __init__(self, num_hidden_layers_override=num_hidden_layers_override, ) + num_hidden_layers = config.num_hidden_layers if len(self.encoder.layers) > config.num_hidden_layers: raise ValueError( - f"The original encoder only has {config.num_hidden_layers} " + f"The original encoder only has {num_hidden_layers} " f"layers, but you requested {len(self.encoder.layers)} layers." ) - elif len(self.encoder.layers) == config.num_hidden_layers: + + # If possible, skip post_layernorm to conserve memory + if require_post_norm is None: + require_post_norm = len(self.encoder.layers) == num_hidden_layers + + if require_post_norm: self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) else: - # post_layernorm is unused when we extract intermediate features - # In this case, we can skip it to conserve memory self.post_layernorm = None def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index edfb0c2b5e19b..42389e85a6f6a 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -354,11 +354,16 @@ def forward(self, inputs_embeds: torch.Tensor): class CLIPVisionTransformer(nn.Module): - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: Optional[bool] = None, + ) -> None: super().__init__() + self.config = config embed_dim = config.hidden_size @@ -372,17 +377,21 @@ def __init__(self, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override) + num_hidden_layers = config.num_hidden_layers if len(self.encoder.layers) > config.num_hidden_layers: raise ValueError( - f"The original encoder only has {config.num_hidden_layers} " + f"The original encoder only has {num_hidden_layers} " f"layers, but you requested {len(self.encoder.layers)} layers." ) - elif len(self.encoder.layers) == config.num_hidden_layers: + + # If possible, skip post_layernorm to conserve memory + if require_post_norm is None: + require_post_norm = len(self.encoder.layers) == num_hidden_layers + + if require_post_norm: self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) else: - # post_layernorm is unused when we extract intermediate features - # In this case, we can skip it to conserve memory self.post_layernorm = None def forward( @@ -405,10 +414,14 @@ class CLIPVisionModel(nn.Module): config_class = CLIPVisionConfig main_input_name = "pixel_values" - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: Optional[bool] = None, + ) -> None: super().__init__() tp_size = get_tensor_model_parallel_world_size() @@ -418,7 +431,9 @@ def __init__(self, self.vision_model = CLIPVisionTransformer( config=config, quant_config=quant_config, - num_hidden_layers_override=num_hidden_layers_override) + num_hidden_layers_override=num_hidden_layers_override, + require_post_norm=require_post_norm, + ) def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return self.vision_model(pixel_values) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 743a81f8f9e95..19312187ae048 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -454,9 +454,12 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, + *, num_hidden_layers_override: Optional[int] = None, - ): + require_post_norm: Optional[bool] = None, + ) -> None: super().__init__() + self.config = config embed_dim = config.hidden_size @@ -467,17 +470,21 @@ def __init__( num_hidden_layers_override=num_hidden_layers_override, ) + num_hidden_layers = config.num_hidden_layers if len(self.encoder.layers) > config.num_hidden_layers: raise ValueError( - f"The original encoder only has {config.num_hidden_layers} " + f"The original encoder only has {num_hidden_layers} " f"layers, but you requested {len(self.encoder.layers)} layers." ) - elif len(self.encoder.layers) == config.num_hidden_layers: + + # If possible, skip post_layernorm to conserve memory + if require_post_norm is None: + require_post_norm = len(self.encoder.layers) == num_hidden_layers + + if require_post_norm: self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) else: - # post_layernorm is unused when we extract intermediate features - # In this case, we can skip it to conserve memory self.post_layernorm = None self.use_head = (True if not hasattr(config, "vision_use_head") else @@ -517,8 +524,10 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, + *, num_hidden_layers_override: Optional[int] = None, - ): + require_post_norm: Optional[bool] = None, + ) -> None: super().__init__() num_heads = config.num_attention_heads @@ -529,6 +538,7 @@ def __init__( config, quant_config, num_hidden_layers_override=num_hidden_layers_override, + require_post_norm=require_post_norm, ) def get_input_embeddings(self) -> nn.Module: From dd4e42f5474c4fdbf8c3a42fc43964cf3e7f4898 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 10 Oct 2024 04:06:43 +0000 Subject: [PATCH 02/17] Fix `quant_config` not passed to vision tower --- vllm/model_executor/models/blip2.py | 2 +- vllm/model_executor/models/internvl.py | 5 +++- vllm/model_executor/models/llava.py | 25 ++++++++++++---- vllm/model_executor/models/llava_next.py | 30 ++----------------- .../model_executor/models/llava_next_video.py | 29 ++---------------- vllm/model_executor/models/llava_onevision.py | 29 ++---------------- vllm/model_executor/models/nvlm_d.py | 5 ++++ vllm/model_executor/models/paligemma.py | 3 +- 8 files changed, 38 insertions(+), 90 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 3ab235754a404..4bcf3ce8d6ec8 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -489,7 +489,7 @@ def __init__(self, self.multimodal_config = multimodal_config # TODO: Optionally initializes this for supporting embeddings. - self.vision_model = BlipVisionModel(config.vision_config) + self.vision_model = BlipVisionModel(config.vision_config, quant_config) self.query_tokens = nn.Parameter( torch.zeros(1, config.num_query_tokens, diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 9024831df543c..4e91bac7cae33 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -430,7 +430,8 @@ def __init__(self, + vision_feature_layer + 1 else: num_hidden_layers = vision_feature_layer + 1 - self.vision_model = self._init_vision_model(config, num_hidden_layers) + self.vision_model = self._init_vision_model(config, quant_config, + num_hidden_layers) self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config) @@ -449,8 +450,10 @@ def sampler(self): return Sampler() def _init_vision_model(self, config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], num_hidden_layers: int): return InternVisionModel(config.vision_config, + quant_config=quant_config, num_hidden_layers_override=num_hidden_layers) def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index a3acb93dc3c11..28aee61e4ef1f 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,11 +1,12 @@ from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, - TypedDict, Union) +from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, + Tuple, TypedDict, Union) import torch import torch.nn as nn from PIL import Image -from transformers import CLIPVisionConfig, LlavaConfig, SiglipVisionConfig +from transformers import (CLIPVisionConfig, LlavaConfig, PretrainedConfig, + SiglipVisionConfig) from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, MultiModalConfig @@ -168,7 +169,17 @@ def input_processor_for_llava(ctx: InputContext, llm_inputs: LLMInputs): raise NotImplementedError(msg) -def _init_vision_tower(hf_config: LlavaConfig): +class LlavaLikeConfig(Protocol): + vision_config: PretrainedConfig + vision_feature_layer: int + + +def init_vision_tower_for_llava( + hf_config: LlavaLikeConfig, + quant_config: Optional[QuantizationConfig], + *, + require_post_norm: Optional[bool] = None, +): vision_config = hf_config.vision_config # Initialize the vision tower only up to the required feature layer @@ -182,12 +193,16 @@ def _init_vision_tower(hf_config: LlavaConfig): if isinstance(vision_config, CLIPVisionConfig): return CLIPVisionModel( vision_config, + quant_config, num_hidden_layers_override=num_hidden_layers, + require_post_norm=require_post_norm, ) elif isinstance(vision_config, SiglipVisionConfig): return SiglipVisionModel( vision_config, + quant_config, num_hidden_layers_override=num_hidden_layers, + require_post_norm=require_post_norm, ) msg = f"Unsupported vision config: {type(vision_config)}" @@ -211,7 +226,7 @@ def __init__(self, self.multimodal_config = multimodal_config # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = _init_vision_tower(config) + self.vision_tower = init_vision_tower_for_llava(config, quant_config) self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 766f6a4cc83fa..b0094e3a51e73 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -24,7 +24,7 @@ dummy_seq_data_for_clip, get_clip_image_feature_size, get_clip_patch_grid_length, input_processor_for_clip) from .interfaces import SupportsMultiModal, SupportsPP -from .llava import LlavaMultiModalProjector +from .llava import LlavaMultiModalProjector, init_vision_tower_for_llava from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip, get_siglip_image_feature_size, get_siglip_patch_grid_length, input_processor_for_siglip) @@ -256,32 +256,6 @@ def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs): raise NotImplementedError(msg) -def _init_vision_tower(hf_config: LlavaNextConfig): - vision_config = hf_config.vision_config - - # Initialize the vision tower only up to the required feature layer - vision_feature_layer = hf_config.vision_feature_layer - if vision_feature_layer < 0: - num_hidden_layers = hf_config.vision_config.num_hidden_layers \ - + vision_feature_layer + 1 - else: - num_hidden_layers = vision_feature_layer + 1 - - if isinstance(vision_config, CLIPVisionConfig): - return CLIPVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - elif isinstance(vision_config, SiglipVisionConfig): - return SiglipVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - @MULTIMODAL_REGISTRY.register_image_input_mapper() @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_next_image_tokens) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next) @@ -300,7 +274,7 @@ def __init__(self, self.multimodal_config = multimodal_config # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = _init_vision_tower(config) + self.vision_tower = init_vision_tower_for_llava(config, quant_config) self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) self.multi_modal_projector = LlavaMultiModalProjector( diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index e10c1f9e6e04b..e371f6929e775 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -25,6 +25,7 @@ from .clip import dummy_image_for_clip, dummy_seq_data_for_clip from .interfaces import SupportsMultiModal, SupportsPP +from .llava import init_vision_tower_for_llava from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip) from .utils import (AutoWeightsLoader, init_vllm_registered_model, @@ -178,32 +179,6 @@ def input_processor_for_llava_next_video(ctx: InputContext, raise NotImplementedError(msg) -def _init_vision_tower(hf_config: LlavaNextVideoConfig): - vision_config = hf_config.vision_config - - # Initialize the vision tower only up to the required feature layer - vision_feature_layer = hf_config.vision_feature_layer - if vision_feature_layer < 0: - num_hidden_layers = hf_config.vision_config.num_hidden_layers \ - + vision_feature_layer + 1 - else: - num_hidden_layers = vision_feature_layer + 1 - - if isinstance(vision_config, CLIPVisionConfig): - return CLIPVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - elif isinstance(vision_config, SiglipVisionConfig): - return SiglipVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - # adopted from transformers modeling_llava_next_video.py class LlavaNextVideoPooler(nn.Module): @@ -280,7 +255,7 @@ def __init__(self, self.multimodal_config = multimodal_config # Initialize the vision tower only up to the required feature layer - self.vision_tower = _init_vision_tower(config) + self.vision_tower = init_vision_tower_for_llava(config, quant_config) self.vision_resampler = LlavaNextVideoPooler(config) self.multi_modal_projector = LlavaNextMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 46e97e78d482b..e8ede7241e939 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -31,6 +31,7 @@ dummy_video_for_clip, get_clip_image_feature_size, get_clip_patch_grid_length, input_processor_for_clip) from .interfaces import SupportsMultiModal, SupportsPP +from .llava import init_vision_tower_for_llava from .siglip import (SiglipVisionModel, dummy_seq_data_for_siglip, dummy_video_for_siglip, get_siglip_image_feature_size, get_siglip_patch_grid_length, input_processor_for_siglip) @@ -359,32 +360,6 @@ def input_processor_for_llava_onevision(ctx: InputContext, raise NotImplementedError(msg) -def _init_vision_tower(hf_config: LlavaOnevisionConfig): - vision_config = hf_config.vision_config - - # Initialize the vision tower only up to the required feature layer - vision_feature_layer = hf_config.vision_feature_layer - if vision_feature_layer < 0: - num_hidden_layers = hf_config.vision_config.num_hidden_layers \ - + vision_feature_layer + 1 - else: - num_hidden_layers = vision_feature_layer + 1 - - if isinstance(vision_config, CLIPVisionConfig): - return CLIPVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - elif isinstance(vision_config, SiglipVisionConfig): - return SiglipVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - class LlavaOnevisionMultiModalProjector(nn.Module): def __init__(self, config: LlavaOnevisionConfig): @@ -427,7 +402,7 @@ def __init__(self, self.multimodal_config = multimodal_config # Initialize the vision tower only up to the required feature layer - self.vision_tower = _init_vision_tower(config) + self.vision_tower = init_vision_tower_for_llava(config, quant_config) self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config) self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config) diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index a52e3cb6039be..3e3c3b05879fb 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -4,10 +4,13 @@ # Copyright (c) 2024 NVIDIA # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- +from typing import Optional + import torch.nn as nn from transformers import PretrainedConfig from vllm.inputs import INPUT_REGISTRY +from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY from .intern_vit import InternVisionModel @@ -56,9 +59,11 @@ def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: ) def _init_vision_model(self, config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], num_hidden_layers: int): # We added additional dummy heads to the original num of heads to make # the number of heads divisible by 8. return InternVisionModel(config.vision_config, + quant_config=quant_config, num_hidden_layers_override=num_hidden_layers, num_dummy_heads=7) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 99d000ea13a2c..a1c1d23183a74 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -140,7 +140,8 @@ def __init__(self, self.config = config self.multimodal_config = multimodal_config - self.vision_tower = SiglipVisionModel(config.vision_config) + self.vision_tower = SiglipVisionModel(config.vision_config, + quant_config) self.multi_modal_projector = PaliGemmaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, projection_dim=config.vision_config.projection_dim) From b9ae0d99954c3c0636c64bfbc319d80c352f9f8d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 10 Oct 2024 04:15:07 +0000 Subject: [PATCH 03/17] Remove redundant `load_weights` --- vllm/model_executor/models/phi3v.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 00a04dac88789..a2d11b79b5d9e 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -289,10 +289,6 @@ def add_image_newline(self, image_features_hd): dim=2).reshape(num_images, -1, hid_dim) return image_features_hd_newline - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - loader = AutoWeightsLoader(self) - loader.load_weights(weights) - # Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L57 def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336): From 4ed2425d699bca55a9da476cf3d942e37399b71e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 10 Oct 2024 08:42:59 +0000 Subject: [PATCH 04/17] Add explicit note that quantization not supported --- vllm/model_executor/models/blip.py | 4 ++++ vllm/model_executor/models/clip.py | 6 +++++- vllm/model_executor/models/intern_vit.py | 6 +++++- vllm/model_executor/models/phi3v.py | 15 ++++++++++----- vllm/model_executor/models/siglip.py | 4 ++++ 5 files changed, 28 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index d90f2089db065..ac265c8c26c23 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -345,6 +345,10 @@ def __init__( num_hidden_layers_override: Optional[int] = None, require_post_norm: Optional[bool] = None, ) -> None: + # NOTE: Vision tower is not quantized by any of the supported methods + if quant_config is not None: + quant_config = None + super().__init__() tp_size = get_tensor_model_parallel_world_size() diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 42389e85a6f6a..0d612bd46d6c8 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -422,8 +422,12 @@ def __init__( num_hidden_layers_override: Optional[int] = None, require_post_norm: Optional[bool] = None, ) -> None: - super().__init__() + # NOTE: Vision tower is not quantized by any of the supported methods + if quant_config is not None: + quant_config = None + super().__init__() + tp_size = get_tensor_model_parallel_world_size() num_heads = config.num_attention_heads self.shard_weight = USE_XFORMERS_OPS and num_heads % tp_size == 0 diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 35be1cec3d434..3626a542ad794 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -381,7 +381,11 @@ def __init__( *, num_hidden_layers_override: Optional[int] = None, num_dummy_heads: int = 0, - ): + ) -> None: + # NOTE: Vision tower is not quantized by any of the supported methods + if quant_config is not None: + quant_config = None + super().__init__() self.config = config diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index a2d11b79b5d9e..b059d69280ea1 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -65,7 +65,8 @@ projection_dim=768) -def _init_img_processor(hf_config: PretrainedConfig): +def _init_img_processor(hf_config: PretrainedConfig, + quant_config: Optional[QuantizationConfig]): clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG layer_idx = hf_config.img_processor.get('layer_idx', -2) @@ -77,7 +78,10 @@ def _init_img_processor(hf_config: PretrainedConfig): num_hidden_layers = layer_idx + 1 img_processor = CLIPVisionModel( - clip_config, num_hidden_layers_override=num_hidden_layers) + clip_config, + quant_config, + num_hidden_layers_override=num_hidden_layers, + ) return img_processor @@ -143,14 +147,15 @@ def get_img_features(self, class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase): """Phi3 Image embedding with HD transform.""" - def __init__(self, config: PretrainedConfig) -> None: + def __init__(self, config: PretrainedConfig, + quant_config: Optional[QuantizationConfig]) -> None: super().__init__() # n_embed or hidden_size hidden_size = config.n_embd if hasattr( config, 'n_embd') else config.hidden_size - self.img_processor = _init_img_processor(config) + self.img_processor = _init_img_processor(config, quant_config) image_dim_out = config.img_processor['image_dim_out'] self.num_img_tokens = config.img_processor['num_img_tokens'] @@ -517,7 +522,7 @@ def __init__(self, self.image_token_id = _IMAGE_TOKEN_ID # TODO: Optionally initializes this for supporting embeddings. - self.vision_embed_tokens = Phi3HDImageEmbedding(config) + self.vision_embed_tokens = Phi3HDImageEmbedding(config, quant_config) self.language_model = LlamaForCausalLM(config, cache_config, quant_config) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 19312187ae048..b5123b73267aa 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -528,6 +528,10 @@ def __init__( num_hidden_layers_override: Optional[int] = None, require_post_norm: Optional[bool] = None, ) -> None: + # NOTE: Vision tower is not quantized by any of the supported methods + if quant_config is not None: + quant_config = None + super().__init__() num_heads = config.num_attention_heads From d16768564095e5dc50b120b8a0dc3c95e5120cd7 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 10 Oct 2024 08:44:21 +0000 Subject: [PATCH 05/17] Format --- vllm/model_executor/models/clip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 0d612bd46d6c8..8d0a397298d26 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -427,7 +427,7 @@ def __init__( quant_config = None super().__init__() - + tp_size = get_tensor_model_parallel_world_size() num_heads = config.num_attention_heads self.shard_weight = USE_XFORMERS_OPS and num_heads % tp_size == 0 From a839727c62f69d1815b676ac20ba31affeb033e1 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 10 Oct 2024 08:44:57 +0000 Subject: [PATCH 06/17] Add note --- vllm/model_executor/models/mllama.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 45d6ad3c0efa5..ee04017ce0122 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -440,8 +440,14 @@ def forward( class MllamaVisionModel(nn.Module): - def __init__(self, config: config_mllama.MllamaVisionConfig): + def __init__(self, config: config_mllama.MllamaVisionConfig, + quant_config: Optional[QuantizationConfig]): + # NOTE: Vision tower is not quantized by any of the supported methods + if quant_config is not None: + quant_config = None + super().__init__() + self.image_size = config.image_size self.patch_size = config.patch_size self.max_num_tiles = config.max_num_tiles @@ -908,7 +914,7 @@ def __init__(self, config.pad_token_id if config.pad_token_id is not None else -1 self.image_size = config.vision_config.image_size - self.vision_model = MllamaVisionModel(config.vision_config) + self.vision_model = MllamaVisionModel(config.vision_config, quant_config) self.language_model = MllamaForCausalLM( config.text_config, cache_config=cache_config, From 90deb9dac2365eb122f8b26b0d2215947a7db20b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 10 Oct 2024 08:48:21 +0000 Subject: [PATCH 07/17] format --- vllm/model_executor/models/mllama.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index ee04017ce0122..23283bdcdc0f7 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -914,7 +914,8 @@ def __init__(self, config.pad_token_id if config.pad_token_id is not None else -1 self.image_size = config.vision_config.image_size - self.vision_model = MllamaVisionModel(config.vision_config, quant_config) + self.vision_model = MllamaVisionModel(config.vision_config, + quant_config) self.language_model = MllamaForCausalLM( config.text_config, cache_config=cache_config, From 143ccc0008640cd60d36bb658f96e4b58bbc8b64 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 10 Oct 2024 17:08:26 +0000 Subject: [PATCH 08/17] Add prefix --- vllm/model_executor/models/blip.py | 54 ++++++++---- vllm/model_executor/models/clip.py | 56 ++++++++---- vllm/model_executor/models/intern_vit.py | 35 ++++++-- vllm/model_executor/models/mllama.py | 108 +++++++++++++++++------ vllm/model_executor/models/siglip.py | 45 +++++++--- 5 files changed, 215 insertions(+), 83 deletions(-) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index ac265c8c26c23..f015c34add349 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -169,7 +169,8 @@ def __init__( self, config: Union[BlipVisionConfig, Blip2VisionConfig], quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() self.config = config self.embed_dim = config.hidden_size @@ -189,11 +190,13 @@ def __init__( self.num_heads, bias=config.qkv_bias, quant_config=quant_config, + prefix=f"{prefix}.qkv", ) self.projection = RowParallelLinear( self.embed_dim, self.embed_dim, quant_config=quant_config, + prefix=f"{prefix}.projection", ) self.tp_size = get_tensor_model_parallel_world_size() @@ -235,9 +238,12 @@ def forward( class BlipMLP(nn.Module): - def __init__(self, - config: Union[BlipVisionConfig, Blip2VisionConfig], - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() self.config = config @@ -246,11 +252,13 @@ def __init__(self, self.fc1 = ColumnParallelLinear(config.hidden_size, config.intermediate_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc1") self.fc2 = RowParallelLinear(config.intermediate_size, config.hidden_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc2") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) @@ -262,9 +270,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class BlipEncoderLayer(nn.Module): - def __init__(self, - config: Union[BlipVisionConfig, Blip2VisionConfig], - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() # fallback to sdpa attention if tp unavailable @@ -272,14 +283,15 @@ def __init__(self, tp_size = get_tensor_model_parallel_world_size() if USE_XFORMERS_OPS and num_heads % tp_size == 0: self.self_attn = BlipParallelAttention(config, - quant_config=quant_config) + quant_config=quant_config, + prefix=prefix) else: # Blip doesn't have SDPA attention implemented in transformers # use eager attention instead for cpu backend self.self_attn = BlipAttention(config) self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.mlp = BlipMLP(config, quant_config=quant_config) + self.mlp = BlipMLP(config, quant_config=quant_config, prefix=prefix) self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -307,10 +319,13 @@ class BlipEncoder(nn.Module): config: BlipConfig """ - def __init__(self, - config: Union[BlipVisionConfig, Blip2VisionConfig], - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + num_hidden_layers_override: Optional[int] = None, + prefix: str = "", + ) -> None: super().__init__() self.config = config @@ -321,8 +336,9 @@ def __init__(self, num_hidden_layers = num_hidden_layers_override self.layers = nn.ModuleList([ - BlipEncoderLayer(config=config, quant_config=quant_config) - for _ in range(num_hidden_layers) + BlipEncoderLayer(config=config, + quant_config=quant_config, + prefix=prefix) for _ in range(num_hidden_layers) ]) def forward(self, inputs_embeds: torch.Tensor): @@ -339,11 +355,12 @@ class BlipVisionModel(nn.Module): def __init__( self, - config: Union[BlipVisionConfig, Blip2VisionConfig], + config: BlipVisionConfig, quant_config: Optional[QuantizationConfig] = None, *, num_hidden_layers_override: Optional[int] = None, require_post_norm: Optional[bool] = None, + prefix: str = "", ) -> None: # NOTE: Vision tower is not quantized by any of the supported methods if quant_config is not None: @@ -362,6 +379,7 @@ def __init__( config=config, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, + prefix=prefix, ) num_hidden_layers = config.num_hidden_layers diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 8d0a397298d26..ce5a57cef9b34 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -192,6 +192,7 @@ def __init__( self, config: CLIPVisionConfig, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -211,12 +212,14 @@ def __init__( head_size=self.head_dim, total_num_heads=self.num_heads, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.out_proj = RowParallelLinear( input_size=self.embed_dim, output_size=self.embed_dim, quant_config=quant_config, + prefix=f"{prefix}.out_proj", ) self.tp_size = get_tensor_model_parallel_world_size() @@ -259,20 +262,25 @@ def forward( class CLIPMLP(nn.Module): - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() self.config = config self.activation_fn = get_act_fn(config.hidden_act) self.fc1 = ColumnParallelLinear(config.hidden_size, config.intermediate_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc1") self.fc2 = RowParallelLinear(config.intermediate_size, config.hidden_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc2") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) @@ -284,21 +292,25 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class CLIPEncoderLayer(nn.Module): - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() num_heads = config.num_attention_heads tp_size = get_tensor_model_parallel_world_size() if USE_XFORMERS_OPS and num_heads % tp_size == 0: self.self_attn = CLIPParallelAttention(config, - quant_config=quant_config) + quant_config=quant_config, + prefix=prefix) else: self.self_attn = CLIPSdpaAttention(config) self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.mlp = CLIPMLP(config, quant_config=quant_config) + self.mlp = CLIPMLP(config, quant_config=quant_config, prefix=prefix) self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -327,11 +339,15 @@ class CLIPEncoder(nn.Module): config: CLIPConfig """ - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + num_hidden_layers_override: Optional[int] = None, + prefix: str = "", + ) -> None: super().__init__() + self.config = config if num_hidden_layers_override is None: @@ -339,8 +355,9 @@ def __init__(self, else: num_hidden_layers = num_hidden_layers_override self.layers = nn.ModuleList([ - CLIPEncoderLayer(config=config, quant_config=quant_config) - for _ in range(num_hidden_layers) + CLIPEncoderLayer(config=config, + quant_config=quant_config, + prefix=prefix) for _ in range(num_hidden_layers) ]) def forward(self, inputs_embeds: torch.Tensor): @@ -361,6 +378,7 @@ def __init__( *, num_hidden_layers_override: Optional[int] = None, require_post_norm: Optional[bool] = None, + prefix: str = "", ) -> None: super().__init__() @@ -375,7 +393,9 @@ def __init__( self.encoder = CLIPEncoder( config=config, quant_config=quant_config, - num_hidden_layers_override=num_hidden_layers_override) + num_hidden_layers_override=num_hidden_layers_override, + prefix=prefix, + ) num_hidden_layers = config.num_hidden_layers if len(self.encoder.layers) > config.num_hidden_layers: @@ -421,6 +441,7 @@ def __init__( *, num_hidden_layers_override: Optional[int] = None, require_post_norm: Optional[bool] = None, + prefix: str = "", ) -> None: # NOTE: Vision tower is not quantized by any of the supported methods if quant_config is not None: @@ -437,6 +458,7 @@ def __init__( quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, require_post_norm=require_post_norm, + prefix=prefix, ) def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 3626a542ad794..c3197047392a7 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -106,6 +106,7 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, *, num_dummy_heads: int = 0, + prefix: str = "", ) -> None: super().__init__() @@ -134,6 +135,7 @@ def __init__( num_dummy_heads + self.num_heads, bias=config.qkv_bias, quant_config=quant_config, + prefix=f"{prefix}.qkv", ) self.qk_normalization = config.qk_normalization @@ -150,6 +152,7 @@ def __init__( self.dummy_dim, self.embed_dim, quant_config=quant_config, + prefix=f"{prefix}.proj", ) def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor): @@ -253,20 +256,26 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class InternMLP(nn.Module): - def __init__(self, - config: PretrainedConfig, - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() + self.config = config self.activation_fn = get_act_fn(config.hidden_act) self.fc1 = ColumnParallelLinear(config.hidden_size, config.intermediate_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc1") self.fc2 = RowParallelLinear(config.intermediate_size, config.hidden_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc2") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) @@ -284,6 +293,7 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, *, num_dummy_heads: int = 0, + prefix: str = "", ) -> None: super().__init__() @@ -293,9 +303,10 @@ def __init__( self.attn = self._init_attn(config, quant_config, - num_dummy_heads=num_dummy_heads) + num_dummy_heads=num_dummy_heads, + prefix=prefix) - self.mlp = InternMLP(config, quant_config=quant_config) + self.mlp = InternMLP(config, quant_config=quant_config, prefix=prefix) self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps) self.norm2 = NORM2FN[self.norm_type](self.embed_dim, @@ -312,6 +323,7 @@ def _init_attn( quant_config: Optional[QuantizationConfig], *, num_dummy_heads: int, + prefix: str = "", ): # fallback to sdpa attention if tp unavailable tp_size = get_tensor_model_parallel_world_size() @@ -320,7 +332,8 @@ def _init_attn( if USE_XFORMERS_OPS and (num_heads + num_dummy_heads) % tp_size == 0: return InternParallelAttention(config, quant_config=quant_config, - num_dummy_heads=num_dummy_heads) + num_dummy_heads=num_dummy_heads, + prefix=prefix) return InternSdpaAttention(config, num_dummy_heads=num_dummy_heads) @@ -346,6 +359,7 @@ def __init__( *, num_hidden_layers_override: Optional[int] = None, num_dummy_heads: int = 0, + prefix: str = "", ): super().__init__() @@ -359,7 +373,8 @@ def __init__( self.layers = nn.ModuleList([ InternVisionEncoderLayer(config, quant_config, - num_dummy_heads=num_dummy_heads) + num_dummy_heads=num_dummy_heads, + prefix=prefix) for _ in range(num_hidden_layers) ]) @@ -381,6 +396,7 @@ def __init__( *, num_hidden_layers_override: Optional[int] = None, num_dummy_heads: int = 0, + prefix: str = "", ) -> None: # NOTE: Vision tower is not quantized by any of the supported methods if quant_config is not None: @@ -396,6 +412,7 @@ def __init__( quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, num_dummy_heads=num_dummy_heads, + prefix=prefix, ) def get_input_embeddings(self): diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 23283bdcdc0f7..981a229350feb 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -356,9 +356,13 @@ def forward( class MllamaVisionEncoderLayer(nn.Module): - def __init__(self, - config: config_mllama.MllamaVisionConfig, - is_gated: bool = False): + def __init__( + self, + config: config_mllama.MllamaVisionConfig, + quant_config: Optional[QuantizationConfig], + prefix: str = "", + is_gated: bool = False, + ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -367,7 +371,7 @@ def __init__(self, self.intermediate_size = config.intermediate_size self.self_attn = MllamaVisionSdpaAttention(config) - self.mlp = CLIPMLP(config) + self.mlp = CLIPMLP(config, quant_config=quant_config, prefix=prefix) self.input_layernorm = nn.LayerNorm(self.hidden_size, eps=config.norm_eps) @@ -404,16 +408,22 @@ def forward( class MllamaVisionEncoder(nn.Module): - def __init__(self, - config: config_mllama.MllamaVisionConfig, - num_layers=32, - is_gated=False, - output_hidden_states=None): + def __init__( + self, + config: config_mllama.MllamaVisionConfig, + quant_config: Optional[QuantizationConfig], + num_layers: int = 32, + is_gated: bool = False, + output_hidden_states=None, + prefix: str = "", + ) -> None: super().__init__() self.config = config self.layers = nn.ModuleList([ - MllamaVisionEncoderLayer(config, is_gated) - for _ in range(num_layers) + MllamaVisionEncoderLayer(config, + quant_config=quant_config, + is_gated=is_gated, + prefix=prefix) for _ in range(num_layers) ]) self.output_hidden_states = output_hidden_states or [] @@ -440,8 +450,12 @@ def forward( class MllamaVisionModel(nn.Module): - def __init__(self, config: config_mllama.MllamaVisionConfig, - quant_config: Optional[QuantizationConfig]): + def __init__( + self, + config: config_mllama.MllamaVisionConfig, + quant_config: Optional[QuantizationConfig], + prefix: str = "", + ) -> None: # NOTE: Vision tower is not quantized by any of the supported methods if quant_config is not None: quant_config = None @@ -483,12 +497,19 @@ def __init__(self, config: config_mllama.MllamaVisionConfig, # encoders self.transformer = MllamaVisionEncoder( config, + quant_config, config.num_hidden_layers, is_gated=False, - output_hidden_states=config.intermediate_layers_indices) - self.global_transformer = MllamaVisionEncoder(config, - config.num_global_layers, - is_gated=True) + output_hidden_states=config.intermediate_layers_indices, + prefix=prefix, + ) + self.global_transformer = MllamaVisionEncoder( + config, + quant_config, + config.num_global_layers, + is_gated=True, + prefix=prefix, + ) def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor: @@ -631,6 +652,7 @@ def __init__( config: Optional[config_mllama.MllamaTextConfig] = None, layer_idx: Optional[int] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -656,6 +678,7 @@ def __init__( self.num_key_value_heads, bias=False, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.o_proj = RowParallelLinear( self.num_heads * self.head_dim, @@ -663,6 +686,7 @@ def __init__( bias=False, input_is_parallel=True, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) # vllm.model_executor.layers.layernorm.RMSNorm has precision issue, # use huggingface's instead @@ -675,6 +699,7 @@ def __init__( self.head_dim, self.scaling, self.num_local_key_value_heads, + prefix=f"{prefix}.attn", ) def forward( @@ -717,15 +742,21 @@ class MllamaCrossAttentionDecoderLayer(torch.nn.Module): """Cross-attention transformer block with tanh-gated attention and feedforward.""" - def __init__(self, config: config_mllama.MllamaTextConfig, layer_idx: int, - quant_config: Optional[QuantizationConfig]) \ - -> None: + def __init__( + self, + config: config_mllama.MllamaTextConfig, + layer_idx: int, + quant_config: Optional[QuantizationConfig], + prefix: str = "", + ) -> None: super().__init__() + self.layer_idx = layer_idx self.cross_attn = MllamaTextCrossAttention( config=config, layer_idx=layer_idx, quant_config=quant_config, + prefix=prefix, ) self.input_layernorm = RMSNorm(config.hidden_size, @@ -737,6 +768,7 @@ def __init__(self, config: config_mllama.MllamaTextConfig, layer_idx: int, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, + prefix=prefix, ) self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -778,10 +810,15 @@ class MllamaTextModel(nn.Module): config_class = config_mllama.MllamaTextConfig base_model_prefix = "model" - def __init__(self, config: config_mllama.MllamaTextConfig, - cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig]): + def __init__( + self, + config: config_mllama.MllamaTextConfig, + cache_config: Optional[CacheConfig], + quant_config: Optional[QuantizationConfig], + prefix: str = "", + ) -> None: super().__init__() + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding(config.vocab_size + 8, @@ -793,13 +830,18 @@ def __init__(self, config: config_mllama.MllamaTextConfig, if layer_idx in self.cross_attention_layers: layers.append( MllamaCrossAttentionDecoderLayer( - config, layer_idx, quant_config=quant_config)) + config, + layer_idx, + quant_config=quant_config, + prefix=prefix, + )) else: # TODO: force LlamaDecoderLayer to config.attention_bias=False layers.append( LlamaDecoderLayer(config, cache_config=cache_config, - quant_config=quant_config)) + quant_config=quant_config, + prefix=prefix)) self.layers = nn.ModuleList(layers) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -854,12 +896,19 @@ class MllamaForCausalLM(nn.Module): "MllamaCrossAttentionDecoderLayer", "MllamaSelfAttentionDecoderLayer" ] - def __init__(self, config: config_mllama.MllamaTextConfig, - cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig]): + def __init__( + self, + config: config_mllama.MllamaTextConfig, + cache_config: Optional[CacheConfig], + quant_config: Optional[QuantizationConfig], + prefix: str = "", + ) -> None: super().__init__() self.vocab_size = config.vocab_size - self.model = MllamaTextModel(config, cache_config, quant_config) + self.model = MllamaTextModel(config, + cache_config, + quant_config, + prefix=f"{prefix}.model") self.lm_head = ParallelLMHead( config.vocab_size, config.hidden_size, @@ -920,6 +969,7 @@ def __init__(self, config.text_config, cache_config=cache_config, quant_config=quant_config, + prefix="language_model", ) self.multi_modal_projector = nn.Linear( config.vision_config.vision_output_dim, diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index b5123b73267aa..bc0241c3ee1c0 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -248,8 +248,10 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() + self.config = config self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads @@ -266,12 +268,14 @@ def __init__( head_size=self.head_dim, total_num_heads=self.num_heads, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.out_proj = RowParallelLinear( input_size=self.embed_dim, output_size=self.embed_dim, quant_config=quant_config, + prefix=f"{prefix}.out_proj", ) self.tp_size = get_tensor_model_parallel_world_size() @@ -314,8 +318,10 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() + self.config = config self.activation_fn = get_act_fn(config.hidden_act) @@ -326,11 +332,13 @@ def __init__( config.hidden_size, config.intermediate_size, quant_config=quant_config if quantizable else None, + prefix=f"{prefix}.fc1", ) self.fc2 = RowParallelLinear( config.intermediate_size, config.hidden_size, quant_config=quant_config if quantizable else None, + prefix=f"{prefix}.fc2", ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -346,15 +354,18 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() + self.embed_dim = config.hidden_size num_heads = config.num_attention_heads tp_size = get_tensor_model_parallel_world_size() if USE_XFORMERS_OPS and num_heads % tp_size == 0: self.self_attn = SiglipParallelAttention(config, - quant_config=quant_config) + quant_config=quant_config, + prefix=prefix) else: self.self_attn = SiglipSdpaAttention(config) @@ -363,6 +374,7 @@ def __init__( self.mlp = SiglipMLP( config, quant_config=quant_config, + prefix=prefix, ) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) @@ -392,8 +404,10 @@ def __init__( config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, num_hidden_layers_override: Optional[int] = None, - ): + prefix: str = "", + ) -> None: super().__init__() + self.config = config if num_hidden_layers_override is None: @@ -402,8 +416,9 @@ def __init__( num_hidden_layers = num_hidden_layers_override self.layers = nn.ModuleList([ - SiglipEncoderLayer(config, quant_config=quant_config) - for _ in range(num_hidden_layers) + SiglipEncoderLayer(config, + quant_config=quant_config, + prefix=prefix) for _ in range(num_hidden_layers) ]) def forward( @@ -424,7 +439,8 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size)) @@ -433,7 +449,9 @@ def __init__( config.hidden_size, config.num_attention_heads, batch_first=True) self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.mlp = SiglipMLP(config=config, quant_config=quant_config) + self.mlp = SiglipMLP(config=config, + quant_config=quant_config, + prefix=prefix) def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: batch_size = hidden_state.shape[0] @@ -457,6 +475,7 @@ def __init__( *, num_hidden_layers_override: Optional[int] = None, require_post_norm: Optional[bool] = None, + prefix: str = "", ) -> None: super().__init__() @@ -468,6 +487,7 @@ def __init__( config, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, + prefix=prefix, ) num_hidden_layers = config.num_hidden_layers @@ -491,7 +511,10 @@ def __init__( config.vision_use_head) if self.use_head: self.head = SiglipMultiheadAttentionPoolingHead( - config=config, quant_config=quant_config) + config=config, + quant_config=quant_config, + prefix=prefix, + ) def forward( self, @@ -527,6 +550,7 @@ def __init__( *, num_hidden_layers_override: Optional[int] = None, require_post_norm: Optional[bool] = None, + prefix: str = "", ) -> None: # NOTE: Vision tower is not quantized by any of the supported methods if quant_config is not None: @@ -543,6 +567,7 @@ def __init__( quant_config, num_hidden_layers_override=num_hidden_layers_override, require_post_norm=require_post_norm, + prefix=prefix, ) def get_input_embeddings(self) -> nn.Module: From a3ca5fda111b88e5bf281bae18e8045134f479b2 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 11 Oct 2024 16:33:33 +0000 Subject: [PATCH 09/17] Fix prefixes --- vllm/model_executor/models/blip.py | 11 +++++++---- vllm/model_executor/models/clip.py | 13 ++++++++----- vllm/model_executor/models/intern_vit.py | 12 +++++++----- vllm/model_executor/models/mllama.py | 20 ++++++++++++-------- vllm/model_executor/models/siglip.py | 15 ++++++++------- 5 files changed, 42 insertions(+), 29 deletions(-) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index f015c34add349..57da351f28c3a 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -284,14 +284,16 @@ def __init__( if USE_XFORMERS_OPS and num_heads % tp_size == 0: self.self_attn = BlipParallelAttention(config, quant_config=quant_config, - prefix=prefix) + prefix=f"{prefix}.self_attn") else: # Blip doesn't have SDPA attention implemented in transformers # use eager attention instead for cpu backend self.self_attn = BlipAttention(config) self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.mlp = BlipMLP(config, quant_config=quant_config, prefix=prefix) + self.mlp = BlipMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -338,7 +340,8 @@ def __init__( self.layers = nn.ModuleList([ BlipEncoderLayer(config=config, quant_config=quant_config, - prefix=prefix) for _ in range(num_hidden_layers) + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_hidden_layers) ]) def forward(self, inputs_embeds: torch.Tensor): @@ -379,7 +382,7 @@ def __init__( config=config, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, - prefix=prefix, + prefix=f"{prefix}.encoder", ) num_hidden_layers = config.num_hidden_layers diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index ce5a57cef9b34..0be141e990053 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -305,12 +305,14 @@ def __init__( if USE_XFORMERS_OPS and num_heads % tp_size == 0: self.self_attn = CLIPParallelAttention(config, quant_config=quant_config, - prefix=prefix) + prefix=f"{prefix}.self_attn") else: self.self_attn = CLIPSdpaAttention(config) self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.mlp = CLIPMLP(config, quant_config=quant_config, prefix=prefix) + self.mlp = CLIPMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -357,7 +359,8 @@ def __init__( self.layers = nn.ModuleList([ CLIPEncoderLayer(config=config, quant_config=quant_config, - prefix=prefix) for _ in range(num_hidden_layers) + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_hidden_layers) ]) def forward(self, inputs_embeds: torch.Tensor): @@ -394,7 +397,7 @@ def __init__( config=config, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, - prefix=prefix, + prefix=f"{prefix}.encoder", ) num_hidden_layers = config.num_hidden_layers @@ -458,7 +461,7 @@ def __init__( quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, require_post_norm=require_post_norm, - prefix=prefix, + prefix=f"{prefix}.vision_model", ) def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index c3197047392a7..55b350b9064ca 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -304,9 +304,11 @@ def __init__( self.attn = self._init_attn(config, quant_config, num_dummy_heads=num_dummy_heads, - prefix=prefix) + prefix=f"{prefix}.attn") - self.mlp = InternMLP(config, quant_config=quant_config, prefix=prefix) + self.mlp = InternMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps) self.norm2 = NORM2FN[self.norm_type](self.embed_dim, @@ -374,8 +376,8 @@ def __init__( InternVisionEncoderLayer(config, quant_config, num_dummy_heads=num_dummy_heads, - prefix=prefix) - for _ in range(num_hidden_layers) + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_hidden_layers) ]) def forward(self, inputs_embeds: torch.Tensor): @@ -412,7 +414,7 @@ def __init__( quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, num_dummy_heads=num_dummy_heads, - prefix=prefix, + prefix=f"{prefix}.encoder", ) def get_input_embeddings(self): diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 981a229350feb..e0b37bc6261c4 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -371,7 +371,9 @@ def __init__( self.intermediate_size = config.intermediate_size self.self_attn = MllamaVisionSdpaAttention(config) - self.mlp = CLIPMLP(config, quant_config=quant_config, prefix=prefix) + self.mlp = CLIPMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.input_layernorm = nn.LayerNorm(self.hidden_size, eps=config.norm_eps) @@ -423,7 +425,8 @@ def __init__( MllamaVisionEncoderLayer(config, quant_config=quant_config, is_gated=is_gated, - prefix=prefix) for _ in range(num_layers) + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_layers) ]) self.output_hidden_states = output_hidden_states or [] @@ -501,14 +504,14 @@ def __init__( config.num_hidden_layers, is_gated=False, output_hidden_states=config.intermediate_layers_indices, - prefix=prefix, + prefix=f"{prefix}.transformer", ) self.global_transformer = MllamaVisionEncoder( config, quant_config, config.num_global_layers, is_gated=True, - prefix=prefix, + prefix=f"{prefix}.global_transformer", ) def apply_class_embedding(self, @@ -756,7 +759,7 @@ def __init__( config=config, layer_idx=layer_idx, quant_config=quant_config, - prefix=prefix, + prefix=f"{prefix}.cross_attn", ) self.input_layernorm = RMSNorm(config.hidden_size, @@ -768,7 +771,7 @@ def __init__( intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, - prefix=prefix, + prefix=f"{prefix}.mlp", ) self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -833,7 +836,7 @@ def __init__( config, layer_idx, quant_config=quant_config, - prefix=prefix, + prefix=f"{prefix}.layers.{layer_idx}", )) else: # TODO: force LlamaDecoderLayer to config.attention_bias=False @@ -841,7 +844,8 @@ def __init__( LlamaDecoderLayer(config, cache_config=cache_config, quant_config=quant_config, - prefix=prefix)) + prefix=f"{prefix}.layers.{layer_idx}", + )) self.layers = nn.ModuleList(layers) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index bc0241c3ee1c0..c68c4fe975555 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -365,7 +365,7 @@ def __init__( if USE_XFORMERS_OPS and num_heads % tp_size == 0: self.self_attn = SiglipParallelAttention(config, quant_config=quant_config, - prefix=prefix) + prefix=f"{prefix}.self_attn") else: self.self_attn = SiglipSdpaAttention(config) @@ -374,7 +374,7 @@ def __init__( self.mlp = SiglipMLP( config, quant_config=quant_config, - prefix=prefix, + prefix=f"{prefix}.mlp", ) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) @@ -418,7 +418,8 @@ def __init__( self.layers = nn.ModuleList([ SiglipEncoderLayer(config, quant_config=quant_config, - prefix=prefix) for _ in range(num_hidden_layers) + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_hidden_layers) ]) def forward( @@ -451,7 +452,7 @@ def __init__( eps=config.layer_norm_eps) self.mlp = SiglipMLP(config=config, quant_config=quant_config, - prefix=prefix) + prefix=f"{prefix}.mlp") def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: batch_size = hidden_state.shape[0] @@ -487,7 +488,7 @@ def __init__( config, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, - prefix=prefix, + prefix=f"{prefix}.encoder", ) num_hidden_layers = config.num_hidden_layers @@ -513,7 +514,7 @@ def __init__( self.head = SiglipMultiheadAttentionPoolingHead( config=config, quant_config=quant_config, - prefix=prefix, + prefix=f"{prefix}.head", ) def forward( @@ -567,7 +568,7 @@ def __init__( quant_config, num_hidden_layers_override=num_hidden_layers_override, require_post_norm=require_post_norm, - prefix=prefix, + prefix=f"{prefix}.vision_model", ) def get_input_embeddings(self) -> nn.Module: From 527bf39b60731556fd152cc8f72f2303d310ee47 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 11 Oct 2024 16:39:18 +0000 Subject: [PATCH 10/17] Update idefics --- .../models/idefics2_vision_model.py | 51 +++++++++++++++---- vllm/model_executor/models/minicpmv.py | 33 +++++++++--- 2 files changed, 66 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 3b0b6febaa48c..43f4f29814e6d 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -113,7 +113,8 @@ def __init__( self, config: Idefics2Config, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() self.config = config self.embed_dim = config.hidden_size @@ -130,12 +131,14 @@ def __init__( self.head_dim, self.num_heads, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.out_proj = RowParallelLinear( self.embed_dim, self.embed_dim, bias=True, quant_config=quant_config, + prefix=f"{prefix}.out_proj", ) self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) @@ -178,7 +181,8 @@ def __init__( self, config: Idefics2Config, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() self.config = config self.activation_fn = get_act_fn(config.hidden_act) @@ -187,12 +191,14 @@ def __init__( config.intermediate_size, bias=True, quant_config=quant_config, + prefix=f"{prefix}.fc1", ) self.fc2 = RowParallelLinear( config.intermediate_size, config.hidden_size, bias=True, quant_config=quant_config, + prefix=f"{prefix}.fc2", ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -204,13 +210,22 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Idefics2EncoderLayer(nn.Module): - def __init__(self, config: Idefics2Config): + def __init__( + self, + config: Idefics2Config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() self.embed_dim = config.hidden_size - self.self_attn = Idefics2VisionAttention(config) + self.self_attn = Idefics2VisionAttention(config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn") self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - self.mlp = Idefics2VisionMLP(config) + self.mlp = Idefics2VisionMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) @@ -245,12 +260,20 @@ class Idefics2Encoder(nn.Module): config: Idefics2Config """ - def __init__(self, config: Idefics2Config): + def __init__( + self, + config: Idefics2Config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() + self.config = config self.layers = nn.ModuleList([ - Idefics2EncoderLayer(config) - for _ in range(config.num_hidden_layers) + Idefics2EncoderLayer(config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(config.num_hidden_layers) ]) def forward( @@ -275,12 +298,20 @@ def forward( class Idefics2VisionTransformer(nn.Module): - def __init__(self, config: Idefics2VisionConfig): + def __init__( + self, + config: Idefics2VisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() + embed_dim = config.hidden_size self.config = config self.embeddings = Idefics2VisionEmbeddings(config) - self.encoder = Idefics2Encoder(config) + self.encoder = Idefics2Encoder(config, + quant_config=quant_config, + prefix=f"{prefix}.encoder") self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 6d0fa34f299ad..27cb4c05ed57f 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -360,7 +360,7 @@ def __init__( self.version = get_version_by_config(self.config) self.llm = self.init_llm(config, cache_config, quant_config) - self.vpm = self.init_vision_module() + self.vpm = self.init_vision_module(config, quant_config) param_dtype = torch.get_default_dtype() self.vpm.to(dtype=param_dtype) self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else @@ -596,7 +596,11 @@ def init_llm( ) -> nn.Module: raise NotImplementedError - def init_vision_module(self) -> nn.Module: + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + ) -> nn.Module: raise NotImplementedError def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module: @@ -642,7 +646,11 @@ def init_llm( quant_config=quant_config), name="model") - def init_vision_module(self) -> nn.Module: + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + ) -> nn.Module: # TODO :refactor this vision model try: import timm @@ -766,8 +774,13 @@ def init_llm( quant_config=quant_config), name="model") - def init_vision_module(self) -> nn.Module: - model = Idefics2VisionTransformer(self.config.vision_config) + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + ) -> nn.Module: + model = Idefics2VisionTransformer(config.vision_config, + quant_config=quant_config) if self.config.drop_vision_last_layer: model.encoder.layers = model.encoder.layers[:-1] return model @@ -878,9 +891,13 @@ def init_llm( quant_config=quant_config), name="model") - def init_vision_module(self) -> nn.Module: - - model = Idefics2VisionTransformer(self.config.vision_config) + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + ) -> nn.Module: + model = Idefics2VisionTransformer(config.vision_config, + quant_config=quant_config) if self.config.drop_vision_last_layer: model.encoder.layers = model.encoder.layers[:-1] return model From a35f49ccb930948e83446d60a727e8203488218d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 11 Oct 2024 16:40:09 +0000 Subject: [PATCH 11/17] format --- vllm/model_executor/models/blip.py | 8 +++++--- vllm/model_executor/models/clip.py | 8 +++++--- vllm/model_executor/models/mllama.py | 9 +++++---- vllm/model_executor/models/siglip.py | 8 +++++--- 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 57da351f28c3a..74cfecfa4e059 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -282,9 +282,11 @@ def __init__( num_heads = config.num_attention_heads tp_size = get_tensor_model_parallel_world_size() if USE_XFORMERS_OPS and num_heads % tp_size == 0: - self.self_attn = BlipParallelAttention(config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn") + self.self_attn = BlipParallelAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) else: # Blip doesn't have SDPA attention implemented in transformers # use eager attention instead for cpu backend diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 0be141e990053..371dfabf6d93c 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -303,9 +303,11 @@ def __init__( num_heads = config.num_attention_heads tp_size = get_tensor_model_parallel_world_size() if USE_XFORMERS_OPS and num_heads % tp_size == 0: - self.self_attn = CLIPParallelAttention(config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn") + self.self_attn = CLIPParallelAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) else: self.self_attn = CLIPSdpaAttention(config) self.layer_norm1 = nn.LayerNorm(config.hidden_size, diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index e0b37bc6261c4..abd320ae46089 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -841,10 +841,11 @@ def __init__( else: # TODO: force LlamaDecoderLayer to config.attention_bias=False layers.append( - LlamaDecoderLayer(config, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.layers.{layer_idx}", + LlamaDecoderLayer( + config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}", )) self.layers = nn.ModuleList(layers) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index c68c4fe975555..fbc8664397321 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -363,9 +363,11 @@ def __init__( num_heads = config.num_attention_heads tp_size = get_tensor_model_parallel_world_size() if USE_XFORMERS_OPS and num_heads % tp_size == 0: - self.self_attn = SiglipParallelAttention(config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn") + self.self_attn = SiglipParallelAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) else: self.self_attn = SiglipSdpaAttention(config) From f69c008843e8cada3a13f488e0289efc30b01c74 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 22 Oct 2024 15:28:49 +0000 Subject: [PATCH 12/17] Add `num_hidden_layers_override` --- vllm/model_executor/models/llava.py | 3 ++- vllm/model_executor/models/pixtral.py | 23 ++++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index def659810c183..83e869efa4712 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -236,10 +236,11 @@ def init_vision_tower_for_llava( require_post_norm=require_post_norm, ) elif isinstance(vision_config, PixtralVisionConfig): - # TODO: allow layer override? return PixtralHFVisionModel( vision_config, quant_config, + num_hidden_layers_override=num_hidden_layers, + require_post_norm=require_post_norm, ) msg = f"Unsupported vision config: {type(vision_config)}" diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index d1fbe8bdfa31d..18dbee94e10b0 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -896,15 +896,21 @@ def __init__( config: PixtralVisionConfig, quant_config: Optional[QuantizationConfig] = None, *, + num_hidden_layers_override: Optional[int] = None, prefix: str = "", ) -> None: super().__init__() + if num_hidden_layers_override is None: + num_hidden_layers = config.num_hidden_layers + else: + num_hidden_layers = num_hidden_layers_override + self.layers = nn.ModuleList([ PixtralHFTransformerBlock(config=config, quant_config=quant_config, prefix=f"{prefix}.layers.{layer_idx}") - for layer_idx in range(config.num_hidden_layers) + for layer_idx in range(num_hidden_layers) ]) def forward( @@ -925,6 +931,8 @@ def __init__( config: PixtralVisionConfig, quant_config: Optional[QuantizationConfig] = None, *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: Optional[bool] = None, prefix: str = "", ) -> None: super().__init__() @@ -941,8 +949,21 @@ def __init__( self.transformer = PixtralHFTransformer( config, quant_config, + num_hidden_layers_override=num_hidden_layers_override, prefix=f"{prefix}.transformer", ) + + num_hidden_layers = config.num_hidden_layers + if len(self.transformer.layers) > config.num_hidden_layers: + raise ValueError( + f"The original encoder only has {num_hidden_layers} " + f"layers, but you requested {len(self.transformer.layers)} " + "layers.") + + if require_post_norm is True: + msg = "PixtralHFVisionModel does not have post-layernorm" + raise ValueError(msg) + self.dtype = next(self.parameters()).dtype self.device = next(self.parameters()).device self.patch_positional_embedding = PixtralRotaryEmbedding( From 6e8670f33c2f9671cb944a7666c8caf591e1db82 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Tue, 22 Oct 2024 16:57:02 +0000 Subject: [PATCH 13/17] Try use `quant_config` --- vllm/model_executor/models/blip.py | 4 ---- vllm/model_executor/models/clip.py | 4 ---- vllm/model_executor/models/intern_vit.py | 4 ---- vllm/model_executor/models/mllama.py | 4 ---- vllm/model_executor/models/siglip.py | 4 ---- 5 files changed, 20 deletions(-) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index c88914cea7303..1f2d7384076ed 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -367,10 +367,6 @@ def __init__( require_post_norm: Optional[bool] = None, prefix: str = "", ) -> None: - # NOTE: Vision tower is not quantized by any of the supported methods - if quant_config is not None: - quant_config = None - super().__init__() tp_size = get_tensor_model_parallel_world_size() diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 258a5789592da..6b45cb384d4a0 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -448,10 +448,6 @@ def __init__( require_post_norm: Optional[bool] = None, prefix: str = "", ) -> None: - # NOTE: Vision tower is not quantized by any of the supported methods - if quant_config is not None: - quant_config = None - super().__init__() tp_size = get_tensor_model_parallel_world_size() diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 39e54e718c5d0..9761635d2a6c2 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -431,10 +431,6 @@ def __init__( num_dummy_heads: int = 0, prefix: str = "", ) -> None: - # NOTE: Vision tower is not quantized by any of the supported methods - if quant_config is not None: - quant_config = None - super().__init__() self.config = config diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index e9ed7d2d01a53..23e2b520e5b40 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -482,10 +482,6 @@ def __init__( quant_config: Optional[QuantizationConfig], prefix: str = "", ) -> None: - # NOTE: Vision tower is not quantized by any of the supported methods - if quant_config is not None: - quant_config = None - super().__init__() self.image_size = config.image_size diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 29da1b7323d99..91277b0ccd145 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -555,10 +555,6 @@ def __init__( require_post_norm: Optional[bool] = None, prefix: str = "", ) -> None: - # NOTE: Vision tower is not quantized by any of the supported methods - if quant_config is not None: - quant_config = None - super().__init__() num_heads = config.num_attention_heads From e676b721a8b2bd64b7081a4571cc6edc83f13ed4 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Wed, 23 Oct 2024 15:32:20 +0800 Subject: [PATCH 14/17] patch internvl awq config --- vllm/model_executor/layers/quantization/awq.py | 18 +++++++++++++++--- vllm/model_executor/models/internvl.py | 17 ++++++++++++++++- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 410b3cb5321cb..a285f9f3dab4a 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -3,7 +3,8 @@ import torch from vllm import _custom_ops as ops -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.parameter import (GroupQuantScaleParameter, @@ -21,10 +22,12 @@ def __init__( weight_bits: int, group_size: int, zero_point: bool, + modules_to_not_convert: Optional[List[str]] = None, ) -> None: self.weight_bits = weight_bits self.group_size = group_size self.zero_point = zero_point + self.modules_to_not_convert = modules_to_not_convert or [] if self.weight_bits != 4: raise ValueError( @@ -35,7 +38,8 @@ def __init__( def __repr__(self) -> str: return (f"AWQConfig(weight_bits={self.weight_bits}, " f"group_size={self.group_size}, " - f"zero_point={self.zero_point})") + f"zero_point={self.zero_point}), " + f"modules_to_not_convert={self.modules_to_not_convert}") def get_name(self) -> str: return "awq" @@ -61,11 +65,15 @@ def from_config(cls, config: Dict[str, Any]) -> "AWQConfig": weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) group_size = cls.get_from_keys(config, ["q_group_size", "group_size"]) zero_point = cls.get_from_keys(config, ["zero_point"]) - return cls(weight_bits, group_size, zero_point) + modules_to_not_convert = cls.get_from_keys_or( + config, ["modules_to_not_convert"], None) + return cls(weight_bits, group_size, zero_point, modules_to_not_convert) def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["AWQLinearMethod"]: if isinstance(layer, LinearBase): + if is_layer_skipped_awq(prefix, self.modules_to_not_convert): + return UnquantizedLinearMethod() return AWQLinearMethod(self) return None @@ -73,6 +81,10 @@ def get_scaled_act_names(self) -> List[str]: return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"] +def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]): + return any(module_name in prefix for module_name in modules_to_not_convert) + + class AWQLinearMethod(LinearMethodBase): """Linear method for AWQ. diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index f9f26668a79f8..218660ccc5631 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -19,7 +19,8 @@ from vllm.config import CacheConfig, MultiModalConfig from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, token_inputs) -from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization import (AWQConfig, + QuantizationConfig) from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) @@ -418,6 +419,7 @@ def __init__(self, self.config = config self.multimodal_config = multimodal_config + self._patch_quant_config(config, quant_config) image_size = config.force_image_size or config.vision_config.image_size patch_size = config.vision_config.patch_size @@ -444,6 +446,18 @@ def __init__(self, self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) + def _patch_quant_config(self, config: PretrainedConfig, + quant_config: QuantizationConfig): + # the awq models from OpenGVLab missing `modules_to_not_convert` + # patch the quant_config to add `modules_to_not_convert` back + if isinstance(quant_config, AWQConfig): + text_config = config.text_config + llm_quant_config = getattr(text_config, "quantization_config", + None) + if (not quant_config.modules_to_not_convert) and \ + (llm_quant_config is not None): + quant_config.modules_to_not_convert.append("vision_model") + @cached_property def sampler(self): if hasattr(self.language_model, "sampler"): @@ -470,6 +484,7 @@ def _init_vision_model( config.vision_config, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers, + prefix="vision_model", ) else: return InternVisionPatchModel(config.vision_config) From 989c4debecf6e30be59f0081da157789aacc6ca6 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Wed, 23 Oct 2024 15:38:38 +0800 Subject: [PATCH 15/17] make mypy happy --- vllm/model_executor/layers/quantization/awq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index a285f9f3dab4a..66a2e7c38a192 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -70,7 +70,7 @@ def from_config(cls, config: Dict[str, Any]) -> "AWQConfig": return cls(weight_bits, group_size, zero_point, modules_to_not_convert) def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["AWQLinearMethod"]: + prefix: str) -> Optional["LinearMethodBase"]: if isinstance(layer, LinearBase): if is_layer_skipped_awq(prefix, self.modules_to_not_convert): return UnquantizedLinearMethod() From 0224b755b90420a939d11d6aac3f110120f1f69c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 23 Oct 2024 07:52:49 +0000 Subject: [PATCH 16/17] Fix typo --- vllm/model_executor/layers/quantization/awq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 66a2e7c38a192..38dd1f2e10fcd 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -38,8 +38,8 @@ def __init__( def __repr__(self) -> str: return (f"AWQConfig(weight_bits={self.weight_bits}, " f"group_size={self.group_size}, " - f"zero_point={self.zero_point}), " - f"modules_to_not_convert={self.modules_to_not_convert}") + f"zero_point={self.zero_point}, " + f"modules_to_not_convert={self.modules_to_not_convert})") def get_name(self) -> str: return "awq" From ef4c253a6020992fed2c3505a41fbf30b19fbdf8 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 23 Oct 2024 07:56:41 +0000 Subject: [PATCH 17/17] Move prefix to be closer to variable declaration --- vllm/model_executor/models/internvl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 218660ccc5631..3ae37d9fe5d85 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -435,6 +435,7 @@ def __init__(self, config, quant_config=quant_config, is_mono=self.is_mono, + prefix="vision_model", ) self.language_model = init_vllm_registered_model( @@ -471,6 +472,7 @@ def _init_vision_model( quant_config: Optional[QuantizationConfig], *, is_mono: bool, + prefix: str, ): if not is_mono: vision_feature_layer = config.select_layer @@ -484,7 +486,7 @@ def _init_vision_model( config.vision_config, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers, - prefix="vision_model", + prefix=prefix, ) else: return InternVisionPatchModel(config.vision_config)