From 81e77ccab536a40511da283dc873ae5da11c186b Mon Sep 17 00:00:00 2001 From: litianjian Date: Thu, 24 Oct 2024 09:32:35 +0000 Subject: [PATCH 1/2] disable post_norm for llava-ov --- vllm/model_executor/models/llava_onevision.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 10aa8049a2347..47e62409072e5 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -400,7 +400,8 @@ def __init__(self, self.multimodal_config = multimodal_config # Initialize the vision tower only up to the required feature layer - self.vision_tower = init_vision_tower_for_llava(config, quant_config) + self.vision_tower = init_vision_tower_for_llava( + config, quant_config, require_post_norm=False) self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config) self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config) From 9a6a3600cc8b2e4ae3aa84bd43c4189c0592ef53 Mon Sep 17 00:00:00 2001 From: litianjian Date: Thu, 24 Oct 2024 10:06:13 +0000 Subject: [PATCH 2/2] disable post_norm for llava models --- vllm/model_executor/models/llava.py | 3 ++- vllm/model_executor/models/llava_next.py | 3 ++- vllm/model_executor/models/llava_next_video.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 83e869efa4712..b005d83c17f90 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -273,7 +273,8 @@ def __init__(self, config.projector_hidden_act = "gelu" # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = init_vision_tower_for_llava(config, quant_config) + self.vision_tower = init_vision_tower_for_llava( + config, quant_config, require_post_norm=False) self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index d33d4ac5bfaed..9466e72ecc639 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -277,7 +277,8 @@ def __init__(self, self.multimodal_config = multimodal_config # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = init_vision_tower_for_llava(config, quant_config) + self.vision_tower = init_vision_tower_for_llava( + config, quant_config, require_post_norm=False) self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) self.multi_modal_projector = LlavaMultiModalProjector( diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index d02cf9044dfc0..43eec43d56643 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -256,7 +256,8 @@ def __init__(self, self.multimodal_config = multimodal_config # Initialize the vision tower only up to the required feature layer - self.vision_tower = init_vision_tower_for_llava(config, quant_config) + self.vision_tower = init_vision_tower_for_llava( + config, quant_config, require_post_norm=False) self.vision_resampler = LlavaNextVideoPooler(config) self.multi_modal_projector = LlavaNextMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size,