diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 222de4352c436..e769f69a6fd30 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -75,10 +75,10 @@ def _get_num_frame_tokens( def _get_max_frame_tokens(self) -> int: hf_config = self._get_hf_config() - vision_encoder_info = self._vision_encoder_info + spatial_pool_stride = hf_config.spatial_pool_stride - patch_grid_length = vision_encoder_info.get_patch_grid_length() - pooled_grid_length = patch_grid_length / hf_config.spatial_pool_stride + patch_grid_length = self._vision_encoder_info.get_patch_grid_length() + pooled_grid_length = patch_grid_length / spatial_pool_stride return int(pooled_grid_length * pooled_grid_length) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index fb11ea87165e2..2aa2207300631 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -156,11 +156,9 @@ def _get_num_frame_tokens( def _get_max_frame_tokens(self) -> int: hf_config = self._get_hf_config() - vision_encoder_info = self._vision_encoder_info - - patch_grid_length = vision_encoder_info.get_patch_grid_length() - spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2) + + patch_grid_length = self._vision_encoder_info.get_patch_grid_length() pooled_grid_length = patch_grid_length / spatial_pool_stride return math.ceil(pooled_grid_length) * math.ceil(pooled_grid_length) @@ -185,9 +183,16 @@ def _get_max_video_frames( num_images: int = 0, num_videos: int = 1, ) -> int: + hf_config = self._get_hf_config() + spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2) + + # How many tokens is one image worth relative to one video frame + i2f = spatial_pool_stride * spatial_pool_stride + max_total_tokens = self.ctx.model_config.max_model_len max_total_frames = int(max_total_tokens / self._get_max_frame_tokens()) - return (max_total_frames - num_images) // max(num_videos, 1) + + return (max_total_frames - num_images * i2f) // max(num_videos, 1) def _get_max_video_tokens(self) -> int: return self._get_max_frame_tokens() * self._get_max_video_frames() diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 0b2fd4d8b159e..a3b47d06e4467 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -772,9 +772,15 @@ def _get_max_video_frames( num_images: int = 0, num_videos: int = 1, ) -> int: + hf_config = self.ctx.get_hf_config(Qwen2VLConfig) + temporal_patch_size = hf_config.vision_config.temporal_patch_size + + # How many tokens is one image worth relative to one video frame + i2f = temporal_patch_size * temporal_patch_size + max_total_tokens = self.ctx.model_config.max_model_len max_total_frames = int(max_total_tokens / self._get_max_image_tokens()) - return (max_total_frames - num_images) // max(num_videos, 1) + return (max_total_frames - num_images * i2f) // max(num_videos, 1) def _get_max_video_tokens(self) -> int: return self._get_max_image_tokens() * self._get_max_video_frames()