diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 222de4352c436..e769f69a6fd30 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -75,10 +75,10 @@ def _get_num_frame_tokens(
 
     def _get_max_frame_tokens(self) -> int:
         hf_config = self._get_hf_config()
-        vision_encoder_info = self._vision_encoder_info
+        spatial_pool_stride = hf_config.spatial_pool_stride
 
-        patch_grid_length = vision_encoder_info.get_patch_grid_length()
-        pooled_grid_length = patch_grid_length / hf_config.spatial_pool_stride
+        patch_grid_length = self._vision_encoder_info.get_patch_grid_length()
+        pooled_grid_length = patch_grid_length / spatial_pool_stride
 
         return int(pooled_grid_length * pooled_grid_length)
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index fb11ea87165e2..2aa2207300631 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -156,11 +156,9 @@ def _get_num_frame_tokens(
 
     def _get_max_frame_tokens(self) -> int:
         hf_config = self._get_hf_config()
-        vision_encoder_info = self._vision_encoder_info
-
-        patch_grid_length = vision_encoder_info.get_patch_grid_length()
-
         spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2)
+
+        patch_grid_length = self._vision_encoder_info.get_patch_grid_length()
         pooled_grid_length = patch_grid_length / spatial_pool_stride
 
         return math.ceil(pooled_grid_length) * math.ceil(pooled_grid_length)
@@ -185,9 +183,16 @@ def _get_max_video_frames(
         num_images: int = 0,
         num_videos: int = 1,
     ) -> int:
+        hf_config = self._get_hf_config()
+        spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2)
+
+        # How many tokens is one image worth relative to one video frame
+        i2f = spatial_pool_stride * spatial_pool_stride
+
         max_total_tokens = self.ctx.model_config.max_model_len
         max_total_frames = int(max_total_tokens / self._get_max_frame_tokens())
-        return (max_total_frames - num_images) // max(num_videos, 1)
+
+        return (max_total_frames - num_images * i2f) // max(num_videos, 1)
 
     def _get_max_video_tokens(self) -> int:
         return self._get_max_frame_tokens() * self._get_max_video_frames()
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 0b2fd4d8b159e..a3b47d06e4467 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -772,9 +772,15 @@ def _get_max_video_frames(
         num_images: int = 0,
         num_videos: int = 1,
     ) -> int:
+        hf_config = self.ctx.get_hf_config(Qwen2VLConfig)
+        temporal_patch_size = hf_config.vision_config.temporal_patch_size
+
+        # How many tokens is one image worth relative to one video frame
+        i2f = temporal_patch_size * temporal_patch_size
+
         max_total_tokens = self.ctx.model_config.max_model_len
         max_total_frames = int(max_total_tokens / self._get_max_image_tokens())
-        return (max_total_frames - num_images) // max(num_videos, 1)
+        return (max_total_frames - num_images * i2f) // max(num_videos, 1)
 
     def _get_max_video_tokens(self) -> int:
         return self._get_max_image_tokens() * self._get_max_video_frames()