diff --git a/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_vl.py index ddb4e36d849..a8e1e8c1593 100644 --- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py +++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py @@ -450,7 +450,7 @@ def get_position_ids( width //= self.spatial_merge_size # calculate the length of the text and image tokens - text_length = next_image_pos - current_pos + text_length = next_image_pos start_idx = ( llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 ) @@ -480,7 +480,7 @@ def get_position_ids( ) llm_pos_ids_list.append(image_pos_ids) - current_pos = next_image_pos + time_steps * height * width + current_pos += next_image_pos + time_steps * height * width image_index += 1 if current_pos < batch_input_ids.size(1): diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index aa0fe1078d3..81b4369b986 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -68,7 +68,8 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str elif config.model_type == "paligemma": return "" * config.text_config.num_image_tokens elif config.model_type == "qwen2_vl": - num_pads = image_input.pixel_values.shape[0] // 4 + grid_t, grid_h, grid_w = image_input["image_grid_thw"][image_id] + num_pads = grid_t * grid_h * grid_w // 4 padding = "<|image_pad|>" * num_pads return f"<|vision_start|>{padding}<|vision_end|>" else: