From 6d8cc83813831c9076da7bace007f2ea4710812e Mon Sep 17 00:00:00 2001 From: Janne Alatalo Date: Fri, 13 Dec 2024 11:58:07 +0200 Subject: [PATCH 1/2] Fix runtime error when Qwen2-VL was prompted with multiple images Fix runtime error when Qwen2-VL model is prompted with prompt with more than one image. The runtime error was: File "text-generation-inference/server/text_generation_server/models/custom_modeling/qwen2_vl.py", line 459, in get_position_ids text_pos_ids = torch.arange(text_length, device=d) RuntimeError: upper bound and larger bound inconsistent with step sign The error was caused by text_length variable going to negative value when multiple images caused multiple loops in the get_position_ids function's main loop. The error is a simple logic mistake where next_image_pos is initialized as relative offset from current_pos, but was used like it was absolute position from zero. --- .../text_generation_server/models/custom_modeling/qwen2_vl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_vl.py index ddb4e36d849..a8e1e8c1593 100644 --- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py +++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py @@ -450,7 +450,7 @@ def get_position_ids( width //= self.spatial_merge_size # calculate the length of the text and image tokens - text_length = next_image_pos - current_pos + text_length = next_image_pos start_idx = ( llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 ) @@ -480,7 +480,7 @@ def get_position_ids( ) llm_pos_ids_list.append(image_pos_ids) - current_pos = next_image_pos + time_steps * height * width + current_pos += next_image_pos + time_steps * height * width image_index += 1 if current_pos < batch_input_ids.size(1): From 233cbb80e00efe6402e06de57b781e1d36b95119 Mon Sep 17 00:00:00 2001 From: Janne Alatalo Date: Fri, 13 Dec 2024 12:29:39 +0200 Subject: [PATCH 2/2] Fix runtime error when Qwen2-VL was prompted with multiple images Fix runtime error when Qwen2-VL model is prompted with prompt with more than one image. The runtime error was: File "text-generation-inference/server/text_generation_server/models/custom_modeling/qwen2_vl.py", line 534, in forward inputs_embeds[input_ids == self.image_token_id] = image_embeds RuntimeError: shape mismatch: value tensor of shape [512, 3584] cannot be broadcast to indexing result of shape [1024, 3584] (The error message shape numbers can be different depending on the input image resolutions) The error was caused by adding the wrong number of <|image_pad|> tokens to the tokenized input in the image_text_replacement function. The error is a simple logical mistake where the number of image pad tokens is checked from pixel_value_shape tensor's first dimension length. However, the pixel_value_shape contains patches from all of the images. Therefore the code added the total number of required image pad tokens for the whole input to each of the images locations. This resulted to extra image pad tokens to be present in the tokenized input. The fix was to check the number of required tokens from the image_grid_thw tensor. The tensor includes grid_t, grid_h, and grid_w values for each image. grid_t * grid_h * grid_w results to the total number of patches for the image [1]. The number of required image pad tokens is number_of_patches // 4. [1] https://github.com/huggingface/transformers/blob/31f9a289a6207be6cae746e009d8e0db523be203/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L311 --- server/text_generation_server/models/vlm_causal_lm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index aa0fe1078d3..81b4369b986 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -68,7 +68,8 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str elif config.model_type == "paligemma": return "" * config.text_config.num_image_tokens elif config.model_type == "qwen2_vl": - num_pads = image_input.pixel_values.shape[0] // 4 + grid_t, grid_h, grid_w = image_input["image_grid_thw"][image_id] + num_pads = grid_t * grid_h * grid_w // 4 padding = "<|image_pad|>" * num_pads return f"<|vision_start|>{padding}<|vision_end|>" else: