From 785b5cf444d8e6b5334c774cf550f50f2234e9c2 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 20 Jan 2025 16:20:06 +0100
Subject: [PATCH] v4.48.1

---
 setup.py                                      |   2 +-
 src/transformers/__init__.py                  |   2 +-
 .../models/emu3/convert_emu3_weights_to_hf.py | 448 ------------------
 .../moonshine/convert_usefulsensors_to_hf.py  | 169 -------
 .../models/qwen2_vl/modeling_qwen2_vl.py      |  25 +-
 5 files changed, 24 insertions(+), 622 deletions(-)
 delete mode 100644 src/transformers/models/emu3/convert_emu3_weights_to_hf.py
 delete mode 100644 src/transformers/models/moonshine/convert_usefulsensors_to_hf.py
diff --git a/setup.py b/setup.py
index c6d12f87b78c..d1483f414859 100644
--- a/setup.py
+++ b/setup.py
@@ -437,7 +437,7 @@ def run(self):
 
 setup(
     name="transformers",
-    version="4.48.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.48.1",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
     author_email="transformers@huggingface.co",
     description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index de34fea356db..017776936b94 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
 
-__version__ = "4.48.0"
+__version__ = "4.48.1"
 
 from typing import TYPE_CHECKING
 
diff --git a/src/transformers/models/emu3/convert_emu3_weights_to_hf.py b/src/transformers/models/emu3/convert_emu3_weights_to_hf.py
deleted file mode 100644
index 8ac8db7e4290..000000000000
--- a/src/transformers/models/emu3/convert_emu3_weights_to_hf.py
+++ /dev/null
@@ -1,448 +0,0 @@
-# Copyright 2024 The Emu team, BAAI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import json
-import os
-import re
-from typing import Dict, Optional
-
-import requests
-import torch
-from accelerate import init_empty_weights
-from PIL import Image
-
-from transformers import (
-    AutoModel,
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    Emu3Config,
-    Emu3ForConditionalGeneration,
-    Emu3ImageProcessor,
-    Emu3Processor,
-    Emu3TextConfig,
-    GenerationConfig,
-)
-from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
-
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/emu3/convert_emu3_weights_to_hf.py \
-    --vq_model_id BAAI/Emu3-VisionTokenizer --llm_model_id BAAI/Emu3-Chat --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import Emu3ForConditionalGeneration, Emu3Processor
-
-model = Emu3ForConditionalGeneration.from_pretrained("/output/path")
-processor = Emu3Processor.from_pretrained("/output/path")
-```
-
-"""
-
-
-byte_encoder = bytes_to_unicode()
-CHAT_TEMPLATE = "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"
-
-
-# Tiktoken to HF conversion, thanks for Xenova
-def token_bytes_to_string(b):
-    return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
-
-
-# Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960
-def bpe(mergeable_ranks: Dict[bytes, int], token: bytes, max_rank: Optional[int] = None):
-    parts = [bytes([b]) for b in token]
-    while True:
-        min_idx = None
-        min_rank = None
-        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
-            rank = mergeable_ranks.get(pair[0] + pair[1])
-            if rank is not None and (min_rank is None or rank < min_rank):
-                min_idx = i
-                min_rank = rank
-        if min_rank is None or (max_rank is not None and min_rank >= max_rank):
-            break
-        assert min_idx is not None
-        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :]
-    return parts
-
-
-def generate_vocab_and_merges(encoder):
-    mergeable_ranks = encoder._mergeable_ranks
-
-    merges = []
-    vocab = {}
-    for token, rank in mergeable_ranks.items():
-        vocab[token_bytes_to_string(token)] = rank
-
-        if len(token) == 1:
-            continue
-        merged = tuple(bpe(mergeable_ranks, token, max_rank=rank))
-        assert len(merged) == 2
-        merges.append(" ".join(map(token_bytes_to_string, merged)))
-
-    # Also add special tokens
-    vocab.update(encoder._special_tokens)
-    return vocab, merges
-
-
-def convert_tiktoken(tokenizer, output_dir):
-    encoder = tokenizer.tokenizer
-    vocab, merges = generate_vocab_and_merges(encoder)
-    added_tokens = [
-        {
-            "id": id,
-            "content": content,
-            "single_word": False,
-            "lstrip": False,
-            "rstrip": False,
-            "normalized": False,
-            "special": True,
-        }
-        for content, id in encoder._special_tokens.items()
-        if content != "<|extra_0|>"
-    ]
-
-    # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer_config.json
-    tokenizer_config_template = {
-        "add_prefix_space": False,
-        "bos_token": "<|extra_203|>",
-        "clean_up_tokenization_spaces": False,
-        "eos_token": "<|extra_204|>",
-        "pad_token": "<|endoftext|>",
-    }
-    tokenizer_config_template.update({"tokenizer_class": "GPT2Tokenizer"})
-    tokenizer_config_template = dict(sorted(tokenizer_config_template.items(), key=lambda x: x[0]))
-
-    # add placeholder image token by taking one of the reserved tokens
-    reserved_token_id = vocab["<|extra_0|>"]
-    vocab["<image>"] = reserved_token_id
-    del vocab["<|extra_0|>"]
-    added_tokens.append(
-        {
-            "id": reserved_token_id,
-            "content": "<image>",
-            "single_word": False,
-            "lstrip": False,
-            "rstrip": False,
-            "normalized": False,
-            "special": True,
-        }
-    )
-
-    os.makedirs(output_dir, exist_ok=True)
-
-    pre_tokenizer = {
-        "type": "ByteLevel",
-        "add_prefix_space": False,
-        "trim_offsets": True,
-        "use_regex": True,
-    }
-
-    # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer.json
-    tokenizer_template = {
-        "version": "1.0",
-        "truncation": None,
-        "padding": None,
-        "added_tokens": added_tokens,
-        "normalizer": None,
-        "pre_tokenizer": pre_tokenizer,
-        "post_processor": None,
-        "decoder": {
-            "type": "ByteLevel",
-            "add_prefix_space": True,
-            "trim_offsets": True,
-            "use_regex": True,
-        },
-        "model": {
-            "type": "BPE",
-            "dropout": None,
-            "unk_token": None,
-            "continuing_subword_prefix": "",
-            "end_of_word_suffix": "",
-            "fuse_unk": False,
-            "byte_fallback": False,
-            "vocab": vocab,
-            "merges": merges,
-        },
-    }
-
-    # Save to files
-    with open(os.path.join(output_dir, "vocab.json"), "w", encoding="utf-8") as fp:
-        json.dump(vocab, fp, indent=2, ensure_ascii=False)
-
-    with open(os.path.join(output_dir, "tokenizer.json"), "w", encoding="utf-8") as fp:
-        json.dump(tokenizer_template, fp, indent=2, ensure_ascii=False)
-
-    with open(os.path.join(output_dir, "tokenizer_config.json"), "w", encoding="utf-8") as fp:
-        json.dump(tokenizer_config_template, fp, indent=2, ensure_ascii=False)
-
-    with open(os.path.join(output_dir, "special_tokens_map.json"), "w", encoding="utf-8") as fp:
-        json.dump(
-            {
-                "bos_token": "<|extra_203|>",
-                "eos_token": "<|extra_204|>",
-                "pad_token": "<|endoftext|>",
-            },
-            fp,
-            indent=2,
-            ensure_ascii=False,
-        )
-
-    with open(os.path.join(output_dir, "merges.txt"), "w", encoding="utf-8") as fp:
-        fp.write("#version: 0.2\n")
-        fp.write("\n".join(merges))
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "^encoder": "model.vqmodel.encoder",
-    "^decoder": "model.vqmodel.decoder",
-    "^post_quant_conv": "model.vqmodel.post_quant_conv",
-    "^quant_conv": "model.vqmodel.quant_conv",
-    "^quantize": "model.vqmodel.quantize",
-    "^model": "text_model.model",
-    r"lm_head\.weight": "text_model.lm_head.weight",
-    r"^text_model\.model\.vqmodel": "vqmodel",
-    # rename QKV proj for the VQ-VAE model because we use SiglipAttention
-    r"\.q\.": ".q_proj.",
-    r"\.k\.": ".k_proj.",
-    r"\.v\.": ".v_proj.",
-    r"\.proj_out\.": ".out_proj.",
-    # move the attention norms outside of attention modules
-    r"mid\.attn_1\.norm\.": "mid.attn_norm.",
-    r"attn\.0\.norm\.": "attn_norms.0.",
-    r"attn\.1\.norm\.": "attn_norms.1.",
-    r"attn\.2\.norm\.": "attn_norms.2.",
-    r"attn\.3\.norm\.": "attn_norms.3.",
-    # isolate down/mid/up into separate classes for readability
-    r"\.down\.": ".down_block.down.",
-    r"\.up\.": ".up_block.up.",
-    r"\.mid\.": ".middle_block.",
-}
-
-
-def convert_state_dict_to_hf(old_state_dict, new_state_dict):
-    for key, value in old_state_dict.items():
-        # convert conv layers in attn to linear
-        if (
-            any(key.endswith(name) for name in ["q.weight", "k.weight", "v.weight", "proj_out.weight"])
-            and value.ndim == 4
-        ):
-            value = value.squeeze()
-
-        for old_pattern, new_pattern in KEYS_TO_MODIFY_MAPPING.items():
-            key = re.sub(old_pattern, new_pattern, key)
-
-        new_state_dict[key] = value
-    return new_state_dict
-
-
-def convert_model(vq_model_id, llm_model_id, output_dir, hub_model_id=None, test_inference=False):
-    os.makedirs(output_dir, exist_ok=True)
-
-    # Convert and save processor
-    tokenizer_tiktoken = AutoTokenizer.from_pretrained(llm_model_id, trust_remote_code=True)
-    convert_tiktoken(tokenizer_tiktoken, output_dir)
-    extra_special_tokens = extra_special_tokens = {
-        "image_token": "<image>",
-        "boi_token": "<|image start|>",
-        "eoi_token": "<|image end|>",
-        "image_wrapper_token": "<|image token|>",
-        "eof_token": "<|extra_201|>",
-    }
-    tokenizer_converted = AutoTokenizer.from_pretrained(output_dir, extra_special_tokens=extra_special_tokens)
-    tokenizer_converted.padding_side = "left"
-
-    image_processor = Emu3ImageProcessor.from_pretrained(vq_model_id)
-    processor = Emu3Processor(image_processor, tokenizer_converted, chat_template=CHAT_TEMPLATE)
-    processor.save_pretrained(output_dir)
-
-    # load models
-    model_llm = AutoModelForCausalLM.from_pretrained(
-        llm_model_id,
-        trust_remote_code=True,
-    )
-    model_vqgan = AutoModel.from_pretrained(vq_model_id, trust_remote_code=True)
-    with open(f"{output_dir}/tokenizer.json", "r") as file:
-        tokenizer_config = json.load(file)
-    vocabulary_map = tokenizer_config["model"]["vocab"]
-
-    text_config = Emu3TextConfig(
-        max_position_embeddings=model_llm.config.max_position_embeddings,
-        rope_scaling={"rope_type": "default"},
-    )
-    config = Emu3Config(text_config=text_config, vocabulary_map=vocabulary_map)
-
-    with init_empty_weights():
-        model = Emu3ForConditionalGeneration(config=config)
-        model.generation_config = GenerationConfig(
-            do_sample=True,
-            top_k=2048,
-            max_new_tokens=50_000,
-            pad_token_id=processor.tokenizer.pad_token_id,
-            eos_token_id=processor.tokenizer.eos_token_id,
-        )
-
-    state_dict = {}
-    state_dict = convert_state_dict_to_hf(model_llm.state_dict(), state_dict)
-    state_dict = convert_state_dict_to_hf(model_vqgan.state_dict(), state_dict)
-
-    model.load_state_dict(state_dict, assign=True, strict=True)
-    model.save_pretrained(output_dir, safe_serialization=True)
-
-    if hub_model_id is not None:
-        model.push_to_hub(hub_model_id)
-        processor.push_to_hub(hub_model_id)
-
-    if test_inference and llm_model_id.endswith("Chat"):
-        # Short inference on a few examples to check if generation makes sense
-        print("Loading the checkpoint in a Emu3 model...")
-        print("*" * 100)
-        model = Emu3ForConditionalGeneration.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto")
-        processor = Emu3Processor.from_pretrained(output_dir)
-
-        conversation = [
-            {
-                "role": "system",
-                "content": [
-                    {"type": "text", "text": "You are a helpful assistant."},
-                ],
-            },
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "Please tell me about this art work and its artist."},
-                    {"type": "image"},
-                ],
-            },
-        ]
-        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
-
-        image = Image.open(
-            requests.get(
-                "https://uploads4.wikiart.org/images/paul-klee/death-for-the-idea-1915.jpg!Large.jpg", stream=True
-            ).raw
-        )
-        inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, torch.bfloat16)
-        length = inputs.input_ids.shape[1]
-
-        out = model.generate(**inputs, max_new_tokens=40, do_sample=False)
-        generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0]
-
-        print(f"Generation for single-image: {generated_text}")
-        print("*" * 100)
-    elif test_inference and llm_model_id.endswith("Gen"):
-        processor = Emu3Processor.from_pretrained(output_dir)
-        model = Emu3ForConditionalGeneration.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto")
-
-        inputs = processor(
-            text=[
-                "a portrait of young girl. masterpiece, film grained, best quality.",
-                "a dog running under the rain",
-            ],
-            padding=True,
-            return_tensors="pt",
-            return_for_image_generation=True,
-        )
-        inputs = inputs.to(device="cuda:0", dtype=torch.bfloat16)
-
-        neg_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry."
-        neg_inputs = processor(text=[neg_prompt] * 2, return_tensors="pt").to(device="cuda:0")
-
-        image_sizes = inputs.pop("image_sizes")
-        HEIGHT, WIDTH = image_sizes[0]
-        VISUAL_TOKENS = model.vocabulary_mapping.image_tokens
-
-        def prefix_allowed_tokens_fn(batch_id, input_ids):
-            height, width = HEIGHT, WIDTH
-            visual_tokens = VISUAL_TOKENS
-            image_token_id = processor.tokenizer.encode("<|image token|>", return_tensors="pt")[0].to(model.device)
-            eoi_token_id = processor.tokenizer.encode("<|image end|>", return_tensors="pt")[0]
-            eos_token_id = processor.tokenizer.encode("<|extra_204|>", return_tensors="pt")[0]
-            pad_token_id = processor.tokenizer.encode("<|endoftext|>", return_tensors="pt")[0]
-            eol_token_id = processor.tokenizer.encode("<|extra_200|>", return_tensors="pt")[0]
-            eof_token_id = processor.tokenizer.encode("<|extra_201|>", return_tensors="pt")[0]
-
-            position = torch.nonzero(input_ids == image_token_id, as_tuple=True)[0][0]
-            offset = input_ids.shape[0] - position
-            if offset % (width + 1) == 0:
-                return (eol_token_id,)
-            elif offset == (width + 1) * height + 1:
-                return (eof_token_id,)
-            elif offset == (width + 1) * height + 2:
-                return (eoi_token_id,)
-            elif offset == (width + 1) * height + 3:
-                return (eos_token_id,)
-            elif offset > (width + 1) * height + 3:
-                return (pad_token_id,)
-            else:
-                return visual_tokens
-
-        out = model.generate(
-            **inputs,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            negative_prompt_ids=neg_inputs.input_ids,
-            negative_prompt_attention_mask=neg_inputs.attention_mask,
-        )
-
-        image = model.decode_image_tokens(out[:, inputs.input_ids.shape[1] :], height=HEIGHT, width=WIDTH)
-        images = processor.postprocess(
-            list(image.float()), return_tensors="PIL.Image.Image"
-        )  # internally we convert to np but it's not supported in bf16 precision
-        for i, image in enumerate(images["pixel_values"]):
-            image.save(f"result_{i}.png")
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--vq_model_id",
-        help="Model ID of Emu3 VQ-VAE on the hub",
-        default="BAAI/Emu3-VisionTokenizer",
-    )
-    parser.add_argument(
-        "--llm_model_id",
-        help="Model ID of Emu3 bacbone LLM on the hub",
-        default="BAAI/Emu3-Chat",
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model",
-    )
-    parser.add_argument(
-        "--hub_model_id",
-        help="Model ID in the hub where to push the model.",
-    )
-    parser.add_argument(
-        "--test_inference",
-        action="store_true",
-        help="Whether to load the model for generation to test it's converted correctly.",
-    )
-    args = parser.parse_args()
-    convert_model(
-        vq_model_id=args.vq_model_id,
-        llm_model_id=args.llm_model_id,
-        output_dir=args.output_dir,
-        hub_model_id=args.hub_model_id,
-        test_inference=args.test_inference,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py b/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py
deleted file mode 100644
index fa80f2b70964..000000000000
--- a/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright 2025 Useful Sensors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import re
-
-import h5py
-import numpy as np
-import torch
-from huggingface_hub import hf_hub_download
-
-from transformers.models.moonshine.modeling_moonshine import MoonshineConfig, MoonshineForConditionalGeneration
-
-
-# Copied from https://github.com/usefulsensors/moonshine/blob/a1d77cc573b0471ac4602b86f67b3f48d67df1a9/moonshine/model.py
-def _get_weights(model_name):
-    repo = "UsefulSensors/moonshine"
-
-    return (
-        hf_hub_download(repo, f"{x}.weights.h5", subfolder=model_name) for x in ("preprocessor", "encoder", "decoder")
-    )
-
-
-def _read_h5_weights(group, current_key="", weights={}):
-    for key in group.keys():
-        full_key = f"{current_key}.{key}" if current_key else key
-        if isinstance(group[key], h5py.Dataset):
-            w = np.array(group[key])
-            w = torch.from_numpy(w)
-            if len(w.shape) > 1:
-                if len(w.shape) == 3:
-                    hidden_size = max(list(w.shape))
-                    try:
-                        w = w.reshape(hidden_size, hidden_size)
-                    except RuntimeError:
-                        # meaning its a conv layers
-                        pass
-                w = w.transpose(0, -1)
-            weights[full_key] = w
-        else:
-            _read_h5_weights(group[key], full_key, weights)
-    return weights
-
-
-def _convert_layer_names(name, gated_mlp=False):
-    name = re.sub(
-        r"layers\.functional(?:_(\d+))?\.layers",
-        lambda m: f'layers.{m.group(1) if m.group(1) else "0"}',
-        name,
-        count=1,
-    )
-    if gated_mlp:
-        name = re.sub(r"functional\.layers\.dense\.", "mlp.fc1.", name)
-        name = re.sub(r"functional\.layers\.dense_1\.", "mlp.fc2.", name)
-    else:
-        name = re.sub(r"functional\.layers\.sequential\.layers\.dense\.", "mlp.fc1.", name)
-        name = re.sub(r"functional\.layers\.sequential\.layers\.dense_1\.", "mlp.fc2.", name)
-    name = re.sub(r"layers\.sequential\.layers\.conv1d\.", "conv1.", name)
-    name = re.sub(r"layers\.sequential\.layers\.conv1d_1\.", "conv2.", name)
-    name = re.sub(r"layers\.sequential\.layers\.conv1d_2\.", "conv3.", name)
-    name = re.sub(r"layers\.sequential\.layers\.group_normalization\.", "groupnorm.", name)
-    name = re.sub(r"mha_with_rope\.key_dense", "self_attn.k_proj", name)
-    name = re.sub(r"mha_with_rope\.query_dense", "self_attn.q_proj", name)
-    name = re.sub(r"mha_with_rope\.value_dense", "self_attn.v_proj", name)
-    name = re.sub(r"mha_with_rope\.output_dense", "self_attn.o_proj", name)
-    name = re.sub(r"mha_precomputed_kv\.key_dense", "encoder_attn.k_proj", name)
-    name = re.sub(r"mha_precomputed_kv\.query_dense", "encoder_attn.q_proj", name)
-    name = re.sub(r"mha_precomputed_kv\.value_dense", "encoder_attn.v_proj", name)
-    name = re.sub(r"mha_precomputed_kv\.output_dense", "encoder_attn.o_proj", name)
-    name = re.sub(r"mha_causal_with_rope\.key_dense", "self_attn.k_proj", name)
-    name = re.sub(r"mha_causal_with_rope\.query_dense", "self_attn.q_proj", name)
-    name = re.sub(r"mha_causal_with_rope\.value_dense", "self_attn.v_proj", name)
-    name = re.sub(r"mha_causal_with_rope\.output_dense", "self_attn.o_proj", name)
-    name = re.sub(r"layer_normalization\.", "input_layernorm.", name)
-    name = re.sub(r"layer_normalization_1\.", "post_attention_layernorm.", name)
-    name = re.sub(r"layer_normalization_2\.", "final_layernorm.", name)
-    name = re.sub(r"vars\.0", "weight", name)
-    name = re.sub(r"vars\.1", "bias", name)
-    name = re.sub(r"layers\.reversible_embedding", "embed_tokens", name)
-
-    return name
-
-
-def _convert_weights(weights, encoder=True):
-    if "layers.rotary_embedding.vars.0" in weights:
-        weights.pop("layers.rotary_embedding.vars.0")
-
-    converted_weights = {}
-    if encoder:
-        converted_weights["layer_norm.weight"] = weights.pop("layers.layer_normalization.vars.0")
-    else:
-        converted_weights["norm.weight"] = weights.pop("layers.layer_normalization.vars.0")
-
-    for name, w in weights.items():
-        if encoder:
-            new_name = _convert_layer_names(name)
-        else:
-            new_name = _convert_layer_names(name, gated_mlp=True)
-        converted_weights[new_name] = w
-
-    return converted_weights
-
-
-def convert_usefulsensors_moonshine_to_hf(model_name, pytorch_dump_folder_path):
-    preprocessor_weights_path, encoder_weights_path, decoder_weights_path = _get_weights(model_name)
-
-    with h5py.File(preprocessor_weights_path, "r") as f:
-        loaded_preprocessor_weights = _read_h5_weights(f, weights={})
-
-    with h5py.File(encoder_weights_path, "r") as f:
-        loaded_encoder_weights = _read_h5_weights(f, weights={})
-
-    with h5py.File(decoder_weights_path, "r") as f:
-        loaded_decoder_weights = _read_h5_weights(f, weights={})
-
-    encoder_state_dict = {**loaded_encoder_weights, **loaded_preprocessor_weights}
-    converted_encoder_state_dict = _convert_weights(encoder_state_dict)
-
-    converted_decoder_state_dict = _convert_weights(loaded_decoder_weights, encoder=False)
-    converted_decoder_state_dict["embed_tokens.weight"] = converted_decoder_state_dict["embed_tokens.weight"].T
-
-    final_weights = {}
-    for k, v in converted_encoder_state_dict.items():
-        final_weights[f"model.encoder.{k}"] = v
-
-    for k, v in converted_decoder_state_dict.items():
-        final_weights[f"model.decoder.{k}"] = v
-
-    if model_name == "tiny":
-        config = MoonshineConfig()
-    elif model_name == "base":
-        config = MoonshineConfig(
-            hidden_size=416,
-            intermediate_size=1664,
-            encoder_num_hidden_layers=8,
-            decoder_num_hidden_layers=8,
-            encoder_num_attention_heads=8,
-            decoder_num_attention_heads=8,
-            partial_rotary_factor=0.62,
-        )
-    else:
-        raise ValueError(f"Unknown model name {model_name}")
-
-    final_weights["proj_out.weight"] = converted_decoder_state_dict["embed_tokens.weight"]
-
-    model = MoonshineForConditionalGeneration(config)
-    model.load_state_dict(final_weights)
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # # Required parameters
-    parser.add_argument("--model_name", type=str, help="Path to the downloaded checkpoints")
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-
-    convert_usefulsensors_moonshine_to_hf(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
index 4893143e2c16..3d432697bfa1 100644
--- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -1482,9 +1482,14 @@ def get_rope_index(
             if attention_mask is None:
                 attention_mask = torch.ones_like(total_input_ids)
             position_ids = torch.ones(
-                3, input_ids.shape[0], input_ids.shape[1], dtype=input_ids.dtype, device=input_ids.device
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
             )
             image_index, video_index = 0, 0
+            attention_mask = attention_mask.to(total_input_ids.device)
             for i, input_ids in enumerate(total_input_ids):
                 input_ids = input_ids[attention_mask[i] == 1]
                 image_nums, video_nums = 0, 0
@@ -1511,15 +1516,21 @@ def get_rope_index(
                             image_grid_thw[image_index][1],
                             image_grid_thw[image_index][2],
                         )
+                        second_per_grid_t = 0
                         image_index += 1
                         remain_images -= 1
                         ed = ed_image
+
                     else:
                         t, h, w = (
                             video_grid_thw[video_index][0],
                             video_grid_thw[video_index][1],
                             video_grid_thw[video_index][2],
                         )
+                        if second_per_grid_ts is not None:
+                            second_per_grid_t = second_per_grid_ts[video_index]
+                        else:
+                            second_per_grid_t = 1.0
                         video_index += 1
                         remain_videos -= 1
                         ed = ed_video
@@ -1533,7 +1544,15 @@ def get_rope_index(
                     st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
                     llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
 
-                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                    t_index = (
+                        (
+                            torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w)
+                            * second_per_grid_t
+                            * self.config.vision_config.tokens_per_second
+                        )
+                        .long()
+                        .flatten()
+                    )
                     h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
                     w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
                     llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
@@ -1553,7 +1572,7 @@ def get_rope_index(
             if attention_mask is not None:
                 position_ids = attention_mask.long().cumsum(-1) - 1
                 position_ids.masked_fill_(attention_mask == 0, 1)
-                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device)
                 max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
                 mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
             else: