From 785b5cf444d8e6b5334c774cf550f50f2234e9c2 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Mon, 20 Jan 2025 16:20:06 +0100 Subject: [PATCH] v4.48.1 --- setup.py | 2 +- src/transformers/__init__.py | 2 +- .../models/emu3/convert_emu3_weights_to_hf.py | 448 ------------------ .../moonshine/convert_usefulsensors_to_hf.py | 169 ------- .../models/qwen2_vl/modeling_qwen2_vl.py | 25 +- 5 files changed, 24 insertions(+), 622 deletions(-) delete mode 100644 src/transformers/models/emu3/convert_emu3_weights_to_hf.py delete mode 100644 src/transformers/models/moonshine/convert_usefulsensors_to_hf.py diff --git a/setup.py b/setup.py index c6d12f87b78c..d1483f414859 100644 --- a/setup.py +++ b/setup.py @@ -437,7 +437,7 @@ def run(self): setup( name="transformers", - version="4.48.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.48.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", author_email="transformers@huggingface.co", description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index de34fea356db..017776936b94 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -18,7 +18,7 @@ # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names # in the namespace without actually importing anything (and especially none of the backends). -__version__ = "4.48.0" +__version__ = "4.48.1" from typing import TYPE_CHECKING diff --git a/src/transformers/models/emu3/convert_emu3_weights_to_hf.py b/src/transformers/models/emu3/convert_emu3_weights_to_hf.py deleted file mode 100644 index 8ac8db7e4290..000000000000 --- a/src/transformers/models/emu3/convert_emu3_weights_to_hf.py +++ /dev/null @@ -1,448 +0,0 @@ -# Copyright 2024 The Emu team, BAAI and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import json -import os -import re -from typing import Dict, Optional - -import requests -import torch -from accelerate import init_empty_weights -from PIL import Image - -from transformers import ( - AutoModel, - AutoModelForCausalLM, - AutoTokenizer, - Emu3Config, - Emu3ForConditionalGeneration, - Emu3ImageProcessor, - Emu3Processor, - Emu3TextConfig, - GenerationConfig, -) -from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode - - -""" -Sample usage: - -``` -python src/transformers/models/emu3/convert_emu3_weights_to_hf.py \ - --vq_model_id BAAI/Emu3-VisionTokenizer --llm_model_id BAAI/Emu3-Chat --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import Emu3ForConditionalGeneration, Emu3Processor - -model = Emu3ForConditionalGeneration.from_pretrained("/output/path") -processor = Emu3Processor.from_pretrained("/output/path") -``` - -""" - - -byte_encoder = bytes_to_unicode() -CHAT_TEMPLATE = "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}" - - -# Tiktoken to HF conversion, thanks for Xenova -def token_bytes_to_string(b): - return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")]) - - -# Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960 -def bpe(mergeable_ranks: Dict[bytes, int], token: bytes, max_rank: Optional[int] = None): - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :] - return parts - - -def generate_vocab_and_merges(encoder): - mergeable_ranks = encoder._mergeable_ranks - - merges = [] - vocab = {} - for token, rank in mergeable_ranks.items(): - vocab[token_bytes_to_string(token)] = rank - - if len(token) == 1: - continue - merged = tuple(bpe(mergeable_ranks, token, max_rank=rank)) - assert len(merged) == 2 - merges.append(" ".join(map(token_bytes_to_string, merged))) - - # Also add special tokens - vocab.update(encoder._special_tokens) - return vocab, merges - - -def convert_tiktoken(tokenizer, output_dir): - encoder = tokenizer.tokenizer - vocab, merges = generate_vocab_and_merges(encoder) - added_tokens = [ - { - "id": id, - "content": content, - "single_word": False, - "lstrip": False, - "rstrip": False, - "normalized": False, - "special": True, - } - for content, id in encoder._special_tokens.items() - if content != "<|extra_0|>" - ] - - # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer_config.json - tokenizer_config_template = { - "add_prefix_space": False, - "bos_token": "<|extra_203|>", - "clean_up_tokenization_spaces": False, - "eos_token": "<|extra_204|>", - "pad_token": "<|endoftext|>", - } - tokenizer_config_template.update({"tokenizer_class": "GPT2Tokenizer"}) - tokenizer_config_template = dict(sorted(tokenizer_config_template.items(), key=lambda x: x[0])) - - # add placeholder image token by taking one of the reserved tokens - reserved_token_id = vocab["<|extra_0|>"] - vocab[""] = reserved_token_id - del vocab["<|extra_0|>"] - added_tokens.append( - { - "id": reserved_token_id, - "content": "", - "single_word": False, - "lstrip": False, - "rstrip": False, - "normalized": False, - "special": True, - } - ) - - os.makedirs(output_dir, exist_ok=True) - - pre_tokenizer = { - "type": "ByteLevel", - "add_prefix_space": False, - "trim_offsets": True, - "use_regex": True, - } - - # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer.json - tokenizer_template = { - "version": "1.0", - "truncation": None, - "padding": None, - "added_tokens": added_tokens, - "normalizer": None, - "pre_tokenizer": pre_tokenizer, - "post_processor": None, - "decoder": { - "type": "ByteLevel", - "add_prefix_space": True, - "trim_offsets": True, - "use_regex": True, - }, - "model": { - "type": "BPE", - "dropout": None, - "unk_token": None, - "continuing_subword_prefix": "", - "end_of_word_suffix": "", - "fuse_unk": False, - "byte_fallback": False, - "vocab": vocab, - "merges": merges, - }, - } - - # Save to files - with open(os.path.join(output_dir, "vocab.json"), "w", encoding="utf-8") as fp: - json.dump(vocab, fp, indent=2, ensure_ascii=False) - - with open(os.path.join(output_dir, "tokenizer.json"), "w", encoding="utf-8") as fp: - json.dump(tokenizer_template, fp, indent=2, ensure_ascii=False) - - with open(os.path.join(output_dir, "tokenizer_config.json"), "w", encoding="utf-8") as fp: - json.dump(tokenizer_config_template, fp, indent=2, ensure_ascii=False) - - with open(os.path.join(output_dir, "special_tokens_map.json"), "w", encoding="utf-8") as fp: - json.dump( - { - "bos_token": "<|extra_203|>", - "eos_token": "<|extra_204|>", - "pad_token": "<|endoftext|>", - }, - fp, - indent=2, - ensure_ascii=False, - ) - - with open(os.path.join(output_dir, "merges.txt"), "w", encoding="utf-8") as fp: - fp.write("#version: 0.2\n") - fp.write("\n".join(merges)) - - -KEYS_TO_MODIFY_MAPPING = { - "^encoder": "model.vqmodel.encoder", - "^decoder": "model.vqmodel.decoder", - "^post_quant_conv": "model.vqmodel.post_quant_conv", - "^quant_conv": "model.vqmodel.quant_conv", - "^quantize": "model.vqmodel.quantize", - "^model": "text_model.model", - r"lm_head\.weight": "text_model.lm_head.weight", - r"^text_model\.model\.vqmodel": "vqmodel", - # rename QKV proj for the VQ-VAE model because we use SiglipAttention - r"\.q\.": ".q_proj.", - r"\.k\.": ".k_proj.", - r"\.v\.": ".v_proj.", - r"\.proj_out\.": ".out_proj.", - # move the attention norms outside of attention modules - r"mid\.attn_1\.norm\.": "mid.attn_norm.", - r"attn\.0\.norm\.": "attn_norms.0.", - r"attn\.1\.norm\.": "attn_norms.1.", - r"attn\.2\.norm\.": "attn_norms.2.", - r"attn\.3\.norm\.": "attn_norms.3.", - # isolate down/mid/up into separate classes for readability - r"\.down\.": ".down_block.down.", - r"\.up\.": ".up_block.up.", - r"\.mid\.": ".middle_block.", -} - - -def convert_state_dict_to_hf(old_state_dict, new_state_dict): - for key, value in old_state_dict.items(): - # convert conv layers in attn to linear - if ( - any(key.endswith(name) for name in ["q.weight", "k.weight", "v.weight", "proj_out.weight"]) - and value.ndim == 4 - ): - value = value.squeeze() - - for old_pattern, new_pattern in KEYS_TO_MODIFY_MAPPING.items(): - key = re.sub(old_pattern, new_pattern, key) - - new_state_dict[key] = value - return new_state_dict - - -def convert_model(vq_model_id, llm_model_id, output_dir, hub_model_id=None, test_inference=False): - os.makedirs(output_dir, exist_ok=True) - - # Convert and save processor - tokenizer_tiktoken = AutoTokenizer.from_pretrained(llm_model_id, trust_remote_code=True) - convert_tiktoken(tokenizer_tiktoken, output_dir) - extra_special_tokens = extra_special_tokens = { - "image_token": "", - "boi_token": "<|image start|>", - "eoi_token": "<|image end|>", - "image_wrapper_token": "<|image token|>", - "eof_token": "<|extra_201|>", - } - tokenizer_converted = AutoTokenizer.from_pretrained(output_dir, extra_special_tokens=extra_special_tokens) - tokenizer_converted.padding_side = "left" - - image_processor = Emu3ImageProcessor.from_pretrained(vq_model_id) - processor = Emu3Processor(image_processor, tokenizer_converted, chat_template=CHAT_TEMPLATE) - processor.save_pretrained(output_dir) - - # load models - model_llm = AutoModelForCausalLM.from_pretrained( - llm_model_id, - trust_remote_code=True, - ) - model_vqgan = AutoModel.from_pretrained(vq_model_id, trust_remote_code=True) - with open(f"{output_dir}/tokenizer.json", "r") as file: - tokenizer_config = json.load(file) - vocabulary_map = tokenizer_config["model"]["vocab"] - - text_config = Emu3TextConfig( - max_position_embeddings=model_llm.config.max_position_embeddings, - rope_scaling={"rope_type": "default"}, - ) - config = Emu3Config(text_config=text_config, vocabulary_map=vocabulary_map) - - with init_empty_weights(): - model = Emu3ForConditionalGeneration(config=config) - model.generation_config = GenerationConfig( - do_sample=True, - top_k=2048, - max_new_tokens=50_000, - pad_token_id=processor.tokenizer.pad_token_id, - eos_token_id=processor.tokenizer.eos_token_id, - ) - - state_dict = {} - state_dict = convert_state_dict_to_hf(model_llm.state_dict(), state_dict) - state_dict = convert_state_dict_to_hf(model_vqgan.state_dict(), state_dict) - - model.load_state_dict(state_dict, assign=True, strict=True) - model.save_pretrained(output_dir, safe_serialization=True) - - if hub_model_id is not None: - model.push_to_hub(hub_model_id) - processor.push_to_hub(hub_model_id) - - if test_inference and llm_model_id.endswith("Chat"): - # Short inference on a few examples to check if generation makes sense - print("Loading the checkpoint in a Emu3 model...") - print("*" * 100) - model = Emu3ForConditionalGeneration.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto") - processor = Emu3Processor.from_pretrained(output_dir) - - conversation = [ - { - "role": "system", - "content": [ - {"type": "text", "text": "You are a helpful assistant."}, - ], - }, - { - "role": "user", - "content": [ - {"type": "text", "text": "Please tell me about this art work and its artist."}, - {"type": "image"}, - ], - }, - ] - prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) - - image = Image.open( - requests.get( - "https://uploads4.wikiart.org/images/paul-klee/death-for-the-idea-1915.jpg!Large.jpg", stream=True - ).raw - ) - inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, torch.bfloat16) - length = inputs.input_ids.shape[1] - - out = model.generate(**inputs, max_new_tokens=40, do_sample=False) - generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0] - - print(f"Generation for single-image: {generated_text}") - print("*" * 100) - elif test_inference and llm_model_id.endswith("Gen"): - processor = Emu3Processor.from_pretrained(output_dir) - model = Emu3ForConditionalGeneration.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto") - - inputs = processor( - text=[ - "a portrait of young girl. masterpiece, film grained, best quality.", - "a dog running under the rain", - ], - padding=True, - return_tensors="pt", - return_for_image_generation=True, - ) - inputs = inputs.to(device="cuda:0", dtype=torch.bfloat16) - - neg_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry." - neg_inputs = processor(text=[neg_prompt] * 2, return_tensors="pt").to(device="cuda:0") - - image_sizes = inputs.pop("image_sizes") - HEIGHT, WIDTH = image_sizes[0] - VISUAL_TOKENS = model.vocabulary_mapping.image_tokens - - def prefix_allowed_tokens_fn(batch_id, input_ids): - height, width = HEIGHT, WIDTH - visual_tokens = VISUAL_TOKENS - image_token_id = processor.tokenizer.encode("<|image token|>", return_tensors="pt")[0].to(model.device) - eoi_token_id = processor.tokenizer.encode("<|image end|>", return_tensors="pt")[0] - eos_token_id = processor.tokenizer.encode("<|extra_204|>", return_tensors="pt")[0] - pad_token_id = processor.tokenizer.encode("<|endoftext|>", return_tensors="pt")[0] - eol_token_id = processor.tokenizer.encode("<|extra_200|>", return_tensors="pt")[0] - eof_token_id = processor.tokenizer.encode("<|extra_201|>", return_tensors="pt")[0] - - position = torch.nonzero(input_ids == image_token_id, as_tuple=True)[0][0] - offset = input_ids.shape[0] - position - if offset % (width + 1) == 0: - return (eol_token_id,) - elif offset == (width + 1) * height + 1: - return (eof_token_id,) - elif offset == (width + 1) * height + 2: - return (eoi_token_id,) - elif offset == (width + 1) * height + 3: - return (eos_token_id,) - elif offset > (width + 1) * height + 3: - return (pad_token_id,) - else: - return visual_tokens - - out = model.generate( - **inputs, - prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, - negative_prompt_ids=neg_inputs.input_ids, - negative_prompt_attention_mask=neg_inputs.attention_mask, - ) - - image = model.decode_image_tokens(out[:, inputs.input_ids.shape[1] :], height=HEIGHT, width=WIDTH) - images = processor.postprocess( - list(image.float()), return_tensors="PIL.Image.Image" - ) # internally we convert to np but it's not supported in bf16 precision - for i, image in enumerate(images["pixel_values"]): - image.save(f"result_{i}.png") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--vq_model_id", - help="Model ID of Emu3 VQ-VAE on the hub", - default="BAAI/Emu3-VisionTokenizer", - ) - parser.add_argument( - "--llm_model_id", - help="Model ID of Emu3 bacbone LLM on the hub", - default="BAAI/Emu3-Chat", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model", - ) - parser.add_argument( - "--hub_model_id", - help="Model ID in the hub where to push the model.", - ) - parser.add_argument( - "--test_inference", - action="store_true", - help="Whether to load the model for generation to test it's converted correctly.", - ) - args = parser.parse_args() - convert_model( - vq_model_id=args.vq_model_id, - llm_model_id=args.llm_model_id, - output_dir=args.output_dir, - hub_model_id=args.hub_model_id, - test_inference=args.test_inference, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py b/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py deleted file mode 100644 index fa80f2b70964..000000000000 --- a/src/transformers/models/moonshine/convert_usefulsensors_to_hf.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright 2025 Useful Sensors and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import re - -import h5py -import numpy as np -import torch -from huggingface_hub import hf_hub_download - -from transformers.models.moonshine.modeling_moonshine import MoonshineConfig, MoonshineForConditionalGeneration - - -# Copied from https://github.com/usefulsensors/moonshine/blob/a1d77cc573b0471ac4602b86f67b3f48d67df1a9/moonshine/model.py -def _get_weights(model_name): - repo = "UsefulSensors/moonshine" - - return ( - hf_hub_download(repo, f"{x}.weights.h5", subfolder=model_name) for x in ("preprocessor", "encoder", "decoder") - ) - - -def _read_h5_weights(group, current_key="", weights={}): - for key in group.keys(): - full_key = f"{current_key}.{key}" if current_key else key - if isinstance(group[key], h5py.Dataset): - w = np.array(group[key]) - w = torch.from_numpy(w) - if len(w.shape) > 1: - if len(w.shape) == 3: - hidden_size = max(list(w.shape)) - try: - w = w.reshape(hidden_size, hidden_size) - except RuntimeError: - # meaning its a conv layers - pass - w = w.transpose(0, -1) - weights[full_key] = w - else: - _read_h5_weights(group[key], full_key, weights) - return weights - - -def _convert_layer_names(name, gated_mlp=False): - name = re.sub( - r"layers\.functional(?:_(\d+))?\.layers", - lambda m: f'layers.{m.group(1) if m.group(1) else "0"}', - name, - count=1, - ) - if gated_mlp: - name = re.sub(r"functional\.layers\.dense\.", "mlp.fc1.", name) - name = re.sub(r"functional\.layers\.dense_1\.", "mlp.fc2.", name) - else: - name = re.sub(r"functional\.layers\.sequential\.layers\.dense\.", "mlp.fc1.", name) - name = re.sub(r"functional\.layers\.sequential\.layers\.dense_1\.", "mlp.fc2.", name) - name = re.sub(r"layers\.sequential\.layers\.conv1d\.", "conv1.", name) - name = re.sub(r"layers\.sequential\.layers\.conv1d_1\.", "conv2.", name) - name = re.sub(r"layers\.sequential\.layers\.conv1d_2\.", "conv3.", name) - name = re.sub(r"layers\.sequential\.layers\.group_normalization\.", "groupnorm.", name) - name = re.sub(r"mha_with_rope\.key_dense", "self_attn.k_proj", name) - name = re.sub(r"mha_with_rope\.query_dense", "self_attn.q_proj", name) - name = re.sub(r"mha_with_rope\.value_dense", "self_attn.v_proj", name) - name = re.sub(r"mha_with_rope\.output_dense", "self_attn.o_proj", name) - name = re.sub(r"mha_precomputed_kv\.key_dense", "encoder_attn.k_proj", name) - name = re.sub(r"mha_precomputed_kv\.query_dense", "encoder_attn.q_proj", name) - name = re.sub(r"mha_precomputed_kv\.value_dense", "encoder_attn.v_proj", name) - name = re.sub(r"mha_precomputed_kv\.output_dense", "encoder_attn.o_proj", name) - name = re.sub(r"mha_causal_with_rope\.key_dense", "self_attn.k_proj", name) - name = re.sub(r"mha_causal_with_rope\.query_dense", "self_attn.q_proj", name) - name = re.sub(r"mha_causal_with_rope\.value_dense", "self_attn.v_proj", name) - name = re.sub(r"mha_causal_with_rope\.output_dense", "self_attn.o_proj", name) - name = re.sub(r"layer_normalization\.", "input_layernorm.", name) - name = re.sub(r"layer_normalization_1\.", "post_attention_layernorm.", name) - name = re.sub(r"layer_normalization_2\.", "final_layernorm.", name) - name = re.sub(r"vars\.0", "weight", name) - name = re.sub(r"vars\.1", "bias", name) - name = re.sub(r"layers\.reversible_embedding", "embed_tokens", name) - - return name - - -def _convert_weights(weights, encoder=True): - if "layers.rotary_embedding.vars.0" in weights: - weights.pop("layers.rotary_embedding.vars.0") - - converted_weights = {} - if encoder: - converted_weights["layer_norm.weight"] = weights.pop("layers.layer_normalization.vars.0") - else: - converted_weights["norm.weight"] = weights.pop("layers.layer_normalization.vars.0") - - for name, w in weights.items(): - if encoder: - new_name = _convert_layer_names(name) - else: - new_name = _convert_layer_names(name, gated_mlp=True) - converted_weights[new_name] = w - - return converted_weights - - -def convert_usefulsensors_moonshine_to_hf(model_name, pytorch_dump_folder_path): - preprocessor_weights_path, encoder_weights_path, decoder_weights_path = _get_weights(model_name) - - with h5py.File(preprocessor_weights_path, "r") as f: - loaded_preprocessor_weights = _read_h5_weights(f, weights={}) - - with h5py.File(encoder_weights_path, "r") as f: - loaded_encoder_weights = _read_h5_weights(f, weights={}) - - with h5py.File(decoder_weights_path, "r") as f: - loaded_decoder_weights = _read_h5_weights(f, weights={}) - - encoder_state_dict = {**loaded_encoder_weights, **loaded_preprocessor_weights} - converted_encoder_state_dict = _convert_weights(encoder_state_dict) - - converted_decoder_state_dict = _convert_weights(loaded_decoder_weights, encoder=False) - converted_decoder_state_dict["embed_tokens.weight"] = converted_decoder_state_dict["embed_tokens.weight"].T - - final_weights = {} - for k, v in converted_encoder_state_dict.items(): - final_weights[f"model.encoder.{k}"] = v - - for k, v in converted_decoder_state_dict.items(): - final_weights[f"model.decoder.{k}"] = v - - if model_name == "tiny": - config = MoonshineConfig() - elif model_name == "base": - config = MoonshineConfig( - hidden_size=416, - intermediate_size=1664, - encoder_num_hidden_layers=8, - decoder_num_hidden_layers=8, - encoder_num_attention_heads=8, - decoder_num_attention_heads=8, - partial_rotary_factor=0.62, - ) - else: - raise ValueError(f"Unknown model name {model_name}") - - final_weights["proj_out.weight"] = converted_decoder_state_dict["embed_tokens.weight"] - - model = MoonshineForConditionalGeneration(config) - model.load_state_dict(final_weights) - model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # # Required parameters - parser.add_argument("--model_name", type=str, help="Path to the downloaded checkpoints") - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - args = parser.parse_args() - - convert_usefulsensors_moonshine_to_hf(args.model_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 4893143e2c16..3d432697bfa1 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1482,9 +1482,14 @@ def get_rope_index( if attention_mask is None: attention_mask = torch.ones_like(total_input_ids) position_ids = torch.ones( - 3, input_ids.shape[0], input_ids.shape[1], dtype=input_ids.dtype, device=input_ids.device + 3, + input_ids.shape[0], + input_ids.shape[1], + dtype=input_ids.dtype, + device=input_ids.device, ) image_index, video_index = 0, 0 + attention_mask = attention_mask.to(total_input_ids.device) for i, input_ids in enumerate(total_input_ids): input_ids = input_ids[attention_mask[i] == 1] image_nums, video_nums = 0, 0 @@ -1511,15 +1516,21 @@ def get_rope_index( image_grid_thw[image_index][1], image_grid_thw[image_index][2], ) + second_per_grid_t = 0 image_index += 1 remain_images -= 1 ed = ed_image + else: t, h, w = ( video_grid_thw[video_index][0], video_grid_thw[video_index][1], video_grid_thw[video_index][2], ) + if second_per_grid_ts is not None: + second_per_grid_t = second_per_grid_ts[video_index] + else: + second_per_grid_t = 1.0 video_index += 1 remain_videos -= 1 ed = ed_video @@ -1533,7 +1544,15 @@ def get_rope_index( st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) - t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten() + t_index = ( + ( + torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w) + * second_per_grid_t + * self.config.vision_config.tokens_per_second + ) + .long() + .flatten() + ) h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten() w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten() llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx) @@ -1553,7 +1572,7 @@ def get_rope_index( if attention_mask is not None: position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) - position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device) max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0] mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1] else: