From 2fceb10b66de5f37a48ef452e51821fa791fa5cb Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 27 Jun 2024 16:48:47 +0200 Subject: [PATCH 01/13] inital commit --- .github/ISSUE_TEMPLATE/bug-report.yml | 4 +- .github/PULL_REQUEST_TEMPLATE.md | 4 +- README.md | 24 +- docs/source/de/testing.md | 16 +- docs/source/en/_toctree.yml | 6 + docs/source/en/index.md | 5 + docs/source/en/model_doc/gpt_neox.md | 62 + docs/source/en/model_doc/instructblip.md | 1 + docs/source/en/model_doc/instructblipvideo.md | 74 + docs/source/en/model_doc/llava-next-video.md | 259 ++ docs/source/en/model_doc/rt_detr.md | 96 + docs/source/en/perf_infer_gpu_one.md | 2 + docs/source/en/tasks/mask_generation.md | 4 +- docs/source/en/testing.md | 16 +- docs/source/ja/testing.md | 11 +- docs/source/ko/testing.md | 188 +- .../text-classification/run_classification.py | 5 + README_de.md => i18n/README_de.md | 22 +- README_es.md => i18n/README_es.md | 22 +- README_fr.md => i18n/README_fr.md | 22 +- README_hd.md => i18n/README_hd.md | 22 +- README_ja.md => i18n/README_ja.md | 22 +- README_ko.md => i18n/README_ko.md | 22 +- README_pt-br.md => i18n/README_pt-br.md | 22 +- README_ru.md => i18n/README_ru.md | 22 +- README_te.md => i18n/README_te.md | 22 +- README_vi.md => i18n/README_vi.md | 22 +- README_zh-hans.md => i18n/README_zh-hans.md | 22 +- README_zh-hant.md => i18n/README_zh-hant.md | 22 +- setup.py | 3 +- src/transformers/__init__.py | 89 + src/transformers/agents/prompts.py | 6 +- src/transformers/agents/python_interpreter.py | 4 +- src/transformers/audio_utils.py | 305 +- src/transformers/cache_utils.py | 142 +- src/transformers/convert_slow_tokenizer.py | 15 +- src/transformers/dependency_versions_table.py | 3 +- .../generation/candidate_generator.py | 35 +- .../generation/configuration_utils.py | 2 +- .../generation/flax_logits_process.py | 2 +- src/transformers/generation/utils.py | 18 +- src/transformers/image_processing_utils.py | 5 + .../image_processing_utils_fast.py | 5 + src/transformers/integrations/ggml.py | 22 +- src/transformers/modeling_utils.py | 6 +- src/transformers/models/__init__.py | 4 + .../models/auto/configuration_auto.py | 11 + .../models/auto/image_processing_auto.py | 15 +- src/transformers/models/auto/modeling_auto.py | 10 + .../models/auto/processing_auto.py | 2 + .../models/auto/tokenization_auto.py | 9 + .../models/blip_2/modeling_blip_2.py | 2 +- src/transformers/models/clap/modeling_clap.py | 7 +- .../modeling_deformable_detr.py | 14 +- .../models/donut/modeling_donut_swin.py | 7 +- src/transformers/models/dpt/modeling_dpt.py | 4 +- .../models/falcon/modeling_falcon.py | 1 + src/transformers/models/gemma/diff_gemma.py | 3 +- .../models/gemma/modeling_gemma.py | 20 +- src/transformers/models/gemma2/__init__.py | 61 + .../models/gemma2/configuration_gemma2.py | 150 + .../gemma2/convert_gemma2_weights_to_hf.py | 239 ++ src/transformers/models/gemma2/diff_gemma2.py | 781 +++++ .../models/gemma2/modeling_gemma2.py | 1376 +++++++++ .../gpt_bigcode/modeling_gpt_bigcode.py | 1 + .../models/gpt_neox/modeling_gpt_neox.py | 260 +- .../models/imagegpt/modeling_imagegpt.py | 10 +- .../configuration_instructblip.py | 2 + .../instructblip/modeling_instructblip.py | 2 +- .../models/instructblipvideo/__init__.py | 83 + .../configuration_instructblipvideo.py | 364 +++ ...t_instructblipvideo_original_to_pytorch.py | 305 ++ .../diff_instructblipvideo.py | 430 +++ .../image_processing_instructblipvideo.py | 362 +++ .../modeling_instructblipvideo.py | 1665 ++++++++++ .../processing_instructblipvideo.py | 170 ++ .../models/layoutlmv3/modeling_layoutlmv3.py | 12 +- .../llama/convert_llama_weights_to_hf.py | 62 +- .../models/llama/modeling_llama.py | 8 + .../models/llava/configuration_llava.py | 18 - .../models/llava_next/modeling_llava_next.py | 19 +- .../models/llava_next_video/__init__.py | 70 + .../configuration_llava_next_video.py | 153 + .../convert_llava_next_video_weights_to_hf.py | 276 ++ .../llava_next_video/diff_llava_next_video.py | 559 ++++ .../image_processing_llava_next_video.py | 421 +++ .../modeling_llava_next_video.py | 1080 +++++++ .../processing_llava_next_video.py | 220 ++ .../image_processing_mask2former.py | 2 +- .../maskformer/image_processing_maskformer.py | 2 +- .../models/mistral/modeling_mistral.py | 8 +- .../models/mixtral/modeling_mixtral.py | 4 + .../models/mobilevit/modeling_mobilevit.py | 13 +- src/transformers/models/olmo/modeling_olmo.py | 6 + .../oneformer/image_processing_oneformer.py | 2 +- .../models/paligemma/modeling_paligemma.py | 5 +- src/transformers/models/phi/modeling_phi.py | 4 + src/transformers/models/phi3/modeling_phi3.py | 6 + .../models/qwen2/modeling_qwen2.py | 4 + .../models/qwen2_moe/modeling_qwen2_moe.py | 4 + src/transformers/models/rt_detr/__init__.py | 78 + .../models/rt_detr/configuration_rt_detr.py | 352 +++ .../rt_detr/configuration_rt_detr_resnet.py | 111 + ..._detr_original_pytorch_checkpoint_to_hf.py | 782 +++++ .../rt_detr/image_processing_rt_detr.py | 1120 +++++++ .../models/rt_detr/modeling_rt_detr.py | 2676 +++++++++++++++++ .../models/rt_detr/modeling_rt_detr_resnet.py | 426 +++ src/transformers/models/sam/modeling_sam.py | 3 +- .../models/siglip/modeling_siglip.py | 15 +- .../models/starcoder2/modeling_starcoder2.py | 4 + src/transformers/models/swin/modeling_swin.py | 7 +- .../models/swin2sr/modeling_swin2sr.py | 2 + .../models/swinv2/modeling_swinv2.py | 2 + .../timm_backbone/modeling_timm_backbone.py | 4 +- .../models/vipllava/configuration_vipllava.py | 23 - .../models/vit/image_processing_vit_fast.py | 3 +- .../models/vivit/modeling_vivit.py | 3 +- src/transformers/pipelines/base.py | 2 +- .../pipelines/image_classification.py | 7 +- src/transformers/pipelines/text_generation.py | 5 +- src/transformers/testing_utils.py | 30 +- src/transformers/trainer.py | 1 - src/transformers/training_args.py | 2 +- src/transformers/utils/__init__.py | 2 + src/transformers/utils/backbone_utils.py | 1 - src/transformers/utils/dummy_pt_objects.py | 112 + .../utils/dummy_vision_objects.py | 21 + src/transformers/utils/generic.py | 24 + src/transformers/utils/import_utils.py | 7 +- tests/deepspeed/test_deepspeed.py | 10 +- tests/extended/test_trainer_ext.py | 2 +- tests/fsdp/test_fsdp.py | 16 + tests/generation/test_utils.py | 78 +- .../models/albert/test_tokenization_albert.py | 2 +- tests/models/align/test_modeling_align.py | 11 +- tests/models/altclip/test_modeling_altclip.py | 8 +- tests/models/bark/test_modeling_bark.py | 10 +- tests/models/bart/test_modeling_bart.py | 3 +- tests/models/bart/test_tokenization_bart.py | 1 + .../barthez/test_tokenization_barthez.py | 2 +- tests/models/beit/test_modeling_beit.py | 4 +- tests/models/bert/test_modeling_bert.py | 2 +- tests/models/bert/test_tokenization_bert.py | 2 +- .../models/big_bird/test_modeling_big_bird.py | 2 +- .../big_bird/test_tokenization_big_bird.py | 2 +- .../test_modeling_bigbird_pegasus.py | 9 +- tests/models/biogpt/test_modeling_biogpt.py | 2 +- .../blenderbot/test_modeling_blenderbot.py | 2 +- .../test_modeling_blenderbot_small.py | 2 +- .../models/blip/test_image_processing_blip.py | 8 +- tests/models/blip/test_modeling_blip.py | 18 +- tests/models/blip/test_modeling_blip_text.py | 2 + tests/models/blip_2/test_modeling_blip_2.py | 2 + tests/models/bloom/test_modeling_bloom.py | 2 +- tests/models/bloom/test_tokenization_bloom.py | 2 +- .../test_image_processing_bridgetower.py | 4 + tests/models/byt5/test_tokenization_byt5.py | 6 +- .../camembert/test_tokenization_camembert.py | 12 +- tests/models/canine/test_modeling_canine.py | 6 +- .../models/canine/test_tokenization_canine.py | 13 +- .../test_image_processing_chinese_clip.py | 18 +- .../test_modeling_chinese_clip.py | 6 +- tests/models/clap/test_modeling_clap.py | 2 +- tests/models/clip/test_modeling_clip.py | 11 +- tests/models/clip/test_tokenization_clip.py | 3 +- tests/models/clipseg/test_modeling_clipseg.py | 13 +- tests/models/clvp/test_tokenization_clvp.py | 2 +- .../test_tokenization_code_llama.py | 15 +- .../codegen/test_tokenization_codegen.py | 4 +- .../models/cohere/test_tokenization_cohere.py | 2 +- .../test_image_processing_conditional_detr.py | 4 + .../test_modeling_conditional_detr.py | 2 +- .../models/convbert/test_modeling_convbert.py | 2 +- .../convnextv2/test_modeling_convnextv2.py | 4 +- tests/models/cpmant/test_modeling_cpmant.py | 2 +- .../data2vec/test_modeling_data2vec_audio.py | 11 +- .../data2vec/test_modeling_data2vec_vision.py | 6 +- tests/models/dbrx/test_modeling_dbrx.py | 8 +- .../test_tokenization_deberta_v2.py | 30 +- .../test_image_processing_deformable_detr.py | 4 + .../test_modeling_deformable_detr.py | 6 +- tests/models/deit/test_modeling_deit.py | 4 +- .../models/detr/test_image_processing_detr.py | 4 + tests/models/detr/test_modeling_detr.py | 2 +- tests/models/dinat/test_modeling_dinat.py | 2 +- .../distilbert/test_modeling_distilbert.py | 2 +- .../models/donut/test_modeling_donut_swin.py | 2 +- .../electra/test_tokenization_electra.py | 2 +- tests/models/encodec/test_modeling_encodec.py | 32 +- .../test_modeling_encoder_decoder.py | 3 + tests/models/ernie/test_modeling_ernie.py | 3 +- tests/models/esm/test_modeling_esm.py | 4 +- tests/models/esm/test_modeling_esmfold.py | 40 +- tests/models/falcon/test_modeling_falcon.py | 4 +- ...test_tokenization_fastspeech2_conformer.py | 2 +- .../models/flaubert/test_modeling_flaubert.py | 2 +- tests/models/flava/test_modeling_flava.py | 55 +- tests/models/fnet/test_modeling_fnet.py | 1 + tests/models/fnet/test_tokenization_fnet.py | 4 +- tests/models/fsmt/test_modeling_fsmt.py | 14 +- tests/models/fsmt/test_tokenization_fsmt.py | 4 +- tests/models/fuyu/test_modeling_fuyu.py | 6 +- tests/models/gemma/test_modeling_gemma.py | 89 +- tests/models/gemma/test_tokenization_gemma.py | 32 +- tests/models/gemma2/__init__.py | 0 tests/models/gemma2/test_modeling_gemma2.py | 498 +++ tests/models/git/test_modeling_git.py | 2 + .../models/glpn/test_image_processing_glpn.py | 2 + tests/models/glpn/test_modeling_glpn.py | 6 +- tests/models/gpt2/test_tokenization_gpt2.py | 7 +- .../gpt_bigcode/test_modeling_gpt_bigcode.py | 12 +- .../models/gpt_neox/test_modeling_gpt_neox.py | 64 +- .../test_tokenization_gpt_neox_japanese.py | 3 +- .../test_image_processing_grounding_dino.py | 4 + .../models/groupvit/test_modeling_groupvit.py | 6 +- .../herbert/test_tokenization_herbert.py | 2 +- tests/models/hubert/test_modeling_hubert.py | 24 +- tests/models/ibert/test_modeling_ibert.py | 2 +- .../idefics/test_image_processing_idefics.py | 12 +- tests/models/idefics/test_modeling_idefics.py | 16 +- .../test_image_processing_idefics2.py | 158 +- .../models/idefics2/test_modeling_idefics2.py | 14 +- .../test_image_processing_imagegpt.py | 4 +- .../models/imagegpt/test_modeling_imagegpt.py | 10 +- .../models/informer/test_modeling_informer.py | 6 +- tests/models/instructblipvideo/__init__.py | 0 ...test_image_processing_instrictblipvideo.py | 191 ++ .../test_modeling_instructblipvideo.py | 585 ++++ tests/models/jamba/test_modeling_jamba.py | 6 +- tests/models/jetmoe/test_modeling_jetmoe.py | 6 +- tests/models/kosmos2/test_modeling_kosmos2.py | 4 +- .../layoutlm/test_tokenization_layoutlm.py | 1 + .../test_image_processing_layoutlmv2.py | 2 +- .../layoutlmv2/test_modeling_layoutlmv2.py | 2 +- .../test_tokenization_layoutlmv2.py | 50 +- .../test_tokenization_layoutlmv3.py | 50 +- .../layoutxlm/test_tokenization_layoutxlm.py | 56 +- tests/models/led/test_modeling_led.py | 2 +- tests/models/led/test_tokenization_led.py | 1 + tests/models/levit/test_modeling_levit.py | 4 +- tests/models/llama/test_modeling_llama.py | 4 +- tests/models/llama/test_tokenization_llama.py | 17 +- tests/models/llava/test_modeling_llava.py | 8 + .../test_image_processing_llava_next.py | 4 +- .../llava_next/test_modeling_llava_next.py | 8 + tests/models/llava_next_video/__init__.py | 0 .../test_image_processing_llava_next_video.py | 218 ++ .../test_modeling_llava_next_video.py | 455 +++ .../longformer/test_modeling_longformer.py | 4 +- .../test_tokenization_longformer.py | 1 + tests/models/longt5/test_modeling_longt5.py | 8 +- tests/models/luke/test_tokenization_luke.py | 1 + tests/models/lxmert/test_modeling_lxmert.py | 6 +- .../models/lxmert/test_tokenization_lxmert.py | 2 +- tests/models/marian/test_modeling_marian.py | 9 +- .../markuplm/test_tokenization_markuplm.py | 46 +- .../test_image_processing_mask2former.py | 2 + .../mask2former/test_modeling_mask2former.py | 2 +- .../test_image_processing_maskformer.py | 2 + .../test_modeling_maskformer_swin.py | 4 +- tests/models/mbart/test_modeling_mbart.py | 4 +- tests/models/mbart/test_tokenization_mbart.py | 4 +- .../mbart50/test_tokenization_mbart50.py | 2 +- .../test_modeling_megatron_bert.py | 2 +- .../test_modeling_megatron_gpt2.py | 2 +- .../mgp_str/test_tokenization_mgp_str.py | 6 +- tests/models/mistral/test_modeling_mistral.py | 10 +- tests/models/mixtral/test_modeling_mixtral.py | 6 +- tests/models/mluke/test_tokenization_mluke.py | 1 + .../mobilebert/test_modeling_mobilebert.py | 2 +- .../test_tokenization_mobilebert.py | 2 +- tests/models/mpnet/test_modeling_mpnet.py | 2 +- tests/models/mpt/test_modeling_mpt.py | 2 +- tests/models/mra/test_modeling_mra.py | 4 +- tests/models/mt5/test_modeling_mt5.py | 4 +- .../models/musicgen/test_modeling_musicgen.py | 34 +- .../musicgen/test_processing_musicgen.py | 5 +- .../test_modeling_musicgen_melody.py | 29 +- tests/models/mvp/test_modeling_mvp.py | 2 +- tests/models/mvp/test_tokenization_mvp.py | 1 + tests/models/nllb/test_tokenization_nllb.py | 8 +- .../models/nllb_moe/test_modeling_nllb_moe.py | 4 +- .../models/nougat/test_tokenization_nougat.py | 8 +- tests/models/olmo/test_modeling_olmo.py | 4 +- .../test_image_processing_oneformer.py | 3 + .../oneformer/test_modeling_oneformer.py | 2 +- .../oneformer/test_processor_oneformer.py | 4 + .../models/openai/test_tokenization_openai.py | 2 +- tests/models/opt/test_modeling_opt.py | 2 +- .../owlv2/test_image_processing_owlv2.py | 2 +- tests/models/owlv2/test_modeling_owlv2.py | 4 +- tests/models/owlvit/test_modeling_owlvit.py | 4 +- .../paligemma/test_modeling_paligemma.py | 28 +- .../test_modeling_patchtsmixer.py | 2 +- tests/models/pegasus/test_modeling_pegasus.py | 2 +- .../pegasus/test_tokenization_pegasus.py | 8 - .../pegasus_x/test_modeling_pegasus_x.py | 2 +- .../perceiver/test_modeling_perceiver.py | 4 +- .../perceiver/test_tokenization_perceiver.py | 7 +- .../persimmon/test_modeling_persimmon.py | 2 +- .../test_image_processing_pix2struct.py | 12 +- .../pix2struct/test_modeling_pix2struct.py | 12 +- tests/models/plbart/test_modeling_plbart.py | 4 +- .../poolformer/test_modeling_poolformer.py | 6 +- .../test_feature_extraction_pop2piano.py | 6 +- .../pop2piano/test_tokenization_pop2piano.py | 6 +- .../prophetnet/test_modeling_prophetnet.py | 6 +- tests/models/pvt/test_modeling_pvt.py | 6 +- tests/models/pvt_v2/test_modeling_pvt_v2.py | 6 +- tests/models/qwen2/test_modeling_qwen2.py | 6 +- tests/models/qwen2/test_tokenization_qwen2.py | 8 +- .../qwen2_moe/test_modeling_qwen2_moe.py | 6 +- tests/models/rag/test_modeling_rag.py | 2 +- .../test_modeling_recurrent_gemma.py | 29 +- .../models/reformer/test_modeling_reformer.py | 22 +- .../reformer/test_tokenization_reformer.py | 4 +- .../rembert/test_tokenization_rembert.py | 9 +- .../roberta/test_tokenization_roberta.py | 1 + .../roformer/test_tokenization_roformer.py | 4 +- tests/models/rt_detr/__init__.py | 0 .../rt_detr/test_image_processing_rt_detr.py | 364 +++ tests/models/rt_detr/test_modeling_rt_detr.py | 704 +++++ .../rt_detr/test_modeling_rt_detr_resnet.py | 130 + .../test_tokenization_seamless_m4t.py | 18 +- .../segformer/test_modeling_segformer.py | 6 +- tests/models/sew/test_modeling_sew.py | 17 +- tests/models/sew_d/test_modeling_sew_d.py | 17 +- .../siglip/test_image_processing_siglip.py | 2 +- tests/models/siglip/test_modeling_siglip.py | 16 +- .../models/siglip/test_tokenization_siglip.py | 4 +- .../test_modeling_speech_encoder_decoder.py | 6 +- .../test_modeling_speech_to_text.py | 16 +- .../models/speecht5/test_modeling_speecht5.py | 86 +- .../speecht5/test_tokenization_speecht5.py | 2 + .../starcoder2/test_modeling_starcoder2.py | 6 +- .../swin2sr/test_image_processing_swin2sr.py | 4 +- tests/models/swin2sr/test_modeling_swin2sr.py | 21 + tests/models/swinv2/test_modeling_swinv2.py | 20 + .../test_modeling_switch_transformers.py | 2 +- tests/models/t5/test_modeling_t5.py | 4 +- tests/models/t5/test_tokenization_t5.py | 2 +- .../test_modeling_table_transformer.py | 2 +- tests/models/tapas/test_modeling_tapas.py | 4 +- tests/models/tapas/test_tokenization_tapas.py | 32 +- .../test_modeling_time_series_transformer.py | 2 +- .../timesformer/test_modeling_timesformer.py | 2 +- .../test_modeling_timm_backbone.py | 36 +- tests/models/trocr/test_modeling_trocr.py | 10 +- tests/models/udop/test_modeling_udop.py | 2 +- tests/models/udop/test_tokenization_udop.py | 54 +- tests/models/umt5/test_modeling_umt5.py | 4 +- .../unispeech/test_modeling_unispeech.py | 7 +- .../test_modeling_unispeech_sat.py | 22 +- .../test_image_processing_video_llava.py | 4 +- .../video_llava/test_modeling_video_llava.py | 8 + .../models/videomae/test_modeling_videomae.py | 2 +- .../models/vilt/test_image_processing_vilt.py | 4 + tests/models/vilt/test_modeling_vilt.py | 8 +- .../models/vipllava/test_modeling_vipllava.py | 8 + .../test_modeling_vision_encoder_decoder.py | 6 +- .../test_modeling_vision_text_dual_encoder.py | 2 +- tests/models/vit/test_image_processing_vit.py | 6 +- tests/models/vitdet/test_modeling_vitdet.py | 8 +- tests/models/vits/test_modeling_vits.py | 10 +- tests/models/vits/test_tokenization_vits.py | 8 +- .../models/wav2vec2/test_modeling_wav2vec2.py | 30 +- .../wav2vec2/test_tokenization_wav2vec2.py | 4 +- .../test_modeling_wav2vec2_conformer.py | 15 +- .../test_tokenization_wav2vec2_phoneme.py | 14 +- tests/models/wavlm/test_modeling_wavlm.py | 9 +- tests/models/whisper/test_modeling_whisper.py | 39 +- .../whisper/test_tokenization_whisper.py | 3 + tests/models/x_clip/test_modeling_x_clip.py | 6 +- tests/models/xglm/test_modeling_xglm.py | 2 +- tests/models/xglm/test_tokenization_xglm.py | 2 +- .../test_tokenization_xlm_roberta.py | 4 +- tests/models/xlnet/test_modeling_xlnet.py | 2 +- .../yolos/test_image_processing_yolos.py | 3 + tests/models/yolos/test_modeling_yolos.py | 2 +- tests/models/yoso/test_modeling_yoso.py | 1 + .../test_pipelines_audio_classification.py | 2 +- ..._pipelines_automatic_speech_recognition.py | 8 +- tests/pipelines/test_pipelines_common.py | 2 +- .../test_pipelines_conversational.py | 439 +++ .../test_pipelines_depth_estimation.py | 4 +- ...t_pipelines_document_question_answering.py | 2 +- .../test_pipelines_feature_extraction.py | 10 +- tests/pipelines/test_pipelines_fill_mask.py | 2 +- .../test_pipelines_image_classification.py | 28 + ...test_pipelines_image_feature_extraction.py | 6 +- .../test_pipelines_image_segmentation.py | 2 +- .../test_pipelines_mask_generation.py | 4 +- .../test_pipelines_object_detection.py | 2 +- .../test_pipelines_text_generation.py | 16 +- .../test_pipelines_video_classification.py | 1 + ...est_pipelines_visual_question_answering.py | 2 +- ...ipelines_zero_shot_audio_classification.py | 4 +- ...ipelines_zero_shot_image_classification.py | 2 +- ...st_pipelines_zero_shot_object_detection.py | 4 +- tests/quantization/autoawq/test_awq.py | 2 +- tests/quantization/bnb/test_4bit.py | 2 +- tests/quantization/bnb/test_mixed_int8.py | 2 +- .../quanto_integration/test_quanto.py | 13 +- tests/test_cache_utils.py | 2 +- tests/test_image_processing_common.py | 90 +- tests/test_modeling_common.py | 250 +- tests/test_modeling_utils.py | 26 +- tests/test_pipeline_mixin.py | 12 +- tests/test_tokenization_common.py | 98 +- tests/test_tokenization_utils.py | 2 +- tests/tokenization/test_tokenization_fast.py | 10 +- tests/trainer/test_trainer.py | 4 +- tests/utils/test_audio_utils.py | 923 ++++++ tests/utils/test_doc_samples.py | 2 +- tests/utils/test_model_output.py | 4 +- utils/check_copies.py | 15 +- utils/check_repo.py | 3 + utils/diff_model_converter.py | 74 +- 418 files changed, 22542 insertions(+), 1929 deletions(-) create mode 100644 docs/source/en/model_doc/instructblipvideo.md create mode 100644 docs/source/en/model_doc/llava-next-video.md create mode 100644 docs/source/en/model_doc/rt_detr.md rename README_de.md => i18n/README_de.md (97%) rename README_es.md => i18n/README_es.md (97%) rename README_fr.md => i18n/README_fr.md (97%) rename README_hd.md => i18n/README_hd.md (98%) rename README_ja.md => i18n/README_ja.md (97%) rename README_ko.md => i18n/README_ko.md (97%) rename README_pt-br.md => i18n/README_pt-br.md (97%) rename README_ru.md => i18n/README_ru.md (98%) rename README_te.md => i18n/README_te.md (98%) rename README_vi.md => i18n/README_vi.md (98%) rename README_zh-hans.md => i18n/README_zh-hans.md (97%) rename README_zh-hant.md => i18n/README_zh-hant.md (97%) create mode 100644 src/transformers/models/gemma2/__init__.py create mode 100644 src/transformers/models/gemma2/configuration_gemma2.py create mode 100644 src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py create mode 100644 src/transformers/models/gemma2/diff_gemma2.py create mode 100644 src/transformers/models/gemma2/modeling_gemma2.py create mode 100644 src/transformers/models/instructblipvideo/__init__.py create mode 100644 src/transformers/models/instructblipvideo/configuration_instructblipvideo.py create mode 100644 src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py create mode 100644 src/transformers/models/instructblipvideo/diff_instructblipvideo.py create mode 100644 src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py create mode 100644 src/transformers/models/instructblipvideo/modeling_instructblipvideo.py create mode 100644 src/transformers/models/instructblipvideo/processing_instructblipvideo.py create mode 100644 src/transformers/models/llava_next_video/__init__.py create mode 100644 src/transformers/models/llava_next_video/configuration_llava_next_video.py create mode 100644 src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py create mode 100644 src/transformers/models/llava_next_video/diff_llava_next_video.py create mode 100644 src/transformers/models/llava_next_video/image_processing_llava_next_video.py create mode 100644 src/transformers/models/llava_next_video/modeling_llava_next_video.py create mode 100644 src/transformers/models/llava_next_video/processing_llava_next_video.py create mode 100644 src/transformers/models/rt_detr/__init__.py create mode 100644 src/transformers/models/rt_detr/configuration_rt_detr.py create mode 100644 src/transformers/models/rt_detr/configuration_rt_detr_resnet.py create mode 100644 src/transformers/models/rt_detr/convert_rt_detr_original_pytorch_checkpoint_to_hf.py create mode 100644 src/transformers/models/rt_detr/image_processing_rt_detr.py create mode 100644 src/transformers/models/rt_detr/modeling_rt_detr.py create mode 100644 src/transformers/models/rt_detr/modeling_rt_detr_resnet.py create mode 100644 tests/models/gemma2/__init__.py create mode 100644 tests/models/gemma2/test_modeling_gemma2.py create mode 100644 tests/models/instructblipvideo/__init__.py create mode 100644 tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py create mode 100644 tests/models/instructblipvideo/test_modeling_instructblipvideo.py create mode 100644 tests/models/llava_next_video/__init__.py create mode 100644 tests/models/llava_next_video/test_image_processing_llava_next_video.py create mode 100644 tests/models/llava_next_video/test_modeling_llava_next_video.py create mode 100644 tests/models/rt_detr/__init__.py create mode 100644 tests/models/rt_detr/test_image_processing_rt_detr.py create mode 100644 tests/models/rt_detr/test_modeling_rt_detr.py create mode 100644 tests/models/rt_detr/test_modeling_rt_detr_resnet.py create mode 100644 tests/pipelines/test_pipelines_conversational.py diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 51d713b2e103..e3ad6cd81cde 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -25,7 +25,7 @@ body: Models: - - text models: @ArthurZucker and @younesbelkada + - text models: @ArthurZucker - vision models: @amyeroberts - speech models: @sanchit-gandhi - graph models: @clefourrier @@ -44,7 +44,7 @@ body: - deepspeed: HF Trainer/Accelerate: @muellerzr - ray/raytune: @richardliaw, @amogkam - Big Model Inference: @SunMarc - - quantization (bitsandbytes, autogpt): @SunMarc and @younesbelkada + - quantization (bitsandbytes, autogpt): @SunMarc Documentation: @stevhliu diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 650e13d8dcab..c55edc015deb 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -39,7 +39,7 @@ members/contributors who may be interested in your PR. Models: -- text models: @ArthurZucker and @younesbelkada +- text models: @ArthurZucker - vision models: @amyeroberts - speech models: @sanchit-gandhi - graph models: @clefourrier @@ -58,7 +58,7 @@ Integrations: - deepspeed: HF Trainer/Accelerate: @muellerzr - ray/raytune: @richardliaw, @amogkam - Big Model Inference: @SunMarc -- quantization (bitsandbytes, autogpt): @SunMarc and @younesbelkada +- quantization (bitsandbytes, autogpt): @SunMarc Documentation: @stevhliu and @MKhalusova diff --git a/README.md b/README.md index 9d116a803cbc..2a2830fb1001 100644 --- a/README.md +++ b/README.md @@ -36,18 +36,18 @@ limitations under the License.

English | - 简体中文 | - 繁體中文 | - 한국어 | - Español | - 日本語 | - हिन्दी | - Русский | - Рortuguês | - తెలుగు | - Français | - Deutsch | - Tiếng Việt | + 简体中文 | + 繁體中文 | + 한국어 | + Español | + 日本語 | + हिन्दी | + Русский | + Рortuguês | + తెలుగు | + Français | + Deutsch | + Tiếng Việt |

diff --git a/docs/source/de/testing.md b/docs/source/de/testing.md index 1d68c11c3ba0..100151e58c3d 100644 --- a/docs/source/de/testing.md +++ b/docs/source/de/testing.md @@ -185,16 +185,16 @@ pytest -k "test and ada" tests/test_optimization.py Manchmal müssen Sie `accelerate` Tests für Ihre Modelle ausführen. Dazu fügen Sie einfach `-m accelerate_tests` zu Ihrem Befehl hinzu, wenn Sie diese Tests bei einem `OPT`-Lauf ausführen möchten: ```bash -RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py +RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py ``` -### Dokumentationstests ausführen +### Dokumentationstests ausführen -Um zu testen, ob die Dokumentationsbeispiele korrekt sind, sollten Sie überprüfen, ob die `doctests` erfolgreich sind. -Lassen Sie uns als Beispiel den docstring von [WhisperModel.forward](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035) verwenden: +Um zu testen, ob die Dokumentationsbeispiele korrekt sind, sollten Sie überprüfen, ob die `doctests` erfolgreich sind. +Lassen Sie uns als Beispiel den docstring von [WhisperModel.forward](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035) verwenden: -```python +```python r""" Returns: @@ -217,8 +217,8 @@ Example: ``` -Führen Sie einfach die folgende Zeile aus, um automatisch jedes docstring-Beispiel in der gewünschten Datei zu testen: -```bash +Führen Sie einfach die folgende Zeile aus, um automatisch jedes docstring-Beispiel in der gewünschten Datei zu testen: +```bash pytest --doctest-modules ``` Wenn die Datei eine Markdown-Erweiterung hat, sollten Sie das Argument `--doctest-glob="*.md"` hinzufügen. @@ -862,7 +862,7 @@ Code, der fehlerhaft ist, einen schlechten Zustand verursacht, der sich auf ande - Hier sehen Sie, wie Sie einen ganzen Test bedingungslos überspringen können: ```python no-style -@unittest.skip("this bug needs to be fixed") +@unittest.skip(reason="this bug needs to be fixed") def test_feature_x(): ``` diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index be3001dc761a..94f5d8d19e6f 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -627,6 +627,8 @@ title: RegNet - local: model_doc/resnet title: ResNet + - local: model_doc/rt_detr + title: RT-DETR - local: model_doc/segformer title: SegFormer - local: model_doc/seggpt @@ -774,6 +776,8 @@ title: Idefics2 - local: model_doc/instructblip title: InstructBLIP + - local: model_doc/instructblipvideo + title: InstructBlipVideo - local: model_doc/kosmos-2 title: KOSMOS-2 - local: model_doc/layoutlm @@ -790,6 +794,8 @@ title: Llava - local: model_doc/llava_next title: LLaVA-NeXT + - local: model_doc/llava-next-video + title: LLaVa-NeXT-Video - local: model_doc/lxmert title: LXMERT - local: model_doc/matcha diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 72237d138395..ac026067ac24 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -145,6 +145,7 @@ Flax), PyTorch, and/or TensorFlow. | [Funnel Transformer](model_doc/funnel) | ✅ | ✅ | ❌ | | [Fuyu](model_doc/fuyu) | ✅ | ❌ | ❌ | | [Gemma](model_doc/gemma) | ✅ | ❌ | ✅ | +| [Gemma2](model_doc/gemma2) | ✅ | ❌ | ❌ | | [GIT](model_doc/git) | ✅ | ❌ | ❌ | | [GLPN](model_doc/glpn) | ✅ | ❌ | ❌ | | [GPT Neo](model_doc/gpt_neo) | ✅ | ❌ | ✅ | @@ -165,6 +166,7 @@ Flax), PyTorch, and/or TensorFlow. | [ImageGPT](model_doc/imagegpt) | ✅ | ❌ | ❌ | | [Informer](model_doc/informer) | ✅ | ❌ | ❌ | | [InstructBLIP](model_doc/instructblip) | ✅ | ❌ | ❌ | +| [InstructBlipVideo](model_doc/instructblipvideo) | ✅ | ❌ | ❌ | | [Jamba](model_doc/jamba) | ✅ | ❌ | ❌ | | [JetMoe](model_doc/jetmoe) | ✅ | ❌ | ❌ | | [Jukebox](model_doc/jukebox) | ✅ | ❌ | ❌ | @@ -181,6 +183,7 @@ Flax), PyTorch, and/or TensorFlow. | [Llama3](model_doc/llama3) | ✅ | ❌ | ✅ | | [LLaVa](model_doc/llava) | ✅ | ❌ | ❌ | | [LLaVA-NeXT](model_doc/llava_next) | ✅ | ❌ | ❌ | +| [LLaVa-NeXT-Video](model_doc/llava-next-video) | ✅ | ❌ | ❌ | | [Longformer](model_doc/longformer) | ✅ | ✅ | ❌ | | [LongT5](model_doc/longt5) | ✅ | ❌ | ✅ | | [LUKE](model_doc/luke) | ✅ | ❌ | ❌ | @@ -262,6 +265,8 @@ Flax), PyTorch, and/or TensorFlow. | [RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm) | ✅ | ✅ | ✅ | | [RoCBert](model_doc/roc_bert) | ✅ | ❌ | ❌ | | [RoFormer](model_doc/roformer) | ✅ | ✅ | ✅ | +| [RT-DETR](model_doc/rt_detr) | ✅ | ❌ | ❌ | +| [RT-DETR-ResNet](model_doc/rt_detr_resnet) | ✅ | ❌ | ❌ | | [RWKV](model_doc/rwkv) | ✅ | ❌ | ❌ | | [SAM](model_doc/sam) | ✅ | ✅ | ❌ | | [SeamlessM4T](model_doc/seamless_m4t) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/gpt_neox.md b/docs/source/en/model_doc/gpt_neox.md index fd105a3e82e1..1319f2e93c14 100644 --- a/docs/source/en/model_doc/gpt_neox.md +++ b/docs/source/en/model_doc/gpt_neox.md @@ -95,6 +95,68 @@ Below is an expected speedup diagram that compares pure inference time between t + +## Using Scaled Dot Product Attention (SDPA) +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) +page for more information. + +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. + +```python +from transformers import GPTNeoXForCausalLM +model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", torch_dtype=torch.float16, attn_implementation="sdpa") +... +``` + +For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`). + +On a local benchmark (rtx3080ti-16GB, PyTorch 2.2.1, OS Ubuntu 22.04) using `float16` with +[pythia-410m-deduped](https://huggingface.co/EleutherAI/pythia-410m-deduped), we saw the +following speedups during training and inference. + +### Training +| Batch size | Seq len | Time per batch (Eager - s) | Time per batch (SDPA - s) | Speedup (%) | Eager peak mem (MB) | SDPA peak mem (MB) | Mem saving (%) | +|-----------:|-----------:|---------------------------:|-----------------------------:|------------:|--------------------:|-------------------:|------------------:| +| 1 | 128 | 0.024 | 0.019 | 28.945 | 1789.95 | 1789.95 | 0 | +| 1 | 256 | 0.039 | 0.031 | 23.18 | 1845.83 | 1844.84 | 0.053 | +| 1 | 512 | 0.08 | 0.055 | 45.524 | 2278.38 | 1953.76 | 16.615 | +| 1 | 1024 | 0.19 | 0.102 | 86.777 | 4772.36 | 2408.35 | 98.159 | +| 1 | 2048 | 0.565 | 0.204 | 177.098 | 13484.1 | 3882.01 | 247.348 | +| 2 | 128 | 0.037 | 0.032 | 15.121 | 1843.86 | 1844.78 | -0.05 | +| 2 | 256 | 0.067 | 0.055 | 21.706 | 1999.72 | 1951.67 | 2.462 | +| 2 | 512 | 0.144 | 0.096 | 50.046 | 3613.16 | 2406.77 | 50.125 | +| 2 | 1024 | 0.366 | 0.193 | 89.666 | 8707.55 | 3878.86 | 124.487 | +| 2 | 2048 | OOM | 0.379 | / | OOM | 6825.13 | SDPA does not OOM | +| 4 | 128 | 0.06 | 0.054 | 11.539 | 1947.6 | 1952.06 | -0.228 | +| 4 | 256 | 0.119 | 0.093 | 28.072 | 3008.39 | 2405.99 | 25.038 | +| 4 | 512 | 0.275 | 0.187 | 47.145 | 6290.58 | 3877.29 | 62.242 | +| 4 | 1024 | OOM | 0.36 | / | OOM | 6821.98 | SDPA does not OOM | +| 4 | 2048 | OOM | 0.731 | / | OOM | 12705.1 | SDPA does not OOM | + +### Inference +| Batch size | Seq len | Per token latency Eager (ms) | Per token latency SDPA (ms) | Speedup (%) | Mem Eager (MB) | Mem SDPA (MB) | Mem saved (%) | +|--------------:|-------------:|--------------------------------:|-------------------------------:|---------------:|------------------:|----------------:|-----------------:| +| 1 | 128 | 6.569 | 5.858 | 12.14 | 974.831 | 974.826 | 0 | +| 1 | 256 | 7.009 | 5.863 | 19.542 | 1029.01 | 1028.08 | 0.09 | +| 1 | 512 | 7.157 | 5.965 | 19.983 | 1137.54 | 1137.52 | 0.001 | +| 1 | 1024 | 7.523 | 6.506 | 15.637 | 1329.3 | 1329.26 | 0.003 | +| 1 | 2048 | 9.271 | 9.205 | 0.713 | 1752.47 | 1734.51 | 1.036 | +| 2 | 128 | 7.239 | 5.959 | 21.493 | 1044.8 | 1028.37 | 1.597 | +| 2 | 256 | 7.228 | 6.036 | 19.757 | 1167.32 | 1137.73 | 2.601 | +| 2 | 512 | 7.538 | 6.693 | 12.628 | 1352.93 | 1329.55 | 1.758 | +| 2 | 1024 | 8.916 | 8.632 | 3.291 | 1752.56 | 1734.62 | 1.034 | +| 2 | 2048 | 12.628 | 12.606 | 0.181 | 2558.72 | 2545.8 | 0.508 | +| 4 | 128 | 7.278 | 6.046 | 20.373 | 1168.41 | 1137.79 | 2.691 | +| 4 | 256 | 7.614 | 6.588 | 15.574 | 1353.1 | 1329.79 | 1.753 | +| 4 | 512 | 8.798 | 8.144 | 8.028 | 1752.76 | 1734.85 | 1.032 | +| 4 | 1024 | 11.765 | 11.303 | 4.09 | 2558.96 | 2546.04 | 0.508 | +| 4 | 2048 | 19.568 | 17.735 | 10.33 | 4175.5 | 4165.26 | 0.246 | + + ## Resources - [Causal language modeling task guide](../tasks/language_modeling) diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md index 1a693493fff1..b5fc634b6216 100644 --- a/docs/source/en/model_doc/instructblip.md +++ b/docs/source/en/model_doc/instructblip.md @@ -50,6 +50,7 @@ InstructBLIP uses the same architecture as [BLIP-2](blip2) with a tiny but impor [[autodoc]] InstructBlipProcessor + ## InstructBlipVisionModel [[autodoc]] InstructBlipVisionModel diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md new file mode 100644 index 000000000000..aa93feb6b6dc --- /dev/null +++ b/docs/source/en/model_doc/instructblipvideo.md @@ -0,0 +1,74 @@ + + +# InstructBlipVideo + +## Overview + +## Overview + +The InstructBLIPVideo is an extension of the models proposed in [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. +InstructBLIPVideo uses the same architecture as [InstructBLIP](instructblip) and works with the same checkpoints as [InstructBLIP](instructblip). The only difference is the ability to process videos. + +The abstract from the paper is the following: + +*General-purpose language models that can solve various language-domain tasks have emerged driven by the pre-training and instruction-tuning pipeline. However, building general-purpose vision-language models is challenging due to the increased task discrepancy introduced by the additional visual input. Although vision-language pre-training has been widely studied, vision-language instruction tuning remains relatively less explored. In this paper, we conduct a systematic and comprehensive study on vision-language instruction tuning based on the pre-trained BLIP-2 models. We gather a wide variety of 26 publicly available datasets, transform them into instruction tuning format and categorize them into two clusters for held-in instruction tuning and held-out zero-shot evaluation. Additionally, we introduce instruction-aware visual feature extraction, a crucial method that enables the model to extract informative features tailored to the given instruction. The resulting InstructBLIP models achieve state-of-the-art zero-shot performance across all 13 held-out datasets, substantially outperforming BLIP-2 and the larger Flamingo. Our models also lead to state-of-the-art performance when finetuned on individual downstream tasks (e.g., 90.7% accuracy on ScienceQA IMG). Furthermore, we qualitatively demonstrate the advantages of InstructBLIP over concurrent multimodal models.* + + + + InstructBLIPVideo architecture. Taken from the original paper. + +This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). +The original code can be found [here](https://github.com/salesforce/LAVIS/tree/main/projects/instructblip). + +## Usage tips + +- The model was trained by sampling 4 frames per video, so it's recommended to sample 4 frames + +## InstructBlipVideoConfig + +[[autodoc]] InstructBlipVideoConfig + - from_vision_qformer_text_configs + +## InstructBlipVideoVisionConfig + +[[autodoc]] InstructBlipVideoVisionConfig + +## InstructBlipVideoQFormerConfig + +[[autodoc]] InstructBlipVideoQFormerConfig + +## InstructBlipVideoProcessor + +[[autodoc]] InstructBlipVideoProcessor + +## InstructBlipVideoImageProcessor + +[[autodoc]] InstructBlipVideoImageProcessor + - preprocess + +## InstructBlipVideoVisionModel + +[[autodoc]] InstructBlipVideoVisionModel + - forward + +## InstructBlipVideoQFormerModel + +[[autodoc]] InstructBlipVideoQFormerModel + - forward + +## InstructBlipVideoForConditionalGeneration + +[[autodoc]] InstructBlipVideoForConditionalGeneration + - forward + - generate \ No newline at end of file diff --git a/docs/source/en/model_doc/llava-next-video.md b/docs/source/en/model_doc/llava-next-video.md new file mode 100644 index 000000000000..88e41efc29c8 --- /dev/null +++ b/docs/source/en/model_doc/llava-next-video.md @@ -0,0 +1,259 @@ + + +# LLaVa-NeXT-Video + +## Overview + +The LLaVa-NeXT-Video model was proposed in [LLaVA-NeXT: A Strong Zero-shot Video Understanding Model +](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/) by Yuanhan Zhang, Bo Li, Haotian Liu, Yong Jae Lee, Liangke Gui, Di Fu, Jiashi Feng, Ziwei Liu, Chunyuan Li. LLaVa-NeXT-Video improves upon [LLaVa-NeXT](llava_next) by fine-tuning on a mix if video and image dataset thus increasing the model's performance on videos. + +[LLaVA-NeXT](llava_next) surprisingly has strong performance in understanding video content in zero-shot fashion with the AnyRes technique that it uses. The AnyRes technique naturally represents a high-resolution image into multiple images. This technique is naturally generalizable to represent videos because videos can be considered as a set of frames (similar to a set of images in LLaVa-NeXT). The current version of LLaVA-NeXT makes use of AnyRes and trains with supervised fine-tuning (SFT) on top of LLaVA-Next on video data to achieves better video understanding capabilities.The model is a current SOTA among open-source models on [VideoMME bench](https://arxiv.org/abs/2405.21075). + + +The introduction from the blog is the following: + +On January 30, 2024, we released LLaVA-NeXT, an open-source Large Multimodal Model (LMM) that has been trained exclusively on text-image data. With the proposed AnyRes technique, it boosts capabilities in reasoning, OCR, and world knowledge, demonstrating remarkable performance across a spectrum of image-based multimodal understanding tasks, and even exceeding Gemini-Pro on several image benchmarks, e.g. MMMU and MathVista. + +**In today’s exploration, we delve into the performance of LLaVA-NeXT within the realm of video understanding tasks. We reveal that LLaVA-NeXT surprisingly has strong performance in understanding video content. The current version of LLaVA-NeXT for videos has several improvements: + +- Zero-shot video representation capabilities with AnyRes: The AnyRes technique naturally represents a high-resolution image into multiple images that a pre-trained VIT is able to digest, and forms them into a concantenated sequence. This technique is naturally generalizable to represent videos (consisting of multiple frames), allowing the image-only-trained LLaVA-Next model to perform surprisingly well on video tasks. Notably, this is the first time that LMMs show strong zero-shot modality transfer ability. +- Inference with length generalization improves on longer videos. The linear scaling technique enables length generalization, allowing LLaVA-NeXT to effectively handle long-video beyond the limitation of the "max_token_length" of the LLM. +- Strong video understanding ability. (1) LLaVA-Next-Image, which combines the above two techniques, yields superior zero-shot performance than open-source LMMs tuned on videos. (2) LLaVA-Next-Video, further supervised fine-tuning (SFT) LLaVA-Next-Image on video data, achieves better video understanding capabilities compared to LLaVA-Next-Image. (3) LLaVA-Next-Video-DPO, which aligns the model response with AI feedback using direct preference optimization (DPO), showing significant performance boost. +- Efficient deployment and inference with SGLang. It allows 5x faster inference on video tasks, allowing more scalable serving such as million-level video re-captioning. See instructions in our repo.** + + +This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). +The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tree/inference). + +## Usage tips + +- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating. + +- Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use tokenizer's `apply_chat_template` to format your prompts correctly. Below is an example of how to do that. + +We will use [LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) and a conversation history of videos and images. Each content field has to be a list of dicts, as follows: + +```python +from transformers import LlavaNextVideoProcessor + +processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf") + +conversation = [ + { + "role": "system", + "content": [ + {"type": "text", "text": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."}, + ], + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "What’s shown in this image?"}, + {"type": "image"}, + ], + }, + { + "role": "assistant", + "content": [{"type": "text", "text": "This image shows a red stop sign."},] + }, + { + + "role": "user", + "content": [ + {"type": "text", "text": "Why is this video funny?"}, + {"type": "video"}, + ], + }, +] + +text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + +# Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your visuals +print(text_prompt) +``` + +## Usage example + +### Single Media Mode + +The model can accept both images and videos as input. Here's an example code for inference in half-precision (`torch.float16`): + +```python +import av +import torch +import numpy as np +from transformers import LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor + +def read_video_pyav(container, indices): + ''' + Decode the video with PyAV decoder. + Args: + container (`av.container.input.InputContainer`): PyAV container. + indices (`List[int]`): List of frame indices to decode. + Returns: + result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). + ''' + frames = [] + container.seek(0) + start_index = indices[0] + end_index = indices[-1] + for i, frame in enumerate(container.decode(video=0)): + if i > end_index: + break + if i >= start_index and i in indices: + frames.append(frame) + return np.stack([x.to_ndarray(format="rgb24") for x in frames]) + +# Load the model in half-precision +model = LlavaNextVideoForConditionalGeneration.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf", torch_dtype=torch.float16, device_map="auto") +processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf") + +# Load the video as an np.array, sampling uniformly 8 frames (can sample more for longer videos) +video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset") +container = av.open(video_path) +total_frames = container.streams.video[0].frames +indices = np.arange(0, total_frames, total_frames / 8).astype(int) +video = read_video_pyav(container, indices) + +conversation = [ + { + + "role": "user", + "content": [ + {"type": "text", "text": "Why is this video funny?"}, + {"type": "video"}, + ], + }, +] + +prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) +inputs = processor(text=prompt, videos=video, return_tensors="pt") + +out = model.generate(**inputs, max_new_tokens=60) +processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True) +``` + + +### Mixed Media Mode + +The model can also generate from an interleaved image-video inputs. However note, that it was not trained in interleaved image-video setting which might affect the performance. Below is an example usage for mixed media input, add the following lines to the above code snippet: + +```python +from PIL import Image +import requests + +# Generate from image and video mixed inputs +# Load and image and write a new prompt +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) +conversation = [ + { + + "role": "user", + "content": [ + {"type": "text", "text": "How many cats are there in the image?"}, + {"type": "image"}, + ], + }, + { + + "role": "assistant", + "content": [{"type": "text", "text": "There are two cats"}], + }, + { + + "role": "user", + "content": [ + {"type": "text", "text": "Why is this video funny?"}, + {"type": "video"}, + ], + }, +] +prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) +inputs = processor(text=prompt, images=image, videos=clip, padding=True, return_tensors="pt") + +# Generate +generate_ids = model.generate(**inputs, max_length=50) +processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) + +``` + +## Model optimization + +### Quantization using Bitsandbytes for memory efficiency + +The model can be loaded in lower bits, significantly reducing memory burden while maintaining the performance of the original model. This allows for efficient deployment on resource-constrained cases. + +First make sure to install bitsandbytes by running `pip install bitsandbytes` and to have access to a CUDA compatible GPU device. Load the quantized model by simply adding [`BitsAndBytesConfig`](../main_classes/quantization#transformers.BitsAndBytesConfig) as shown below: + + +```python +from transformers import LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor + +# specify how to quantize the model +quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, +) + +model = LlavaNextVideoForConditionalGeneration.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf", quantization_config=quantization_config, device_map="auto") +``` + + +### Flash-Attention 2 to speed-up generation + +Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. + +First, make sure to install the latest version of Flash Attention 2: + +```bash +pip install -U flash-attn --no-build-isolation +``` + +Also, you should have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of the [flash attention repository](https://github.com/Dao-AILab/flash-attention). FlashAttention-2 can only be used when a model is loaded in `torch.float16` or `torch.bfloat16`. + +To load and run a model using Flash Attention-2, simply add `attn_implementation="flash_attention_2"` when loading the model as follows: + +```python +from transformers import LlavaNextVideoForConditionalGeneration + +model = LlavaNextVideoForConditionalGeneration.from_pretrained( + "llava-hf/LLaVA-NeXT-Video-7B-hf", + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", +).to(0) +``` + + + +## LlavaNextVideoConfig + +[[autodoc]] LlavaNextVideoConfig + +## LlavaNextVideoProcessor + +[[autodoc]] LlavaNextVideoProcessor + +## LlavaNextVideoImageProcessor + +[[autodoc]] LlavaNextVideoImageProcessor + +## LlavaNextVideoForConditionalGeneration + +[[autodoc]] LlavaNextVideoForConditionalGeneration + - forward diff --git a/docs/source/en/model_doc/rt_detr.md b/docs/source/en/model_doc/rt_detr.md new file mode 100644 index 000000000000..6075fbad5335 --- /dev/null +++ b/docs/source/en/model_doc/rt_detr.md @@ -0,0 +1,96 @@ + + +# RT-DETR + +## Overview + + +The RT-DETR model was proposed in [DETRs Beat YOLOs on Real-time Object Detection](https://arxiv.org/abs/2304.08069) by Wenyu Lv, Yian Zhao, Shangliang Xu, Jinman Wei, Guanzhong Wang, Cheng Cui, Yuning Du, Qingqing Dang, Yi Liu. + +RT-DETR is an object detection model that stands for "Real-Time DEtection Transformer." This model is designed to perform object detection tasks with a focus on achieving real-time performance while maintaining high accuracy. Leveraging the transformer architecture, which has gained significant popularity in various fields of deep learning, RT-DETR processes images to identify and locate multiple objects within them. + +The abstract from the paper is the following: + +*Recently, end-to-end transformer-based detectors (DETRs) have achieved remarkable performance. However, the issue of the high computational cost of DETRs has not been effectively addressed, limiting their practical application and preventing them from fully exploiting the benefits of no post-processing, such as non-maximum suppression (NMS). In this paper, we first analyze the influence of NMS in modern real-time object detectors on inference speed, and establish an end-to-end speed benchmark. To avoid the inference delay caused by NMS, we propose a Real-Time DEtection TRansformer (RT-DETR), the first real-time end-to-end object detector to our best knowledge. Specifically, we design an efficient hybrid encoder to efficiently process multi-scale features by decoupling the intra-scale interaction and cross-scale fusion, and propose IoU-aware query selection to improve the initialization of object queries. In addition, our proposed detector supports flexibly adjustment of the inference speed by using different decoder layers without the need for retraining, which facilitates the practical application of real-time object detectors. Our RT-DETR-L achieves 53.0% AP on COCO val2017 and 114 FPS on T4 GPU, while RT-DETR-X achieves 54.8% AP and 74 FPS, outperforming all YOLO detectors of the same scale in both speed and accuracy. Furthermore, our RT-DETR-R50 achieves 53.1% AP and 108 FPS, outperforming DINO-Deformable-DETR-R50 by 2.2% AP in accuracy and by about 21 times in FPS.* + +The model version was contributed by [rafaelpadilla](https://huggingface.co/rafaelpadilla) and [sangbumchoi](https://github.com/SangbumChoi). The original code can be found [here](https://github.com/lyuwenyu/RT-DETR/). + + +## Usage tips + +Initially, an image is processed using a pre-trained convolutional neural network, specifically a Resnet-D variant as referenced in the original code. This network extracts features from the final three layers of the architecture. Following this, a hybrid encoder is employed to convert the multi-scale features into a sequential array of image features. Then, a decoder, equipped with auxiliary prediction heads is used to refine the object queries. This process facilitates the direct generation of bounding boxes, eliminating the need for any additional post-processing to acquire the logits and coordinates for the bounding boxes. + +```py +>>> import torch +>>> import requests + +>>> from PIL import Image +>>> from transformers import RTDetrForObjectDetection, RTDetrImageProcessor + +>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +>>> image = Image.open(requests.get(url, stream=True).raw) + +>>> image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd") +>>> model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd") + +>>> inputs = image_processor(images=image, return_tensors="pt") + +>>> with torch.no_grad(): +... outputs = model(**inputs) + +>>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3) + +>>> for result in results: +... for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]): +... score, label = score.item(), label_id.item() +... box = [round(i, 2) for i in box.tolist()] +... print(f"{model.config.id2label[label]}: {score:.2f} {box}") +sofa: 0.97 [0.14, 0.38, 640.13, 476.21] +cat: 0.96 [343.38, 24.28, 640.14, 371.5] +cat: 0.96 [13.23, 54.18, 318.98, 472.22] +remote: 0.95 [40.11, 73.44, 175.96, 118.48] +remote: 0.92 [333.73, 76.58, 369.97, 186.99] +``` + +## RTDetrConfig + +[[autodoc]] RTDetrConfig + +## RTDetrResNetConfig + +[[autodoc]] RTDetrResNetConfig + +## RTDetrImageProcessor + +[[autodoc]] RTDetrImageProcessor + - preprocess + - post_process_object_detection + +## RTDetrModel + +[[autodoc]] RTDetrModel + - forward + +## RTDetrForObjectDetection + +[[autodoc]] RTDetrForObjectDetection + - forward + +## RTDetrResNetBackbone + +[[autodoc]] RTDetrResNetBackbone + - forward diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index fb443a3ce12c..add92a9440c2 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -55,6 +55,7 @@ FlashAttention-2 is currently supported for the following architectures: * [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) * [Llava](https://huggingface.co/docs/transformers/model_doc/llava) * [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next) +* [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video) * [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava) * [VideoLlava](https://huggingface.co/docs/transformers/model_doc/video_llava) * [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100) @@ -203,6 +204,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel) * [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2) * [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel) +* [GPTNeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox#transformers.GPTNeoXModel) * [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel) * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel) * [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) diff --git a/docs/source/en/tasks/mask_generation.md b/docs/source/en/tasks/mask_generation.md index e16b014f3757..82202f58bca6 100644 --- a/docs/source/en/tasks/mask_generation.md +++ b/docs/source/en/tasks/mask_generation.md @@ -124,6 +124,7 @@ the processor. ```python from transformers import SamModel, SamProcessor +import torch device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') @@ -147,7 +148,6 @@ masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), i We can visualize the three masks in the `masks` output. ```python -import torch import matplotlib.pyplot as plt import numpy as np @@ -211,7 +211,7 @@ import matplotlib.patches as patches fig, ax = plt.subplots() ax.imshow(image) -rectangle = patches.Rectangle((2350, 1600, 500, 500, linewidth=2, edgecolor='r', facecolor='none') +rectangle = patches.Rectangle((2350, 1600), 500, 500, linewidth=2, edgecolor='r', facecolor='none') ax.add_patch(rectangle) ax.axis("off") plt.show() diff --git a/docs/source/en/testing.md b/docs/source/en/testing.md index 4649059872aa..e26411f69e1e 100644 --- a/docs/source/en/testing.md +++ b/docs/source/en/testing.md @@ -184,16 +184,16 @@ pytest -k "test and ada" tests/test_optimization.py Sometimes you need to run `accelerate` tests on your models. For that you can just add `-m accelerate_tests` to your command, if let's say you want to run these tests on `OPT` run: ```bash -RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py +RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py ``` -### Run documentation tests +### Run documentation tests -In order to test whether the documentation examples are correct, you should check that the `doctests` are passing. -As an example, let's use [`WhisperModel.forward`'s docstring](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035): +In order to test whether the documentation examples are correct, you should check that the `doctests` are passing. +As an example, let's use [`WhisperModel.forward`'s docstring](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035): -```python +```python r""" Returns: @@ -216,8 +216,8 @@ Example: ``` -Just run the following line to automatically test every docstring example in the desired file: -```bash +Just run the following line to automatically test every docstring example in the desired file: +```bash pytest --doctest-modules ``` If the file has a markdown extention, you should add the `--doctest-glob="*.md"` argument. @@ -881,7 +881,7 @@ code that's buggy causes some bad state that will affect other tests, do not use - Here is how to skip whole test unconditionally: ```python no-style -@unittest.skip("this bug needs to be fixed") +@unittest.skip(reason="this bug needs to be fixed") def test_feature_x(): ``` diff --git a/docs/source/ja/testing.md b/docs/source/ja/testing.md index 00a51f13811b..8831d48a3bda 100644 --- a/docs/source/ja/testing.md +++ b/docs/source/ja/testing.md @@ -171,16 +171,16 @@ pytest -k "test and ada" tests/test_optimization.py 時々、モデルに対して `accelerate` テストを実行する必要があります。たとえば、`OPT` 実行に対してこれらのテストを実行したい場合、コマンドに `-m accelerate_tests` を追加するだけで済みます: ```bash -RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py +RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py ``` -### Run documentation tests +### Run documentation tests ドキュメンテーションの例が正しいかどうかをテストするには、`doctests` が合格しているかを確認する必要があります。 例として、[`WhisperModel.forward` のドックストリング](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035)を使用しましょう。 -```python +```python r""" Returns: @@ -205,7 +205,7 @@ Example: 指定したファイル内のすべてのドックストリング例を自動的にテストするために、以下の行を実行してください: -```bash +```bash pytest --doctest-modules ``` @@ -809,7 +809,7 @@ with ExtendSysPath(f"{bindir}/.."): ```python no-style -@unittest.skip("this bug needs to be fixed") +@unittest.skip(reason="this bug needs to be fixed") def test_feature_x(): ``` @@ -1211,4 +1211,3 @@ cmd_that_may_fail || true - [Github Actions:](https://github.com/actions/toolkit/issues/399) - [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344) - diff --git a/docs/source/ko/testing.md b/docs/source/ko/testing.md index 390a1c19baac..fd3f548eeb81 100644 --- a/docs/source/ko/testing.md +++ b/docs/source/ko/testing.md @@ -26,19 +26,19 @@ rendered properly in your Markdown viewer. ## Transformers 테스트 방법[[how-transformers-are-tested]] -1. PR이 제출되면 9개의 CircleCi 작업으로 테스트가 진행됩니다. 해당 PR에 대해 새로운 커밋이 생성될 때마다 테스트는 다시 진행됩니다. 이 작업들은 - 이 [config 파일](https://github.com/huggingface/transformers/tree/main/.circleci/config.yml)에 정의되어 있으므로 필요하다면 +1. PR이 제출되면 9개의 CircleCi 작업으로 테스트가 진행됩니다. 해당 PR에 대해 새로운 커밋이 생성될 때마다 테스트는 다시 진행됩니다. 이 작업들은 + 이 [config 파일](https://github.com/huggingface/transformers/tree/main/.circleci/config.yml)에 정의되어 있으므로 필요하다면 사용자의 로컬 환경에서 동일하게 재현해 볼 수 있습니다. 이 CI 작업은 `@slow` 테스트를 실행하지 않습니다. 2. [github actions](https://github.com/huggingface/transformers/actions)에 의해 실행되는 작업은 3개입니다: - - [torch hub integration](https://github.com/huggingface/transformers/tree/main/.github/workflows/github-torch-hub.yml): + - [torch hub integration](https://github.com/huggingface/transformers/tree/main/.github/workflows/github-torch-hub.yml): torch hub integration이 작동하는지 확인합니다. - - [self-hosted (push)](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-push.yml): `main` 브랜치에서 커밋이 업데이트된 경우에만 GPU를 이용한 빠른 테스트를 실행합니다. - 이는 `src`, `tests`, `.github` 폴더 중 하나에 코드가 업데이트된 경우에만 실행됩니다. + - [self-hosted (push)](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-push.yml): `main` 브랜치에서 커밋이 업데이트된 경우에만 GPU를 이용한 빠른 테스트를 실행합니다. + 이는 `src`, `tests`, `.github` 폴더 중 하나에 코드가 업데이트된 경우에만 실행됩니다. (model card, notebook, 기타 등등을 추가한 경우 실행되지 않도록 하기 위해서입니다) - [self-hosted runner](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-scheduled.yml): `tests` 및 `examples`에서 @@ -61,7 +61,7 @@ RUN_SLOW=1 pytest examples/ ### 실행할 테스트 선택[[choosing-which-tests-to-run]] -이 문서는 테스트를 실행하는 다양한 방법에 대해 자세히 설명합니다. +이 문서는 테스트를 실행하는 다양한 방법에 대해 자세히 설명합니다. 모든 내용을 읽은 후에도, 더 자세한 내용이 필요하다면 [여기](https://docs.pytest.org/en/latest/usage.html)에서 확인할 수 있습니다. 다음은 가장 유용한 테스트 실행 방법 몇 가지입니다. @@ -186,7 +186,7 @@ pytest -k "test and ada" tests/test_optimization.py 모델에서 `accelerate` 테스트를 실행해야 할 때가 있습니다. 이를 위해서는 명령어에 `-m accelerate_tests`를 추가하면 됩니다. 예를 들어, `OPT`에서 이러한 테스트를 실행하려면 다음과 같습니다: ```bash -RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py +RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py ``` ### 문서 테스트 실행[[run-documentation-tests]] @@ -194,7 +194,7 @@ RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py 예시 문서가 올바른지 테스트하려면 `doctests`가 통과하는지 확인해야 합니다. 예를 들어, [`WhisperModel.forward`'s docstring](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035)를 사용해 봅시다: -```python +```python r""" Returns: @@ -218,7 +218,7 @@ Example: ``` 원하는 파일의 모든 docstring 예제를 자동으로 테스트하려면 다음 명령을 실행하면 됩니다: -```bash +```bash pytest --doctest-modules ``` 파일의 확장자가 markdown인 경우 `--doctest-glob="*.md"` 인수를 추가해야 합니다. @@ -240,9 +240,9 @@ pytest --picked ### 소스 수정 시 실패한 테스트 자동 재실행[[automatically-rerun-failed-tests-on-source-modification]] -[pytest-xdist](https://github.com/pytest-dev/pytest-xdist)는 모든 실패한 테스트를 감지하고, +[pytest-xdist](https://github.com/pytest-dev/pytest-xdist)는 모든 실패한 테스트를 감지하고, 파일을 수정한 후에 파일을 계속 재실행하여 테스트가 성공할 때까지 기다리는 매우 유용한 기능을 제공합니다. -따라서 수정한 내용을 확인한 후 pytest를 다시 시작할 필요가 없습니다. +따라서 수정한 내용을 확인한 후 pytest를 다시 시작할 필요가 없습니다. 모든 테스트가 통과될 때까지 이 과정을 반복한 후 다시 전체 실행이 이루어집니다. ```bash @@ -252,7 +252,7 @@ pip install pytest-xdist 재귀적 모드의 사용: `pytest -f` 또는 `pytest --looponfail` 파일의 변경 사항은 `looponfailroots` 루트 디렉터리와 해당 내용을 (재귀적으로) 확인하여 감지됩니다. -이 값의 기본값이 작동하지 않는 경우, +이 값의 기본값이 작동하지 않는 경우, `setup.cfg`의 설정 옵션을 변경하여 프로젝트에서 변경할 수 있습니다: ```ini @@ -275,7 +275,7 @@ looponfailroots = transformers tests ### 특정 테스트 모듈 건너뛰기[[skip-a-test-module]] -모든 테스트 모듈을 실행하되 특정 모듈을 제외하려면, 실행할 테스트 목록을 명시적으로 지정할 수 있습니다. +모든 테스트 모듈을 실행하되 특정 모듈을 제외하려면, 실행할 테스트 목록을 명시적으로 지정할 수 있습니다. 예를 들어, `test_modeling_*.py` 테스트를 제외한 모든 테스트를 실행하려면 다음을 사용할 수 있습니다: ```bash @@ -292,19 +292,19 @@ pytest --cache-clear tests ### 테스트를 병렬로 실행[[running-tests-in-parallel]] -이전에 언급한 것처럼 `make test`는 테스트를 병렬로 실행하기 위해 +이전에 언급한 것처럼 `make test`는 테스트를 병렬로 실행하기 위해 `pytest-xdist` 플러그인(`-n X` 인수, 예를 들어 `-n 2`를 사용하여 2개의 병렬 작업 실행)을 통해 실행됩니다. -`pytest-xdist`의 `--dist=` 옵션을 사용하여 테스트를 어떻게 그룹화할지 제어할 수 있습니다. +`pytest-xdist`의 `--dist=` 옵션을 사용하여 테스트를 어떻게 그룹화할지 제어할 수 있습니다. `--dist=loadfile`은 하나의 파일에 있는 테스트를 동일한 프로세스로 그룹화합니다. 실행된 테스트의 순서가 다르고 예측할 수 없기 때문에, `pytest-xdist`로 테스트 스위트를 실행하면 실패가 발생할 수 있습니다 (검출되지 않은 결합된 테스트가 있는 경우). -이 경우 [pytest-replay](https://github.com/ESSS/pytest-replay)를 사용하면 동일한 순서로 테스트를 다시 실행해서 +이 경우 [pytest-replay](https://github.com/ESSS/pytest-replay)를 사용하면 동일한 순서로 테스트를 다시 실행해서 실패하는 시퀀스를 최소화하는 데에 도움이 됩니다. ### 테스트 순서와 반복[[test-order-and-repetition]] -잠재적인 종속성 및 상태 관련 버그(tear down)를 감지하기 위해 +잠재적인 종속성 및 상태 관련 버그(tear down)를 감지하기 위해 테스트를 여러 번, 연속으로, 무작위로 또는 세트로 반복하는 것이 좋습니다. 그리고 직접적인 여러 번의 반복은 DL의 무작위성에 의해 발견되는 일부 문제를 감지하는 데에도 유용합니다. @@ -341,10 +341,10 @@ pytest --flake-finder --flake-runs=5 tests/test_failing_test.py pip install pytest-random-order ``` -중요: `pytest-random-order`가 설치되면 테스트가 자동으로 임의의 순서로 섞입니다. +중요: `pytest-random-order`가 설치되면 테스트가 자동으로 임의의 순서로 섞입니다. 구성 변경이나 커맨드 라인 옵션이 필요하지 않습니다. -앞서 설명한 것처럼 이를 통해 한 테스트의 상태가 다른 테스트의 상태에 영향을 미치는 결합된 테스트를 감지할 수 있습니다. +앞서 설명한 것처럼 이를 통해 한 테스트의 상태가 다른 테스트의 상태에 영향을 미치는 결합된 테스트를 감지할 수 있습니다. `pytest-random-order`가 설치되면 해당 세션에서 사용된 랜덤 시드가 출력되며 예를 들어 다음과 같습니다: ```bash @@ -364,7 +364,7 @@ Using --random-order-seed=573663 ``` 정확히 동일한 테스트 목록(또는 목록이 없음)을 사용하는 경우에만 정확한 순서를 재현합니다. -목록을 수동으로 좁히기 시작하면 더 이상 시드에 의존할 수 없고 실패했던 정확한 순서로 수동으로 목록을 나열해야합니다. 그리고 `--random-order-bucket=none`을 사용하여 pytest에게 순서를 임의로 설정하지 않도록 알려야 합니다. +목록을 수동으로 좁히기 시작하면 더 이상 시드에 의존할 수 없고 실패했던 정확한 순서로 수동으로 목록을 나열해야합니다. 그리고 `--random-order-bucket=none`을 사용하여 pytest에게 순서를 임의로 설정하지 않도록 알려야 합니다. 예를 들어 다음과 같습니다: ```bash @@ -377,19 +377,19 @@ pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.p pytest --random-order-bucket=none ``` -기본적으로 `--random-order-bucket=module`이 내재되어 있으므로, 모듈 수준에서 파일을 섞습니다. +기본적으로 `--random-order-bucket=module`이 내재되어 있으므로, 모듈 수준에서 파일을 섞습니다. 또한 `class`, `package`, `global` 및 `none` 수준에서도 섞을 수 있습니다. 자세한 내용은 해당 [문서](https://github.com/jbasko/pytest-random-order)를 참조하세요. 또 다른 무작위화의 대안은 [`pytest-randomly`](https://github.com/pytest-dev/pytest-randomly)입니다. -이 모듈은 매우 유사한 기능/인터페이스를 가지고 있지만, `pytest-random-order`에 있는 버킷 모드를 사용할 수는 없습니다. +이 모듈은 매우 유사한 기능/인터페이스를 가지고 있지만, `pytest-random-order`에 있는 버킷 모드를 사용할 수는 없습니다. 설치 후에는 자동으로 적용되는 문제도 동일하게 가집니다. ### 외관과 느낌을 변경[[look-and-feel-variations] #### pytest-sugar 사용[[pytest-sugar]] -[pytest-sugar](https://github.com/Frozenball/pytest-sugar)는 테스트가 보여지는 형태를 개선하고, +[pytest-sugar](https://github.com/Frozenball/pytest-sugar)는 테스트가 보여지는 형태를 개선하고, 진행 상황 바를 추가하며, 실패한 테스트와 검증을 즉시 표시하는 플러그인입니다. 설치하면 자동으로 활성화됩니다. ```bash @@ -416,7 +416,7 @@ pytest --pspec tests/test_optimization.py #### 실패한 테스트 즉시 표시[[instantly-shows-failed-tests]] -[pytest-instafail](https://github.com/pytest-dev/pytest-instafail)은 테스트 세션의 끝까지 기다리지 않고 +[pytest-instafail](https://github.com/pytest-dev/pytest-instafail)은 테스트 세션의 끝까지 기다리지 않고 실패 및 오류를 즉시 표시합니다. ```bash @@ -435,7 +435,7 @@ GPU가 활성화된 환경에서, CPU 전용 모드로 테스트하려면 `CUDA_ CUDA_VISIBLE_DEVICES="" pytest tests/utils/test_logging.py ``` -또는 다중 GPU가 있는 경우 `pytest`에서 사용할 GPU를 지정할 수도 있습니다. +또는 다중 GPU가 있는 경우 `pytest`에서 사용할 GPU를 지정할 수도 있습니다. 예를 들어, GPU `0` 및 `1`이 있는 경우 다음을 실행할 수 있습니다: ```bash @@ -444,7 +444,7 @@ CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py 이렇게 하면 다른 GPU에서 다른 작업을 실행하려는 경우 유용합니다. -일부 테스트는 반드시 CPU 전용으로 실행해야 하며, 일부는 CPU 또는 GPU 또는 TPU에서 실행해야 하고, 일부는 여러 GPU에서 실행해야 합니다. +일부 테스트는 반드시 CPU 전용으로 실행해야 하며, 일부는 CPU 또는 GPU 또는 TPU에서 실행해야 하고, 일부는 여러 GPU에서 실행해야 합니다. 다음 스킵 데코레이터는 테스트의 요구 사항을 CPU/GPU/TPU별로 설정하는 데 사용됩니다: - `require_torch` - 이 테스트는 torch에서만 실행됩니다. @@ -480,7 +480,7 @@ def test_example_with_multi_gpu(): def test_tf_thing_with_tensorflow(): ``` -이러한 데코레이터는 중첩될 수 있습니다. +이러한 데코레이터는 중첩될 수 있습니다. 예를 들어, 느린 테스트로 진행되고 pytorch에서 적어도 하나의 GPU가 필요한 경우 다음과 같이 설정할 수 있습니다: ```python no-style @@ -489,7 +489,7 @@ def test_tf_thing_with_tensorflow(): def test_example_slow_on_gpu(): ``` -`@parametrized`와 같은 일부 데코레이터는 테스트 이름을 다시 작성하기 때문에 `@require_*` 스킵 데코레이터는 올바르게 작동하려면 항상 맨 마지막에 나열되어야 합니다. +`@parametrized`와 같은 일부 데코레이터는 테스트 이름을 다시 작성하기 때문에 `@require_*` 스킵 데코레이터는 올바르게 작동하려면 항상 맨 마지막에 나열되어야 합니다. 다음은 올바른 사용 예입니다: ```python no-style @@ -498,7 +498,7 @@ def test_example_slow_on_gpu(): def test_integration_foo(): ``` -`@pytest.mark.parametrize`에는 이러한 순서 문제는 없으므로 처음 혹은 마지막에 위치시킬 수 있고 이러한 경우에도 잘 작동할 것입니다. +`@pytest.mark.parametrize`에는 이러한 순서 문제는 없으므로 처음 혹은 마지막에 위치시킬 수 있고 이러한 경우에도 잘 작동할 것입니다. 하지만 unittest가 아닌 경우에만 작동합니다. 테스트 내부에서 다음을 사용할 수 있습니다: @@ -513,7 +513,7 @@ n_gpu = get_gpu_count() #torch와 tf와 함께 작동 ### 분산 훈련[[distributed-training]] -`pytest`는 분산 훈련을 직접적으로 다루지 못합니다. +`pytest`는 분산 훈련을 직접적으로 다루지 못합니다. 이를 시도하면 하위 프로세스가 올바른 작업을 수행하지 않고 `pytest`라고 생각하기에 테스트 스위트를 반복해서 실행하게 됩니다. 그러나 일반 프로세스를 생성한 다음 여러 워커를 생성하고 IO 파이프를 관리하도록 하면 동작합니다. @@ -532,7 +532,7 @@ CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/test_trainer_distributed.py ### 출력 캡처[[output-capture]] -테스트 실행 중 `stdout` 및 `stderr`로 전송된 모든 출력이 캡처됩니다. +테스트 실행 중 `stdout` 및 `stderr`로 전송된 모든 출력이 캡처됩니다. 테스트나 설정 메소드가 실패하면 캡처된 출력은 일반적으로 실패 추적 정보와 함께 표시됩니다. 출력 캡처를 비활성화하고 `stdout` 및 `stderr`를 정상적으로 받으려면 `-s` 또는 `--capture=no`를 사용하세요: @@ -563,7 +563,7 @@ pytest --color=no tests/utils/test_logging.py pytest --pastebin=failed tests/utils/test_logging.py ``` -이렇게 하면 각 실패에 대한 URL을 제공하는 remote Paste service에 테스트 실행 정보를 제출합니다. +이렇게 하면 각 실패에 대한 URL을 제공하는 remote Paste service에 테스트 실행 정보를 제출합니다. 일반적인 테스트를 선택할 수도 있고 혹은 특정 실패만 보내려면 `-x`와 같이 추가할 수도 있습니다. 전체 테스트 세션 로그에 대한 URL을 생성합니다: @@ -574,17 +574,17 @@ pytest --pastebin=all tests/utils/test_logging.py ## 테스트 작성[[writing-tests]] -🤗 transformers 테스트는 대부분 `unittest`를 기반으로 하지만, +🤗 transformers 테스트는 대부분 `unittest`를 기반으로 하지만, `pytest`에서 실행되므로 대부분의 경우 두 시스템의 기능을 사용할 수 있습니다. -지원되는 기능에 대해 [여기](https://docs.pytest.org/en/stable/unittest.html)에서 확인할 수 있지만, +지원되는 기능에 대해 [여기](https://docs.pytest.org/en/stable/unittest.html)에서 확인할 수 있지만, 기억해야 할 중요한 점은 대부분의 `pytest` fixture가 작동하지 않는다는 것입니다. 파라미터화도 작동하지 않지만, 우리는 비슷한 방식으로 작동하는 `parameterized` 모듈을 사용합니다. ### 매개변수화[[parametrization]] -동일한 테스트를 다른 인수로 여러 번 실행해야 하는 경우가 종종 있습니다. +동일한 테스트를 다른 인수로 여러 번 실행해야 하는 경우가 종종 있습니다. 테스트 내에서 이 작업을 수행할 수 있지만, 그렇게 하면 하나의 인수 세트에 대해 테스트를 실행할 수 없습니다. ```python @@ -605,7 +605,7 @@ class TestMathUnitTest(unittest.TestCase): assert_equal(math.floor(input), expected) ``` -이제 기본적으로 이 테스트는 `test_floor`의 마지막 3개 인수가 +이제 기본적으로 이 테스트는 `test_floor`의 마지막 3개 인수가 매개변수 목록의 해당 인수에 할당되는 것으로 3번 실행될 것입니다. 그리고 `negative` 및 `integer` 매개변수 집합만 실행하려면 다음과 같이 실행할 수 있습니다: @@ -620,7 +620,7 @@ pytest -k "negative and integer" tests/test_mytest.py pytest -k "not negative" tests/test_mytest.py ``` -앞에서 언급한 `-k` 필터를 사용하는 것 외에도, +앞에서 언급한 `-k` 필터를 사용하는 것 외에도, 각 서브 테스트의 정확한 이름을 확인한 후에 일부 혹은 전체 서브 테스트를 실행할 수 있습니다. ```bash @@ -641,10 +641,10 @@ test_this1.py::TestMathUnitTest::test_floor_2_large_fraction pytest test_this1.py::TestMathUnitTest::test_floor_0_negative test_this1.py::TestMathUnitTest::test_floor_1_integer ``` -`transformers`의 개발자 종속성에 이미 있는 [parameterized](https://pypi.org/project/parameterized/) 모듈은 +`transformers`의 개발자 종속성에 이미 있는 [parameterized](https://pypi.org/project/parameterized/) 모듈은 `unittests`와 `pytest` 테스트 모두에서 작동합니다. -그러나 테스트가 `unittest`가 아닌 경우 `pytest.mark.parametrize`를 사용할 수 있습니다(이미 있는 일부 테스트에서 사용되는 경우도 있습니다. +그러나 테스트가 `unittest`가 아닌 경우 `pytest.mark.parametrize`를 사용할 수 있습니다(이미 있는 일부 테스트에서 사용되는 경우도 있습니다. 주로 `examples` 하위에 있습니다). 다음은 `pytest`의 `parametrize` 마커를 사용한 동일한 예입니다: @@ -666,8 +666,8 @@ def test_floor(name, input, expected): assert_equal(math.floor(input), expected) ``` -`parameterized`와 마찬가지로 `pytest.mark.parametrize`를 사용하면 -`-k` 필터가 작동하지 않는 경우에도 실행할 서브 테스트를 정확하게 지정할 수 있습니다. +`parameterized`와 마찬가지로 `pytest.mark.parametrize`를 사용하면 +`-k` 필터가 작동하지 않는 경우에도 실행할 서브 테스트를 정확하게 지정할 수 있습니다. 단, 이 매개변수화 함수는 서브 테스트의 이름 집합을 약간 다르게 생성합니다. 다음과 같은 모습입니다: ```bash @@ -694,7 +694,7 @@ pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[i ### 파일 및 디렉터리[[files-and-directories]] -테스트에서 종종 현재 테스트 파일과 관련된 상대적인 위치를 알아야 하는 경우가 있습니다. +테스트에서 종종 현재 테스트 파일과 관련된 상대적인 위치를 알아야 하는 경우가 있습니다. 테스트가 여러 디렉터리에서 호출되거나 깊이가 다른 하위 디렉터리에 있을 수 있기 때문에 그 위치를 아는 것은 간단하지 않습니다. `transformers.test_utils.TestCasePlus`라는 헬퍼 클래스는 모든 기본 경로를 처리하고 간단한 액세서를 제공하여 이 문제를 해결합니다: @@ -717,7 +717,7 @@ pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[i - `repo_root_dir_str` - `src_dir_str` -위의 내용을 사용하려면 테스트가 'transformers.test_utils.TestCasePlus'의 서브클래스에 있는지 확인해야 합니다. +위의 내용을 사용하려면 테스트가 'transformers.test_utils.TestCasePlus'의 서브클래스에 있는지 확인해야 합니다. 예를 들어 다음과 같습니다: ```python @@ -729,7 +729,7 @@ class PathExampleTest(TestCasePlus): data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro" ``` -만약 `pathlib`를 통해 경로를 조작할 필요가 없거나 경로를 문자열로만 필요로 하는 경우에는 `pathlib` 객체에 `str()`을 호출하거나 `_str`로 끝나는 접근자를 사용할 수 있습니다. +만약 `pathlib`를 통해 경로를 조작할 필요가 없거나 경로를 문자열로만 필요로 하는 경우에는 `pathlib` 객체에 `str()`을 호출하거나 `_str`로 끝나는 접근자를 사용할 수 있습니다. 예를 들어 다음과 같습니다: ```python @@ -743,14 +743,14 @@ class PathExampleTest(TestCasePlus): ### 임시 파일 및 디렉터리[[temporary-files-and-directories]] -고유한 임시 파일 및 디렉터리를 사용하는 것은 병렬 테스트 실행에 있어 필수적입니다. -이렇게 함으로써 테스트들이 서로의 데이터를 덮어쓰지 않게 할 수 있습니다. 또한 우리는 생성된 테스트의 종료 단계에서 이러한 임시 파일 및 디렉터리를 제거하고 싶습니다. +고유한 임시 파일 및 디렉터리를 사용하는 것은 병렬 테스트 실행에 있어 필수적입니다. +이렇게 함으로써 테스트들이 서로의 데이터를 덮어쓰지 않게 할 수 있습니다. 또한 우리는 생성된 테스트의 종료 단계에서 이러한 임시 파일 및 디렉터리를 제거하고 싶습니다. 따라서 이러한 요구 사항을 충족시켜주는 `tempfile`과 같은 패키지를 사용하는 것이 중요합니다. -그러나 테스트를 디버깅할 때는 임시 파일이나 디렉터리에 들어가는 내용을 확인할 수 있어야 하며, +그러나 테스트를 디버깅할 때는 임시 파일이나 디렉터리에 들어가는 내용을 확인할 수 있어야 하며, 재실행되는 각 테스트마다 임시 파일이나 디렉터리의 경로에 대해 무작위 값이 아닌 정확한 값을 알고 싶을 것입니다. -`transformers.test_utils.TestCasePlus`라는 도우미 클래스는 이러한 목적에 가장 적합합니다. +`transformers.test_utils.TestCasePlus`라는 도우미 클래스는 이러한 목적에 가장 적합합니다. 이 클래스는 `unittest.TestCase`의 하위 클래스이므로, 우리는 이것을 테스트 모듈에서 쉽게 상속할 수 있습니다. 다음은 해당 클래스를 사용하는 예시입니다: @@ -773,7 +773,7 @@ def test_whatever(self): tmp_dir = self.get_auto_remove_tmp_dir() ``` -`tmp_dir`에는 생성된 임시 디렉터리의 경로가 포함됩니다. +`tmp_dir`에는 생성된 임시 디렉터리의 경로가 포함됩니다. 이는 테스트의 종료 단계에서 자동으로 제거됩니다. - 선택한 경로로 임시 디렉터리 생성 후에 테스트 시작 전에 비어 있는 상태인지 확인하고, 테스트 후에는 비우지 마세요. @@ -783,10 +783,10 @@ def test_whatever(self): tmp_dir = self.get_auto_remove_tmp_dir("./xxx") ``` -이것은 디버깅할 때 특정 디렉터리를 모니터링하고, +이것은 디버깅할 때 특정 디렉터리를 모니터링하고, 그 디렉터리에 이전에 실행된 테스트가 데이터를 남기지 않도록 하는 데에 유용합니다. -- `before` 및 `after` 인수를 직접 오버라이딩하여 기본 동작을 변경할 수 있으며 +- `before` 및 `after` 인수를 직접 오버라이딩하여 기본 동작을 변경할 수 있으며 다음 중 하나의 동작으로 이어집니다: - `before=True`: 테스트 시작 시 임시 디렉터리가 항상 지워집니다. @@ -804,7 +804,7 @@ def test_whatever(self): -각 테스트는 여러 개의 임시 디렉터리를 등록할 수 있으며, +각 테스트는 여러 개의 임시 디렉터리를 등록할 수 있으며, 별도로 요청하지 않는 한 모두 자동으로 제거됩니다. @@ -826,17 +826,17 @@ with ExtendSysPath(f"{bindir}/.."): ### 테스트 건너뛰기[[skipping-tests]] -이것은 버그가 발견되어 새로운 테스트가 작성되었지만 아직 그 버그가 수정되지 않은 경우에 유용합니다. +이것은 버그가 발견되어 새로운 테스트가 작성되었지만 아직 그 버그가 수정되지 않은 경우에 유용합니다. 이 테스트를 주 저장소에 커밋하려면 `make test` 중에 건너뛰도록 해야 합니다. 방법: -- **skip**은 테스트가 일부 조건이 충족될 경우에만 통과될 것으로 예상되고, 그렇지 않으면 pytest가 전체 테스트를 건너뛰어야 함을 의미합니다. -일반적인 예로는 Windows가 아닌 플랫폼에서 Windows 전용 테스트를 건너뛰거나 +- **skip**은 테스트가 일부 조건이 충족될 경우에만 통과될 것으로 예상되고, 그렇지 않으면 pytest가 전체 테스트를 건너뛰어야 함을 의미합니다. +일반적인 예로는 Windows가 아닌 플랫폼에서 Windows 전용 테스트를 건너뛰거나 외부 리소스(예를 들어 데이터베이스)에 의존하는 테스트를 건너뛰는 것이 있습니다. -- **xfail**은 테스트가 특정한 이유로 인해 실패할 것으로 예상하는 것을 의미합니다. -일반적인 예로는 아직 구현되지 않은 기능이나 아직 수정되지 않은 버그의 테스트가 있습니다. +- **xfail**은 테스트가 특정한 이유로 인해 실패할 것으로 예상하는 것을 의미합니다. +일반적인 예로는 아직 구현되지 않은 기능이나 아직 수정되지 않은 버그의 테스트가 있습니다. `xfail`로 표시된 테스트가 예상대로 실패하지 않고 통과된 경우, 이것은 xpass이며 테스트 결과 요약에 기록됩니다. 두 가지 중요한 차이점 중 하나는 `skip`은 테스트를 실행하지 않지만 `xfail`은 실행한다는 것입니다. @@ -847,7 +847,7 @@ with ExtendSysPath(f"{bindir}/.."): - 전체 테스트를 무조건 건너뛰려면 다음과 같이 할 수 있습니다: ```python no-style -@unittest.skip("this bug needs to be fixed") +@unittest.skip(reason="this bug needs to be fixed") def test_feature_x(): ``` @@ -920,7 +920,7 @@ class TestClass(): ### 느린 테스트[[slow-tests]] -테스트 라이브러리는 지속적으로 확장되고 있으며, 일부 테스트는 실행하는 데 몇 분이 걸립니다. +테스트 라이브러리는 지속적으로 확장되고 있으며, 일부 테스트는 실행하는 데 몇 분이 걸립니다. 그리고 우리에게는 테스트 스위트가 CI를 통해 완료되기까지 한 시간을 기다릴 여유가 없습니다. 따라서 필수 테스트를 위한 일부 예외를 제외하고 느린 테스트는 다음과 같이 표시해야 합니다. @@ -936,7 +936,7 @@ def test_integration_foo(): RUN_SLOW=1 pytest tests ``` -`@parameterized`와 같은 몇 가지 데코레이터는 테스트 이름을 다시 작성합니다. +`@parameterized`와 같은 몇 가지 데코레이터는 테스트 이름을 다시 작성합니다. 그러므로 `@slow`와 나머지 건너뛰기 데코레이터 `@require_*`가 올바르게 작동되려면 마지막에 나열되어야 합니다. 다음은 올바른 사용 예입니다. ```python no-style @@ -945,25 +945,25 @@ RUN_SLOW=1 pytest tests def test_integration_foo(): ``` -이 문서의 초반부에 설명된 것처럼 느린 테스트는 PR의 CI 확인이 아닌 예약된 일정 기반으로 실행됩니다. +이 문서의 초반부에 설명된 것처럼 느린 테스트는 PR의 CI 확인이 아닌 예약된 일정 기반으로 실행됩니다. 따라서 PR 제출 중에 일부 문제를 놓친 채로 병합될 수 있습니다. -이러한 문제들은 다음번의 예정된 CI 작업 중에 감지됩니다. +이러한 문제들은 다음번의 예정된 CI 작업 중에 감지됩니다. 하지만 PR을 제출하기 전에 자신의 컴퓨터에서 느린 테스트를 실행하는 것 또한 중요합니다. 느린 테스트로 표시해야 하는지 여부를 결정하는 대략적인 결정 기준은 다음과 같습니다. -만약 테스트가 라이브러리의 내부 구성 요소 중 하나에 집중되어 있다면(예: 모델링 파일, 토큰화 파일, 파이프라인), +만약 테스트가 라이브러리의 내부 구성 요소 중 하나에 집중되어 있다면(예: 모델링 파일, 토큰화 파일, 파이프라인), 해당 테스트를 느린 테스트 스위트에서 실행해야 합니다. -만약 라이브러리의 다른 측면(예: 문서 또는 예제)에 집중되어 있다면, +만약 라이브러리의 다른 측면(예: 문서 또는 예제)에 집중되어 있다면, 해당 테스트를 느린 테스트 스위트에서 실행해야 합니다. 그리고 이 접근 방식을 보완하기 위해 예외를 만들어야 합니다. -- 무거운 가중치 세트나 50MB보다 큰 데이터셋을 다운로드해야 하는 모든 테스트(예: 모델 통합 테스트, 토크나이저 통합 테스트, 파이프라인 통합 테스트)를 +- 무거운 가중치 세트나 50MB보다 큰 데이터셋을 다운로드해야 하는 모든 테스트(예: 모델 통합 테스트, 토크나이저 통합 테스트, 파이프라인 통합 테스트)를 느린 테스트로 설정해야 합니다. - 새로운 모델을 추가하는 경우 통합 테스트용으로 무작위 가중치로 작은 버전을 만들어 허브에 업로드해야 합니다. + 새로운 모델을 추가하는 경우 통합 테스트용으로 무작위 가중치로 작은 버전을 만들어 허브에 업로드해야 합니다. 이 내용은 아래 단락에서 설명됩니다. - 특별히 빠르게 실행되도록 최적화되지 않은 학습을 수행해야 하는 테스트는 느린 테스트로 설정해야 합니다. -- 느리지 않아야 할 테스트 중 일부가 극도로 느린 경우 - 예외를 도입하고 이를 `@slow`로 설정할 수 있습니다. +- 느리지 않아야 할 테스트 중 일부가 극도로 느린 경우 + 예외를 도입하고 이를 `@slow`로 설정할 수 있습니다. 대용량 파일을 디스크에 저장하고 불러오는 자동 모델링 테스트는 `@slow`으로 표시된 테스트의 좋은 예입니다. - CI에서 1초 이내에 테스트가 완료되는 경우(다운로드 포함)에는 느린 테스트가 아니어야 합니다. @@ -976,22 +976,22 @@ def test_integration_foo(): grep tiny tests examples ``` -다음은 작은 모델[stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de)을 만든 -[script](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py) 예시입니다. +다음은 작은 모델[stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de)을 만든 +[script](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py) 예시입니다. 특정 모델의 아키텍처에 맞게 쉽게 조정할 수 있습니다. -예를 들어 대용량 모델을 다운로드하는 경우 런타임을 잘못 측정하기 쉽지만, -로컬에서 테스트하면 다운로드한 파일이 캐시되어 다운로드 시간이 측정되지 않습니다. +예를 들어 대용량 모델을 다운로드하는 경우 런타임을 잘못 측정하기 쉽지만, +로컬에서 테스트하면 다운로드한 파일이 캐시되어 다운로드 시간이 측정되지 않습니다. 대신 CI 로그의 실행 속도 보고서를 확인하세요(`pytest --durations=0 tests`의 출력). -이 보고서는 느린 이상값으로 표시되지 않거나 빠르게 다시 작성해야 하는 느린 이상값을 찾는 데도 유용합니다. +이 보고서는 느린 이상값으로 표시되지 않거나 빠르게 다시 작성해야 하는 느린 이상값을 찾는 데도 유용합니다. CI에서 테스트 스위트가 느려지기 시작하면 이 보고서의 맨 위 목록에 가장 느린 테스트가 표시됩니다. ### stdout/stderr 출력 테스트[[testing-the-stdout/stderr-output]] -`stdout` 및/또는 `stderr`로 쓰는 함수를 테스트하려면 `pytest`의 [capsys 시스템](https://docs.pytest.org/en/latest/capture.html)을 사용하여 해당 스트림에 액세스할 수 있습니다. +`stdout` 및/또는 `stderr`로 쓰는 함수를 테스트하려면 `pytest`의 [capsys 시스템](https://docs.pytest.org/en/latest/capture.html)을 사용하여 해당 스트림에 액세스할 수 있습니다. 다음과 같이 수행할 수 있습니다. ```python @@ -1019,7 +1019,7 @@ def test_result_and_stdout(capsys): assert msg in err ``` -그리고, 물론 대부분의 경우에는 `stderr`는 예외의 일부로 제공됩니다. +그리고, 물론 대부분의 경우에는 `stderr`는 예외의 일부로 제공됩니다. 그러므로 해당 경우에는 try/except를 사용해야 합니다. ```python @@ -1061,11 +1061,11 @@ def test_result_and_stdout(): ``` `stdout` 캡처에 관련된 중요한 문제 중 하나는 보통 `print`에서 이전에 인쇄된 내용을 재설정하는 `\r` 문자가 포함될 수 있다는 것입니다. -`pytest`에서는 문제가 없지만 `pytest -s`에서는 이러한 문자가 버퍼에 포함되므로 +`pytest`에서는 문제가 없지만 `pytest -s`에서는 이러한 문자가 버퍼에 포함되므로 `-s`가 있거나 없는 상태에서 태스트를 수행할 수 있으려면 캡처된 출력에 대해 추가적인 정리가 필요합니다. 이 경우에는 `re.sub(r'~.*\r', '', buf, 0, re.M)`을 사용할 수 있습니다. -하지만 도우미 컨텍스트 관리자 래퍼를 사용하면 +하지만 도우미 컨텍스트 관리자 래퍼를 사용하면 출력에 `\r`이 포함되어 있는지의 여부에 관계없이 모든 것을 자동으로 처리하므로 편리합니다. ```python @@ -1108,7 +1108,7 @@ with CaptureStd() as cs: print(cs.err, cs.out) ``` -또한, 테스트의 디버깅을 지원하기 위해 +또한, 테스트의 디버깅을 지원하기 위해 이러한 컨텍스트 관리자는 기본적으로 컨텍스트에서 종료할 때 캡처된 스트림을 자동으로 다시 실행합니다. @@ -1130,7 +1130,7 @@ assert cl.out, msg + "\n" ### 환경 변수를 이용하여 테스트[[testing-with-environment-variables]] -특정 테스트의 환경 변수 영향을 검증하려면 +특정 테스트의 환경 변수 영향을 검증하려면 `transformers.testing_utils.mockenv`라는 도우미 데코레이터를 사용할 수 있습니다. ```python @@ -1143,7 +1143,7 @@ class HfArgumentParserTest(unittest.TestCase): env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None) ``` -일부 경우에는 외부 프로그램을 호출해야할 수도 있는데, 이 때에는 여러 개의 로컬 경로를 포함하는 `os.environ`에서 `PYTHONPATH`의 설정이 필요합니다. +일부 경우에는 외부 프로그램을 호출해야할 수도 있는데, 이 때에는 여러 개의 로컬 경로를 포함하는 `os.environ`에서 `PYTHONPATH`의 설정이 필요합니다. 헬퍼 클래스 `transformers.test_utils.TestCasePlus`가 도움이 됩니다: ```python @@ -1156,8 +1156,8 @@ class EnvExampleTest(TestCasePlus): # 이제 `env`를 사용하여 외부 프로그램 호출 ``` -테스트 파일이 `tests` 테스트 스위트 또는 `examples`에 있는지에 따라 -`env[PYTHONPATH]`가 두 디렉터리 중 하나를 포함하도록 설정되며, +테스트 파일이 `tests` 테스트 스위트 또는 `examples`에 있는지에 따라 +`env[PYTHONPATH]`가 두 디렉터리 중 하나를 포함하도록 설정되며, 현재 저장소에 대해 테스트가 수행되도록 `src` 디렉터리도 포함됩니다. 테스트 호출 이전에 설정된 경우에는 `env[PYTHONPATH]`를 그대로 사용합니다. @@ -1166,7 +1166,7 @@ class EnvExampleTest(TestCasePlus): ### 재현 가능한 결과 얻기[[getting-reproducible-results]] -일부 상황에서 테스트에서 임의성을 제거하여 동일하게 재현 가능한 결과를 얻고 싶을 수 있습니다. +일부 상황에서 테스트에서 임의성을 제거하여 동일하게 재현 가능한 결과를 얻고 싶을 수 있습니다. 이를 위해서는 다음과 같이 시드를 고정해야 합니다. ```python @@ -1207,11 +1207,11 @@ pytest tests/utils/test_logging.py -W error::UserWarning --pdb 셀프 푸시 워크플로우 CI 작업을 트리거하려면, 다음을 수행해야 합니다. 1. `transformers` 원본에서 새 브랜치를 만듭니다(포크가 아닙니다!). -2. 브랜치 이름은 `ci_` 또는 `ci-`로 시작해야 합니다(`main`도 트리거하지만 `main`에서는 PR을 할 수 없습니다). - 또한 특정 경로에 대해서만 트리거되므로 이 문서가 작성된 후에 변경된 내용은 +2. 브랜치 이름은 `ci_` 또는 `ci-`로 시작해야 합니다(`main`도 트리거하지만 `main`에서는 PR을 할 수 없습니다). + 또한 특정 경로에 대해서만 트리거되므로 이 문서가 작성된 후에 변경된 내용은 [여기](https://github.com/huggingface/transformers/blob/main/.github/workflows/self-push.yml)의 *push:*에서 확인할 수 있습니다. 3. 이 브랜치에서 PR을 생성합니다 -4. 그런 다음 [여기](https://github.com/huggingface/transformers/actions/workflows/self-push.yml)에서 작업이 나타나는지 확인할 수 있습니다. +4. 그런 다음 [여기](https://github.com/huggingface/transformers/actions/workflows/self-push.yml)에서 작업이 나타나는지 확인할 수 있습니다. 백로그가 있는 경우, 바로 실행되지 않을 수도 있습니다. @@ -1219,13 +1219,13 @@ pytest tests/utils/test_logging.py -W error::UserWarning --pdb ## 실험적인 CI 기능 테스트[[testing-Experimental-CI-Features]] -CI 기능을 테스트하는 것은 일반 CI 작동에 방해가 될 수 있기 때문에 잠재적으로 문제가 발생할 수 있습니다. +CI 기능을 테스트하는 것은 일반 CI 작동에 방해가 될 수 있기 때문에 잠재적으로 문제가 발생할 수 있습니다. 따라서 새로운 CI 기능을 추가하는 경우 다음과 같이 수행해야 합니다. 1. 테스트해야 할 내용을 테스트하는 새로운 전용 작업을 생성합니다. 2. 새로운 작업은 항상 성공해야만 녹색 ✓를 받을 수 있습니다(아래에 자세한 내용이 있습니다). -3. 다양한 PR 유형에 대한 확인을 위해 - (사용자 포크 브랜치, 포크되지 않은 브랜치, github.com UI 직접 파일 편집에서 생성된 브랜치, 강제 푸시 등 PR의 유형은 아주 다양합니다.) +3. 다양한 PR 유형에 대한 확인을 위해 + (사용자 포크 브랜치, 포크되지 않은 브랜치, github.com UI 직접 파일 편집에서 생성된 브랜치, 강제 푸시 등 PR의 유형은 아주 다양합니다.) 며칠 동안 실험 작업의 로그를 모니터링하면서 실행해봅니다. (의도적으로 항상 녹색을 표시하므로 작업 전체가 녹색은 아니라는 점에 유의합니다.) 4. 모든 것이 안정적인지 확인한 후, 새로운 변경 사항을 기존 작업에 병합합니다. @@ -1234,7 +1234,7 @@ CI 기능을 테스트하는 것은 일반 CI 작동에 방해가 될 수 있기 그러나 새로운 CI 기능이 개발 중인 동안, 항상 성공하도록 할 수 있는 방법은 무엇일까요? -TravisCI와 같은 일부 CI는 `ignore-step-failure`를 지원하며 전체 작업을 성공한 것으로 보고하지만, +TravisCI와 같은 일부 CI는 `ignore-step-failure`를 지원하며 전체 작업을 성공한 것으로 보고하지만, 현재 우리가 사용하는 CircleCI와 Github Actions는 이를 지원하지 않습니다. 따라서 다음과 같은 해결책을 사용할 수 있습니다. @@ -1264,12 +1264,12 @@ TravisCI와 같은 일부 CI는 `ignore-step-failure`를 지원하며 전체 작 cmd_that_may_fail || true ``` -결과에 만족한 후에는 물론, 실험적인 단계 또는 작업을 일반 작업의 나머지 부분과 통합하면서 -`set +euo pipefail` 또는 기타 추가한 요소를 제거하여 +결과에 만족한 후에는 물론, 실험적인 단계 또는 작업을 일반 작업의 나머지 부분과 통합하면서 +`set +euo pipefail` 또는 기타 추가한 요소를 제거하여 실험 작업이 일반 CI 작동에 방해되지 않도록 해야 합니다. -이 전반적인 과정은 실험 단계가 PR의 전반적인 상태에 영향을 주지 않고 실패하도록 -`allow-failure`와 같은 기능을 설정할 수 있다면 훨씬 더 쉬웠을 것입니다. +이 전반적인 과정은 실험 단계가 PR의 전반적인 상태에 영향을 주지 않고 실패하도록 +`allow-failure`와 같은 기능을 설정할 수 있다면 훨씬 더 쉬웠을 것입니다. 그러나 앞에서 언급한 바와 같이 CircleCI와 Github Actions는 현재 이러한 기능들 지원하지 않습니다. 이 기능의 지원을 위한 투표에 참여하고 CI 관련 스레드들에서 이러한 상황을 확인할 수도 있습니다. diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index 5decef3656b9..ff05b78cb538 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -133,6 +133,10 @@ class DataTrainingArguments: ) }, ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) overwrite_cache: bool = field( default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} ) @@ -573,6 +577,7 @@ def preprocess_function(examples): raw_datasets = raw_datasets.map( preprocess_function, batched=True, + num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset", ) diff --git a/README_de.md b/i18n/README_de.md similarity index 97% rename from README_de.md rename to i18n/README_de.md index b212371f02e2..f837501f3ca6 100644 --- a/README_de.md +++ b/i18n/README_de.md @@ -36,18 +36,18 @@ limitations under the License.

English | - 简体中文 | - 繁體中文 | - 한국어 | - Español | - 日本語 | - हिन्दी | - Русский | - Рortuguês | - తెలుగు | - Français | + 简体中文 | + 繁體中文 | + 한국어 | + Español | + 日本語 | + हिन्दी | + Русский | + Рortuguês | + తెలుగు | + Français | Deutsch | - Tiếng Việt | + Tiếng Việt |

diff --git a/README_es.md b/i18n/README_es.md similarity index 97% rename from README_es.md rename to i18n/README_es.md index 95909a45f18a..c437e2ab6f78 100644 --- a/README_es.md +++ b/i18n/README_es.md @@ -31,18 +31,18 @@ limitations under the License.

English | - 简体中文 | - 繁體中文 | - 한국어 | + 简体中文 | + 繁體中文 | + 한국어 | Español | - 日本語 | - हिन्दी | - Русский | - Рortuguês | - తెలుగు | - Français | - Deutsch | - Tiếng Việt | + 日本語 | + हिन्दी | + Русский | + Рortuguês | + తెలుగు | + Français | + Deutsch | + Tiếng Việt |

diff --git a/README_fr.md b/i18n/README_fr.md similarity index 97% rename from README_fr.md rename to i18n/README_fr.md index d315cc5083eb..b3be5e44112c 100644 --- a/README_fr.md +++ b/i18n/README_fr.md @@ -36,18 +36,18 @@ limitations under the License.

English | - 简体中文 | - 繁體中文 | - 한국어 | - Español | - 日本語 | - हिन्दी | - Русский | - Рortuguês | - తెలుగు | + 简体中文 | + 繁體中文 | + 한국어 | + Español | + 日本語 | + हिन्दी | + Русский | + Рortuguês | + తెలుగు | Français | - Deutsch | - Tiếng Việt | + Deutsch | + Tiếng Việt |

diff --git a/README_hd.md b/i18n/README_hd.md similarity index 98% rename from README_hd.md rename to i18n/README_hd.md index fd4e244baf14..b3f5daf6a28d 100644 --- a/README_hd.md +++ b/i18n/README_hd.md @@ -56,18 +56,18 @@ checkpoint: जाँच बिंदु

English | - 简体中文 | - 繁體中文 | - 한국어 | - Español | - 日本語 | + 简体中文 | + 繁體中文 | + 한국어 | + Español | + 日本語 | हिन्दी | - Русский | - Рortuguês | - తెలుగు | - Français | - Deutsch | - Tiếng Việt | + Русский | + Рortuguês | + తెలుగు | + Français | + Deutsch | + Tiếng Việt |

diff --git a/README_ja.md b/i18n/README_ja.md similarity index 97% rename from README_ja.md rename to i18n/README_ja.md index a824ae6445fe..f91b109138b6 100644 --- a/README_ja.md +++ b/i18n/README_ja.md @@ -66,18 +66,18 @@ user: ユーザ

English | - 简体中文 | - 繁體中文 | - 한국어 | - Español | + 简体中文 | + 繁體中文 | + 한국어 | + Español | 日本語 | - हिन्दी | - Русский | - Рortuguês | - తెలుగు | - Français | - Deutsch | - Tiếng Việt | + हिन्दी | + Русский | + Рortuguês | + తెలుగు | + Français | + Deutsch | + Tiếng Việt |

diff --git a/README_ko.md b/i18n/README_ko.md similarity index 97% rename from README_ko.md rename to i18n/README_ko.md index f6492ba96766..9b9661687812 100644 --- a/README_ko.md +++ b/i18n/README_ko.md @@ -31,18 +31,18 @@ limitations under the License.

English | - 简体中文 | - 繁體中文 | + 简体中文 | + 繁體中文 | 한국어 | - Español | - 日本語 | - हिन्दी | - Русский | - Рortuguês | - తెలుగు | - Français | - Deutsch | - Tiếng Việt | + Español | + 日本語 | + हिन्दी | + Русский | + Рortuguês | + తెలుగు | + Français | + Deutsch | + Tiếng Việt |

diff --git a/README_pt-br.md b/i18n/README_pt-br.md similarity index 97% rename from README_pt-br.md rename to i18n/README_pt-br.md index 6d74068bd40f..e5143e686d31 100644 --- a/README_pt-br.md +++ b/i18n/README_pt-br.md @@ -36,18 +36,18 @@ limitations under the License.

English | - 简体中文 | - 繁體中文 | - 한국어 | - Español | - 日本語 | - हिन्दी | - Русский | + 简体中文 | + 繁體中文 | + 한국어 | + Español | + 日本語 | + हिन्दी | + Русский | Рortuguês | - తెలుగు | - Français | - Deutsch | - Tiếng Việt | + తెలుగు | + Français | + Deutsch | + Tiếng Việt |

diff --git a/README_ru.md b/i18n/README_ru.md similarity index 98% rename from README_ru.md rename to i18n/README_ru.md index bb9e18ef2fff..6261bd0aeada 100644 --- a/README_ru.md +++ b/i18n/README_ru.md @@ -36,18 +36,18 @@ limitations under the License.

English | - 简体中文 | - 繁體中文 | - 한국어 | - Español | - 日本語 | - हिन्दी | + 简体中文 | + 繁體中文 | + 한국어 | + Español | + 日本語 | + हिन्दी | Русский | - Рortuguês | - తెలుగు | - Français | - Deutsch | - Tiếng Việt | + Рortuguês | + తెలుగు | + Français | + Deutsch | + Tiếng Việt |

diff --git a/README_te.md b/i18n/README_te.md similarity index 98% rename from README_te.md rename to i18n/README_te.md index 28397038cf77..13fd0ade17cd 100644 --- a/README_te.md +++ b/i18n/README_te.md @@ -38,18 +38,18 @@ limitations under the License.

English | - 简体中文 | - 繁體中文 | - 한국어 | - Español | - 日本語 | - हिन्दी | - Русский | - Рortuguês | + 简体中文 | + 繁體中文 | + 한국어 | + Español | + 日本語 | + हिन्दी | + Русский | + Рortuguês | తెలుగు | - Français | - Deutsch | - Tiếng Việt | + Français | + Deutsch | + Tiếng Việt |

diff --git a/README_vi.md b/i18n/README_vi.md similarity index 98% rename from README_vi.md rename to i18n/README_vi.md index 36e6dfb1cba7..4a48edf7eb0b 100644 --- a/README_vi.md +++ b/i18n/README_vi.md @@ -36,17 +36,17 @@ limitations under the License.

English | - 简体中文 | - 繁體中文 | - 한국어 | - Español | - 日本語 | - हिन्दी | - Русский | - Рortuguês | - తెలుగు | - Français | - Deutsch | + 简体中文 | + 繁體中文 | + 한국어 | + Español | + 日本語 | + हिन्दी | + Русский | + Рortuguês | + తెలుగు | + Français | + Deutsch | Tiếng việt |

diff --git a/README_zh-hans.md b/i18n/README_zh-hans.md similarity index 97% rename from README_zh-hans.md rename to i18n/README_zh-hans.md index b8c7a0129939..ef059dd0b0f0 100644 --- a/README_zh-hans.md +++ b/i18n/README_zh-hans.md @@ -57,17 +57,17 @@ checkpoint: 检查点

English | 简体中文 | - 繁體中文 | - 한국어 | - Español | - 日本語 | - हिन्दी | - Русский | - Рortuguês | - తెలుగు | - Français | - Deutsch | - Tiếng Việt | + 繁體中文 | + 한국어 | + Español | + 日本語 | + हिन्दी | + Русский | + Рortuguês | + తెలుగు | + Français | + Deutsch | + Tiếng Việt |

diff --git a/README_zh-hant.md b/i18n/README_zh-hant.md similarity index 97% rename from README_zh-hant.md rename to i18n/README_zh-hant.md index 405801e67b1f..1dceb5cb9027 100644 --- a/README_zh-hant.md +++ b/i18n/README_zh-hant.md @@ -68,18 +68,18 @@ user: 使用者

English | - 简体中文 | + 简体中文 | 繁體中文 | - 한국어 | - Español | - 日本語 | - हिन्दी | - Русский | - Рortuguês | - తెలుగు | - Français | - Deutsch | - Tiếng Việt | + 한국어 | + Español | + 日本語 | + हिन्दी | + Русский | + Рortuguês | + తెలుగు | + Français | + Deutsch | + Tiếng Việt |

diff --git a/setup.py b/setup.py index 4edffc724e92..f438dd8225a4 100644 --- a/setup.py +++ b/setup.py @@ -124,6 +124,7 @@ "jax>=0.4.1,<=0.4.13", "jaxlib>=0.4.1,<=0.4.13", "jieba", + "jinja2>=3.1.0", "kenlm", # Keras pin - this is to make sure Keras 3 doesn't destroy us. Remove or change when we have proper support. "keras>2.9,<2.16", @@ -131,7 +132,7 @@ "librosa", "nltk", "natten>=0.14.6,<0.15.0", - "numpy>=1.17", + "numpy>=1.17,<2.0", "onnxconverter-common", "onnxruntime-tools>=1.4.2", "onnxruntime>=1.4.0", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 4976a4a1b90e..7b39fd479edf 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -435,6 +435,7 @@ ], "models.fuyu": ["FuyuConfig"], "models.gemma": ["GemmaConfig"], + "models.gemma2": ["Gemma2Config"], "models.git": [ "GitConfig", "GitProcessor", @@ -473,6 +474,12 @@ "InstructBlipQFormerConfig", "InstructBlipVisionConfig", ], + "models.instructblipvideo": [ + "InstructBlipVideoConfig", + "InstructBlipVideoProcessor", + "InstructBlipVideoQFormerConfig", + "InstructBlipVideoVisionConfig", + ], "models.jamba": ["JambaConfig"], "models.jetmoe": ["JetMoeConfig"], "models.kosmos2": [ @@ -510,6 +517,10 @@ "LlavaNextConfig", "LlavaNextProcessor", ], + "models.llava_next_video": [ + "LlavaNextVideoConfig", + "LlavaNextVideoProcessor", + ], "models.longformer": [ "LongformerConfig", "LongformerTokenizer", @@ -654,6 +665,7 @@ "RoFormerConfig", "RoFormerTokenizer", ], + "models.rt_detr": ["RTDetrConfig", "RTDetrResNetConfig"], "models.rwkv": ["RwkvConfig"], "models.sam": [ "SamConfig", @@ -1136,10 +1148,12 @@ _import_structure["models.idefics"].extend(["IdeficsImageProcessor"]) _import_structure["models.idefics2"].extend(["Idefics2ImageProcessor"]) _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"]) + _import_structure["models.instructblipvideo"].extend(["InstructBlipVideoImageProcessor"]) _import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"]) _import_structure["models.layoutlmv3"].extend(["LayoutLMv3FeatureExtractor", "LayoutLMv3ImageProcessor"]) _import_structure["models.levit"].extend(["LevitFeatureExtractor", "LevitImageProcessor"]) _import_structure["models.llava_next"].append("LlavaNextImageProcessor") + _import_structure["models.llava_next_video"].append("LlavaNextVideoImageProcessor") _import_structure["models.mask2former"].append("Mask2FormerImageProcessor") _import_structure["models.maskformer"].extend(["MaskFormerFeatureExtractor", "MaskFormerImageProcessor"]) _import_structure["models.mobilenet_v1"].extend(["MobileNetV1FeatureExtractor", "MobileNetV1ImageProcessor"]) @@ -1153,6 +1167,7 @@ _import_structure["models.pix2struct"].extend(["Pix2StructImageProcessor"]) _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"]) _import_structure["models.pvt"].extend(["PvtImageProcessor"]) + _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor"]) _import_structure["models.sam"].extend(["SamImageProcessor"]) _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"]) _import_structure["models.seggpt"].extend(["SegGptImageProcessor"]) @@ -2167,6 +2182,15 @@ "GemmaPreTrainedModel", ] ) + _import_structure["models.gemma2"].extend( + [ + "Gemma2ForCausalLM", + "Gemma2ForSequenceClassification", + "Gemma2ForTokenClassification", + "Gemma2Model", + "Gemma2PreTrainedModel", + ] + ) _import_structure["models.git"].extend( [ "GitForCausalLM", @@ -2316,6 +2340,14 @@ "InstructBlipVisionModel", ] ) + _import_structure["models.instructblipvideo"].extend( + [ + "InstructBlipVideoForConditionalGeneration", + "InstructBlipVideoPreTrainedModel", + "InstructBlipVideoQFormerModel", + "InstructBlipVideoVisionModel", + ] + ) _import_structure["models.jamba"].extend( [ "JambaForCausalLM", @@ -2415,6 +2447,12 @@ "LlavaNextPreTrainedModel", ] ) + _import_structure["models.llava_next_video"].extend( + [ + "LlavaNextVideoForConditionalGeneration", + "LlavaNextVideoPreTrainedModel", + ] + ) _import_structure["models.longformer"].extend( [ "LongformerForMaskedLM", @@ -3004,6 +3042,15 @@ "load_tf_weights_in_roformer", ] ) + _import_structure["models.rt_detr"].extend( + [ + "RTDetrForObjectDetection", + "RTDetrModel", + "RTDetrPreTrainedModel", + "RTDetrResNetBackbone", + "RTDetrResNetPreTrainedModel", + ] + ) _import_structure["models.rwkv"].extend( [ "RwkvForCausalLM", @@ -5025,6 +5072,7 @@ ) from .models.fuyu import FuyuConfig from .models.gemma import GemmaConfig + from .models.gemma2 import Gemma2Config from .models.git import ( GitConfig, GitProcessor, @@ -5068,6 +5116,12 @@ InstructBlipQFormerConfig, InstructBlipVisionConfig, ) + from .models.instructblipvideo import ( + InstructBlipVideoConfig, + InstructBlipVideoProcessor, + InstructBlipVideoQFormerConfig, + InstructBlipVideoVisionConfig, + ) from .models.jamba import JambaConfig from .models.jetmoe import JetMoeConfig from .models.kosmos2 import ( @@ -5105,6 +5159,10 @@ LlavaNextConfig, LlavaNextProcessor, ) + from .models.llava_next_video import ( + LlavaNextVideoConfig, + LlavaNextVideoProcessor, + ) from .models.longformer import ( LongformerConfig, LongformerTokenizer, @@ -5270,6 +5328,10 @@ RoFormerConfig, RoFormerTokenizer, ) + from .models.rt_detr import ( + RTDetrConfig, + RTDetrResNetConfig, + ) from .models.rwkv import RwkvConfig from .models.sam import ( SamConfig, @@ -5757,6 +5819,7 @@ from .models.idefics import IdeficsImageProcessor from .models.idefics2 import Idefics2ImageProcessor from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor + from .models.instructblipvideo import InstructBlipVideoImageProcessor from .models.layoutlmv2 import ( LayoutLMv2FeatureExtractor, LayoutLMv2ImageProcessor, @@ -5767,6 +5830,7 @@ ) from .models.levit import LevitFeatureExtractor, LevitImageProcessor from .models.llava_next import LlavaNextImageProcessor + from .models.llava_next_video import LlavaNextVideoImageProcessor from .models.mask2former import Mask2FormerImageProcessor from .models.maskformer import ( MaskFormerFeatureExtractor, @@ -5792,6 +5856,7 @@ PoolFormerImageProcessor, ) from .models.pvt import PvtImageProcessor + from .models.rt_detr import RTDetrImageProcessor from .models.sam import SamImageProcessor from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor from .models.seggpt import SegGptImageProcessor @@ -6640,6 +6705,13 @@ GemmaModel, GemmaPreTrainedModel, ) + from .models.gemma2 import ( + Gemma2ForCausalLM, + Gemma2ForSequenceClassification, + Gemma2ForTokenClassification, + Gemma2Model, + Gemma2PreTrainedModel, + ) from .models.git import ( GitForCausalLM, GitModel, @@ -6755,6 +6827,12 @@ InstructBlipQFormerModel, InstructBlipVisionModel, ) + from .models.instructblipvideo import ( + InstructBlipVideoForConditionalGeneration, + InstructBlipVideoPreTrainedModel, + InstructBlipVideoQFormerModel, + InstructBlipVideoVisionModel, + ) from .models.jamba import ( JambaForCausalLM, JambaForSequenceClassification, @@ -6830,6 +6908,10 @@ LlavaNextForConditionalGeneration, LlavaNextPreTrainedModel, ) + from .models.llava_next_video import ( + LlavaNextVideoForConditionalGeneration, + LlavaNextVideoPreTrainedModel, + ) from .models.longformer import ( LongformerForMaskedLM, LongformerForMultipleChoice, @@ -7295,6 +7377,13 @@ RoFormerPreTrainedModel, load_tf_weights_in_roformer, ) + from .models.rt_detr import ( + RTDetrForObjectDetection, + RTDetrModel, + RTDetrPreTrainedModel, + RTDetrResNetBackbone, + RTDetrResNetPreTrainedModel, + ) from .models.rwkv import ( RwkvForCausalLM, RwkvModel, diff --git a/src/transformers/agents/prompts.py b/src/transformers/agents/prompts.py index 661df9bd24e7..3a867e8dc9bf 100644 --- a/src/transformers/agents/prompts.py +++ b/src/transformers/agents/prompts.py @@ -123,7 +123,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): ``` --- -Above example were using tools that might not exist for you. You only have acces to those Tools: +Above example were using tools that might not exist for you. You only have access to those Tools: <> Remember to make sure that variables you use are all defined. @@ -145,7 +145,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): "action_input": $INPUT } -Make sure to have the $INPUT as a dictionnary in the right format for the tool you are using, and do not put variable names as input if you can find the right values. +Make sure to have the $INPUT as a dictionary in the right format for the tool you are using, and do not put variable names as input if you can find the right values. You should ALWAYS use the following format: @@ -250,7 +250,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): } -Above example were using notional tools that might not exist for you. You only have acces to those tools: +Above example were using notional tools that might not exist for you. You only have access to those tools: <> Here are the rules you should always follow to solve your task: diff --git a/src/transformers/agents/python_interpreter.py b/src/transformers/agents/python_interpreter.py index 39814daa7f56..04f62a8acfb9 100644 --- a/src/transformers/agents/python_interpreter.py +++ b/src/transformers/agents/python_interpreter.py @@ -628,7 +628,7 @@ def evaluate_ast( Args: expression (`ast.AST`): - The code to evaluate, as an abastract syntax tree. + The code to evaluate, as an abstract syntax tree. state (`Dict[str, Any]`): A dictionary mapping variable names to values. The `state` is updated if need be when the evaluation encounters assignements. @@ -640,7 +640,7 @@ def evaluate_ast( Add more at your own risk! """ if isinstance(expression, ast.Assign): - # Assignement -> we evaluate the assignement which should update the state + # Assignement -> we evaluate the assignment which should update the state # We return the variable assigned as it may be used to determine the final result. return evaluate_assign(expression, state, tools) elif isinstance(expression, ast.AugAssign): diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py index 4dc408bfa299..dc51cda1b76d 100644 --- a/src/transformers/audio_utils.py +++ b/src/transformers/audio_utils.py @@ -18,7 +18,7 @@ """ import warnings -from typing import Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import numpy as np @@ -581,6 +581,213 @@ def spectrogram( return spectrogram +def spectrogram_batch( + waveform_list: List[np.ndarray], + window: np.ndarray, + frame_length: int, + hop_length: int, + fft_length: Optional[int] = None, + power: Optional[float] = 1.0, + center: bool = True, + pad_mode: str = "reflect", + onesided: bool = True, + preemphasis: Optional[float] = None, + mel_filters: Optional[np.ndarray] = None, + mel_floor: float = 1e-10, + log_mel: Optional[str] = None, + reference: float = 1.0, + min_value: float = 1e-10, + db_range: Optional[float] = None, + remove_dc_offset: Optional[bool] = None, + dtype: np.dtype = np.float32, +) -> List[np.ndarray]: + """ + Calculates spectrograms for a list of waveforms using the Short-Time Fourier Transform, optimized for batch processing. + This function extends the capabilities of the `spectrogram` function to handle multiple waveforms efficiently by leveraging broadcasting. + + It supports generating various types of spectrograms: + + - amplitude spectrogram (`power = 1.0`) + - power spectrogram (`power = 2.0`) + - complex-valued spectrogram (`power = None`) + - log spectrogram (use `log_mel` argument) + - mel spectrogram (provide `mel_filters`) + - log-mel spectrogram (provide `mel_filters` and `log_mel`) + + How this works: + + 1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length + - hop_length` samples. + 2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`. + 3. The DFT is taken of each windowed frame. + 4. The results are stacked into a spectrogram. + + We make a distinction between the following "blocks" of sample data, each of which may have a different lengths: + + - The analysis frame. This is the size of the time slices that the input waveform is split into. + - The window. Each analysis frame is multiplied by the window to avoid spectral leakage. + - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram. + + In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A + padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame, + typically the next power of two. + + Note: This function is designed for efficient batch processing of multiple waveforms but retains compatibility with individual waveform processing methods like `librosa.stft`. + + Args: + waveform_list (`List[np.ndarray]` with arrays of shape `(length,)`): + The list of input waveforms, each a single-channel (mono) signal. + window (`np.ndarray` of shape `(frame_length,)`): + The windowing function to apply, including zero-padding if necessary. + frame_length (`int`): + The length of each frame for analysis. + hop_length (`int`): + The step size between successive frames. + fft_length (`int`, *optional*): + The size of the FFT buffer, defining frequency bin resolution. + power (`float`, *optional*, defaults to 1.0): + Determines the type of spectrogram: 1.0 for amplitude, 2.0 for power, None for complex. + center (`bool`, *optional*, defaults to `True`): + Whether to center-pad the waveform frames. + pad_mode (`str`, *optional*, defaults to `"reflect"`): + The padding strategy when `center` is `True`. + onesided (`bool`, *optional*, defaults to `True`): + If True, returns a one-sided spectrogram for real input signals. + preemphasis (`float`, *optional*): + Applies a pre-emphasis filter to each frame. + mel_filters (`np.ndarray`, *optional*): + Mel filter bank for converting to mel spectrogram. + mel_floor (`float`, *optional*, defaults to 1e-10): + Floor value for mel spectrogram to avoid log(0). + log_mel (`str`, *optional*): + Specifies log scaling strategy; options are None, "log", "log10", "dB". + reference (`float`, *optional*, defaults to 1.0): + Reference value for dB conversion in log_mel. + min_value (`float`, °optional*, defaults to 1e-10): + Minimum floor value for log scale conversions. + db_range (`float`, *optional*): + Dynamic range for dB scale spectrograms. + remove_dc_offset (`bool`, *optional*): + Whether to remove the DC offset from each frame. + dtype (`np.dtype`, *optional*, defaults to `np.float32`): + Data type of the output spectrogram. + + Returns: + List[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform. + """ + window_length = len(window) + + if fft_length is None: + fft_length = frame_length + + if frame_length > fft_length: + raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})") + + if window_length != frame_length: + raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})") + + if hop_length <= 0: + raise ValueError("hop_length must be greater than zero") + + # Check the dimensions of the waveform + for waveform in waveform_list: + if waveform.ndim != 1: + raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}") + + # Check if waveform is complex + for waveform in waveform_list: + if np.iscomplexobj(waveform): + raise ValueError("Complex-valued input waveforms are not currently supported") + + # Center pad the waveform + if center: + padding = [(int(frame_length // 2), int(frame_length // 2))] + waveform_list = [ + np.pad( + waveform, + padding, + mode=pad_mode, + ) + for waveform in waveform_list + ] + original_waveform_lengths = [ + len(waveform) for waveform in waveform_list + ] # these lengths will be used to remove padding later + + # Batch pad the waveform + max_length = max(original_waveform_lengths) + padded_waveform_batch = np.array( + [ + np.pad(waveform, (0, max_length - len(waveform)), mode="constant", constant_values=0) + for waveform in waveform_list + ], + dtype=dtype, + ) + + # Promote to float64, since np.fft uses float64 internally + padded_waveform_batch = padded_waveform_batch.astype(np.float64) + window = window.astype(np.float64) + + # Split waveform into frames of frame_length size + num_frames = int(1 + np.floor((padded_waveform_batch.shape[1] - frame_length) / hop_length)) + # these lengths will be used to remove padding later + true_num_frames = [int(1 + np.floor((length - frame_length) / hop_length)) for length in original_waveform_lengths] + num_batches = padded_waveform_batch.shape[0] + + num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length + spectrogram = np.empty((num_batches, num_frames, num_frequency_bins), dtype=np.complex64) + + # rfft is faster than fft + fft_func = np.fft.rfft if onesided else np.fft.fft + buffer = np.zeros((num_batches, fft_length)) + + for frame_idx in range(num_frames): + timestep = frame_idx * hop_length + buffer[:, :frame_length] = padded_waveform_batch[:, timestep : timestep + frame_length] + + if remove_dc_offset: + buffer[:, :frame_length] -= buffer[:, :frame_length].mean(axis=1, keepdims=True) + + if preemphasis is not None: + buffer[:, 1:frame_length] -= preemphasis * buffer[:, : frame_length - 1] + buffer[:, 0] *= 1 - preemphasis + + buffer[:, :frame_length] *= window + + spectrogram[:, frame_idx] = fft_func(buffer) + + # Note: ** is much faster than np.power + if power is not None: + spectrogram = np.abs(spectrogram, dtype=np.float64) ** power + + # Apply mel filters if provided + if mel_filters is not None: + result = np.tensordot(spectrogram, mel_filters.T, axes=([2], [1])) + spectrogram = np.maximum(mel_floor, result) + + # Convert to log scale if specified + if power is not None and log_mel is not None: + if log_mel == "log": + spectrogram = np.log(spectrogram) + elif log_mel == "log10": + spectrogram = np.log10(spectrogram) + elif log_mel == "dB": + if power == 1.0: + spectrogram = amplitude_to_db_batch(spectrogram, reference, min_value, db_range) + elif power == 2.0: + spectrogram = power_to_db_batch(spectrogram, reference, min_value, db_range) + else: + raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}") + else: + raise ValueError(f"Unknown log_mel option: {log_mel}") + + spectrogram = np.asarray(spectrogram, dtype) + + spectrogram_list = [spectrogram[i, : true_num_frames[i], :].T for i in range(len(true_num_frames))] + + return spectrogram_list + + def power_to_db( spectrogram: np.ndarray, reference: float = 1.0, @@ -632,6 +839,55 @@ def power_to_db( return spectrogram +def power_to_db_batch( + spectrogram: np.ndarray, + reference: float = 1.0, + min_value: float = 1e-10, + db_range: Optional[float] = None, +) -> np.ndarray: + """ + Converts a batch of power spectrograms to the decibel scale. This computes `10 * log10(spectrogram / reference)`, + using basic logarithm properties for numerical stability. + + This function supports batch processing, where each item in the batch is an individual power (mel) spectrogram. + + Args: + spectrogram (`np.ndarray`): + The input batch of power (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape). + Note that a power spectrogram has the amplitudes squared! + reference (`float`, *optional*, defaults to 1.0): + Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set + the loudest part to 0 dB. Must be greater than zero. + min_value (`float`, *optional*, defaults to `1e-10`): + The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking + `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero. + db_range (`float`, *optional*): + Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the + peak value and the smallest value will never be more than 80 dB. Must be greater than zero. + + Returns: + `np.ndarray`: the batch of spectrograms in decibels + """ + if reference <= 0.0: + raise ValueError("reference must be greater than zero") + if min_value <= 0.0: + raise ValueError("min_value must be greater than zero") + + reference = max(min_value, reference) + + spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) + spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference)) + + if db_range is not None: + if db_range <= 0.0: + raise ValueError("db_range must be greater than zero") + # Apply db_range clipping per batch item + max_values = spectrogram.max(axis=(1, 2), keepdims=True) + spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None) + + return spectrogram + + def amplitude_to_db( spectrogram: np.ndarray, reference: float = 1.0, @@ -681,6 +937,51 @@ def amplitude_to_db( return spectrogram +def amplitude_to_db_batch( + spectrogram: np.ndarray, reference: float = 1.0, min_value: float = 1e-5, db_range: Optional[float] = None +) -> np.ndarray: + """ + Converts a batch of amplitude spectrograms to the decibel scale. This computes `20 * log10(spectrogram / reference)`, + using basic logarithm properties for numerical stability. + + The function supports batch processing, where each item in the batch is an individual amplitude (mel) spectrogram. + + Args: + spectrogram (`np.ndarray`): + The input batch of amplitude (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape). + reference (`float`, *optional*, defaults to 1.0): + Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set + the loudest part to 0 dB. Must be greater than zero. + min_value (`float`, *optional*, defaults to `1e-5`): + The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking + `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero. + db_range (`float`, *optional*): + Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the + peak value and the smallest value will never be more than 80 dB. Must be greater than zero. + + Returns: + `np.ndarray`: the batch of spectrograms in decibels + """ + if reference <= 0.0: + raise ValueError("reference must be greater than zero") + if min_value <= 0.0: + raise ValueError("min_value must be greater than zero") + + reference = max(min_value, reference) + + spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) + spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference)) + + if db_range is not None: + if db_range <= 0.0: + raise ValueError("db_range must be greater than zero") + # Apply db_range clipping per batch item + max_values = spectrogram.max(axis=(1, 2), keepdims=True) + spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None) + + return spectrogram + + ### deprecated functions below this line ### @@ -773,7 +1074,7 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = frames (`np.array` of dimension `(num_frames, fft_window_size)`): A framed audio signal obtained using `audio_utils.fram_wav`. windowing_function (`np.array` of dimension `(nb_frequency_bins, nb_mel_filters)`: - A array reprensenting the function that will be used to reduces the amplitude of the discontinuities at the + A array representing the function that will be used to reduces the amplitude of the discontinuities at the boundaries of each frame when computing the STFT. Each frame will be multiplied by the windowing_function. For more information on the discontinuities, called *Spectral leakage*, refer to [this tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py index 04ba337ef436..b167cd1d1170 100644 --- a/src/transformers/cache_utils.py +++ b/src/transformers/cache_utils.py @@ -214,7 +214,7 @@ class QuantizedCacheConfig(CacheConfig): compute_dtype (`torch.dtype`, *optional*, defaults to `torch.float16`): The defualt dtype used for computations in the model. Keys and Values will be cast to this dtype after dequantization. device (`str`, *optional*, defaults to `"cpu"`): - Device on which to peform computations, should be same as the model's device. + Device on which to perform computations, should be same as the model's device. """ def __init__( @@ -395,21 +395,21 @@ def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTens cache.update(key_states, value_states, layer_idx) return cache - def crop(self, maximum_length: int): - """Crop the past key values up to a new `maximum_length` in terms of tokens. `maximum_length` can also be - negative to remove `maximum_length` tokens. This is used in assisted decoding and contrastive search.""" + def crop(self, max_length: int): + """Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be + negative to remove `max_length` tokens. This is used in assisted decoding and contrastive search.""" # In case it is negative - if maximum_length < 0: - maximum_length = self.get_seq_length() - abs(maximum_length) + if max_length < 0: + max_length = self.get_seq_length() - abs(max_length) - if self.get_seq_length() <= maximum_length: + if self.get_seq_length() <= max_length: return - self._seen_tokens = maximum_length + self._seen_tokens = max_length for idx in range(len(self.key_cache)): - self.key_cache[idx] = self.key_cache[idx][..., :maximum_length, :] - self.value_cache[idx] = self.value_cache[idx][..., :maximum_length, :] + self.key_cache[idx] = self.key_cache[idx][..., :max_length, :] + self.value_cache[idx] = self.value_cache[idx][..., :max_length, :] def batch_split(self, full_batch_size: int, split_size: int) -> List["DynamicCache"]: """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by @@ -970,3 +970,125 @@ def get_max_length(self) -> Optional[int]: # in theory there is no limit because the sliding window size is fixed # no matter how long the sentence is return None + + +class HybridCache(Cache): + def __init__(self, config: PretrainedConfig, max_batch_size, max_cache_len, device="cpu", dtype=None) -> None: + if not hasattr(config, "sliding_window") or config.sliding_window is None: + raise ValueError( + "Setting `cache_implementation` to 'sliding_window' requires the model config supporting " + "sliding window attention, please check if there is a `sliding_window` field in the model " + "config and it's not set to None." + ) + self.max_cache_len = max_cache_len + self.max_batch_size = max_batch_size + # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads + self.head_dim = ( + config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads + ) + + self.dtype = dtype if dtype is not None else torch.float32 + self.num_key_value_heads = ( + config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads + ) + self.is_sliding = torch.tensor( + [i % 2 for i in range(config.num_hidden_layers)], dtype=torch.bool, device=device + ) + self.key_cache: List[torch.Tensor] = [] + self.value_cache: List[torch.Tensor] = [] + global_cache_shape = (max_batch_size, self.num_key_value_heads, max_cache_len, self.head_dim) + sliding_cache_shape = ( + max_batch_size, + self.num_key_value_heads, + min(config.sliding_window, max_cache_len), + self.head_dim, + ) + for i in range(config.num_hidden_layers): + # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph + # breaks when updating the cache. + cache_shape = global_cache_shape if not self.is_sliding[i] else sliding_cache_shape + new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device) + new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device) + torch._dynamo.mark_static_address(new_layer_key_cache) + torch._dynamo.mark_static_address(new_layer_value_cache) + self.key_cache.append(new_layer_key_cache) + self.value_cache.append(new_layer_value_cache) + + def _sliding_update(self, cache_position, layer_idx, key_states, value_states, k_out, v_out, max_cache_len): + if cache_position.shape[0] > max_cache_len: + k_out = key_states[:, :, -max_cache_len:, :] + v_out = value_states[:, :, -max_cache_len:, :] + # Assumption: caches are all zeros at this point, `+=` is equivalent to `=` but compile-friendly + self.key_cache[layer_idx] += k_out + self.value_cache[layer_idx] += v_out + # we should return the whole states instead of k_out, v_out to take the whole prompt + # into consideration when building kv cache instead of just throwing away tokens outside of the window + return key_states, value_states + + slicing = torch.ones(max_cache_len, dtype=torch.long, device=value_states.device).cumsum(0) + cache_position = cache_position.clamp(0, max_cache_len - 1) + to_shift = cache_position >= max_cache_len - 1 + indices = (slicing + to_shift[-1].int() - 1) % max_cache_len + k_out = k_out[:, :, indices] + v_out = v_out[:, :, indices] + + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states + # `_.zero()` followed by `+=` is equivalent `=`, but compile-friendly (without graph breaks due to assignment) + self.key_cache[layer_idx].zero_() + self.value_cache[layer_idx].zero_() + + self.key_cache[layer_idx] += k_out + self.value_cache[layer_idx] += v_out + return k_out, v_out + + def _static_update(self, cache_position, layer_idx, key_states, value_states, k_out, v_out, max_cache_len): + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states + + self.key_cache[layer_idx] = k_out + self.value_cache[layer_idx] = v_out + return k_out, v_out + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + sliding_window: Optional[int] = None, + ) -> Tuple[torch.Tensor]: + cache_position = cache_kwargs.get("cache_position") + self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device=key_states.device) + self.value_cache[layer_idx] = self.value_cache[layer_idx].to(device=value_states.device) + k_out = self.key_cache[layer_idx] + v_out = self.value_cache[layer_idx] + if sliding_window: + update_fn = self._sliding_update + else: + update_fn = self._static_update + + return update_fn( + cache_position, + layer_idx, + key_states, + value_states, + k_out, + v_out, + k_out.shape[2], + ) + + def get_max_length(self) -> Optional[int]: + # in theory there is no limit because the sliding window size is fixed + # no matter how long the sentence is + return self.max_cache_len + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + return None + + def reset(self): + """Resets the cache values while preserving the objects""" + for layer_idx in range(len(self.key_cache)): + # In-place ops prevent breaking the static address + self.key_cache[layer_idx].zero_() + self.value_cache[layer_idx].zero_() diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 1f7fcf12f9f9..987646301196 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -622,6 +622,17 @@ def decoder(self, replacement, add_prefix_space): def converted(self) -> Tokenizer: tokenizer = self.tokenizer(self.proto) + # Add user defined symbols (type == 4) from sentnecepiece (https://github.com/google/sentencepiece/blob/6225e08edb2577757163b3f5dbba4c0b670ef445/src/sentencepiece_model.proto#L299C29-L299C33) + user_defined_symbols = [ + AddedToken(token, normalized=False, special=False) + for token in [p.piece for p in self.proto.pieces if p.type == 4] + ] + control_symbols = [ + AddedToken(token, normalized=False, special=True) for token in self.proto.trainer_spec.control_symbols + ] + + tokenizer.add_tokens(user_defined_symbols + control_symbols) + # Tokenizer assemble normalizer = self.normalizer(self.proto) if normalizer is not None: @@ -1330,10 +1341,6 @@ def tokenizer(self, proto): raise Exception( "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" ) - user_defined_symbols = [ - AddedToken(token, normalized=True, special=False) for token in proto.trainer_spec.user_defined_symbols - ] - tokenizer.add_tokens(user_defined_symbols) return tokenizer diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 3148d0f3393c..7f8c3e4433fe 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -31,13 +31,14 @@ "jax": "jax>=0.4.1,<=0.4.13", "jaxlib": "jaxlib>=0.4.1,<=0.4.13", "jieba": "jieba", + "jinja2": "jinja2>=3.1.0", "kenlm": "kenlm", "keras": "keras>2.9,<2.16", "keras-nlp": "keras-nlp>=0.3.1", "librosa": "librosa", "nltk": "nltk", "natten": "natten>=0.14.6,<0.15.0", - "numpy": "numpy>=1.17", + "numpy": "numpy>=1.17,<2.0", "onnxconverter-common": "onnxconverter-common", "onnxruntime-tools": "onnxruntime-tools>=1.4.2", "onnxruntime": "onnxruntime>=1.4.0", diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index ccbbb412bd1c..e735d0a2ca7f 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -111,24 +111,11 @@ def __init__( # Prepare the kwargs for the assistant model assistant_kwargs = {} for key, value in model_kwargs.items(): # deepcopy crashes if we attempt to copy encoder outputs with grads - if key not in ("encoder_outputs", "assistant_encoder_outputs"): + if key not in ("encoder_outputs", "assistant_encoder_outputs", "past_key_values"): assistant_kwargs[key] = ( value.detach().to(device) if isinstance(value, torch.Tensor) else copy.deepcopy(value) ) - # Remove potential default DynamicCache if assistant does not support it - if "past_key_values" in assistant_kwargs.keys(): - if ( - isinstance(assistant_kwargs["past_key_values"], DynamicCache) - and not self.assistant_model._supports_cache_class - ): - # Cache is empty -> remove it from kwargs - if len(assistant_kwargs["past_key_values"]) == 0: - del assistant_kwargs["past_key_values"] - # Cache is not empty -> convert to legacy - else: - assistant_kwargs["past_key_values"] = assistant_kwargs["past_key_values"].to_legacy_cache() - if "assistant_encoder_outputs" in model_kwargs: assistant_kwargs["encoder_outputs"] = model_kwargs["assistant_encoder_outputs"] elif assistant_model.config.is_encoder_decoder: @@ -363,15 +350,15 @@ def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.F return -def _crop_past_key_values(model, past_key_values, maximum_length): +def _crop_past_key_values(model, past_key_values, max_length): """Crops the past key values up to a certain maximum length.""" new_past = [] if model.config.is_encoder_decoder: for idx in range(len(past_key_values)): new_past.append( ( - past_key_values[idx][0][:, :, :maximum_length, :], - past_key_values[idx][1][:, :, :maximum_length, :], + past_key_values[idx][0][:, :, :max_length, :], + past_key_values[idx][1][:, :, :max_length, :], past_key_values[idx][2], past_key_values[idx][3], ) @@ -384,8 +371,8 @@ def _crop_past_key_values(model, past_key_values, maximum_length): for idx in range(len(past_key_values)): new_past.append( ( - past_key_values[idx][0][:, :, :maximum_length], - past_key_values[idx][1][:, :maximum_length, :], + past_key_values[idx][0][:, :, :max_length], + past_key_values[idx][1][:, :max_length, :], ) ) past_key_values = tuple(new_past) @@ -395,19 +382,19 @@ def _crop_past_key_values(model, past_key_values, maximum_length): ): if model.config.multi_query: for idx in range(len(past_key_values)): - past_key_values[idx] = past_key_values[idx][:, :maximum_length, :] + past_key_values[idx] = past_key_values[idx][:, :max_length, :] else: for idx in range(len(past_key_values)): - past_key_values[idx] = past_key_values[idx][:, :, :maximum_length, :] + past_key_values[idx] = past_key_values[idx][:, :, :max_length, :] elif isinstance(past_key_values, DynamicCache): - past_key_values.crop(maximum_length) + past_key_values.crop(max_length) elif past_key_values is not None: for idx in range(len(past_key_values)): new_past.append( ( - past_key_values[idx][0][:, :, :maximum_length, :], - past_key_values[idx][1][:, :, :maximum_length, :], + past_key_values[idx][0][:, :, :max_length, :], + past_key_values[idx][1][:, :, :max_length, :], ) ) past_key_values = tuple(new_past) diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index 8bb5e091d6db..8ba17a6a350f 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -400,7 +400,7 @@ def __init__(self, **kwargs): # Cache implementation self.cache_implementation = kwargs.pop("cache_implementation", None) self.cache_config = kwargs.pop("cache_config", None) - if self.cache_implementation is not None: + if self.cache_implementation is not None and self.cache_implementation in NEEDS_CACHE_CONFIG: cache_config_class = NEEDS_CACHE_CONFIG[self.cache_implementation] if self.cache_config is None: self.cache_config = cache_config_class() diff --git a/src/transformers/generation/flax_logits_process.py b/src/transformers/generation/flax_logits_process.py index 84b5a38d5de4..9b2ab5fb1afa 100644 --- a/src/transformers/generation/flax_logits_process.py +++ b/src/transformers/generation/flax_logits_process.py @@ -476,7 +476,7 @@ def __init__(self, ngram_size: int): def get_previous_ngrams(self, input_ids: jnp.ndarray, vocab_size: int, cur_len: int): """ get a matrix of size (batch_size,) + (vocab_size,)*n (for n-grams) that - represent the n-grams that occured previously. + represent the n-grams that occurred previously. The BCOO representation allow to store only the few non-zero entries, instead of the full (huge) matrix """ batch_size, seq_len = input_ids.shape diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index dd1719294e8f..231427856860 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -28,6 +28,7 @@ Cache, DynamicCache, HQQQuantizedCache, + HybridCache, QuantizedCacheConfig, QuantoQuantizedCache, SlidingWindowCache, @@ -112,7 +113,7 @@ if is_accelerate_available(): from accelerate.hooks import AlignDevicesHook, add_hook_to_module -NEED_SETUP_CACHE_CLASSES_MAPPING = {"static": StaticCache, "sliding_window": SlidingWindowCache} +NEED_SETUP_CACHE_CLASSES_MAPPING = {"static": StaticCache, "sliding_window": SlidingWindowCache, "hybrid": HybridCache} QUANT_BACKEND_CLASSES_MAPPING = {"quanto": QuantoQuantizedCache, "HQQ": HQQQuantizedCache} @@ -1395,10 +1396,10 @@ def _get_initial_cache_position(self, input_ids, model_kwargs): past_length = 0 if "past_key_values" in model_kwargs: - if isinstance(model_kwargs["past_key_values"], Cache): - past_length = model_kwargs["past_key_values"].get_seq_length() - else: - past_length = model_kwargs["past_key_values"][0][0].shape[2] + cache = model_kwargs["past_key_values"] + if not isinstance(cache, Cache): + past_length = cache[0][0].shape[2] + if "inputs_embeds" in model_kwargs: cur_len = model_kwargs["inputs_embeds"].shape[1] else: @@ -1739,7 +1740,9 @@ def generate( "issue: https://github.com/huggingface/transformers/issues/28981" ) model_kwargs["past_key_values"] = self._get_cache( - generation_config.cache_implementation, batch_size, generation_config.max_length + generation_config.cache_implementation, + getattr(generation_config, "num_beams", 1) * batch_size, + generation_config.max_length, ) elif generation_config.cache_implementation == "quantized": if not self._supports_quantized_cache: @@ -3697,11 +3700,10 @@ def _assisted_decoding( model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) # This is needed if return_dict_in_generate is True + start_from_empty_dynamic_cache = False if isinstance(model_kwargs.get("past_key_values", None), DynamicCache): if len(model_kwargs["past_key_values"]) == 0: start_from_empty_dynamic_cache = True - else: - start_from_empty_dynamic_cache = False this_peer_finished = False while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 4b263446b54e..0279f26a963e 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -151,6 +151,11 @@ def center_crop( **kwargs, ) + def to_dict(self): + encoder_dict = super().to_dict() + encoder_dict.pop("_valid_processor_keys", None) + return encoder_dict + VALID_SIZE_DICT_KEYS = ( {"height", "width"}, diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index daeee3e1bd5b..d1a08132d73d 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -61,3 +61,8 @@ def _validate_params(self, **kwargs) -> None: def get_transforms(self, **kwargs) -> "Compose": self._validate_params(**kwargs) return self._build_transforms(**kwargs) + + def to_dict(self): + encoder_dict = super().to_dict() + encoder_dict.pop("_transform_params", None) + return encoder_dict diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index be953ef08b7c..5c2d72c345ec 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -632,7 +632,27 @@ def decoder(self, replacement, add_prefix_space): return decoders.Sequence(sequence) def converted(self): - tokenizer = super().converted() + # Copied partly from converted method in SpmConverter class + tokenizer = self.tokenizer(self.proto) + + # Tokenizer assemble + normalizer = self.normalizer(self.proto) + if normalizer is not None: + tokenizer.normalizer = normalizer + + replacement = "▁" + add_prefix_space = True + if hasattr(self.original_tokenizer, "add_prefix_space"): + add_prefix_space = self.original_tokenizer.add_prefix_space + + pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space) + if pre_tokenizer is not None: + tokenizer.pre_tokenizer = pre_tokenizer + + tokenizer.decoder = self.decoder(replacement, add_prefix_space) + post_processor = self.post_processor() + if post_processor: + tokenizer.post_processor = post_processor # HACK: patch the llama-3 tokenizer to use the correspinding pre-tokenizer # and normalizer diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index f7b0db6d77f8..c991c1c95ba2 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2958,6 +2958,8 @@ def from_pretrained( using the `dtype` it was saved in at the end of the training. It can't be used as an indicator of how the model was trained. Since it could be trained in one of half precision dtypes, but saved in fp32. + 3. A string that is a valid `torch.dtype`. E.g. "float32" loads the model in `torch.float32`, "float16" loads in `torch.float16` etc. + For some models the `dtype` they were trained in is unknown - you may try to check the model's paper or @@ -3661,9 +3663,11 @@ def from_pretrained( "Since the `torch_dtype` attribute can't be found in model's config object, " "will use torch_dtype={torch_dtype} as derived from model's weights" ) + elif hasattr(torch, torch_dtype): + torch_dtype = getattr(torch, torch_dtype) else: raise ValueError( - f'`torch_dtype` can be either `torch.dtype` or `"auto"`, but received {torch_dtype}' + f'`torch_dtype` can be one of: `torch.dtype`, `"auto"` or a string of a valid `torch.dtype`, but received {torch_dtype}' ) dtype_orig = cls._set_default_torch_dtype(torch_dtype) diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 24b602f18c8f..f4c334914728 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -92,6 +92,7 @@ funnel, fuyu, gemma, + gemma2, git, glpn, gpt2, @@ -111,6 +112,7 @@ imagegpt, informer, instructblip, + instructblipvideo, jamba, jetmoe, kosmos2, @@ -124,6 +126,7 @@ llama, llava, llava_next, + llava_next_video, longformer, longt5, luke, @@ -193,6 +196,7 @@ roberta_prelayernorm, roc_bert, roformer, + rt_detr, rwkv, sam, seamless_m4t, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 40e282166ef9..7f52b3dc280a 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -108,6 +108,7 @@ ("funnel", "FunnelConfig"), ("fuyu", "FuyuConfig"), ("gemma", "GemmaConfig"), + ("gemma2", "Gemma2Config"), ("git", "GitConfig"), ("glpn", "GLPNConfig"), ("gpt-sw3", "GPT2Config"), @@ -128,6 +129,7 @@ ("imagegpt", "ImageGPTConfig"), ("informer", "InformerConfig"), ("instructblip", "InstructBlipConfig"), + ("instructblipvideo", "InstructBlipVideoConfig"), ("jamba", "JambaConfig"), ("jetmoe", "JetMoeConfig"), ("jukebox", "JukeboxConfig"), @@ -140,6 +142,7 @@ ("lilt", "LiltConfig"), ("llama", "LlamaConfig"), ("llava", "LlavaConfig"), + ("llava-next-video", "LlavaNextVideoConfig"), ("llava_next", "LlavaNextConfig"), ("longformer", "LongformerConfig"), ("longt5", "LongT5Config"), @@ -214,6 +217,8 @@ ("roberta-prelayernorm", "RobertaPreLayerNormConfig"), ("roc_bert", "RoCBertConfig"), ("roformer", "RoFormerConfig"), + ("rt_detr", "RTDetrConfig"), + ("rt_detr_resnet", "RTDetrResNetConfig"), ("rwkv", "RwkvConfig"), ("sam", "SamConfig"), ("seamless_m4t", "SeamlessM4TConfig"), @@ -381,6 +386,7 @@ ("funnel", "Funnel Transformer"), ("fuyu", "Fuyu"), ("gemma", "Gemma"), + ("gemma2", "Gemma2"), ("git", "GIT"), ("glpn", "GLPN"), ("gpt-sw3", "GPT-Sw3"), @@ -402,6 +408,7 @@ ("imagegpt", "ImageGPT"), ("informer", "Informer"), ("instructblip", "InstructBLIP"), + ("instructblipvideo", "InstructBlipVideo"), ("jamba", "Jamba"), ("jetmoe", "JetMoe"), ("jukebox", "Jukebox"), @@ -417,6 +424,7 @@ ("llama2", "Llama2"), ("llama3", "Llama3"), ("llava", "LLaVa"), + ("llava-next-video", "LLaVa-NeXT-Video"), ("llava_next", "LLaVA-NeXT"), ("longformer", "Longformer"), ("longt5", "LongT5"), @@ -499,6 +507,8 @@ ("roberta-prelayernorm", "RoBERTa-PreLayerNorm"), ("roc_bert", "RoCBert"), ("roformer", "RoFormer"), + ("rt_detr", "RT-DETR"), + ("rt_detr_resnet", "RT-DETR-ResNet"), ("rwkv", "RWKV"), ("sam", "SAM"), ("seamless_m4t", "SeamlessM4T"), @@ -623,6 +633,7 @@ ("clip_vision_model", "clip"), ("siglip_vision_model", "siglip"), ("chinese_clip_vision_model", "chinese_clip"), + ("rt_detr_resnet", "rt_detr"), ] ) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 055f2ca733ce..efc2d4d998cc 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -89,11 +89,13 @@ ("idefics2", ("Idefics2ImageProcessor",)), ("imagegpt", ("ImageGPTImageProcessor",)), ("instructblip", ("BlipImageProcessor",)), + ("instructblipvideo", ("InstructBlipVideoImageProcessor",)), ("kosmos-2", ("CLIPImageProcessor",)), ("layoutlmv2", ("LayoutLMv2ImageProcessor",)), ("layoutlmv3", ("LayoutLMv3ImageProcessor",)), ("levit", ("LevitImageProcessor",)), ("llava", ("CLIPImageProcessor",)), + ("llava-next-video", ("LlavaNextVideoImageProcessor",)), ("llava_next", ("LlavaNextImageProcessor",)), ("mask2former", ("Mask2FormerImageProcessor",)), ("maskformer", ("MaskFormerImageProcessor",)), @@ -114,6 +116,7 @@ ("pvt_v2", ("PvtImageProcessor",)), ("regnet", ("ConvNextImageProcessor",)), ("resnet", ("ConvNextImageProcessor",)), + ("rt_detr", "RTDetrImageProcessor"), ("sam", ("SamImageProcessor",)), ("segformer", ("SegformerImageProcessor",)), ("seggpt", ("SegGptImageProcessor",)), @@ -155,7 +158,6 @@ IMAGE_PROCESSOR_MAPPING_NAMES[model_type] = (slow_image_processor_class, fast_image_processor_class) - IMAGE_PROCESSOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, IMAGE_PROCESSOR_MAPPING_NAMES) @@ -398,7 +400,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): kwargs["token"] = use_auth_token config = kwargs.pop("config", None) - use_fast = kwargs.pop("use_fast", False) + use_fast = kwargs.pop("use_fast", None) trust_remote_code = kwargs.pop("trust_remote_code", None) kwargs["_from_auto"] = True @@ -429,10 +431,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): if image_processor_class is not None: # Update class name to reflect the use_fast option. If class is not found, None is returned. - if use_fast and not image_processor_class.endswith("Fast"): - image_processor_class += "Fast" - elif not use_fast and image_processor_class.endswith("Fast"): - image_processor_class = image_processor_class[:-4] + if use_fast is not None: + if use_fast and not image_processor_class.endswith("Fast"): + image_processor_class += "Fast" + elif not use_fast and image_processor_class.endswith("Fast"): + image_processor_class = image_processor_class[:-4] image_processor_class = image_processor_class_from_name(image_processor_class) has_remote_code = image_processor_auto_map is not None diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index adfcc7af9fbc..f674b777fca7 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -105,6 +105,7 @@ ("fsmt", "FSMTModel"), ("funnel", ("FunnelModel", "FunnelBaseModel")), ("gemma", "GemmaModel"), + ("gemma2", "Gemma2Model"), ("git", "GitModel"), ("glpn", "GLPNModel"), ("gpt-sw3", "GPT2Model"), @@ -202,6 +203,7 @@ ("roberta-prelayernorm", "RobertaPreLayerNormModel"), ("roc_bert", "RoCBertModel"), ("roformer", "RoFormerModel"), + ("rt_detr", "RTDetrModel"), ("rwkv", "RwkvModel"), ("sam", "SamModel"), ("seamless_m4t", "SeamlessM4TModel"), @@ -298,6 +300,7 @@ ("idefics2", "Idefics2ForConditionalGeneration"), ("layoutlm", "LayoutLMForMaskedLM"), ("llava", "LlavaForConditionalGeneration"), + ("llava-next-video", "LlavaNextVideoForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("longformer", "LongformerForMaskedLM"), ("luke", "LukeForMaskedLM"), @@ -452,6 +455,7 @@ ("falcon", "FalconForCausalLM"), ("fuyu", "FuyuForCausalLM"), ("gemma", "GemmaForCausalLM"), + ("gemma2", "Gemma2ForCausalLM"), ("git", "GitForCausalLM"), ("gpt-sw3", "GPT2LMHeadModel"), ("gpt2", "GPT2LMHeadModel"), @@ -696,8 +700,10 @@ ("git", "GitForCausalLM"), ("idefics2", "Idefics2ForConditionalGeneration"), ("instructblip", "InstructBlipForConditionalGeneration"), + ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"), ("kosmos-2", "Kosmos2ForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"), + ("llava-next-video", "LlavaNextVideoForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("paligemma", "PaliGemmaForConditionalGeneration"), ("pix2struct", "Pix2StructForConditionalGeneration"), @@ -765,6 +771,7 @@ ("deformable_detr", "DeformableDetrForObjectDetection"), ("deta", "DetaForObjectDetection"), ("detr", "DetrForObjectDetection"), + ("rt_detr", "RTDetrForObjectDetection"), ("table-transformer", "TableTransformerForObjectDetection"), ("yolos", "YolosForObjectDetection"), ] @@ -858,6 +865,7 @@ ("fnet", "FNetForSequenceClassification"), ("funnel", "FunnelForSequenceClassification"), ("gemma", "GemmaForSequenceClassification"), + ("gemma2", "Gemma2ForSequenceClassification"), ("gpt-sw3", "GPT2ForSequenceClassification"), ("gpt2", "GPT2ForSequenceClassification"), ("gpt_bigcode", "GPTBigCodeForSequenceClassification"), @@ -1039,6 +1047,7 @@ ("fnet", "FNetForTokenClassification"), ("funnel", "FunnelForTokenClassification"), ("gemma", "GemmaForTokenClassification"), + ("gemma2", "Gemma2ForTokenClassification"), ("gpt-sw3", "GPT2ForTokenClassification"), ("gpt2", "GPT2ForTokenClassification"), ("gpt_bigcode", "GPTBigCodeForTokenClassification"), @@ -1252,6 +1261,7 @@ ("nat", "NatBackbone"), ("pvt_v2", "PvtV2Backbone"), ("resnet", "ResNetBackbone"), + ("rt_detr_resnet", "RTDetrResNetBackbone"), ("swin", "SwinBackbone"), ("swinv2", "Swinv2Backbone"), ("timm_backbone", "TimmBackbone"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 4a8295cc8304..7c7342bb9fb7 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -64,10 +64,12 @@ ("idefics", "IdeficsProcessor"), ("idefics2", "Idefics2Processor"), ("instructblip", "InstructBlipProcessor"), + ("instructblipvideo", "InstructBlipVideoProcessor"), ("kosmos-2", "Kosmos2Processor"), ("layoutlmv2", "LayoutLMv2Processor"), ("layoutlmv3", "LayoutLMv3Processor"), ("llava", "LlavaProcessor"), + ("llava-next-video", "LlavaNextVideoProcessor"), ("llava_next", "LlavaNextProcessor"), ("markuplm", "MarkupLMProcessor"), ("mctct", "MCTCTProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index e99bc89205cb..dddab5379f56 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -188,6 +188,13 @@ "GemmaTokenizerFast" if is_tokenizers_available() else None, ), ), + ( + "gemma2", + ( + "GemmaTokenizer" if is_sentencepiece_available() else None, + "GemmaTokenizerFast" if is_tokenizers_available() else None, + ), + ), ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)), ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), @@ -205,6 +212,7 @@ ("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("idefics2", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("instructblipvideo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ( "jamba", ( @@ -241,6 +249,7 @@ ), ), ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("llava-next-video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)), ( diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 8fa55d01ee88..7aad5bea66ca 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -317,7 +317,7 @@ def _init_weights(self, module): module.bias.data.zero_() if isinstance(module, Blip2VisionEmbeddings): - if hasattr(self.config, "vision_config"): + if hasattr(self.config, "vision_config") and not isinstance(self.config, Blip2VisionConfig): factor = self.config.vision_config.initializer_range nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor) nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor) diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index 1c236d29d4e7..3e83daa942c0 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -37,6 +37,7 @@ add_start_docstrings_to_model_forward, logging, replace_return_docstrings, + torch_int, ) from .configuration_clap import ClapAudioConfig, ClapConfig, ClapTextConfig @@ -590,8 +591,10 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): def set_shift_and_window_size(self, input_resolution): if min(input_resolution) <= self.window_size: # if window size is larger than input resolution, we don't partition windows - self.shift_size = 0 - self.window_size = min(input_resolution) + self.shift_size = torch_int(0) + self.window_size = ( + torch.min(torch.tensor(input_resolution)) if torch.jit.is_tracing() else min(input_resolution) + ) def get_attn_mask(self, height, width, dtype, device): if self.shift_size > 0: diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index 492026244303..cfa08e3974b7 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -29,22 +29,24 @@ from torch.autograd.function import once_differentiable from ...activations import ACT2FN -from ...file_utils import ( +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import meshgrid +from ...utils import ( ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, + is_accelerate_available, + is_ninja_available, is_scipy_available, is_timm_available, is_torch_cuda_available, is_vision_available, + logging, replace_return_docstrings, requires_backends, ) -from ...modeling_attn_mask_utils import _prepare_4d_attention_mask -from ...modeling_outputs import BaseModelOutput -from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import meshgrid -from ...utils import is_accelerate_available, is_ninja_available, logging from ...utils.backbone_utils import load_backbone from .configuration_deformable_detr import DeformableDetrConfig diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py index 7e899f453f1c..115808a6b11a 100644 --- a/src/transformers/models/donut/modeling_donut_swin.py +++ b/src/transformers/models/donut/modeling_donut_swin.py @@ -35,6 +35,7 @@ add_start_docstrings, add_start_docstrings_to_model_forward, logging, + torch_int, ) from .configuration_donut_swin import DonutSwinConfig @@ -562,8 +563,10 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): def set_shift_and_window_size(self, input_resolution): if min(input_resolution) <= self.window_size: # if window size is larger than input resolution, we don't partition windows - self.shift_size = 0 - self.window_size = min(input_resolution) + self.shift_size = torch_int(0) + self.window_size = ( + torch.min(torch.tensor(input_resolution)) if torch.jit.is_tracing() else min(input_resolution) + ) def get_attn_mask(self, height, width, dtype, device): if self.shift_size > 0: diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index a7e554742f2d..db5db0eae118 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -39,7 +39,7 @@ from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput, SemanticSegmenterOutput from ...modeling_utils import PreTrainedModel from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer -from ...utils import ModelOutput, logging +from ...utils import ModelOutput, logging, torch_int from ...utils.backbone_utils import load_backbone from .configuration_dpt import DPTConfig @@ -226,7 +226,7 @@ def _resize_pos_embed(self, posemb, grid_size_height, grid_size_width, start_ind posemb_tok = posemb[:, :start_index] posemb_grid = posemb[0, start_index:] - old_grid_size = int(math.sqrt(len(posemb_grid))) + old_grid_size = torch_int(posemb_grid.size(0) ** 0.5) posemb_grid = posemb_grid.reshape(1, old_grid_size, old_grid_size, -1).permute(0, 3, 1, 2) posemb_grid = nn.functional.interpolate(posemb_grid, size=(grid_size_height, grid_size_width), mode="bilinear") diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index a30891bddbc1..5f92ea0bd5a2 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -782,6 +782,7 @@ def forward( head_mask: Optional[torch.Tensor] = None, use_cache: bool = False, output_attentions: bool = False, + **kwargs, ): residual = hidden_states diff --git a/src/transformers/models/gemma/diff_gemma.py b/src/transformers/models/gemma/diff_gemma.py index 1165b05483fc..d1df9d8cfb07 100644 --- a/src/transformers/models/gemma/diff_gemma.py +++ b/src/transformers/models/gemma/diff_gemma.py @@ -257,6 +257,7 @@ def __init__(self, config: GemmaConfig, layer_idx: Optional[int] = None): self.max_position_embeddings = config.max_position_embeddings self.rope_theta = config.rope_theta self.is_causal = True + self.scaling = 1 / math.sqrt(config.head_dim) if self.hidden_size % self.num_heads != 0: raise ValueError( @@ -305,7 +306,7 @@ def forward( key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling if attention_mask is not None: # no matter the length, we just slice it causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index c0a8c193d4cc..c0da2530fe2c 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -240,6 +240,7 @@ def __init__(self, config: GemmaConfig, layer_idx: Optional[int] = None): self.max_position_embeddings = config.max_position_embeddings self.rope_theta = config.rope_theta self.is_causal = True + self.scaling = 1 / math.sqrt(config.head_dim) if self.hidden_size % self.num_heads != 0: raise ValueError( @@ -288,7 +289,7 @@ def forward( key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling if attention_mask is not None: # no matter the length, we just slice it causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] @@ -628,6 +629,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, + **kwargs, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ Args: @@ -642,6 +644,11 @@ def forward( If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model """ residual = hidden_states @@ -892,6 +899,13 @@ def forward( # See https://github.com/huggingface/transformers/pull/29402 normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype) hidden_states = hidden_states * normalizer + if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = True + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)" + ) # decoder layers all_hidden_states = () if output_hidden_states else None @@ -1391,7 +1405,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(GEMMA_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, @@ -1401,7 +1415,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + ) -> Union[Tuple, TokenClassifierOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., diff --git a/src/transformers/models/gemma2/__init__.py b/src/transformers/models/gemma2/__init__.py new file mode 100644 index 000000000000..0d0aa148be5e --- /dev/null +++ b/src/transformers/models/gemma2/__init__.py @@ -0,0 +1,61 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_torch_available, +) + + +_import_structure = { + "configuration_gemma2": ["Gemma2Config"], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_gemma2"] = [ + "Gemma2ForCausalLM", + "Gemma2Model", + "Gemma2PreTrainedModel", + "Gemma2ForSequenceClassification", + "Gemma2ForTokenClassification", + ] + +if TYPE_CHECKING: + from .configuration_gemma import Gemma2Config + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_gemma import ( + Gemma2ForCausalLM, + Gemma2ForSequenceClassification, + Gemma2ForTokenClassification, + Gemma2Model, + Gemma2PreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py new file mode 100644 index 000000000000..47350af0ffd0 --- /dev/null +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -0,0 +1,150 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from . +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the diff. If any change should be done, please apply the change to the +# diff.py file directly. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from transformers import PretrainedConfig + + +class Gemma2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Gemma2Model`]. It is used to instantiate an Gemma2 + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Gemma2-7B. + e.g. [google/gemma2-7b](https://huggingface.co/google/gemma2-7b) + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + vocab_size (`int`, *optional*, defaults to 256000): + Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Gemma2Model`] + hidden_size (`int`, *optional*, defaults to 3072): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 24576): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 28): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 16): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + head_dim (`int`, *optional*, defaults to 256): + The attention head dimension. + hidden_activation (`str` or `function`, *optional*): + The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"` + if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function. + max_position_embeddings (`int`, *optional*, defaults to 8192): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + eos_token_id (`int`, *optional*, defaults to 1): + End of stream token id. + bos_token_id (`int`, *optional*, defaults to 2): + Beginning of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits. + query_pre_attn_scalar (`float`, *optional*, defaults to 224): scaling factor used on the attention scores + sliding_window (`int`, *optional*, defaults to 4096): in Gemma2, every other layer uses sliding window attention. This is the + size of the sliding window. + ```python + >>> from transformers import Gemma2Model, Gemma2Config + >>> # Initializing a Gemma2 gemma2-9b style configuration + >>> configuration = Gemma2Config() + >>> # Initializing a model from the gemma2-9b style configuration + >>> model = Gemma2Model(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "gemma2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=256000, + hidden_size=3072, + intermediate_size=24576, + num_hidden_layers=28, + num_attention_heads=16, + num_key_value_heads=16, + head_dim=256, + hidden_activation="gelu_pytorch_tanh", + max_position_embeddings=8192, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + eos_token_id=1, + bos_token_id=2, + tie_word_embeddings=True, + rope_theta=10000.0, + attention_bias=False, + attention_dropout=0.0, + final_logit_softcapping=30.0, + query_pre_attn_scalar=224, + sliding_window=4096, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.head_dim = head_dim + self.num_key_value_heads = num_key_value_heads + self.hidden_activation = hidden_activation + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + self.final_logit_softcapping = final_logit_softcapping + self.query_pre_attn_scalar = query_pre_attn_scalar + self.sliding_window = sliding_window + self.cache_implementation = "hybrid" diff --git a/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py b/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py new file mode 100644 index 000000000000..1ad7d23c3c3e --- /dev/null +++ b/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py @@ -0,0 +1,239 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +import warnings + +import torch +from accelerate import init_empty_weights + +from transformers import Gemma2Config, Gemma2ForCausalLM, GemmaTokenizer + + +try: + from transformers import GemmaTokenizerFast +except ImportError as e: + warnings.warn(e) + warnings.warn( + "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" + ) + GemmaTokenizerFast = None + +""" +Sample usage: + +``` +python src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py \ + --input_dir /path/to/downloaded/gemma/weights --model_size 9B --output_dir /output/path +``` + +Thereafter, models can be loaded via: + +```py +from transformers import Gemma2ForCausalLM, GemmaTokenizerFast + +model = Gemma2ForCausalLM.from_pretrained("/output/path") +tokenizer = GemmaTokenizerFast.from_pretrained("/output/path") +``` + +Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions +come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). +""" + +gemma_9b_config = Gemma2Config( + num_hidden_layers=42, + num_attention_heads=16, + num_key_value_heads=8, + hidden_size=3584, + intermediate_size=14336, + final_logit_softcapping=30.0, + attn_logit_softcapping=50.0, + head_dim=256, + sliding_window=4096, + query_pre_attn_scalar=224, +) + +gemma_27b_config = Gemma2Config( + num_hidden_layers=46, + num_attention_heads=32, + num_key_value_heads=16, + hidden_size=4608, + intermediate_size=36864, + final_logit_softcapping=30.0, + attn_logit_softcapping=50.0, + head_dim=128, + sliding_window=4096, + query_pre_attn_scalar=144, +) + +CONFIG_MAPPING = {"9B": gemma_9b_config, "27B": gemma_27b_config} +LAYER_NAME_MAPPING = {"embedder.weight": "model.embed_tokens.weight"} + + +def write_model(save_path, input_base_path, config, safe_serialization=True, push_to_hub=False, dtype=torch.float32): + num_attn_heads = config.num_attention_heads + hidden_size = config.hidden_size + num_kv_heads = config.num_key_value_heads + head_dim = config.head_dim + + print(f"Fetching all parameters from the checkpoint at '{input_base_path}'") + + if os.path.isdir(input_base_path): + print("Model seems sharded") + + model_state_dict = {} + files = [file for file in os.listdir(input_base_path) if file.endswith(".bin")] + + for file in files: + print(file) + loaded_state_dict = torch.load(os.path.join(input_base_path, file), map_location="cpu") + model_state_dict.update(loaded_state_dict) + else: + print("Model does not seem to be sharded") + model_state_dict = torch.load(input_base_path, map_location="cpu")["model_state_dict"] + model_state_dict.pop("freqs_cis") + + state_dict = {} + for k, v in model_state_dict.items(): + if "qkv_proj" in k: + if num_kv_heads == 1: + v = v.reshape(num_attn_heads + num_kv_heads * 2, head_dim, hidden_size) + q_proj = v[:num_attn_heads, ...] + k_proj = v[num_attn_heads : num_attn_heads + num_kv_heads, ...].repeat(num_kv_heads, 1, 1) + v_proj = v[-num_kv_heads:, ...].repeat(num_kv_heads, 1, 1) + + state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape( + num_attn_heads * head_dim, hidden_size + ).clone() + state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape( + num_kv_heads * head_dim, hidden_size + ).clone() + state_dict[k.replace("qkv_proj", "v_proj")] = v_proj[0].clone() + else: + q_proj, k_proj, v_proj = torch.split( + v, [num_attn_heads * head_dim, num_kv_heads * head_dim, num_kv_heads * head_dim], 0 + ) + state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape( + num_attn_heads * head_dim, hidden_size + ).clone() + state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape( + num_kv_heads * head_dim, hidden_size + ).clone() + state_dict[k.replace("qkv_proj", "v_proj")] = v_proj.reshape( + num_kv_heads * head_dim, hidden_size + ).clone() + + elif k == "embedder.weight": + state_dict[LAYER_NAME_MAPPING[k]] = v + state_dict["lm_head.weight"] = v + else: + state_dict[k] = v + + torch.set_default_dtype(dtype) + + print("Loading the checkpoint in a Gemma2 model.") + with init_empty_weights(): + model = Gemma2ForCausalLM(config) + model.load_state_dict(state_dict, assign=True, strict=False) + + model.config.torch_dtype = torch.float32 + del model.config._name_or_path + print("Saving in the Transformers format.") + + if push_to_hub: + print(f"pushing the model to {save_path}") + model.push_to_hub(save_path, safe_serialization=safe_serialization, private=True) + else: + model.save_pretrained(save_path, safe_serialization=safe_serialization) + + +def write_tokenizer(input_tokenizer_path, save_path, push_to_hub=False): + # Initialize the tokenizer based on the `spm` model + tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast + print(f"Saving a {tokenizer_class.__name__} to {save_path}.") + tokenizer = tokenizer_class(input_tokenizer_path) + if push_to_hub: + tokenizer.push_to_hub(save_path) + else: + tokenizer.save_pretrained(save_path) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_checkpoint", + help="Absolute path to the target Gemma2 weights.", + required=True, + ) + parser.add_argument( + "--tokenizer_checkpoint", + help="Location of Gemma2 tokenizer model", + ) + parser.add_argument( + "--model_size", + default="9B", + choices=["9B", "27B", "tokenizer_only"], + help="'f' models correspond to the finetuned versions, and are specific to the Gemma22 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b", + ) + parser.add_argument( + "--output_dir", + default="google/gemma-9b", + help="Location to write HF model and tokenizer", + ) + parser.add_argument( + "--pickle_serialization", + help="Whether or not to save using `safetensors`.", + action="store_true", + default=False, + ) + parser.add_argument( + "--convert_tokenizer", + help="Whether or not to convert the tokenizer as well.", + action="store_true", + default=False, + ) + parser.add_argument( + "--push_to_hub", + help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.", + action="store_true", + default=False, + ) + parser.add_argument( + "--dtype", + default="float32", + help="Target dtype of the converted model", + ) + args = parser.parse_args() + + if args.convert_tokenizer: + if args.tokenizer_checkpoint is None: + raise ValueError("Path to the tokenizer is required when passing --convert_tokenizer") + + spm_path = os.path.join(args.tokenizer_checkpoint) + write_tokenizer(spm_path, args.output_dir, args.push_to_hub) + if not args.model_size == "tokenizer_only": + config = CONFIG_MAPPING[args.model_size] + dtype = getattr(torch, args.dtype) + write_model( + config=config, + input_base_path=args.input_checkpoint, + save_path=args.output_dir, + safe_serialization=not args.pickle_serialization, + push_to_hub=args.push_to_hub, + dtype=dtype, + ) + + +if __name__ == "__main__": + main() diff --git a/src/transformers/models/gemma2/diff_gemma2.py b/src/transformers/models/gemma2/diff_gemma2.py new file mode 100644 index 000000000000..443be0cf87f5 --- /dev/null +++ b/src/transformers/models/gemma2/diff_gemma2.py @@ -0,0 +1,781 @@ +# coding=utf-8 +# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import inspect +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch.nn import CrossEntropyLoss + +from transformers.models.gemma.configuration_gemma import GemmaConfig +from transformers.models.gemma.modeling_gemma import ( + GemmaAttention, + GemmaDecoderLayer, + GemmaForCausalLM, + GemmaForSequenceClassification, + GemmaForTokenClassification, + GemmaModel, + GemmaRMSNorm, + apply_rotary_pos_emb, + repeat_kv, +) + +from ...cache_utils import Cache +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + + +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +logger = logging.get_logger(__name__) + + +class Gemma2Config(GemmaConfig): + cache_implementation = "hybrid" # TODO this is not properly ported, but cls attr is better + + def __init__( + self, + query_pre_attn_scalar=224, + sliding_window=4096, + final_logit_softcapping=30.0, + **super_kwargs, + ): + super().__init__(self, **super_kwargs) + self.query_pre_attn_scalar = query_pre_attn_scalar + self.sliding_window = sliding_window + self.cache_implementation = "hybrid" + self.final_logit_softcapping = final_logit_softcapping + + +class Gemma2RMSNorm(GemmaRMSNorm): + pass + + +class Gemma2Attention(GemmaAttention): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None): + self.scaling = config.query_pre_attn_scalar**-0.5 + + super().__init__(config, layer_idx) + + +class Gemma2FlashAttention2(Gemma2Attention): + """ + Gemma2 flash attention module. This module inherits from `Gemma2Attention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (Gemma2RMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + ########### ONLY DIFFERENCE IS WE USE SLIDING AND PASS THE SOFTMAX SCALING + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + softmax_scale=self.scaling, + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + cache_position=0, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in Gemma2FlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + use_sliding_windows = ( + _flash_supports_window_size and self.sliding_window is not None and cache_position > self.sliding_window + ) + flash_kwargs = {"window_size": (self.sliding_window, self.sliding_window)} if use_sliding_windows else {} + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + **flash_kwargs, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +class Gemma2SdpaAttention(Gemma2Attention): + """ + Gemma2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `Gemma2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from Gemma2Attention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "Gemma2Model is using Gemma2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and causal_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + is_causal = True if causal_mask is None and q_len > 1 else False + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=is_causal, + scale=self.scaling, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, -1) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +class Gemma2DecoderLayer(GemmaDecoderLayer): + def __init__(self, config: Gemma2Config, layer_idx: int): + super().__init__(config, layer_idx) + + self.is_sliding = bool(layer_idx % 2) + self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.sliding_window = config.sliding_window + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if self.is_sliding and attention_mask is not None: # efficient SDPA and no padding + attention_mask = attention_mask * torch.tril( + torch.ones_like(attention_mask), diagonal=(self.sliding_window - cache_position[-1]) + ) + if cache_position[0] > 0: + attention_mask = attention_mask[:, -self.sliding_window :] + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.pre_feedforward_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_feedforward_layernorm(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class Gemma2Model(GemmaModel): + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if cache_position is None: + cache_position = torch.arange(0, inputs_embeds.shape[1], device=inputs_embeds.device) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + + # embed positions + hidden_states = inputs_embeds + + # normalized + # Gemma2 downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5 + # See https://github.com/huggingface/transformers/pull/29402 + normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype) + hidden_states = hidden_states * normalizer + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + causal_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + cache_position, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = past_key_values if use_cache else None + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + @torch.no_grad() + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + dtype, device = input_tensor.dtype, input_tensor.device + min_dtype = torch.finfo(dtype).min + sequence_length = input_tensor.shape[1] + if past_key_values is not None: + target_length = past_key_values.get_max_length() + else: + target_length = attention_mask.shape[-1] + + if attention_mask is not None and attention_mask.dim() == 4: + # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing + if attention_mask.max() != 0: + raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`") + causal_mask = attention_mask + else: + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + return causal_mask + + +class Gemma2ForCausalLM(GemmaForCausalLM): + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, GemmaForCausalLM + + >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b") + >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b") + + >>> prompt = "What is your favorite condiment?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "What is your favorite condiment?" + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + if self.config.final_logit_softcapping is not None: + logits = logits / self.config.final_logit_softcapping + logits = torch.tanh(logits) + logits = logits * self.config.final_logit_softcapping + + logits = logits.float() + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + use_cache=True, + **kwargs, + ): + past_length = 0 + if past_key_values is not None: + # Past key values are always initialized with a `Cache` object -> no need for if-else anymore + past_length = cache_position[0] if cache_position is not None else torch.tensor(0, device=input_ids.device) + max_cache_length = ( + torch.tensor(past_key_values.get_max_length(), device=input_ids.device) + if past_key_values.get_max_length() is not None + else None + ) + cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length) + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_length == 0: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114 + # TODO: use `next_tokens` directly instead. + model_inputs = {"input_ids": input_ids.contiguous()} + + input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1] + if cache_position is None: + cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device) + elif use_cache: + cache_position = cache_position[-input_length:] + + model_inputs.update( + { + "position_ids": position_ids, + "cache_position": cache_position, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + } + ) + return model_inputs + + +class Gemma2ForSequenceClassification(GemmaForSequenceClassification): + pass + + +class Gemma2ForTokenClassification(GemmaForTokenClassification): + pass diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py new file mode 100644 index 000000000000..4b4aef214c65 --- /dev/null +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -0,0 +1,1376 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from . +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the diff. If any change should be done, please apply the change to the +# diff.py file directly. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import inspect +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_gemma2 import Gemma2Config + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + + +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +class Gemma2RMSNorm(nn.Module): + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.zeros(dim)) + + def _norm(self, x): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x): + output = self._norm(x.float()) + # Llama does x.to(float16) * w whilst Gemma2 is (x * w).to(float16) + # See https://github.com/huggingface/transformers/pull/29402 + output = output * (1.0 + self.weight.float()) + return output.type_as(x) + + +class Gemma2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)) + self.register_buffer("inv_freq", tensor=inv_freq, persistent=False) + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + self.inv_freq.to(x.device) + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class Gemma2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_activation] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Gemma2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = config.head_dim + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.scaling = config.query_pre_attn_scalar**-0.5 + + if self.hidden_size % self.num_heads != 0: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) + self.rotary_emb = Gemma2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + self.sliding_window = config.sliding_window if layer_idx % 2 else None + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.view(bsz, q_len, -1) + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class Gemma2FlashAttention2(Gemma2Attention): + """ + Gemma2 flash attention module. This module inherits from `Gemma2Attention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (Gemma2RMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + softmax_scale=self.scaling, + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + cache_position=0, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in Gemma2FlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + use_sliding_windows = ( + _flash_supports_window_size and self.sliding_window is not None and cache_position > self.sliding_window + ) + flash_kwargs = {"window_size": (self.sliding_window, self.sliding_window)} if use_sliding_windows else {} + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + **flash_kwargs, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +class Gemma2SdpaAttention(Gemma2Attention): + """ + Gemma2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `Gemma2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from Gemma2Attention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "Gemma2Model is using Gemma2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = { + "sin": sin, + "cos": cos, + "sliding_window": self.sliding_window, + "cache_position": cache_position, + } + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and causal_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + is_causal = True if causal_mask is None and q_len > 1 else False + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=is_causal, + scale=self.scaling, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, -1) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +GEMMA2_ATTENTION_CLASSES = { + "eager": Gemma2Attention, + "flash_attention_2": Gemma2FlashAttention2, + "sdpa": Gemma2SdpaAttention, +} + + +class Gemma2DecoderLayer(nn.Module): + def __init__(self, config: Gemma2Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = GEMMA2_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) + + self.mlp = Gemma2MLP(config) + self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.is_sliding = bool(layer_idx % 2) + self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.sliding_window = config.sliding_window + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if self.is_sliding and attention_mask is not None: # efficient SDPA and no padding + attention_mask = attention_mask * torch.tril( + torch.ones_like(attention_mask), diagonal=-self.sliding_window + ) + if attention_mask.shape[1] <= 1: # when decoding + attention_mask = attention_mask[:, -self.sliding_window :] + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.pre_feedforward_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_feedforward_layernorm(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +GEMMA2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Gemma2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Gemma2 Model outputting raw hidden-states without any specific head on top.", + GEMMA2_START_DOCSTRING, +) +class Gemma2PreTrainedModel(PreTrainedModel): + config_class = Gemma2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Gemma2DecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = False + _supports_quantized_cache = False + _supports_static_cache = True + _is_stateful = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +_CONFIG_FOR_DOC = "Gemma2Config" + + +GEMMA2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + "The bare Gemma2 Model outputting raw hidden-states without any specific head on top.", + GEMMA2_START_DOCSTRING, +) +class Gemma2Model(Gemma2PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Gemma2DecoderLayer`] + + Args: + config: Gemma2Config + """ + + def __init__(self, config: Gemma2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if cache_position is None: + cache_position = torch.arange(0, inputs_embeds.shape[1], device=inputs_embeds.device) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + + # embed positions + hidden_states = inputs_embeds + + # normalized + # Gemma2 downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5 + # See https://github.com/huggingface/transformers/pull/29402 + normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype) + hidden_states = hidden_states * normalizer + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + causal_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + cache_position, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = past_key_values if use_cache else None + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + dtype, device = input_tensor.dtype, input_tensor.device + min_dtype = torch.finfo(dtype).min + sequence_length = input_tensor.shape[1] + if past_key_values is not None: + target_length = past_key_values.get_max_length() + else: + target_length = attention_mask.shape[-1] if attention_mask is not None else input_tensor.shape[1] + + if attention_mask is not None and attention_mask.dim() == 4: + # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing + if attention_mask.max() != 0: + raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`") + causal_mask = attention_mask + else: + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + return causal_mask + + +class Gemma2ForCausalLM(Gemma2PreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = Gemma2Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, GemmaForCausalLM + + >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b") + >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b") + + >>> prompt = "What is your favorite condiment?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "What is your favorite condiment?" + ```""" + if self.training and self.config._attn_implementation != "eager": + logger.warning_once( + "It is strongly recommended to train Gemma2 models with the `eager` attention implementation " + f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('', attn_implementation='eager')`." + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + if self.config.final_logit_softcapping is not None: + logits = logits / self.config.final_logit_softcapping + logits = torch.tanh(logits) + logits = logits * self.config.final_logit_softcapping + + logits = logits.float() + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + use_cache=True, + **kwargs, + ): + past_length = 0 + if past_key_values is not None: + # Past key values are always initialized with a `Cache` object -> no need for if-else anymore + past_length = cache_position[0] if cache_position is not None else torch.tensor(0, device=input_ids.device) + max_cache_length = ( + torch.tensor(past_key_values.get_max_length(), device=input_ids.device) + if past_key_values.get_max_length() is not None + else None + ) + cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length) + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_length == 0: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114 + # TODO: use `next_tokens` directly instead. + model_inputs = {"input_ids": input_ids.contiguous()} + + input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1] + if cache_position is None: + cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device) + elif use_cache: + cache_position = cache_position[-input_length:] + + model_inputs.update( + { + "position_ids": position_ids, + "cache_position": cache_position, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The Gemma2 Model transformer with a sequence classification head on top (linear layer). + + [`Gemma2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + GEMMA2_START_DOCSTRING, +) +class Gemma2ForSequenceClassification(Gemma2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Gemma2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@add_start_docstrings( + """ + The Gemma2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states + output) e.g. for Named-Entity-Recognition (NER) tasks. + """, + GEMMA2_START_DOCSTRING, +) +class Gemma2ForTokenClassification(Gemma2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Gemma2Model(config) + if getattr(config, "classifier_dropout", None) is not None: + classifier_dropout = config.classifier_dropout + elif getattr(config, "hidden_dropout", None) is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.score = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(GEMMA2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output) + logits = self.score(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index 22735a7a0a38..2b9707c5d5f8 100644 --- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -706,6 +706,7 @@ def forward( encoder_attention_mask: Optional[torch.Tensor] = None, use_cache: Optional[bool] = False, output_attentions: Optional[bool] = False, + **kwargs, ) -> Union[ Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor] ]: diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index bde881226fb8..85ee61e7fe31 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -18,6 +18,7 @@ import torch import torch.utils.checkpoint +from packaging import version from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from torch.nn import functional as F @@ -29,6 +30,7 @@ add_start_docstrings_to_model_forward, replace_return_docstrings, ) +from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa from ...modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, @@ -37,7 +39,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging +from ...utils import get_torch_version, is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging from .configuration_gpt_neox import GPTNeoXConfig @@ -78,6 +80,7 @@ class GPTNeoXPreTrainedModel(PreTrainedModel): _no_split_modules = ["GPTNeoXLayer"] _skip_keys_device_placement = "past_key_values" _supports_flash_attn_2 = True + _supports_sdpa = True def _init_weights(self, module): """Initialize the weights""" @@ -162,7 +165,56 @@ def forward( layer_past: Optional[Tuple[torch.Tensor]] = None, use_cache: Optional[bool] = False, output_attentions: Optional[bool] = False, - padding_mask: Optional[torch.Tensor] = None, + ): + # Apply attention-specific projections and rope + query, key, value, present = self._attn_projections_and_rope( + hidden_states=hidden_states, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache + ) + + # Compute attention + attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) + + # Reshape outputs + attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size) + attn_output = self.dense(attn_output) + + outputs = (attn_output, present) + if output_attentions: + outputs += (attn_weights,) + + return outputs + + @classmethod + def _split_heads(cls, tensor, num_attention_heads, attn_head_size): + """ + Splits hidden dim into attn_head_size and num_attention_heads + """ + # tensor: [bs, seq_len, hidden_size] + new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size) + # -> [bs, seq_len, num_attention_heads, attn_head_size] + tensor = tensor.view(new_shape) + # -> [bs, num_attention_heads, seq_len, attn_head_size] + tensor = tensor.permute(0, 2, 1, 3) + return tensor + + @classmethod + def _merge_heads(cls, tensor, num_attention_heads, attn_head_size): + """ + Merges attn_head_size dim and num_attn_heads dim into hidden dim + """ + # tensor [bs, num_attention_heads, seq_len, attn_head_size] + tensor = tensor.permute(0, 2, 1, 3).contiguous() + # -> [bs, seq_len, num_attention_heads, attn_head_size] + tensor = tensor.view(tensor.size(0), tensor.size(1), num_attention_heads * attn_head_size) + # -> [bs, seq_len, hidden_size] + return tensor + + def _attn_projections_and_rope( + self, + hidden_states: torch.FloatTensor, + position_ids: torch.LongTensor, + layer_past: Optional[Tuple[torch.Tensor]] = None, + use_cache: Optional[bool] = False, ): has_layer_past = layer_past is not None @@ -204,43 +256,7 @@ def forward( value = torch.cat((past_value, value), dim=-2) present = (key, value) if use_cache else None - # Compute attention - attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) - - # Reshape outputs - attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size) - attn_output = self.dense(attn_output) - - outputs = (attn_output, present) - if output_attentions: - outputs += (attn_weights,) - - return outputs - - @classmethod - def _split_heads(cls, tensor, num_attention_heads, attn_head_size): - """ - Splits hidden dim into attn_head_size and num_attention_heads - """ - # tensor: [bs, seq_len, hidden_size] - new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size) - # -> [bs, seq_len, num_attention_heads, attn_head_size] - tensor = tensor.view(new_shape) - # -> [bs, num_attention_heads, seq_len, attn_head_size] - tensor = tensor.permute(0, 2, 1, 3) - return tensor - - @classmethod - def _merge_heads(cls, tensor, num_attention_heads, attn_head_size): - """ - Merges attn_head_size dim and num_attn_heads dim into hidden dim - """ - # tensor [bs, num_attention_heads, seq_len, attn_head_size] - tensor = tensor.permute(0, 2, 1, 3).contiguous() - # -> [bs, seq_len, num_attention_heads, attn_head_size] - tensor = tensor.view(tensor.size(0), tensor.size(1), num_attention_heads * attn_head_size) - # -> [bs, seq_len, hidden_size] - return tensor + return query, key, value, present def _attn(self, query, key, value, attention_mask=None, head_mask=None): # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size] @@ -319,48 +335,13 @@ def forward( use_cache: Optional[bool] = False, output_attentions: Optional[bool] = False, ): - has_layer_past = layer_past is not None - - # Compute QKV - # Attention heads [batch, seq_len, hidden_size] - # --> [batch, seq_len, (np * 3 * head_size)] - qkv = self.query_key_value(hidden_states) - - # [batch, seq_len, (num_heads * 3 * head_size)] - # --> [batch, seq_len, num_heads, 3 * head_size] - new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size) - qkv = qkv.view(*new_qkv_shape) - - # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size] - query = qkv[..., : self.head_size].permute(0, 2, 1, 3) - key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3) - value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3) + # Apply attention-specific projections and rope + query, key, value, present = self._attn_projections_and_rope( + hidden_states=hidden_states, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache + ) query_length = query.shape[-2] - # Compute rotary embeddings on rotary_ndims - query_rot = query[..., : self.rotary_ndims] - query_pass = query[..., self.rotary_ndims :] - key_rot = key[..., : self.rotary_ndims] - key_pass = key[..., self.rotary_ndims :] - - # Compute token offset for rotary embeddings (when decoding) - seq_len = key.shape[-2] - if has_layer_past: - seq_len += layer_past[0].shape[-2] - cos, sin = self.rotary_emb(value, seq_len=seq_len) - query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids) - query = torch.cat((query, query_pass), dim=-1) - key = torch.cat((key, key_pass), dim=-1) - - # Cache QKV values - if has_layer_past: - past_key = layer_past[0] - past_value = layer_past[1] - key = torch.cat((past_key, key), dim=-2) - value = torch.cat((past_value, value), dim=-2) - present = (key, value) if use_cache else None - # GPT-neo-X casts query and key in fp32 to apply rotary embedding in full precision target_dtype = value.dtype if query.dtype != target_dtype: @@ -516,6 +497,90 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query ) +class GPTNeoXSdpaAttention(GPTNeoXAttention): + """ + GPTNeoX attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `GPTNeoXAttention` as the weights of the module stays untouched. The only changes are on the forward pass + to adapt to the SDPA API. + """ + + def __init__(self, config): + super().__init__(config) + + # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom + # attn_mask, so we need to call `.contiguous()`. This was fixed in torch==2.2.0. + # Reference: https://github.com/pytorch/pytorch/issues/112577 + self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0") + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: torch.FloatTensor, + position_ids: torch.LongTensor, + head_mask: Optional[torch.FloatTensor] = None, + layer_past: Optional[Tuple[torch.Tensor]] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + ): + if output_attentions or head_mask is not None: + logger.warning_once( + "`GPTNeoXSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but " + "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " + 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + layer_past=layer_past, + use_cache=use_cache, + output_attentions=output_attentions, + ) + + bsz, q_len, _ = hidden_states.size() + + # Apply attention-specific projections and rope + query, key, value, present = self._attn_projections_and_rope( + hidden_states=hidden_states, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache + ) + + # GPT-neo-X casts query and key in fp32 to apply rotary embedding in full precision + target_dtype = value.dtype + if query.dtype != target_dtype: + query = query.to(target_dtype) + if key.dtype != target_dtype: + key = key.to(target_dtype) + + # Avoid torch==2.1.2 specific bug for the memory-efficient backend in SDPA + if self.require_contiguous_qkv and query.device.type == "cuda" and attention_mask is not None: + query = query.contiguous() + key = key.contiguous() + value = value.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + is_causal = True if attention_mask is None and q_len > 1 else False + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query=query, + key=key, + value=value, + attn_mask=attention_mask, + dropout_p=self.attention_dropout.p if self.training else 0.0, + is_causal=is_causal, + ) + + # Reshape outputs + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, self.hidden_size) + + attn_output = self.dense(attn_output) + + return attn_output, present, None + + def attention_mask_func(attention_scores, ltor_mask): attention_scores.masked_fill_(~ltor_mask, torch.finfo(attention_scores.dtype).min) return attention_scores @@ -660,6 +725,7 @@ def forward(self, hidden_states): GPT_NEOX_ATTENTION_CLASSES = { "eager": GPTNeoXAttention, "flash_attention_2": GPTNeoXFlashAttention2, + "sdpa": GPTNeoXSdpaAttention, } @@ -786,7 +852,8 @@ def __init__(self, config): self.emb_dropout = nn.Dropout(config.hidden_dropout) self.layers = nn.ModuleList([GPTNeoXLayer(config) for _ in range(config.num_hidden_layers)]) self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + + self._attn_implementation = config._attn_implementation self.gradient_checkpointing = False @@ -859,27 +926,29 @@ def forward( position_ids = torch.arange(past_length, seq_length + past_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0) + if inputs_embeds is None: + inputs_embeds = self.embed_in(input_ids) + # Attention mask. if attention_mask is not None: assert batch_size > 0, "batch_size has to be defined and > 0" attention_mask = attention_mask.view(batch_size, -1) - if self._use_flash_attention_2: + if self._attn_implementation == "flash_attention_2": attention_mask = attention_mask if 0 in attention_mask else None + elif self._attn_implementation == "sdpa" and not output_attentions and head_mask is None: + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask=attention_mask, + input_shape=(batch_size, seq_length), + inputs_embeds=inputs_embeds, + past_key_values_length=past_length, + ) else: - # We create a 3D attention mask from a 2D tensor mask. - # Sizes are [batch_size, 1, 1, to_seq_length] - # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] - # this attention mask is more simple than the triangular masking of causal attention - # used in OpenAI GPT, we just need to prepare the broadcast dimension here. - attention_mask = attention_mask[:, None, None, :] - - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and the dtype's smallest value for masked positions. - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility - attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask=attention_mask, + input_shape=(batch_size, seq_length), + inputs_embeds=inputs_embeds, + past_key_values_length=past_length, + ) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head @@ -888,9 +957,6 @@ def forward( # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - if inputs_embeds is None: - inputs_embeds = self.embed_in(input_ids) - hidden_states = self.emb_dropout(inputs_embeds) if self.gradient_checkpointing and self.training: diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py index c0b0a83c24d6..5d59a4ed90e4 100755 --- a/src/transformers/models/imagegpt/modeling_imagegpt.py +++ b/src/transformers/models/imagegpt/modeling_imagegpt.py @@ -33,7 +33,13 @@ ) from ...modeling_utils import PreTrainedModel from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer -from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, + torch_float, +) from .configuration_imagegpt import ImageGPTConfig @@ -229,7 +235,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None): attn_weights = torch.matmul(query, key.transpose(-1, -2)) if self.scale_attn_weights: - attn_weights = attn_weights / (float(value.size(-1)) ** 0.5) + attn_weights = attn_weights / torch_float(value.size(-1) ** 0.5) # Layer-wise attention scaling if self.scale_attn_by_inverse_layer_idx: diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py index 31dfacea92c4..77014e6f4667 100644 --- a/src/transformers/models/instructblip/configuration_instructblip.py +++ b/src/transformers/models/instructblip/configuration_instructblip.py @@ -164,6 +164,8 @@ class InstructBlipQFormerConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Token id used for padding sequences. position_embedding_type (`str`, *optional*, defaults to `"absolute"`): Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index 386b69cd3b0f..8ad47b308fd0 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -324,7 +324,7 @@ def _init_weights(self, module): module.bias.data.zero_() if isinstance(module, InstructBlipVisionEmbeddings): - if hasattr(self.config, "vision_config"): + if hasattr(self.config, "vision_config") and not isinstance(self.config, InstructBlipVisionConfig): factor = self.config.vision_config.initializer_range nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor) nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor) diff --git a/src/transformers/models/instructblipvideo/__init__.py b/src/transformers/models/instructblipvideo/__init__.py new file mode 100644 index 000000000000..18d20d040150 --- /dev/null +++ b/src/transformers/models/instructblipvideo/__init__.py @@ -0,0 +1,83 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available + + +_import_structure = { + "configuration_instructblipvideo": [ + "InstructBlipVideoConfig", + "InstructBlipVideoQFormerConfig", + "InstructBlipVideoVisionConfig", + ], + "processing_instructblipvideo": ["InstructBlipVideoProcessor"], +} + + +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["image_processing_instructblipvideo"] = ["InstructBlipVideoImageProcessor"] + + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_instructblipvideo"] = [ + "InstructBlipVideoQFormerModel", + "InstructBlipVideoPreTrainedModel", + "InstructBlipVideoForConditionalGeneration", + "InstructBlipVideoVisionModel", + ] + +if TYPE_CHECKING: + from .configuration_instructblipvideo import ( + InstructBlipVideoConfig, + InstructBlipVideoQFormerConfig, + InstructBlipVideoVisionConfig, + ) + from .processing_instructblipvideo import InstructBlipVideoProcessor + + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .image_processing_instructblipvideo import InstructBlipVideoImageProcessor + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_instructblipvideo import ( + InstructBlipVideoForConditionalGeneration, + InstructBlipVideoPreTrainedModel, + InstructBlipVideoQFormerModel, + InstructBlipVideoVisionModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py new file mode 100644 index 000000000000..180372f35d18 --- /dev/null +++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py @@ -0,0 +1,364 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from . +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the diff. If any change should be done, please apply the change to the +# diff.py file directly. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Union + +from ...configuration_utils import PretrainedConfig +from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES +from ...utils import ( + logging, +) +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + + +class InstructBlipVideoVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`InstructBlipVideoVisionModel`]. It is used to + instantiate a Instructblipvideo vision encoder according to the specified arguments, defining the model architecture. + Instantiating a configuration defaults will yield a similar configuration to that of the Instructblipvideo + [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 1408): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 6144): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 39): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 14): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. to 1e-5): The epsilon used by the layer + normalization layers. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 1e-10): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries and values in the self-attention layers. + + Example: + + ```python + >>> from transformers import InstructBlipVideoVisionConfig, InstructBlipVideoVisionModel + + >>> # Initializing a InstructBlipVideoVisionConfig with Salesforce/instruct-blip-flan-t5 style configuration + >>> configuration = InstructBlipVideoVisionConfig() + + >>> # Initializing a InstructBlipVideoVisionModel (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration + >>> model = InstructBlipVideoVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "instructblipvideo_vision_model" + + def __init__( + self, + hidden_size=1408, + intermediate_size=6144, + num_hidden_layers=39, + num_attention_heads=16, + image_size=224, + patch_size=14, + hidden_act="gelu", + layer_norm_eps=1e-6, + attention_dropout=0.0, + initializer_range=1e-10, + qkv_bias=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.qkv_bias = qkv_bias + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from InstructBlipVideoConfig + if config_dict.get("model_type") == "instructblipvideo": + config_dict = config_dict["vision_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class InstructBlipVideoQFormerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`InstructBlipVideoQFormerModel`]. It is used to + instantiate a Instructblipvideo Querying Transformer (Q-Former) model according to the specified arguments, defining the + model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the Instructblipvideo [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) + architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. + Read the documentation from [`PretrainedConfig`] for more information. + + Note that [`InstructBlipVideoQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention. + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling the model. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Token id used for padding sequences. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + cross_attention_frequency (`int`, *optional*, defaults to 2): + The frequency of adding cross-attention to the Transformer layers. + encoder_hidden_size (`int`, *optional*, defaults to 1408): + The hidden size of the hidden states for cross-attention. + + Examples: + + ```python + >>> from transformers import InstructBlipVideoQFormerConfig, InstructBlipVideoQFormerModel + + >>> # Initializing a Instructblipvideo Salesforce/instruct-blip-flan-t5 style configuration + >>> configuration = InstructBlipVideoQFormerConfig() + + >>> # Initializing a model (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration + >>> model = InstructBlipVideoQFormerModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "instructblipvideo_qformer" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + cross_attention_frequency=2, + encoder_hidden_size=1408, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.cross_attention_frequency = cross_attention_frequency + self.encoder_hidden_size = encoder_hidden_size + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the qformer config dict if we are loading from InstructBlipVideoConfig + if config_dict.get("model_type") == "instructblipvideo": + config_dict = config_dict["qformer_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class InstructBlipVideoConfig(PretrainedConfig): + r""" + [`InstructBlipVideoConfig`] is the configuration class to store the configuration of a + [`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified + arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with + the defaults will yield a similar configuration to that of the Instructblipvideo + [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`]. + qformer_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`]. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize any [`PretrainedConfig`]. + num_query_tokens (`int`, *optional*, defaults to 32): + The number of query tokens passed through the Transformer. + + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import ( + ... InstructBlipVideoVisionConfig, + ... InstructBlipVideoQFormerConfig, + ... OPTConfig, + ... InstructBlipVideoConfig, + ... InstructBlipVideoForConditionalGeneration, + ... ) + + >>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration + >>> configuration = InstructBlipVideoConfig() + + >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration + >>> model = InstructBlipVideoForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PretrainedConfig + + >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations + >>> vision_config = InstructBlipVideoVisionConfig() + >>> qformer_config = InstructBlipVideoQFormerConfig() + >>> text_config = OPTConfig() + + >>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config) + ```""" + + model_type = "instructblipvideo" + + def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs): + super().__init__(**kwargs) + + if vision_config is None: + vision_config = {} + logger.info("vision_config is None. initializing the InstructBlipVideoVisionConfig with default values.") + + if qformer_config is None: + qformer_config = {} + logger.info("qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.") + + if text_config is None: + text_config = {} + logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") + + self.vision_config = InstructBlipVideoVisionConfig(**vision_config) + self.qformer_config = InstructBlipVideoQFormerConfig(**qformer_config) + text_model_type = text_config["model_type"] if "model_type" in text_config else "opt" + self.text_config = CONFIG_MAPPING[text_model_type](**text_config) + + self.tie_word_embeddings = self.text_config.tie_word_embeddings + self.is_encoder_decoder = self.text_config.is_encoder_decoder + + self.num_query_tokens = num_query_tokens + self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size + self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + self.initializer_factor = 1.0 + self.initializer_range = 0.02 + + @classmethod + def from_vision_qformer_text_configs( + cls, + vision_config: InstructBlipVideoVisionConfig, + qformer_config: InstructBlipVideoQFormerConfig, + text_config: PretrainedConfig, + **kwargs, + ): + r""" + Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a Instructblipvideo vision model, Q-Former and + language model configurations. + + Returns: + [`InstructBlipVideoConfig`]: An instance of a configuration object + """ + + return cls( + vision_config=vision_config.to_dict(), + qformer_config=qformer_config.to_dict(), + text_config=text_config.to_dict(), + **kwargs, + ) diff --git a/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py b/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py new file mode 100644 index 000000000000..9b3d508db6ff --- /dev/null +++ b/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py @@ -0,0 +1,305 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Convert InstructBlipVideo checkpoints from the original repository. + +URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblipvideo +""" + +import argparse + +import requests +import torch + +# pip3 install salesforce-lavis +# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch) +# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +# same for Vicuna-13b +from lavis.models import load_model_and_preprocess +from PIL import Image + +from transformers import ( + AutoTokenizer, + BlipImageProcessor, + InstructBlipProcessor, + InstructBlipVideoConfig, + InstructBlipVideoForConditionalGeneration, + InstructBlipVideoQFormerConfig, + InstructBlipVideoVisionConfig, + LlamaConfig, + LlamaTokenizerFast, + T5Config, + T5TokenizerFast, +) +from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD + + +def load_demo_image(): + url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + + return image + + +# here we list all keys to be renamed (original name on the left, our name on the right) +def create_rename_keys(config): + rename_keys = [] + # fmt: off + + # vision encoder + rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding")) + rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding")) + rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight")) + rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias")) + rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight")) + rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias")) + + for i in range(config.vision_config.num_hidden_layers): + rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias")) + rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias")) + rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",)) + rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias")) + rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias")) + rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias")) + + # QFormer + rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight")) + rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias")) + + # fmt: on + return rename_keys + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + + +def read_in_q_v_bias(state_dict, config): + for i in range(config.vision_config.num_hidden_layers): + # read in original q and v biases + q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias") + v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias") + + # next, set bias in the state dict + qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) + state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias + + +def get_blip2_config(model_name): + image_size = 364 if "coco" in model_name else 224 + vision_config = InstructBlipVideoVisionConfig(image_size=image_size).to_dict() + + # make sure the models have proper bos_token_id and eos_token_id set (important for generation) + # seems like flan-T5 models don't have bos_token_id properly set? + if "t5-xl" in model_name: + text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() + elif "t5-xxl" in model_name: + text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() + elif "vicuna-7b" in model_name: + text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict() + elif "vicuna-13b" in model_name: + text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict() + else: + raise ValueError("Model name not supported") + + # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1 + qformer_config = InstructBlipVideoQFormerConfig(vocab_size=30523).to_dict() + config = InstructBlipVideoConfig( + vision_config=vision_config, text_config=text_config, qformer_config=qformer_config + ) + + return config, image_size + + +@torch.no_grad() +def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): + """ + Copy/paste/tweak model's weights to Transformers design. + """ + qformer_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", truncation_side="left") + qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"}) + + if "t5" in model_name: + tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left") + elif "vicuna" in model_name: + # the following was used in the original implementation: + # tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left") + # tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + # tokenizer.add_special_tokens({"bos_token": ""}) + # tokenizer.add_special_tokens({"eos_token": ""}) + # tokenizer.add_special_tokens({"unk_token": ""}) + tokenizer = LlamaTokenizerFast.from_pretrained( + "huggyllama/llama-7b", truncation_side="left", bos_token="", unk_token="" + ) + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + + config, image_size = get_blip2_config(model_name) + hf_model = InstructBlipVideoForConditionalGeneration(config).eval() + + model_name_to_original = { + "instructblipvideo-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"), + "instructblipvideo-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"), + "instructblipvideo-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"), + "instructblipvideo-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"), + } + + name, type = model_name_to_original[model_name] + + # load original model + print("Loading original model...") + hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu" + lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu" + original_model, vis_processors, _ = load_model_and_preprocess( + name=name, model_type=type, is_eval=True, device=lavis_device + ) + original_model.eval() + print("Done!") + + # update state dict keys + state_dict = original_model.state_dict() + rename_keys = create_rename_keys(config) + for src, dest in rename_keys: + rename_key(state_dict, src, dest) + + # some keys can be renamed efficiently + for key, val in state_dict.copy().items(): + val = state_dict.pop(key) + if key.startswith("Qformer.bert"): + key = key.replace("Qformer.bert", "qformer") + if "attention.self" in key: + key = key.replace("self", "attention") + if "llm_proj" in key: + key = key.replace("llm_proj", "language_projection") + if "t5_proj" in key: + key = key.replace("t5_proj", "language_projection") + if key.startswith("llm_model"): + key = key.replace("llm_model", "language_model") + if key.startswith("t5"): + key = key.replace("t5", "language") + state_dict[key] = val + + # read in qv biases + read_in_q_v_bias(state_dict, config) + + # note: weights get loaded in torch.float32 by default + hf_model.load_state_dict(state_dict, strict=True) + + image = load_demo_image() + prompt = "What is unusual about this image?" + + # create processor + image_processor = BlipImageProcessor( + size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD + ) + processor = InstructBlipProcessor( + image_processor=image_processor, + tokenizer=tokenizer, + qformer_tokenizer=qformer_tokenizer, + ) + inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device) + + # make sure processor creates exact same pixel values + original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device) + pixel_values = inputs.pixel_values + assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values) + + original_model.to(lavis_device) + hf_model.to(hf_model_device) + with torch.no_grad(): + if "vicuna" in model_name: + original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits + logits = hf_model(**inputs).logits + else: + original_logits = original_model( + {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]} + ).logits + label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device) + labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100) + logits = hf_model(**inputs, labels=labels).logits + + print("First values of original logits:", original_logits[0, :3, :3]) + print("First values of HF logits:", logits[0, :3, :3]) + + # assert values + assert original_logits.shape == logits.shape + atol = 1e-4 if "vicuna" in model_name else 1e-5 + assert torch.allclose(original_logits.to(logits.device), logits, atol=atol) + print("Looks ok!") + + print("Generating with original model...") + original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5) + + # important: we need to cast the weights of the HF model to the appropriate type + print("Generating with HF model...") + outputs = hf_model.generate( + **inputs, + do_sample=False, + num_beams=5, + max_length=256, + min_length=1, + top_p=0.9, + repetition_penalty=1.5, + length_penalty=1.0, + temperature=1, + ) + if "vicuna" in model_name: + # convert output id 0 to 2 (eos_token_id) + # TODO add this in the generate method? + outputs[outputs == 0] = 2 + print("Original generation:", original_outputs) + output_text = processor.batch_decode(outputs, skip_special_tokens=True) + output_text = [text.strip() for text in output_text] + print("HF generation:", output_text) + + if pytorch_dump_folder_path is not None: + processor.save_pretrained(pytorch_dump_folder_path) + hf_model.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + processor.push_to_hub(f"Salesforce/{model_name}") + hf_model.push_to_hub(f"Salesforce/{model_name}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + choices = [ + "instructblipvideo-vicuna-7b", + "instructblipvideo-vicuna-13b", + "instructblipvideo-flan-t5-xl", + "instructblipvideo-flan-t5-xxl", + ] + parser.add_argument( + "--model_name", + default="instructblipvideo-flan-t5-xl", + choices=choices, + type=str, + help="Path to hf config.json of model to convert", + ) + parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") + parser.add_argument( + "--push_to_hub", + action="store_true", + help="Whether to push the model and processor to the hub after converting", + ) + + args = parser.parse_args() + + convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/instructblipvideo/diff_instructblipvideo.py b/src/transformers/models/instructblipvideo/diff_instructblipvideo.py new file mode 100644 index 000000000000..08bc960e041d --- /dev/null +++ b/src/transformers/models/instructblipvideo/diff_instructblipvideo.py @@ -0,0 +1,430 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch.nn import CrossEntropyLoss + +from transformers.models.instructblip.configuration_instructblip import ( + InstructBlipConfig, + InstructBlipQFormerConfig, + InstructBlipVisionConfig, +) +from transformers.models.instructblip.modeling_instructblip import ( + InstructBlipAttention, + InstructBlipEncoder, + InstructBlipEncoderLayer, + InstructBlipForConditionalGeneration, + InstructBlipForConditionalGenerationModelOutput, + InstructBlipMLP, + InstructBlipPreTrainedModel, + InstructBlipQFormerAttention, + InstructBlipQFormerEmbeddings, + InstructBlipQFormerEncoder, + InstructBlipQFormerIntermediate, + InstructBlipQFormerLayer, + InstructBlipQFormerModel, + InstructBlipQFormerOutput, + InstructBlipQFormerSelfOutput, + InstructBlipVisionEmbeddings, + InstructBlipVisionModel, +) + +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class InstructBlipVideoVisionConfig(InstructBlipVisionConfig): + pass + + +class InstructBlipVideoQFormerConfig(InstructBlipQFormerConfig): + pass + + +class InstructBlipVideoConfig(InstructBlipConfig): + pass + + +@dataclass +class InstructBlipVideoForConditionalGenerationModelOutput(InstructBlipForConditionalGenerationModelOutput): + pass + + +class InstructBlipVideoVisionEmbeddings(InstructBlipVisionEmbeddings): + pass + + +class InstructBlipVideoAttention(InstructBlipAttention): + pass + + +class InstructBlipVideoMLP(InstructBlipMLP): + pass + + +class InstructBlipVideoEncoderLayer(InstructBlipEncoderLayer): + pass + + +class InstructBlipVideoPreTrainedModel(InstructBlipPreTrainedModel): + pass + + +class InstructBlipVideoEncoder(InstructBlipEncoder): + pass + + +class InstructBlipVideoVisionModel(InstructBlipVisionModel): + pass + + +class InstructBlipVideoQFormerSelfOutput(InstructBlipQFormerSelfOutput): + pass + + +class InstructBlipVideoQFormerAttention(InstructBlipQFormerAttention): + pass + + +class InstructBlipVideoQFormerIntermediate(InstructBlipQFormerIntermediate): + pass + + +class InstructBlipVideoQFormerOutput(InstructBlipQFormerOutput): + pass + + +class InstructBlipVideoQFormerLayer(InstructBlipQFormerLayer): + pass + + +class InstructBlipVideoQFormerEncoder(InstructBlipQFormerEncoder): + pass + + +class InstructBlipVideoQFormerEmbeddings(InstructBlipQFormerEmbeddings): + pass + + +class InstructBlipVideoQFormerModel(InstructBlipQFormerModel): + pass + + +class InstructBlipVideoForConditionalGeneration(InstructBlipForConditionalGeneration): + def forward( + self, + pixel_values: torch.FloatTensor, + qformer_input_ids: torch.FloatTensor, + qformer_attention_mask: Optional[torch.LongTensor] = None, + input_ids: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> Union[Tuple, InstructBlipVideoForConditionalGenerationModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size - + 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., + config.vocab_size]` + + Returns: + + Examples: + + ```python + >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration + >>> import torch + >>> from huggingface_hub import hf_hub_download + >>> from av + + >>> def read_video_pyav(container, indices): + ... ''' + ... Decode the video with PyAV decoder. + ... Args: + ... container (`av.container.input.InputContainer`): PyAV container. + ... indices (`List[int]`): List of frame indices to decode. + ... Returns: + ... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). + ... ''' + ... frames = [] + ... container.seek(0) + ... start_index = indices[0] + ... end_index = indices[-1] + ... for i, frame in enumerate(container.decode(video=0)): + ... if i > end_index: + ... break + ... if i >= start_index and i in indices: + ... frames.append(frame) + ... return np.stack([x.to_ndarray(format="rgb24") for x in frames]) + + >>> model = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto") + >>> processor = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b") + + >>> file_path = hf_hub_download( + repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" + ) + >>> container = av.open(video_path) + >>> # sample uniformly 4 frames from the videWhy is this video funny?o + >>> total_frames = container.streams.video[0].frames + >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int) + >>> clip = read_video_pyav(container, indices) + + >>> prompt = "What is happening in the video?" + >>> inputs = processor(videos=clip, text=prompt, return_tensors="pt").to(device) + + >>> outputs = model.generate( + ... **inputs, + ... do_sample=False, + ... num_beams=5, + ... max_length=256, + ... repetition_penalty=1.5, + ... length_penalty=1.0, + ... ) + >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() + >>> print(generated_text) + "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front" + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # step 1: forward the images through the vision encoder, + # we process in a batched way, later unbatch it back (video has frames=4 always) + batch_size, frames, channel, height, width = pixel_values.shape + pixel_values = pixel_values.reshape(batch_size * frames, channel, height, width) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + image_embeds = vision_outputs[0] + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + # difference with BLIP-2 here: we also feed the instruction prompt to the Q-Former + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device) + + if qformer_attention_mask is None: + qformer_attention_mask = torch.ones_like(qformer_input_ids) + + qformer_input_ids = qformer_input_ids.repeat_interleave(frames, dim=0) + qformer_attention_mask = qformer_attention_mask.repeat_interleave(frames, dim=0) + qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1) + query_outputs = self.qformer( + input_ids=qformer_input_ids, + attention_mask=qformer_attention_mask, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + query_output = query_outputs[0][:, : query_tokens.size(1), :] + + # step 3: use the language model, conditioned on the query outputs and the prompt + language_model_inputs = self.language_projection(query_output) + + # unbatch inputs back, each video-frame gets `num_query_tokens` seq length + language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1) + language_model_attention_mask = torch.ones( + language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device + ) + + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) + inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + attention_mask = torch.cat([language_model_attention_mask.to(attention_mask.device), attention_mask], dim=1) + + if self.config.use_decoder_only_language_model: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits if return_dict else outputs[0] + loss = None + # we compute the loss here since we need to take into account the sequence length of the query embeds + if labels is not None: + labels = labels.to(logits.device) + logits = logits[:, -labels.size(1) :, :] + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().to(logits.device) + + # Flatten the tokens + loss_fct = CrossEntropyLoss(reduction="mean") + + loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1)) + else: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + ) + loss = outputs.loss if return_dict else outputs[0] + logits = outputs.logits if return_dict else outputs[1] + + if not return_dict: + output = (logits, vision_outputs, query_outputs, outputs) + return ((loss,) + output) if loss is not None else output + + return InstructBlipVideoForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + vision_outputs=vision_outputs, + qformer_outputs=query_outputs, + language_model_outputs=outputs, + ) + + @torch.no_grad() + def generate( + self, + pixel_values: torch.FloatTensor, + qformer_input_ids: Optional[torch.LongTensor] = None, + qformer_attention_mask: Optional[torch.LongTensor] = None, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **generate_kwargs, + ) -> torch.LongTensor: + """ + Overrides `generate` function to be able to use the model as a conditional generator. + + Args: + pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or + (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed. + qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + The sequence used as a prompt to be fed to the Q-Former module. + qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + Mask to avoid performing attention on padding token indices. + input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + The sequence used as a prompt for the generation. + attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + Mask to avoid performing attention on padding token indices. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the positional encoding of the image embeddings. + + Returns: + captions (list): A list of strings of length batch_size * num_captions. + """ + if hasattr(self, "hf_device_map"): + # preprocess for `accelerate` + self._preprocess_accelerate() + + # we process in a batched way, later unbatch it back (video has frames=4) + batch_size, frames, channel, height, width = pixel_values.shape + pixel_values = pixel_values.reshape(batch_size * frames, channel, height, width) + + image_embeds = self.vision_model( + pixel_values, + return_dict=True, + interpolate_pos_encoding=interpolate_pos_encoding, + ).last_hidden_state + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device) + if qformer_attention_mask is None: + qformer_attention_mask = torch.ones_like(qformer_input_ids) + + qformer_input_ids = qformer_input_ids.repeat_interleave(frames, dim=0) + qformer_attention_mask = qformer_attention_mask.repeat_interleave(frames, dim=0) + qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1) + query_outputs = self.qformer( + input_ids=qformer_input_ids, + attention_mask=qformer_attention_mask, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=True, + ) + query_output = query_outputs.last_hidden_state[:, : query_tokens.size(1), :] + + language_model_inputs = self.language_projection(query_output) + + # unbatch the embeddings back by moving frames to seq-len + language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1) + language_attention_mask = torch.ones( + language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device + ) + + if input_ids is None: + input_ids = ( + torch.LongTensor([[self.config.text_config.bos_token_id]]) + .repeat(batch_size, 1) + .to(image_embeds.device) + ) + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1) + + # concatenate query embeddings with prompt embeddings + inputs_embeds = self.get_input_embeddings()(input_ids) + inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) + + # add image_embeds length to max_length, so that the final max_length in counted only on token embeds + # -1 is to account for the prepended BOS after `generate.` + if not self.language_model.config.is_encoder_decoder: + generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1 + generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] + + outputs = self.language_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + **generate_kwargs, + ) + + # this is a temporary workaround to be consistent with other generation models and + # have BOS as the first token, even though under the hood we are calling LM with embeds + if not self.language_model.config.is_encoder_decoder: + # the InstructBLIP authors used inconsistent tokenizer/model files during training, + # with the tokenizer's bos token being set to which has ID=2, + # whereas the model's text config has bos token id = 0 + bos_token_id = ( + 2 + if self.config.text_config.architectures[0] == "LLaMAForCausalLM" + else self.config.text_config.bos_token_id + ) + bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device) + if not isinstance(outputs, torch.Tensor): + outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) + else: + outputs = torch.cat([bos_tokens, outputs], dim=-1) + + return outputs diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py new file mode 100644 index 000000000000..69f2feebd39c --- /dev/null +++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py @@ -0,0 +1,362 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Image processor class for InstructBLIPVideo. Largely copy of Blip2Processor with addition of a video processing abilities +""" + +from typing import Dict, List, Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + VideoInput, + infer_channel_dimension_format, + is_scaled_image, + is_valid_image, + to_numpy_array, + valid_images, + validate_kwargs, + validate_preprocess_arguments, +) +from ...utils import TensorType, is_vision_available, logging + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +def make_batched_videos(videos) -> List[VideoInput]: + if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + return videos + + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + if isinstance(videos[0], PIL.Image.Image): + return [videos] + elif len(videos[0].shape) == 4: + return [list(video) for video in videos] + + elif is_valid_image(videos) and len(videos.shape) == 4: + return [list(videos)] + + raise ValueError(f"Could not make batched video from {videos}") + + +# Copied from transformers.models.blip.image_processing_blip.BlipImageProcessor with Blip->InstructBlipVideo, BLIP->InstructBLIPVideo +class InstructBlipVideoImageProcessor(BaseImageProcessor): + r""" + Constructs a InstructBLIPVideo image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 384, "width": 384} + size = get_size_dict(size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + self._valid_processor_keys = [ + "images", + "do_resize", + "size", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "do_convert_rgb", + "return_tensors", + "data_format", + "input_data_format", + ] + + # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + + output_size = (size["height"], size["width"]) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + # Ignore copy + def preprocess( + self, + images: VideoInput = None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: bool = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> PIL.Image.Image: + """ + Preprocess a video or batch of images/videos. + + Args: + videos (`VideoInput`): + Video frames to preprocess. Expects a single or batch of videos as a list of frames with pixel values + ranging from 0 to 255. If passing in video with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the video. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the video after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the video. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the video values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the video by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the video. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the video by if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the video by if `do_normalize` is set to `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + + videos = make_batched_videos(images) + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if not valid_images(videos): + raise ValueError( + "Invalid input type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + pixel_values = [ + [ + self._preprocess_image( + image=frame, + do_resize=do_resize, + size=size, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_convert_rgb=do_convert_rgb, + data_format=data_format, + input_data_format=input_data_format, + ) + for frame in video + ] + for video in videos + ] + + encoded_outputs = BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors) + return encoded_outputs + + # Ignore copy + def _preprocess_image( + self, + image: ImageInput = None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + # PIL RGBA images are converted to RGB + if do_convert_rgb: + image = convert_to_rgb(image) + + # All transformations expect numpy arrays. + image = to_numpy_array(image) + + if is_scaled_image(image) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled video frames. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(image) + + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + + return image diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py new file mode 100644 index 000000000000..4bd97249efd6 --- /dev/null +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -0,0 +1,1665 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from . +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the diff. If any change should be done, please apply the change to the +# diff.py file directly. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPooling, + BaseModelOutputWithPoolingAndCrossAttentions, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from ..auto import AutoModelForCausalLM, AutoModelForSeq2SeqLM +from .configuration_instructblipvideo import ( + InstructBlipVideoConfig, + InstructBlipVideoQFormerConfig, + InstructBlipVideoVisionConfig, +) + + +logger = logging.get_logger(__name__) + + +@dataclass +# Copied from transformers.models.blip_2.modeling_blip_2.Blip2ForConditionalGenerationModelOutput with Blip2->InstructBlipVideo +class InstructBlipVideoForConditionalGenerationModelOutput(ModelOutput): + """ + Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`]. + + Args: + loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Language modeling loss from the language model. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head of the language model. + vision_outputs (`BaseModelOutputWithPooling`): + Outputs of the vision encoder. + qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`): + Outputs of the Q-Former (Querying Transformer). + language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`): + Outputs of the language model. + """ + + loss: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None + vision_outputs: Optional[torch.FloatTensor] = None + qformer_outputs: Optional[Tuple[torch.FloatTensor]] = None + language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] + if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"] + else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +# Copied from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->InstructBlipVideo +class InstructBlipVideoVisionEmbeddings(nn.Module): + def __init__(self, config: InstructBlipVideoVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embedding.shape[1] - 1 + + if num_patches == num_positions and height == width: + return self.position_embedding + + class_pos_embed = self.position_embedding[:, 0, :] + patch_pos_embed = self.position_embedding[:, 1:, :] + dim = embeddings.shape[-1] + h0 = height // self.config.patch_size + w0 = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + h0, w0 = h0 + 0.1, w0 + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(h0 / math.sqrt(num_positions), w0 / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + position_embedding = self.interpolate_pos_encoding(embeddings, height, width) + else: + position_embedding = self.position_embedding + embeddings = embeddings + position_embedding[:, : embeddings.size(1), :].to(target_dtype) + return embeddings + + +# Copied from transformers.models.blip_2.modeling_blip_2.Blip2Attention with Blip2->InstructBlipVideo +class InstructBlipVideoAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = nn.Dropout(config.attention_dropout) + + # small tweak here compared to CLIP, no bias here + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False) + + if config.qkv_bias: + q_bias = nn.Parameter(torch.zeros(self.embed_dim)) + v_bias = nn.Parameter(torch.zeros(self.embed_dim)) + else: + q_bias = None + v_bias = None + + if q_bias is not None: + qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) + self.qkv.bias = nn.Parameter(qkv_bias) + + self.projection = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.size() + + mixed_qkv = self.qkv(hidden_states) + + mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute( + 2, 0, 3, 1, 4 + ) + query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2] + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) + + attention_scores = attention_scores * self.scale + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3) + + new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,) + context_layer = context_layer.reshape(new_context_layer_shape) + + output = self.projection(context_layer) + + outputs = (output, attention_probs) if output_attentions else (output, None) + + return outputs + + +# Copied from transformers.models.blip.modeling_blip.BlipMLP +class InstructBlipVideoMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +# Copied from transformers.models.blip.modeling_blip.BlipEncoderLayer with Blip->InstructBlipVideo +class InstructBlipVideoEncoderLayer(nn.Module): + def __init__(self, config: InstructBlipVideoConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = InstructBlipVideoAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = InstructBlipVideoMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + head_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + + hidden_states = hidden_states + residual + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class InstructBlipVideoPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = InstructBlipVideoConfig + base_model_prefix = "blip" + supports_gradient_checkpointing = True + _no_split_modules = [ + "InstructBlipVideoQFormerEmbeddings", + "InstructBlipVideoAttention", + "InstructBlipVideoQFormerMultiHeadAttention", + "InstructBlipVideoQFormerSelfOutput", + ] + _keep_in_fp32_modules = [] + + # Copied from transformers.models.blip_2.modeling_blip_2.Blip2PreTrainedModel._init_weights with Blip2->InstructBlipVideo + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_range + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=factor) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + if isinstance(module, InstructBlipVideoVisionEmbeddings): + if hasattr(self.config, "vision_config") and not isinstance(self.config, InstructBlipVideoVisionConfig): + factor = self.config.vision_config.initializer_range + nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor) + nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor) + + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +INSTRUCTBLIPVIDEO_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`InstructBlipVideoProcessor`]. See + [`InstructBlipVideoProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the pre-trained position encodings. +""" + + +# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->InstructBlipVideo +class InstructBlipVideoEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`InstructBlipVideoEncoderLayer`]. + + Args: + config (`InstructBlipVideoConfig`): + The corresponding vision configuration for the `InstructBlipVideoEncoder`. + """ + + def __init__(self, config: InstructBlipVideoConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([InstructBlipVideoEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Embedded representation of the inputs. Should be float, not int tokens. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +INSTRUCTBLIPVIDEO_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`InstructBlipVideoConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +INSTRUCTBLIPVIDEO_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`InstructBlipVideoProcessor`]. See + [`InstructBlipVideoProcessor.__call__`] for details. + + qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided + to serve as text prompt, which the Q-Former model will encode. + + Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for + details. + + [What are input IDs?](../glossary#input-ids) + + qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be + provided to serve as text prompt, which the language model can continue. + + Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for + details. + + [What are input IDs?](../glossary#input-ids) + + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an + encoder-decoder language model (like T5) is used. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids) + + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + Only relevant in case an encoder-decoder language model (like T5) is used. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the pre-trained position encodings. +""" + + +# Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->InstructBlipVideo, BLIP->INSTRUCTBLIPVIDEO +class InstructBlipVideoVisionModel(InstructBlipVideoPreTrainedModel): + main_input_name = "pixel_values" + config_class = InstructBlipVideoVisionConfig + + def __init__(self, config: InstructBlipVideoVisionConfig): + super().__init__(config) + self.config = config + embed_dim = config.hidden_size + + self.embeddings = InstructBlipVideoVisionEmbeddings(config) + self.encoder = InstructBlipVideoEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + self.post_init() + + @add_start_docstrings_to_model_forward(INSTRUCTBLIPVIDEO_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=InstructBlipVideoVisionConfig) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.embeddings + + +class InstructBlipVideoQFormerMultiHeadAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + attention_scores_dtype = attention_scores.dtype + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores).to(attention_scores_dtype) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->InstructBlipVideoQFormer +class InstructBlipVideoQFormerSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerAttention with Blip2->InstructBlipVideo +class InstructBlipVideoQFormerAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.attention = InstructBlipVideoQFormerMultiHeadAttention(config, is_cross_attention) + self.output = InstructBlipVideoQFormerSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->InstructBlipVideoQFormer +class InstructBlipVideoQFormerIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->InstructBlipVideoQFormer +class InstructBlipVideoQFormerOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class InstructBlipVideoQFormerLayer(nn.Module): + def __init__(self, config, layer_idx): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = InstructBlipVideoQFormerAttention(config) + + self.layer_idx = layer_idx + + if layer_idx % config.cross_attention_frequency == 0: + self.crossattention = InstructBlipVideoQFormerAttention(config, is_cross_attention=True) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate = InstructBlipVideoQFormerIntermediate(config) + self.output = InstructBlipVideoQFormerOutput(config) + + self.intermediate_query = InstructBlipVideoQFormerIntermediate(config) + self.output_query = InstructBlipVideoQFormerOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:-1] + + present_key_value = self_attention_outputs[-1] + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + if encoder_hidden_states is None: + raise ValueError("encoder_hidden_states must be given for cross-attention layers") + cross_attention_outputs = self.crossattention( + query_attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + query_attention_output = cross_attention_outputs[0] + # add cross attentions if we output attention weights + outputs = outputs + cross_attention_outputs[1:-1] + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = torch.cat([layer_output, layer_output_text], dim=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query(self, attention_output): + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerEncoder with Blip2->InstructBlipVideo +class InstructBlipVideoQFormerEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [InstructBlipVideoQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + query_length, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if layer_module.has_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class InstructBlipVideoQFormerEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + self.config = config + + def forward( + self, + input_ids=None, + position_ids=None, + query_embeds=None, + past_key_values_length=0, + ): + if input_ids is not None: + seq_length = input_ids.size()[1] + else: + seq_length = 0 + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone() + + if input_ids is not None: + embeddings = self.word_embeddings(input_ids) + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids.to(embeddings.device)) + embeddings = embeddings + position_embeddings + + if query_embeds is not None: + embeddings = torch.cat((query_embeds, embeddings), dim=1) + else: + embeddings = query_embeds + + embeddings = embeddings.to(self.layernorm.weight.dtype) + embeddings = self.layernorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class InstructBlipVideoQFormerModel(InstructBlipVideoPreTrainedModel): + """ + Querying Transformer (Q-Former), used in Instructblipvideo. Slightly modified from BLIP-2 as it also takes the + instruction as input. + """ + + def __init__(self, config: InstructBlipVideoQFormerConfig): + super().__init__(config) + self.config = config + + self.embeddings = InstructBlipVideoQFormerEmbeddings(config) + + self.encoder = InstructBlipVideoQFormerEncoder(config) + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: torch.Tensor, + input_shape: Tuple[int], + device: torch.device, + has_query: bool = False, + ) -> torch.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`Tuple[int]`): + The shape of the input to the model. + device: (`torch.device`): + The device of the input to the model. + + Returns: + `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})", + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + input_ids: torch.LongTensor, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + query_embeds: Optional[torch.Tensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of: + shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and + value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are + used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape + `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is None and query_embeds is None: + raise ValueError("You have to specify query_embeds when input_ids is None") + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0 + ) + + query_length = query_embeds.shape[1] if query_embeds is not None else 0 + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + query_embeds=query_embeds, + past_key_values_length=past_key_values_length, + ) + + input_shape = embedding_output.size()[:-1] + batch_size, seq_length = input_shape + device = embedding_output.device + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if isinstance(encoder_hidden_states, list): + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size() + else: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if isinstance(encoder_attention_mask, list): + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + query_length=query_length, + ) + sequence_output = encoder_outputs[0] + pooled_output = sequence_output[:, 0, :] + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + Instructblipvideo Model for generating text given an image and an optional text prompt. The model consists of a vision + encoder, Querying Transformer (Q-Former) and a language model. + + One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue + the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token. + """, + INSTRUCTBLIPVIDEO_START_DOCSTRING, +) +class InstructBlipVideoForConditionalGeneration(InstructBlipVideoPreTrainedModel): + config_class = InstructBlipVideoConfig + main_input_name = "pixel_values" + + def __init__(self, config: InstructBlipVideoConfig): + super().__init__(config) + + self.vision_model = InstructBlipVideoVisionModel(config.vision_config) + + self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)) + self.qformer = InstructBlipVideoQFormerModel(config.qformer_config) + + self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) + + if config.use_decoder_only_language_model: + language_model = AutoModelForCausalLM.from_config( + config.text_config, attn_implementation=config._attn_implementation + ) + else: + language_model = AutoModelForSeq2SeqLM.from_config( + config.text_config, attn_implementation=config._attn_implementation + ) + + if language_model._no_split_modules is not None: + self._no_split_modules.extend(language_model._no_split_modules) + + if language_model._keep_in_fp32_modules is not None: + self._keep_in_fp32_modules.extend(language_model._keep_in_fp32_modules) + + self.language_model = language_model + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def get_output_embeddings(self) -> nn.Module: + return self.language_model.get_output_embeddings() + + def get_encoder(self): + return self.language_model.get_encoder() + + def get_decoder(self): + return self.language_model.get_decoder() + + def _tie_weights(self): + if not self.config.use_decoder_only_language_model: + self.language_model.encoder.embed_tokens = self.language_model.shared + self.language_model.decoder.embed_tokens = self.language_model.shared + + def _preprocess_accelerate(self): + r""" + Some pre-processing hacks to make the model `accelerate` compatible. Check + https://github.com/huggingface/transformers/pull/21707 for more details. + """ + hf_device_map = self.hf_device_map + + if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1: + # warn users about unexpected behavior when using multi-GPU + Instructblipvideo + `accelerate`. + logger.warning( + "The `language_model` is not in the `hf_device_map` dictionary and you are running your script" + " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`." + " Please pass a `device_map` that contains `language_model` to remove this warning." + " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for" + " more details on creating a `device_map` for large models.", + ) + + if hasattr(self.language_model, "_hf_hook"): + self.language_model._hf_hook.io_same_device = True # For `generate` compatibility + + @add_start_docstrings_to_model_forward(INSTRUCTBLIPVIDEO_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=InstructBlipVideoForConditionalGenerationModelOutput, config_class=InstructBlipVideoVisionConfig + ) + def forward( + self, + pixel_values: torch.FloatTensor, + qformer_input_ids: torch.FloatTensor, + qformer_attention_mask: Optional[torch.LongTensor] = None, + input_ids: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> Union[Tuple, InstructBlipVideoForConditionalGenerationModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size - + 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., + config.vocab_size]` + + Returns: + + Examples: + + ```python + >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration + >>> import torch + >>> from huggingface_hub import hf_hub_download + >>> from av + + >>> def read_video_pyav(container, indices): + ... ''' + ... Decode the video with PyAV decoder. + ... Args: + ... container (`av.container.input.InputContainer`): PyAV container. + ... indices (`List[int]`): List of frame indices to decode. + ... Returns: + ... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). + ... ''' + ... frames = [] + ... container.seek(0) + ... start_index = indices[0] + ... end_index = indices[-1] + ... for i, frame in enumerate(container.decode(video=0)): + ... if i > end_index: + ... break + ... if i >= start_index and i in indices: + ... frames.append(frame) + ... return np.stack([x.to_ndarray(format="rgb24") for x in frames]) + + >>> model = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto") + >>> processor = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b") + + >>> file_path = hf_hub_download( + repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" + ) + >>> container = av.open(video_path) + >>> # sample uniformly 4 frames from the videWhy is this video funny?o + >>> total_frames = container.streams.video[0].frames + >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int) + >>> clip = read_video_pyav(container, indices) + + >>> prompt = "What is happening in the video?" + >>> inputs = processor(videos=clip, text=prompt, return_tensors="pt").to(device) + + >>> outputs = model.generate( + ... **inputs, + ... do_sample=False, + ... num_beams=5, + ... max_length=256, + ... repetition_penalty=1.5, + ... length_penalty=1.0, + ... ) + >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() + >>> print(generated_text) + "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front" + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # step 1: forward the images through the vision encoder, + # we process in a batched way, later unbatch it back (video has frames=4 always) + batch_size, frames, channel, height, width = pixel_values.shape + pixel_values = pixel_values.reshape(batch_size * frames, channel, height, width) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + image_embeds = vision_outputs[0] + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + # difference with BLIP-2 here: we also feed the instruction prompt to the Q-Former + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device) + + if qformer_attention_mask is None: + qformer_attention_mask = torch.ones_like(qformer_input_ids) + + qformer_input_ids = qformer_input_ids.repeat_interleave(frames, dim=0) + qformer_attention_mask = qformer_attention_mask.repeat_interleave(frames, dim=0) + qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1) + query_outputs = self.qformer( + input_ids=qformer_input_ids, + attention_mask=qformer_attention_mask, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + query_output = query_outputs[0][:, : query_tokens.size(1), :] + + # step 3: use the language model, conditioned on the query outputs and the prompt + language_model_inputs = self.language_projection(query_output) + + # unbatch inputs back, each video-frame gets `num_query_tokens` seq length + language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1) + language_model_attention_mask = torch.ones( + language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device + ) + + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) + inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + attention_mask = torch.cat([language_model_attention_mask.to(attention_mask.device), attention_mask], dim=1) + + if self.config.use_decoder_only_language_model: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits if return_dict else outputs[0] + loss = None + # we compute the loss here since we need to take into account the sequence length of the query embeds + if labels is not None: + labels = labels.to(logits.device) + logits = logits[:, -labels.size(1) :, :] + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().to(logits.device) + + # Flatten the tokens + loss_fct = CrossEntropyLoss(reduction="mean") + + loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1)) + else: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + ) + loss = outputs.loss if return_dict else outputs[0] + logits = outputs.logits if return_dict else outputs[1] + + if not return_dict: + output = (logits, vision_outputs, query_outputs, outputs) + return ((loss,) + output) if loss is not None else output + + return InstructBlipVideoForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + vision_outputs=vision_outputs, + qformer_outputs=query_outputs, + language_model_outputs=outputs, + ) + + @torch.no_grad() + def generate( + self, + pixel_values: torch.FloatTensor, + qformer_input_ids: Optional[torch.LongTensor] = None, + qformer_attention_mask: Optional[torch.LongTensor] = None, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **generate_kwargs, + ) -> torch.LongTensor: + """ + Overrides `generate` function to be able to use the model as a conditional generator. + + Args: + pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or + (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed. + qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + The sequence used as a prompt to be fed to the Q-Former module. + qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + Mask to avoid performing attention on padding token indices. + input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + The sequence used as a prompt for the generation. + attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + Mask to avoid performing attention on padding token indices. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the positional encoding of the image embeddings. + + Returns: + captions (list): A list of strings of length batch_size * num_captions. + """ + if hasattr(self, "hf_device_map"): + # preprocess for `accelerate` + self._preprocess_accelerate() + + # we process in a batched way, later unbatch it back (video has frames=4) + batch_size, frames, channel, height, width = pixel_values.shape + pixel_values = pixel_values.reshape(batch_size * frames, channel, height, width) + + image_embeds = self.vision_model( + pixel_values, + return_dict=True, + interpolate_pos_encoding=interpolate_pos_encoding, + ).last_hidden_state + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device) + if qformer_attention_mask is None: + qformer_attention_mask = torch.ones_like(qformer_input_ids) + + qformer_input_ids = qformer_input_ids.repeat_interleave(frames, dim=0) + qformer_attention_mask = qformer_attention_mask.repeat_interleave(frames, dim=0) + qformer_attention_mask = torch.cat([query_attention_mask, qformer_attention_mask], dim=1) + query_outputs = self.qformer( + input_ids=qformer_input_ids, + attention_mask=qformer_attention_mask, + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=True, + ) + query_output = query_outputs.last_hidden_state[:, : query_tokens.size(1), :] + + language_model_inputs = self.language_projection(query_output) + + # unbatch the embeddings back by moving frames to seq-len + language_model_inputs = language_model_inputs.reshape(batch_size, self.config.num_query_tokens * frames, -1) + language_attention_mask = torch.ones( + language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device + ) + + if input_ids is None: + input_ids = ( + torch.LongTensor([[self.config.text_config.bos_token_id]]) + .repeat(batch_size, 1) + .to(image_embeds.device) + ) + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1) + + # concatenate query embeddings with prompt embeddings + inputs_embeds = self.get_input_embeddings()(input_ids) + inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) + + # add image_embeds length to max_length, so that the final max_length in counted only on token embeds + # -1 is to account for the prepended BOS after `generate.` + if not self.language_model.config.is_encoder_decoder: + generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1 + generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] + + outputs = self.language_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + **generate_kwargs, + ) + + # this is a temporary workaround to be consistent with other generation models and + # have BOS as the first token, even though under the hood we are calling LM with embeds + if not self.language_model.config.is_encoder_decoder: + # the InstructBLIP authors used inconsistent tokenizer/model files during training, + # with the tokenizer's bos token being set to which has ID=2, + # whereas the model's text config has bos token id = 0 + bos_token_id = ( + 2 + if self.config.text_config.architectures[0] == "LLaMAForCausalLM" + else self.config.text_config.bos_token_id + ) + bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device) + if not isinstance(outputs, torch.Tensor): + outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) + else: + outputs = torch.cat([bos_tokens, outputs], dim=-1) + + return outputs diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py new file mode 100644 index 000000000000..095715807751 --- /dev/null +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -0,0 +1,170 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former. +""" + +import os +from typing import List, Optional, Union + +from ...image_processing_utils import BatchFeature +from ...image_utils import VideoInput +from ...processing_utils import ProcessorMixin +from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from ...utils import TensorType +from ..auto import AutoTokenizer + + +class InstructBlipVideoProcessor(ProcessorMixin): + r""" + Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single + processor. + + [`InstructBlipVideoProcessor`] offers all the functionalities of [`InstructBlipVideoImageProcessor`] and [`AutoTokenizer`]. See the + docstring of [`~InstructBlipVideoProcessor.__call__`] and [`~InstructBlipVideoProcessor.decode`] for more information. + + Args: + image_processor (`InstructBlipVideoImageProcessor`): + An instance of [`InstructBlipVideoImageProcessor`]. The image processor is a required input. + tokenizer (`AutoTokenizer`): + An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. + qformer_tokenizer (`AutoTokenizer`): + An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "InstructBlipVideoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__(self, image_processor, tokenizer, qformer_tokenizer): + super().__init__(image_processor, tokenizer) + + # add QFormer tokenizer + self.qformer_tokenizer = qformer_tokenizer + + def __call__( + self, + images: VideoInput = None, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_token_type_ids: bool = False, + return_length: bool = False, + verbose: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> BatchFeature: + """ + This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and + [`BertTokenizerFast.__call__`] to prepare text for the model. + + Please refer to the docstring of the above two methods for more information. + """ + encoding = BatchFeature() + + if text is not None: + text_encoding = self.tokenizer( + text=text, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_token_type_ids=return_token_type_ids, + return_length=return_length, + verbose=verbose, + return_tensors=return_tensors, + **kwargs, + ) + encoding.update(text_encoding) + qformer_text_encoding = self.qformer_tokenizer( + text=text, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_token_type_ids=return_token_type_ids, + return_length=return_length, + verbose=verbose, + return_tensors=return_tensors, + **kwargs, + ) + encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids") + encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask") + + if images is not None: + image_encoding = self.image_processor(images, return_tensors=return_tensors) + encoding.update(image_encoding) + + return encoding + + # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + # overwrite to save the Q-Former tokenizer in a separate folder + def save_pretrained(self, save_directory, **kwargs): + if os.path.isfile(save_directory): + raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file") + os.makedirs(save_directory, exist_ok=True) + qformer_tokenizer_path = os.path.join(save_directory, "qformer_tokenizer") + self.qformer_tokenizer.save_pretrained(qformer_tokenizer_path) + return super().save_pretrained(save_directory, **kwargs) + + # overwrite to load the Q-Former tokenizer from a separate folder + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + qformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="qformer_tokenizer") + args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) + args.append(qformer_tokenizer) + return cls(*args) diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py index 941ff860042a..629490350c7d 100644 --- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py @@ -33,7 +33,13 @@ ) from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward -from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, + torch_int, +) from .configuration_layoutlmv3 import LayoutLMv3Config @@ -910,8 +916,8 @@ def forward( patch_height = patch_width = None if pixel_values is not None: patch_height, patch_width = ( - int(pixel_values.shape[2] / self.config.patch_size), - int(pixel_values.shape[3] / self.config.patch_size), + torch_int(pixel_values.shape[2] / self.config.patch_size), + torch_int(pixel_values.shape[3] / self.config.patch_size), ) visual_embeddings = self.forward_image(pixel_values) visual_attention_mask = torch.ones( diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py index a98d44b7484a..a0fbe4680add 100644 --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py +++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py @@ -105,21 +105,18 @@ def write_json(text, path): def write_model( model_path, input_base_path, - model_size, + model_size=None, safe_serialization=True, llama_version=1, vocab_size=None, + num_shards=None, ): - # for backward compatibility, before you needed the repo to be called `my_repo/model_size` - if not os.path.isfile(os.path.join(input_base_path, "params.json")): - input_base_path = os.path.join(input_base_path, model_size) - os.makedirs(model_path, exist_ok=True) tmp_model_path = os.path.join(model_path, "tmp") os.makedirs(tmp_model_path, exist_ok=True) params = read_json(os.path.join(input_base_path, "params.json")) - num_shards = NUM_SHARDS[model_size] + num_shards = NUM_SHARDS[model_size] if num_shards is None else num_shards params = params.get("model", params) n_layers = params["n_layers"] n_heads = params["n_heads"] @@ -142,12 +139,13 @@ def write_model( vocab_size = vocab_size if vocab_size is not None else 32000 if params.get("n_kv_heads", None) is not None: num_key_value_heads = params["n_kv_heads"] # for GQA / MQA - num_local_key_value_heads = n_heads_per_shard // num_key_value_heads - key_value_dim = dim // num_key_value_heads + num_key_value_heads_per_shard = num_key_value_heads // num_shards + key_value_dim = dims_per_head * num_key_value_heads else: # compatibility with other checkpoints num_key_value_heads = n_heads - num_local_key_value_heads = n_heads_per_shard - key_value_dim = dim + num_key_value_heads_per_shard = n_heads_per_shard + key_value_dim = dims_per_head * num_key_value_heads + print(num_shards, num_key_value_heads, num_key_value_heads_per_shard, key_value_dim) # permute for sliced rotary def permute(w, n_heads, dim1=dim, dim2=dim): @@ -162,8 +160,9 @@ def permute(w, n_heads, dim1=dim, dim2=dim): else: # Sharded loaded = [ - torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu") - for i in range(num_shards) + torch.load(os.path.join(input_base_path, file), map_location="cpu") + for file in os.listdir(input_base_path) + if file.endswith(".pth") ] param_count = 0 index_dict = {"weight_map": {}} @@ -178,7 +177,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( loaded[f"layers.{layer_i}.attention.wk.weight"], n_heads=num_key_value_heads, - dim1=dim // num_local_key_value_heads, + dim1=key_value_dim, ), f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], @@ -206,7 +205,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): torch.cat( [ loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim) - for i in range(num_shards) + for i in range(len(loaded)) ], dim=0, ).reshape(dim, dim), @@ -216,9 +215,9 @@ def permute(w, n_heads, dim1=dim, dim2=dim): torch.cat( [ loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( - num_local_key_value_heads, dims_per_head, dim + num_key_value_heads_per_shard, dims_per_head, dim ) - for i in range(num_shards) + for i in range(len(loaded)) ], dim=0, ).reshape(key_value_dim, dim), @@ -229,24 +228,24 @@ def permute(w, n_heads, dim1=dim, dim2=dim): state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( [ loaded[i][f"layers.{layer_i}.attention.wv.weight"].view( - num_local_key_value_heads, dims_per_head, dim + num_key_value_heads_per_shard, dims_per_head, dim ) - for i in range(num_shards) + for i in range(len(loaded)) ], dim=0, ).reshape(key_value_dim, dim) state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1 + [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(len(loaded))], dim=1 ) state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0 + [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(len(loaded))], dim=0 ) state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1 + [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(len(loaded))], dim=1 ) state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0 + [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(len(loaded))], dim=0 ) state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq @@ -268,9 +267,9 @@ def permute(w, n_heads, dim1=dim, dim2=dim): state_dict = { "model.norm.weight": loaded[0]["norm.weight"], "model.embed_tokens.weight": torch.cat( - [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=concat_dim + [loaded[i]["tok_embeddings.weight"] for i in range(len(loaded))], dim=concat_dim ), - "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0), + "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(len(loaded))], dim=0), } for k, v in state_dict.items(): @@ -310,7 +309,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): model.config.torch_dtype = torch.float16 print("Saving in the Transformers format.") model.save_pretrained(model_path, safe_serialization=safe_serialization) - shutil.rmtree(tmp_model_path) + shutil.rmtree(tmp_model_path, ignore_errors=True) class Llama3Converter(TikTokenConverter): @@ -371,8 +370,8 @@ def main(): ) parser.add_argument( "--model_size", - choices=["7B", "8B", "8Bf", "7Bf", "13B", "13Bf", "30B", "34B", "65B", "70B", "70Bf", "tokenizer_only"], - help="'f' models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama", + default=None, + help="'f' Deprecated in favor of `num_shards`: models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama", ) parser.add_argument( "--output_dir", @@ -389,7 +388,15 @@ def main(): type=int, help="Version of the Llama model to convert. Currently supports Llama1 and Llama2. Controls the context size", ) + parser.add_argument( + "--num_shards", + default=None, + type=int, + help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth", + ) args = parser.parse_args() + if args.model_size is None and args.num_shards is None: + raise ValueError("You have to set at least `num_shards` if you are not giving the `model_size`") spm_path = os.path.join(args.input_dir, "tokenizer.model") vocab_size = len(write_tokenizer(args.output_dir, spm_path, llama_version=args.llama_version)) if args.model_size != "tokenizer_only": @@ -400,6 +407,7 @@ def main(): safe_serialization=args.safe_serialization, llama_version=args.llama_version, vocab_size=vocab_size, + num_shards=args.num_shards, ) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 7f3c30703304..8cbe8fe35926 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -301,6 +301,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -590,6 +591,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if output_attentions: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. @@ -687,6 +689,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, + **kwargs, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ Args: @@ -701,6 +704,11 @@ def forward( If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model """ residual = hidden_states diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py index 6930dcc78c46..34e67ee4221f 100644 --- a/src/transformers/models/llava/configuration_llava.py +++ b/src/transformers/models/llava/configuration_llava.py @@ -131,23 +131,5 @@ def __init__( text_config = CONFIG_MAPPING["llama"]() self.text_config = text_config - self._vocab_size = self.text_config.vocab_size super().__init__(**kwargs) - - @property - def vocab_size(self): - warnings.warn( - "The `vocab_size` attribute is deprecated and will be removed in v4.42, Please use `text_config.vocab_size` instead.", - FutureWarning, - ) - return self._vocab_size - - @vocab_size.setter - def vocab_size(self, value): - self._vocab_size = value - - def to_dict(self): - output = super().to_dict() - output.pop("_vocab_size", None) - return output diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index c052af3b3c8a..23e3c25025fc 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -545,8 +545,9 @@ def _merge_input_ids_with_image_features( ) # Compute the maximum embed dimension # max_image_feature_lens is max_feature_lens per batch + feature_lens = feature_lens.to(input_ids.device) feature_lens_batch = feature_lens.split(num_special_image_tokens.tolist(), dim=0) - feature_lens_batch_sum = torch.tensor([x.sum() for x in feature_lens_batch], device=feature_lens.device) + feature_lens_batch_sum = torch.tensor([x.sum() for x in feature_lens_batch], device=input_ids.device) embed_sequence_lengths = ( (attention_mask == 1).long().sum(-1) - num_special_image_tokens + feature_lens_batch_sum ) @@ -577,9 +578,9 @@ def _merge_input_ids_with_image_features( final_attention_mask = torch.zeros( batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device ) - final_labels = None - if labels is not None: - final_labels = torch.full_like(final_attention_mask, ignore_index).to(torch.long) + final_input_ids = torch.full( + (batch_size, max_embed_dim), self.pad_token_id, dtype=input_ids.dtype, device=inputs_embeds.device + ) # In case the Vision model or the Language model has been offloaded to CPU, we need to manually # set the corresponding tensors into their correct target device. target_device = inputs_embeds.device @@ -589,12 +590,17 @@ def _merge_input_ids_with_image_features( text_to_overwrite.to(target_device), ) attention_mask = attention_mask.to(target_device) + input_ids = input_ids.to(target_device) # 4. Fill the embeddings based on the mask. If we have ["hey" "", "how", "are"] # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices] final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices] + final_input_ids[batch_indices, text_to_overwrite] = input_ids[batch_indices, non_image_indices] + final_labels = None if labels is not None: + labels = labels.to(target_device) + final_labels = torch.full_like(final_attention_mask, ignore_index).to(torch.long) final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices] # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835) @@ -609,6 +615,7 @@ def _merge_input_ids_with_image_features( if left_padding: # exclude padding on the left + max_embed_dim = max_embed_dim.to(target_device) val = (max_embed_dim - embed_indices) <= embed_seq_lens else: # exclude padding on the right @@ -626,7 +633,7 @@ def _merge_input_ids_with_image_features( final_attention_mask |= image_to_overwrite position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1) - return final_embedding, final_attention_mask, position_ids, final_labels + return final_embedding, final_attention_mask, position_ids, final_labels, final_input_ids def pack_image_features(self, image_features, image_sizes, image_newline=None): """ @@ -796,7 +803,7 @@ def forward( ) inputs_embeds = inputs_embeds.to(image_features.dtype) - inputs_embeds, attention_mask, position_ids, labels = self._merge_input_ids_with_image_features( + inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features( image_features, feature_lens, inputs_embeds, diff --git a/src/transformers/models/llava_next_video/__init__.py b/src/transformers/models/llava_next_video/__init__.py new file mode 100644 index 000000000000..d079643e73e9 --- /dev/null +++ b/src/transformers/models/llava_next_video/__init__.py @@ -0,0 +1,70 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available + + +_import_structure = { + "configuration_llava_next_video": ["LlavaNextVideoConfig"], + "processing_llava_next_video": ["LlavaNextVideoProcessor"], +} + + +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["image_processing_llava_next_video"] = ["LlavaNextVideoImageProcessor"] + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_llava_next_video"] = [ + "LlavaNextVideoForConditionalGeneration", + "LlavaNextVideoPreTrainedModel", + ] + +if TYPE_CHECKING: + from .configuration_llava_next_video import LlavaNextVideoConfig + from .processing_llava_next_video import LlavaNextVideoProcessor + + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .image_processing_llava_next_video import LlavaNextVideoImageProcessor + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_llava_next_video import ( + LlavaNextVideoForConditionalGeneration, + LlavaNextVideoPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure) diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py new file mode 100644 index 000000000000..59bf460e84a6 --- /dev/null +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -0,0 +1,153 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from . +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the diff. If any change should be done, please apply the change to the +# diff.py file directly. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from transformers import PretrainedConfig + +from ..auto import CONFIG_MAPPING + + +class LlavaNextVideoConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`LlavaNextVideoForConditionalGeneration`]. It is used to instantiate an + Llava-NeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the [llava-hf/LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) + model. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `CLIPVisionConfig`): + The config object or dictionary of the vision backbone. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + ignore_index (`int`, *optional*, defaults to -100): + The ignore index for the loss function. + image_token_index (`int`, *optional*, defaults to 32001): + The image token index to encode the image prompt. + projector_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The activation function used by the multimodal projector. + vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. + If `"full"`, the full vision features are used. + vision_feature_layer (`int`, *optional*, defaults to -2): + The index of the layer to select the vision feature. + image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`): + A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list + of the form `(height, width)`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + video_token_index (`int`, *optional*, defaults to 32000): + The video token index to encode the image prompt. + spatial_pool_mode (`str`, *optional*, defaults to `"average"`): + Pooling mode to use for videos. Can be "average", "max" or "conv". + spatial_pool_stride (`int`, *optional*, defaults to 2): + Stride used in the pooling layer for videos. + + Example: + + ```python + >>> from transformers import LlavaNextVideoForConditionalGeneration, LlavaNextVideoConfig, CLIPVisionConfig, LlamaConfig + + >>> # Initializing a CLIP-vision config + >>> vision_config = CLIPVisionConfig() + + >>> # Initializing a Llama config + >>> text_config = LlamaConfig() + + >>> configuration = LlavaNextVideoConfig(vision_config, text_config) + + >>> model = LlavaNextVideoForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "llava_next_video" + is_composition = True + + def __init__( + self, + vision_config=None, + text_config=None, + ignore_index=-100, + image_token_index=32001, + projector_hidden_act="gelu", + vision_feature_select_strategy="default", + vision_feature_layer=-2, + image_grid_pinpoints=None, + tie_word_embeddings=False, + video_token_index=32000, + spatial_pool_mode="average", + spatial_pool_stride=2, + **kwargs, + ): + self.video_token_index = video_token_index + self.spatial_pool_mode = spatial_pool_mode + self.spatial_pool_stride = spatial_pool_stride + self.ignore_index = ignore_index + self.image_token_index = image_token_index + self.projector_hidden_act = projector_hidden_act + + if vision_feature_select_strategy not in ["default", "full"]: + raise ValueError( + "vision_feature_select_strategy should be one of 'default', 'full'." + f"Got: {vision_feature_select_strategy}" + ) + + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + image_grid_pinpoints = ( + image_grid_pinpoints + if image_grid_pinpoints is not None + else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] + ) + self.image_grid_pinpoints = image_grid_pinpoints + + if isinstance(vision_config, dict): + vision_config["model_type"] = ( + vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model" + ) + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["clip_vision_model"]( + intermediate_size=4096, + hidden_size=1024, + patch_size=14, + image_size=336, + num_hidden_layers=24, + num_attention_heads=16, + vocab_size=32000, + projection_dim=768, + ) + + self.vision_config = vision_config + + if isinstance(text_config, dict): + text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama" + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["llama"]() + + self.text_config = text_config + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py b/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py new file mode 100644 index 000000000000..aae44eee97a0 --- /dev/null +++ b/src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py @@ -0,0 +1,276 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Convert LLaVa-NeXT-Video checkpoints from the original repository. + +URL: https://github.com/LLaVA-VL/LLaVA-NeXT/tree/inference +""" + +import argparse +import glob +import json +from pathlib import Path + +import torch +from accelerate import init_empty_weights +from huggingface_hub import hf_hub_download, snapshot_download +from safetensors import safe_open + +from transformers import ( + AddedToken, + AutoConfig, + AutoTokenizer, + LlavaNextImageProcessor, + LlavaNextVideoConfig, + LlavaNextVideoForConditionalGeneration, + LlavaNextVideoImageProcessor, + LlavaNextVideoProcessor, +) + + +KEYS_TO_MODIFY_MAPPING = { + "model.vision_tower.": "", + ".vision_resampler": "", # all lmms-lab models do avg pooling, so no vision_resampler + "model.mm_projector": "multi_modal_projector", + "model": "model.model", + "vision_model.model": "vision_model", + "lm_head": "language_model.lm_head", + "model.model": "language_model.model", + "multi_modal_projector.0": "multi_modal_projector.linear_1", + "multi_modal_projector.2": "multi_modal_projector.linear_2", + "language_model.model.image_newline": "image_newline", +} + +# {{SYSTEM_PROMPT}} USER: \n{{PROMPT}} ASSISTANT:" assistant end with " " +chat_vicuna = ( + "{% for message in messages %}" + "{% if message['role'] == 'system' %}" + "{{ message['content'][0]['text'] }}" + "{% else %}" + "{{ message['role'].upper() + ': '}}" + "{% endif %}" + "{# Render all images first #}" + "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}" + "{{ '\n' }}" + "{% endfor %}" + "{# Render all text next #}" + "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}" + "{{ content['text'] + ' '}}" + "{% endfor %}" + "{% endfor %}" + "{% if add_generation_prompt %}" + "{{ 'ASSISTANT:' }}" + "{% endif %}" +) + +# "[INST] \nWhat is shown in this image? [/INST]" assistant end with " " +chat_mistral = ( + "{% for message in messages %}" + "{% if message['role'] == 'user' %}" + "{{ '[INST] ' }}" + "{# Render all images first #}" + "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}" + "{{ '\n' }}" + "{% endfor %}" + "{# Render all text next #}" + "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}" + "{{ content['text'] }}" + "{% endfor %}" + "{{' [/INST]' }}" + "{% elif message['role'] == 'assistant' %}" + r"{{ ' ' + message['content'][0]['text'] + '<\s> '}}" + "{% else %}" + "{{ raise_exception('Only user and assistant roles are supported!') }}" + "{% endif %}" + "{% endfor %}" +) + +# "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" +chat_yi = ( + "{% for message in messages %}" + "{{'<|im_start|>' + message['role'] + '\n'}}" + "{# Render all images first #}" + "{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}" + "{{ '\n' }}" + "{% endfor %}" + "{# Render all text next #}" + "{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}" + "{{ content['text'] }}" + "{% endfor %}" + "{{'<|im_end|>' + '\n'}}" + "{% endfor %}" + "{% if add_generation_prompt %}" + "{{ '<|im_start|>assistant\n' }}" + "{% endif %}" +) + +model2template = { + "lmms-lab/LLaVA-NeXT-Video-7B-32K": chat_mistral, + "lmms-lab/LLaVA-NeXT-Video-7B": chat_vicuna, + "lmms-lab/LLaVA-NeXT-Video-7B-DPO": chat_vicuna, + "lmms-lab/LLaVA-NeXT-Video-34B": chat_yi, + "lmms-lab/LLaVA-NeXT-Video-34B-DPO": chat_yi, +} + + +def load_original_state_dict(model_id): + directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) + + original_state_dict = {} + for path in glob.glob(f"{directory_path}/*"): + if path.endswith(".safetensors"): + with safe_open(path, framework="pt", device="cpu") as f: + for key in f.keys(): + original_state_dict[key] = f.get_tensor(key) + + return original_state_dict + + +def convert_state_dict_to_hf(state_dict): + new_state_dict = {} + for key, value in state_dict.items(): + if key.endswith(".inv_freq"): + continue + for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): + if key_to_modify in key: + key = key.replace(key_to_modify, new_key) + + new_state_dict[key] = value.to(torch.bfloat16) + return new_state_dict + + +def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): + # load original config + filepath = hf_hub_download(repo_id=model_id, filename="config.json", repo_type="model") + with open(filepath) as f: + data = json.load(f) + print(data) + + if model_id == "lmms-lab/LLaVA-NeXT-Video-7B-32K": + text_model_id = "mistralai/Mistral-7B-Instruct-v0.2" + video_token_index = 32000 + image_token_index = 32001 + overwrite_text_config = {} + elif model_id in ["lmms-lab/LLaVA-NeXT-Video-7B", "lmms-lab/LLaVA-NeXT-Video-7B-DPO"]: + text_model_id = "lmsys/vicuna-7b-v1.5" + video_token_index = 32000 + image_token_index = 32001 + overwrite_text_config = {"factor": 2.0, "type": "linear"} + elif model_id in ["lmms-lab/LLaVA-NeXT-Video-34B", "lmms-lab/LLaVA-NeXT-Video-34B-DPO"]: + text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B" + video_token_index = 64000 + image_token_index = 64001 + overwrite_text_config = {} + else: + raise ValueError("Incorrect checkpoint referenced. Text model-id not identified!") + + vision_model_id = data["mm_vision_tower"] + + torch.set_default_dtype(torch.bfloat16) + text_config = AutoConfig.from_pretrained(text_model_id) + text_config = text_config.to_dict() + text_config.update(overwrite_text_config) + + tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=True, padding_side="left") + tokenizer.add_tokens(AddedToken(" diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index ce87bc862313..ddd329817aff 100755 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -60,6 +60,8 @@ tensor_size, to_numpy, to_py_obj, + torch_float, + torch_int, transpose, working_or_temp_dir, ) diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py index e689fee20fe8..86a1fae4ad0c 100644 --- a/src/transformers/utils/backbone_utils.py +++ b/src/transformers/utils/backbone_utils.py @@ -313,7 +313,6 @@ def load_backbone(config): use_pretrained_backbone = getattr(config, "use_pretrained_backbone", None) backbone_checkpoint = getattr(config, "backbone", None) backbone_kwargs = getattr(config, "backbone_kwargs", None) - backbone_kwargs = {} if backbone_kwargs is None else backbone_kwargs if backbone_kwargs and backbone_config is not None: diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 0cda4ed7b963..c9267debc5de 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -4197,6 +4197,41 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class Gemma2ForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class Gemma2ForSequenceClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class Gemma2ForTokenClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class Gemma2Model(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class Gemma2PreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class GitForCausalLM(metaclass=DummyObject): _backends = ["torch"] @@ -4755,6 +4790,34 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class InstructBlipVideoForConditionalGeneration(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class InstructBlipVideoPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class InstructBlipVideoQFormerModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class InstructBlipVideoVisionModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class JambaForCausalLM(metaclass=DummyObject): _backends = ["torch"] @@ -5112,6 +5175,20 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class LlavaNextVideoForConditionalGeneration(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class LlavaNextVideoPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class LongformerForMaskedLM(metaclass=DummyObject): _backends = ["torch"] @@ -7492,6 +7569,41 @@ def load_tf_weights_in_roformer(*args, **kwargs): requires_backends(load_tf_weights_in_roformer, ["torch"]) +class RTDetrForObjectDetection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class RTDetrModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class RTDetrPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class RTDetrResNetBackbone(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class RTDetrResNetPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class RwkvForCausalLM(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index a27dc024447f..359c5481757d 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -303,6 +303,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class InstructBlipVideoImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class LayoutLMv2FeatureExtractor(metaclass=DummyObject): _backends = ["vision"] @@ -352,6 +359,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class LlavaNextVideoImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class Mask2FormerImageProcessor(metaclass=DummyObject): _backends = ["vision"] @@ -492,6 +506,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class RTDetrImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class SamImageProcessor(metaclass=DummyObject): _backends = ["vision"] diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py index 4a3c1d970116..80232898ce47 100644 --- a/src/transformers/utils/generic.py +++ b/src/transformers/utils/generic.py @@ -753,6 +753,30 @@ def infer_framework(model_class): raise TypeError(f"Could not infer framework from class {model_class}.") +def torch_int(x): + """ + Casts an input to a torch int64 tensor if we are in a tracing context, otherwise to a Python int. + """ + if not is_torch_available(): + return int(x) + + import torch + + return x.to(torch.int64) if torch.jit.is_tracing() else int(x) + + +def torch_float(x): + """ + Casts an input to a torch float32 tensor if we are in a tracing context, otherwise to a Python float. + """ + if not is_torch_available(): + return int(x) + + import torch + + return x.to(torch.float32) if torch.jit.is_tracing() else int(x) + + def filter_out_non_signature_kwargs(extra: Optional[list] = None): """ Decorator to filter out named arguments that are not in the function signature. diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 310101e0a9a2..53a21ccae8b4 100755 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -754,10 +754,13 @@ def is_torch_xpu_available(check_device=False): if not is_torch_available(): return False - import torch - + torch_version = version.parse(_torch_version) if is_ipex_available(): import intel_extension_for_pytorch # noqa: F401 + elif torch_version.major < 2 or (torch_version.major == 2 and torch_version.minor < 4): + return False + + import torch if check_device: try: diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 58425fecbcb9..2eb3a40b99d6 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -108,13 +108,13 @@ def require_deepspeed_aio(test_case): Decorator marking a test that requires deepspeed aio (nvme) """ if not is_deepspeed_available(): - return unittest.skip("test requires deepspeed")(test_case) + return unittest.skip(reason="test requires deepspeed")(test_case) import deepspeed from deepspeed.ops.aio import AsyncIOBuilder if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]: - return unittest.skip("test requires deepspeed async-io")(test_case) + return unittest.skip(reason="test requires deepspeed async-io")(test_case) else: return test_case @@ -643,7 +643,7 @@ def test_early_get_last_lr(self, stage, dtype): # print(trainer.model.b.item()) # need to investigate at some point if (stage == ZERO3 and dtype == FP16) or (dtype == BF16): - return + self.skipTest(reason="When using zero3/fp16 or any/bf16 the optimizer seems run oddly") # it's enough that train didn't fail for this test, but we must check that # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing) @@ -795,7 +795,7 @@ def test_can_resume_training_normal(self, stage, dtype, optim, scheduler): # ToDo: Currently, hf_optim + hf_scheduler resumes with the correct states and # also has same losses for few steps but then slowly diverges. Need to figure it out. if optim == HF_OPTIM and scheduler == HF_SCHEDULER: - return + self.skipTest(reason="hf_optim + hf_scheduler resumes with the correct states but slowly diverges") output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False) ds_config_dict = self.get_config_dict(stage) @@ -1113,7 +1113,7 @@ def test_resume_train_not_from_ds_checkpoint(self, stage, dtype): @require_torch_multi_accelerator def test_inference(self, dtype): if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device): - self.skipTest("test requires bfloat16 hardware support") + self.skipTest(reason="test requires bfloat16 hardware support") # this is just inference, so no optimizer should be loaded # it only works for z3 (makes no sense with z1-z2) diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py index a35ea1a8e7eb..9bf34c366927 100644 --- a/tests/extended/test_trainer_ext.py +++ b/tests/extended/test_trainer_ext.py @@ -80,7 +80,7 @@ def run_seq2seq_quick( logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history if not do_eval: - return + self.skipTest(reason="do_eval is False") eval_metrics = [log for log in logs if "eval_loss" in log.keys()] diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py index 9ae55ecdec2d..ff5bd8510697 100644 --- a/tests/fsdp/test_fsdp.py +++ b/tests/fsdp/test_fsdp.py @@ -14,6 +14,7 @@ import itertools import os +import subprocess import unittest from copy import deepcopy from functools import partial @@ -31,6 +32,7 @@ require_accelerate, require_fsdp, require_torch_accelerator, + require_torch_gpu, require_torch_multi_accelerator, slow, torch_device, @@ -276,6 +278,20 @@ def test_training_and_can_resume_normally(self, state_dict_type): if "learning_rate" in log: self.assertAlmostEqual(log["learning_rate"], log1["learning_rate"], delta=1e-5) + @require_torch_multi_accelerator + @slow + @require_torch_gpu + @require_fsdp + def test_fsdp_cpu_offloading(self): + try: + subprocess.run( + "accelerate launch utils/testing_scripts/fsdp_cpu_offloading.py --config utils/testing_scripts/dummy_fsdp_config.yml", + shell=True, + check=True, + ) + except: # noqa + raise AssertionError("CPU offloading failed with FSDP!") + def run_cmd_and_get_logs(self, use_accelerate, sharding_strategy, launcher, script, args, output_dir): if not use_accelerate: fsdp_args = [ diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index f61adbbd906c..3293cc279d01 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -463,9 +463,9 @@ def test_greedy_generate_dict_outputs_use_cache(self): config, input_ids, attention_mask = self._get_input_ids_and_config() if not hasattr(config, "use_cache"): - self.skipTest("This model doesn't support caching") + self.skipTest(reason="This model doesn't support caching") if any(model_name in model_class.__name__.lower() for model_name in ["rwkv"]): - self.skipTest("Won't fix: model with non-standard dictionary output shapes") + self.skipTest(reason="Won't fix: model with non-standard dictionary output shapes") config.use_cache = True config.is_decoder = True @@ -625,9 +625,9 @@ def test_beam_search_generate_dict_outputs_use_cache(self): config, input_ids, attention_mask = self._get_input_ids_and_config() if not hasattr(config, "use_cache"): - self.skipTest("This model doesn't support caching") + self.skipTest(reason="This model doesn't support caching") if any(model_name in model_class.__name__.lower() for model_name in ["rwkv"]): - self.skipTest("Won't fix: model with non-standard dictionary output shapes") + self.skipTest(reason="Won't fix: model with non-standard dictionary output shapes") model = model_class(config).to(torch_device).eval() logits_process_kwargs, _ = self._get_logits_processor_and_warper_kwargs( @@ -667,7 +667,7 @@ def test_beam_search_generate_dict_outputs_use_cache(self): def test_model_parallel_beam_search(self): for model_class in self.all_generative_model_classes: if "xpu" in torch_device: - return unittest.skip("device_map='auto' does not work with XPU devices") + return unittest.skip(reason="device_map='auto' does not work with XPU devices") if model_class._no_split_modules is None: continue @@ -765,7 +765,7 @@ def test_generate_without_input_ids(self): # if no bos token id => cannot generate from None if config.bos_token_id is None: - return + self.skipTest(reason="bos_token_id is None") # hack in case they are equal, otherwise the attn mask will be [0] if config.bos_token_id == config.pad_token_id: @@ -982,17 +982,17 @@ def test_constrained_beam_search_generate_dict_output(self): def test_contrastive_generate(self): for model_class in self.all_generative_model_classes: if model_class._is_stateful: - self.skipTest("Stateful models don't support contrastive search generation") + self.skipTest(reason="Stateful models don't support contrastive search generation") # won't fix: FSMT and Reformer have a different cache variable type (and format). if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]): - self.skipTest("Won't fix: old model with different cache format") + self.skipTest(reason="Won't fix: old model with different cache format") config, input_ids, attention_mask = self._get_input_ids_and_config() # NOTE: contrastive search only works with cache on at the moment. if not hasattr(config, "use_cache"): - self.skipTest("This model doesn't support caching") + self.skipTest(reason="This model doesn't support caching") config.use_cache = True config.is_decoder = True @@ -1009,17 +1009,17 @@ def test_contrastive_generate(self): def test_contrastive_generate_dict_outputs_use_cache(self): for model_class in self.all_generative_model_classes: if model_class._is_stateful: - self.skipTest("Stateful models don't support contrastive search generation") + self.skipTest(reason="Stateful models don't support contrastive search generation") # won't fix: FSMT and Reformer have a different cache variable type (and format). if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]): - self.skipTest("Won't fix: old model with different cache format") + self.skipTest(reason="Won't fix: old model with different cache format") config, input_ids, attention_mask = self._get_input_ids_and_config() # NOTE: contrastive search only works with cache on at the moment. if not hasattr(config, "use_cache"): - self.skipTest("This model doesn't support caching") + self.skipTest(reason="This model doesn't support caching") config.use_cache = True config.is_decoder = True @@ -1045,18 +1045,18 @@ def test_contrastive_generate_low_memory(self): # Check that choosing 'low_memory' does not change the model output for model_class in self.all_generative_model_classes: if model_class._is_stateful: - self.skipTest("Stateful models don't support contrastive search generation") + self.skipTest(reason="Stateful models don't support contrastive search generation") if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer", "speech2text"]): - self.skipTest("Won't fix: old model with different cache format") + self.skipTest(reason="Won't fix: old model with different cache format") if any(model_name in model_class.__name__.lower() for model_name in ["gptbigcode"]): - self.skipTest("TODO: fix me") + self.skipTest(reason="TODO: fix me") config, input_ids, attention_mask = self._get_input_ids_and_config(batch_size=1) # NOTE: contrastive search only works with cache on at the moment. if not hasattr(config, "use_cache"): - self.skipTest("This model doesn't support caching") + self.skipTest(reason="This model doesn't support caching") config.use_cache = True config.is_decoder = True @@ -1087,9 +1087,9 @@ def test_beam_search_low_memory(self): # Check that choosing 'low_memory' does not change the model output for model_class in self.all_generative_model_classes: if model_class._is_stateful: - self.skipTest("May fix in the future: need custom cache handling") + self.skipTest(reason="May fix in the future: need custom cache handling") if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]): - self.skipTest("Won't fix: old model with different cache format") + self.skipTest(reason="Won't fix: old model with different cache format") if any( model_name in model_class.__name__.lower() for model_name in [ @@ -1102,7 +1102,7 @@ def test_beam_search_low_memory(self): "jamba", ] ): - self.skipTest("May fix in the future: need model-specific fixes") + self.skipTest(reason="May fix in the future: need model-specific fixes") config, input_ids, _ = self._get_input_ids_and_config(batch_size=2) # batch_size=1 is ok, but batch_size>1 will cause non-identical output @@ -1135,9 +1135,9 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type): for model_class in self.all_generative_model_classes: if model_class._is_stateful: - self.skipTest("Stateful models don't support assisted generation") + self.skipTest(reason="Stateful models don't support assisted generation") if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]): - self.skipTest("Won't fix: old model with different cache format") + self.skipTest(reason="Won't fix: old model with different cache format") if any( model_name in model_class.__name__.lower() for model_name in [ @@ -1151,14 +1151,14 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type): "clvp", ] ): - self.skipTest("May fix in the future: need model-specific fixes") + self.skipTest(reason="May fix in the future: need model-specific fixes") # enable cache config, input_ids, attention_mask = self._get_input_ids_and_config(batch_size=1) # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): - self.skipTest("This model doesn't support caching") + self.skipTest(reason="This model doesn't support caching") config.use_cache = True config.is_decoder = True @@ -1206,9 +1206,9 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): for model_class in self.all_generative_model_classes: if model_class._is_stateful: - self.skipTest("Stateful models don't support assisted generation") + self.skipTest(reason="Stateful models don't support assisted generation") if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]): - self.skipTest("Won't fix: old model with different cache format") + self.skipTest(reason="Won't fix: old model with different cache format") if any( model_name in model_class.__name__.lower() for model_name in [ @@ -1222,14 +1222,14 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): "clvp", ] ): - self.skipTest("May fix in the future: need model-specific fixes") + self.skipTest(reason="May fix in the future: need model-specific fixes") # enable cache config, input_ids, attention_mask = self._get_input_ids_and_config(batch_size=1) # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): - self.skipTest("This model doesn't support caching") + self.skipTest(reason="This model doesn't support caching") config.use_cache = True config.is_decoder = True @@ -1268,9 +1268,9 @@ def test_assisted_decoding_sample(self): # different shapes, see https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535). for model_class in self.all_generative_model_classes: if model_class._is_stateful: - self.skipTest("Stateful models don't support assisted generation") + self.skipTest(reason="Stateful models don't support assisted generation") if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]): - self.skipTest("Won't fix: old model with different cache format") + self.skipTest(reason="Won't fix: old model with different cache format") if any( model_name in model_class.__name__.lower() for model_name in [ @@ -1284,14 +1284,14 @@ def test_assisted_decoding_sample(self): "clvp", ] ): - self.skipTest("May fix in the future: need model-specific fixes") + self.skipTest(reason="May fix in the future: need model-specific fixes") # enable cache config, input_ids, attention_mask = self._get_input_ids_and_config(batch_size=1) # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): - self.skipTest("This model doesn't support caching") + self.skipTest(reason="This model doesn't support caching") config.use_cache = True config.is_decoder = True @@ -1436,7 +1436,7 @@ def test_past_key_values_format(self): # If it doesn't support cache, pass the test if not hasattr(config, "use_cache"): - self.skipTest("This model doesn't support caching") + self.skipTest(reason="This model doesn't support caching") model = model_class(config).to(torch_device) if "use_cache" not in inputs: @@ -1445,7 +1445,7 @@ def test_past_key_values_format(self): # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format) if "past_key_values" not in outputs: - self.skipTest("This model doesn't return `past_key_values`") + self.skipTest(reason="This model doesn't return `past_key_values`") num_hidden_layers = ( getattr(config, "decoder_layers", None) @@ -1553,14 +1553,14 @@ def test_generate_continue_from_past_key_values(self): # Tests that we can continue generating from past key values, returned from a previous `generate` call for model_class in self.all_generative_model_classes: if any(model_name in model_class.__name__.lower() for model_name in ["imagegpt"]): - self.skipTest("Won't fix: old model with unique inputs/caches/other") + self.skipTest(reason="Won't fix: old model with unique inputs/caches/other") if any(model_name in model_class.__name__.lower() for model_name in ["umt5"]): - self.skipTest("TODO: needs modeling or test input preparation fixes for compatibility") + self.skipTest(reason="TODO: needs modeling or test input preparation fixes for compatibility") config, inputs = self.model_tester.prepare_config_and_inputs_for_common() if not hasattr(config, "use_cache"): - self.skipTest("This model doesn't support caching") + self.skipTest(reason="This model doesn't support caching") # Let's make it always: # 1. use cache (for obvious reasons) @@ -1582,7 +1582,7 @@ def test_generate_continue_from_past_key_values(self): # If "past_key_values" is not returned, skip the test (e.g. RWKV uses a different cache name and format) outputs = model(**inputs) if "past_key_values" not in outputs: - self.skipTest("This model doesn't return `past_key_values`") + self.skipTest(reason="This model doesn't return `past_key_values`") # Traditional way of generating text, with `return_dict_in_generate` to return the past key values outputs = model.generate(**inputs, do_sample=False, max_new_tokens=4, return_dict_in_generate=True) @@ -1632,7 +1632,7 @@ def test_new_cache_format(self, num_beams, do_sample): # 👉 tests with and without sampling so we can cover the most common use cases. for model_class in self.all_generative_model_classes: if not model_class._supports_cache_class: - self.skipTest("This model does not support the new cache format") + self.skipTest(reason="This model does not support the new cache format") config, input_ids, attention_mask = self._get_input_ids_and_config() config.use_cache = True @@ -1689,7 +1689,7 @@ def test_new_cache_format(self, num_beams, do_sample): def test_generate_with_quant_cache(self): for model_class in self.all_generative_model_classes: if not model_class._supports_quantized_cache: - self.skipTest("This model does not support the quantized cache format") + self.skipTest(reason="This model does not support the quantized cache format") config, input_ids, attention_mask = self._get_input_ids_and_config() config.use_cache = True diff --git a/tests/models/albert/test_tokenization_albert.py b/tests/models/albert/test_tokenization_albert.py index e3f39257a68c..beb910b9d155 100644 --- a/tests/models/albert/test_tokenization_albert.py +++ b/tests/models/albert/test_tokenization_albert.py @@ -67,7 +67,7 @@ def test_vocab_size(self): def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py index 50a29eb550ca..35000db677d3 100644 --- a/tests/models/align/test_modeling_align.py +++ b/tests/models/align/test_modeling_align.py @@ -23,7 +23,6 @@ from transformers import AlignConfig, AlignProcessor, AlignTextConfig, AlignVisionConfig from transformers.testing_utils import ( - is_flax_available, require_torch, require_vision, slow, @@ -56,10 +55,6 @@ from PIL import Image -if is_flax_available(): - pass - - class AlignVisionModelTester: def __init__( self, @@ -215,9 +210,11 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -355,9 +352,11 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -518,7 +517,7 @@ def test_initialization(self): def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.torchscript = True diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 4f258e4ddb23..83b6d60595d3 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -178,9 +178,11 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -309,7 +311,7 @@ class AltCLIPTextModelTest(ModelTesterMixin, unittest.TestCase): test_head_masking = False # TODO (@SunMarc): Fix me - @unittest.skip("It's broken.") + @unittest.skip(reason="It's broken.") def test_resize_tokens_embeddings(self): super().test_resize_tokens_embeddings() @@ -324,9 +326,11 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -487,7 +491,7 @@ def test_initialization(self): def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.torchscript = True diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py index 47e0beaeae80..9bb8ef33d759 100644 --- a/tests/models/bark/test_modeling_bark.py +++ b/tests/models/bark/test_modeling_bark.py @@ -754,7 +754,7 @@ def test_inputs_embeds(self): with torch.no_grad(): model(**inputs)[0] - @unittest.skip("FineModel relies on codebook idx and does not return same logits") + @unittest.skip(reason="FineModel relies on codebook idx and does not return same logits") def test_inputs_embeds_matches_input_ids(self): pass @@ -826,7 +826,7 @@ def test_resize_tokens_embeddings(self): # resizing tokens_embeddings of a ModuleList original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() if not self.test_resize_embeddings: - return + self.skipTest(reason="test_resize_embeddings is False") for model_class in self.all_model_classes: config = copy.deepcopy(original_config) @@ -877,7 +877,7 @@ def test_resize_embeddings_untied(self): # resizing tokens_embeddings of a ModuleList original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() if not self.test_resize_embeddings: - return + self.skipTest(reason="test_resize_embeddings is False") original_config.tie_word_embeddings = False @@ -931,7 +931,7 @@ def test_resize_embeddings_untied(self): def test_flash_attn_2_inference_equivalence(self): for model_class in self.all_model_classes: if not model_class._supports_flash_attn_2: - return + self.skipTest(reason="Model does not support flash_attention_2") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() model = model_class(config) @@ -988,7 +988,7 @@ def test_flash_attn_2_inference_equivalence(self): def test_flash_attn_2_inference_equivalence_right_padding(self): for model_class in self.all_model_classes: if not model_class._supports_flash_attn_2: - return + self.skipTest(reason="Model does not support flash_attention_2") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() model = model_class(config) diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py index ba9e112c186e..a65ec043de82 100644 --- a/tests/models/bart/test_modeling_bart.py +++ b/tests/models/bart/test_modeling_bart.py @@ -1515,9 +1515,10 @@ def test_decoder_model_attn_mask_past(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs) + @unittest.skip(reason="Decoder cannot keep gradients") def test_retain_grad_hidden_states_attentions(self): - # decoder cannot keep gradients return + @unittest.skip def test_save_load_fast_init_from_base(self): pass diff --git a/tests/models/bart/test_tokenization_bart.py b/tests/models/bart/test_tokenization_bart.py index f3a63d6d417f..274312983f18 100644 --- a/tests/models/bart/test_tokenization_bart.py +++ b/tests/models/bart/test_tokenization_bart.py @@ -147,6 +147,7 @@ def test_special_tokens(self): self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item()) self.assertTrue((labels[:, -1] == tokenizer.eos_token_id).all().item()) + @unittest.skip def test_pretokenized_inputs(self): pass diff --git a/tests/models/barthez/test_tokenization_barthez.py b/tests/models/barthez/test_tokenization_barthez.py index b2b0c7b058d2..c76435958c6a 100644 --- a/tests/models/barthez/test_tokenization_barthez.py +++ b/tests/models/barthez/test_tokenization_barthez.py @@ -75,7 +75,7 @@ def test_prepare_batch(self): def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py index 0e3e3e32d270..ac64f0fd3b0b 100644 --- a/tests/models/beit/test_modeling_beit.py +++ b/tests/models/beit/test_modeling_beit.py @@ -301,7 +301,7 @@ def test_for_semantic_segmentation(self): def test_training(self): if not self.model_tester.is_training: - return + self.skipTest(reason="model_tester.is_training is set to False") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True @@ -325,7 +325,7 @@ def test_training(self): def test_training_gradient_checkpointing(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() if not self.model_tester.is_training: - return + self.skipTest(reason="model_tester.is_training is set to False") config.use_cache = False config.return_dict = True diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 8b2dbc3634ba..6ae9f6c279de 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -614,7 +614,7 @@ def test_torchscript_device_change(self): for model_class in self.all_model_classes: # BertForMultipleChoice behaves incorrectly in JIT environments. if model_class == BertForMultipleChoice: - return + self.skipTest(reason="BertForMultipleChoice behaves incorrectly in JIT environments.") config.torchscript = True model = model_class(config=config) diff --git a/tests/models/bert/test_tokenization_bert.py b/tests/models/bert/test_tokenization_bert.py index 5cebf58029f9..747b0cf2a732 100644 --- a/tests/models/bert/test_tokenization_bert.py +++ b/tests/models/bert/test_tokenization_bert.py @@ -79,7 +79,7 @@ def test_full_tokenizer(self): def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py index dd22eb4a6d22..7a7ad5071df2 100644 --- a/tests/models/big_bird/test_modeling_big_bird.py +++ b/tests/models/big_bird/test_modeling_big_bird.py @@ -716,7 +716,7 @@ def test_block_sparse_attention_probs(self): """ if not self.test_attention_probs: - return + self.skip("test_attention_probs is set to False") model = BigBirdModel.from_pretrained( "google/bigbird-roberta-base", attention_type="block_sparse", num_random_blocks=3, block_size=16 diff --git a/tests/models/big_bird/test_tokenization_big_bird.py b/tests/models/big_bird/test_tokenization_big_bird.py index 863d30e84990..25f8de17700f 100644 --- a/tests/models/big_bird/test_tokenization_big_bird.py +++ b/tests/models/big_bird/test_tokenization_big_bird.py @@ -63,7 +63,7 @@ def test_vocab_size(self): def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() diff --git a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py index 5e4ce9f0bbfc..357b91a41e57 100644 --- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py +++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py @@ -335,14 +335,15 @@ def test_model_various_attn_type(self): def test_generate_without_input_ids(self): if self.model_tester.attention_type == "block_sparse": - # this test can never pass for BigBird-block-sparse attention since input_ids must be multiple of block_size - return + self.skipTest( + "Cannot pass for BigBird-block-sparse attention since input_ids must be multiple of block_size" + ) super().test_generate_without_input_ids() def test_retain_grad_hidden_states_attentions(self): if self.model_tester.attention_type == "block_sparse": # this test can't pass since attention matrix (which is getting returned) can't have gradients (& just 0 at many locations) - return + self.skipTest(reason="Cannot pass since returned attention matrix can't have gradients") super().test_retain_grad_hidden_states_attentions() # BigBirdPegasusForSequenceClassification does not support inputs_embeds @@ -811,6 +812,6 @@ def test_decoder_model_attn_mask_past(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs) + @unittest.skip("Decoder cannot retain gradients") def test_retain_grad_hidden_states_attentions(self): - # decoder cannot keep gradients return diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py index 51e836a1f90c..1ccb2b54cc9a 100644 --- a/tests/models/biogpt/test_modeling_biogpt.py +++ b/tests/models/biogpt/test_modeling_biogpt.py @@ -414,7 +414,7 @@ def test_biogpt_sequence_classification_model_for_multi_label(self): result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) - @unittest.skip("The `input_embeds` when fed don't produce the same results.") + @unittest.skip(reason="The `input_embeds` when fed don't produce the same results.") def test_beam_sample_generate(self): pass diff --git a/tests/models/blenderbot/test_modeling_blenderbot.py b/tests/models/blenderbot/test_modeling_blenderbot.py index 6c6a0185f397..fa0797cbeed8 100644 --- a/tests/models/blenderbot/test_modeling_blenderbot.py +++ b/tests/models/blenderbot/test_modeling_blenderbot.py @@ -565,6 +565,6 @@ def test_decoder_model_attn_mask_past(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs) + @unittest.skip(reason="decoder cannot keep gradients") def test_retain_grad_hidden_states_attentions(self): - # decoder cannot keep gradients return diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py index 6f28b5959c74..6be86a66b98e 100644 --- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py +++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py @@ -564,6 +564,6 @@ def test_decoder_model_attn_mask_past(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs) + @unittest.skip(reason="decoder cannot keep gradients") def test_retain_grad_hidden_states_attentions(self): - # decoder cannot keep gradients return diff --git a/tests/models/blip/test_image_processing_blip.py b/tests/models/blip/test_image_processing_blip.py index 905e1dad55e2..9be86359a1c3 100644 --- a/tests/models/blip/test_image_processing_blip.py +++ b/tests/models/blip/test_image_processing_blip.py @@ -130,18 +130,18 @@ def test_image_processor_properties(self): self.assertTrue(hasattr(image_processor, "image_std")) self.assertTrue(hasattr(image_processor, "do_convert_rgb")) - @unittest.skip("BlipImageProcessor does not support 4 channels yet") # FIXME Amy + @unittest.skip(reason="BlipImageProcessor does not support 4 channels yet") # FIXME Amy def test_call_numpy(self): return super().test_call_numpy() - @unittest.skip("BlipImageProcessor does not support 4 channels yet") # FIXME Amy + @unittest.skip(reason="BlipImageProcessor does not support 4 channels yet") # FIXME Amy def test_call_pytorch(self): return super().test_call_torch() - @unittest.skip("BLIP doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy + @unittest.skip(reason="BLIP doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy def test_call_pil(self): pass - @unittest.skip("BLIP doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy + @unittest.skip(reason="BLIP doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy def test_call_numpy_4_channels(self): pass diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index db71336e3466..2f8ee3229ff2 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -193,9 +193,11 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -335,9 +337,11 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -491,7 +495,7 @@ def test_initialization(self): def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.torchscript = True @@ -932,7 +936,7 @@ def test_forward_signature(self): def test_training(self): if not self.model_tester.is_training: - return + self.skipTest(reason="ModelTester is not setup for training") for model_class in self.all_model_classes[:-1]: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -951,7 +955,7 @@ def test_training(self): def test_training_gradient_checkpointing(self): if not self.model_tester.is_training: - return + self.skipTest(reason="ModelTester is not setup for training") for model_class in self.all_model_classes[:-1]: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -1008,7 +1012,7 @@ def test_initialization(self): def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.torchscript = True @@ -1160,7 +1164,7 @@ def test_forward_signature(self): def test_training(self): if not self.model_tester.is_training: - return + self.skipTest(reason="ModelTester is not setup for training") for model_class in self.all_model_classes[:-1]: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -1179,7 +1183,7 @@ def test_training(self): def test_training_gradient_checkpointing(self): if not self.model_tester.is_training: - return + self.skipTest(reason="ModelTester is not setup for training") for model_class in self.all_model_classes[:-1]: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -1224,7 +1228,7 @@ def test_initialization(self): def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.torchscript = True diff --git a/tests/models/blip/test_modeling_blip_text.py b/tests/models/blip/test_modeling_blip_text.py index ea6e138b7e72..85ab462a0d54 100644 --- a/tests/models/blip/test_modeling_blip_text.py +++ b/tests/models/blip/test_modeling_blip_text.py @@ -141,9 +141,11 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 9e295325b3fd..28ed3a79cae5 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -187,9 +187,11 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py index d0ee36dc3ca1..0952cfee3b74 100644 --- a/tests/models/bloom/test_modeling_bloom.py +++ b/tests/models/bloom/test_modeling_bloom.py @@ -389,7 +389,7 @@ def test_bloom_weight_initialization(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_bloom_weight_initialization(*config_and_inputs) - @unittest.skip("Bloom has a non-standard KV cache format.") + @unittest.skip(reason="Bloom has a non-standard KV cache format.") def test_past_key_values_format(self): pass diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py index 7a62f8f1a7fe..a477d2538c7c 100644 --- a/tests/models/bloom/test_tokenization_bloom.py +++ b/tests/models/bloom/test_tokenization_bloom.py @@ -43,7 +43,7 @@ def get_rust_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) return BloomTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) - @unittest.skip("This needs a slow tokenizer. Bloom does not have one!") + @unittest.skip(reason="This needs a slow tokenizer. Bloom does not have one!") def test_encode_decode_with_spaces(self): return diff --git a/tests/models/bridgetower/test_image_processing_bridgetower.py b/tests/models/bridgetower/test_image_processing_bridgetower.py index 1dc5419b77c8..48268c8d3f56 100644 --- a/tests/models/bridgetower/test_image_processing_bridgetower.py +++ b/tests/models/bridgetower/test_image_processing_bridgetower.py @@ -17,6 +17,8 @@ import unittest from typing import Dict, List, Optional, Union +import numpy as np + from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_vision_available @@ -84,6 +86,8 @@ def get_expected_values(self, image_inputs, batched=False): image = image_inputs[0] if isinstance(image, Image.Image): w, h = image.size + elif isinstance(image, np.ndarray): + h, w = image.shape[0], image.shape[1] else: h, w = image.shape[1], image.shape[2] scale = size / min(w, h) diff --git a/tests/models/byt5/test_tokenization_byt5.py b/tests/models/byt5/test_tokenization_byt5.py index 0033021ea726..c3075beb5063 100644 --- a/tests/models/byt5/test_tokenization_byt5.py +++ b/tests/models/byt5/test_tokenization_byt5.py @@ -300,15 +300,15 @@ def test_decode_single_bytes(self): self.assertTrue(tokenizer.decode([255]) == "") - # tokenizer does not have vocabulary + @unittest.skip(reason="ByT5Tokenizer does not have a vocabulary") def test_get_vocab(self): pass - # inputs cannot be pretokenized since ids depend on whole input string and not just on single characters + @unittest.skip(reason="inputs cannot be pretokenized as ids depend on whole input string") def test_pretokenized_inputs(self): pass - # tests all ids in vocab => vocab doesn't exist so unnecessary to test + @unittest.skip(reason="ByT5Tokenizer does not have a vocabulary") def test_conversion_reversible(self): pass diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py index 624338b7f0b1..1ff43e359d5e 100644 --- a/tests/models/camembert/test_tokenization_camembert.py +++ b/tests/models/camembert/test_tokenization_camembert.py @@ -94,7 +94,7 @@ def test_rust_and_python_bpe_tokenizers(self): def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() @@ -144,7 +144,7 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir) self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens) self.assertIn(new_eos, tokenizer.added_tokens_decoder.values()) self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos) - self.assertDictEqual(expected, tokenizer.added_tokens_decoder) + self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items())) return tokenizer new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False) @@ -198,7 +198,13 @@ def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir) self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"): - self.assertDictEqual(EXPECTED_ADDED_TOKENS_DECODER, tokenizer_fast.added_tokens_decoder) + with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"): + self.assertTrue( + all( + item in tokenizer.added_tokens_decoder.items() + for item in EXPECTED_ADDED_TOKENS_DECODER.items() + ) + ) EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder with tempfile.TemporaryDirectory() as tmp_dir_4: diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py index 3e7b27638c24..efc70dff499c 100644 --- a/tests/models/canine/test_modeling_canine.py +++ b/tests/models/canine/test_modeling_canine.py @@ -441,7 +441,7 @@ def recursive_check(tuple_object, dict_object): def test_headmasking(self): if not self.test_head_masking: - return + self.skipTest(reason="test_head_masking is set to False") global_rng.seed(42) config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -496,7 +496,7 @@ def check_attentions_validity(attentions): check_attentions_validity(outputs.attentions) - @unittest.skip("CANINE does not have a get_input_embeddings() method.") + @unittest.skip(reason="CANINE does not have a get_input_embeddings() method.") def test_inputs_embeds(self): # ViT does not use inputs_embeds pass @@ -505,7 +505,7 @@ def test_inputs_embeds(self): def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip("CANINE does not have a get_input_embeddings() method.") + @unittest.skip(reason="CANINE does not have a get_input_embeddings() method.") def test_model_get_set_embeddings(self): pass diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py index d34ac324eac4..e7e19c63ce93 100644 --- a/tests/models/canine/test_tokenization_canine.py +++ b/tests/models/canine/test_tokenization_canine.py @@ -303,31 +303,32 @@ def test_tokenizers_common_ids_setters(self): self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [additional_special_token]) self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [additional_special_token_id]) - # tokenizer has a fixed vocab_size (namely all possible unicode code points) + @unittest.skip(reason="tokenizer has a fixed vocab_size (namely all possible unicode code points)") def test_add_tokens_tokenizer(self): pass # CanineTokenizer does not support do_lower_case = True, as each character has its own Unicode code point # ("b" and "B" for example have different Unicode code points) + @unittest.skip(reason="CanineTokenizer does not support do_lower_case = True") def test_added_tokens_do_lower_case(self): pass - # CanineModel does not support the get_input_embeddings nor the get_vocab method + @unittest.skip(reason="CanineModel does not support the get_input_embeddings nor the get_vocab method") def test_np_encode_plus_sent_to_model(self): pass - # CanineModel does not support the get_input_embeddings nor the get_vocab method + @unittest.skip(reason="CanineModel does not support the get_input_embeddings nor the get_vocab method") def test_torch_encode_plus_sent_to_model(self): pass - # tokenizer does not have vocabulary + @unittest.skip(reason="CanineTokenizer does not have vocabulary") def test_get_vocab(self): pass - # inputs cannot be pretokenized since ids depend on whole input string and not just on single characters + @unittest.skip(reason="inputs cannot be pretokenized since ids depend on whole input string") def test_pretokenized_inputs(self): pass - # tests all ids in vocab => vocab doesn't exist so unnecessary to test + @unittest.skip(reason="CanineTokenizer does not have vocabulary") def test_conversion_reversible(self): pass diff --git a/tests/models/chinese_clip/test_image_processing_chinese_clip.py b/tests/models/chinese_clip/test_image_processing_chinese_clip.py index 94e41e8eaa06..168f84e98426 100644 --- a/tests/models/chinese_clip/test_image_processing_chinese_clip.py +++ b/tests/models/chinese_clip/test_image_processing_chinese_clip.py @@ -17,7 +17,7 @@ import unittest from transformers.testing_utils import require_torch, require_vision -from transformers.utils import is_torch_available, is_vision_available +from transformers.utils import is_vision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs @@ -26,10 +26,6 @@ from transformers import ChineseCLIPImageProcessor -if is_torch_available(): - pass - - class ChineseCLIPImageProcessingTester(unittest.TestCase): def __init__( self, @@ -125,7 +121,9 @@ def test_image_processor_from_dict_with_kwargs(self): self.assertEqual(image_processor.size, {"shortest_edge": 42}) self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) - @unittest.skip("ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy + @unittest.skip( + reason="ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet" + ) # FIXME Amy def test_call_numpy_4_channels(self): pass @@ -155,14 +153,16 @@ def test_image_processor_properties(self): self.assertTrue(hasattr(image_processing, "image_std")) self.assertTrue(hasattr(image_processing, "do_convert_rgb")) - @unittest.skip("ChineseCLIPImageProcessor does not support 4 channels yet") # FIXME Amy + @unittest.skip(reason="ChineseCLIPImageProcessor does not support 4 channels yet") # FIXME Amy def test_call_numpy(self): return super().test_call_numpy() - @unittest.skip("ChineseCLIPImageProcessor does not support 4 channels yet") # FIXME Amy + @unittest.skip(reason="ChineseCLIPImageProcessor does not support 4 channels yet") # FIXME Amy def test_call_pytorch(self): return super().test_call_torch() - @unittest.skip("ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy + @unittest.skip( + reason="ChineseCLIPImageProcessor doesn't treat 4 channel PIL and numpy consistently yet" + ) # FIXME Amy def test_call_numpy_4_channels(self): pass diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 17d8ddcb1c43..7046f28b5f94 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -388,9 +388,11 @@ def test_model_from_pretrained(self): model = ChineseCLIPTextModel.from_pretrained(model_name) self.assertIsNotNone(model) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -466,9 +468,11 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -621,7 +625,7 @@ def test_initialization(self): def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.torchscript = True diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index 7cb558b97a9d..8e3392133f1f 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -562,7 +562,7 @@ def test_initialization(self): def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.torchscript = True diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 5221274ffae9..78a5fb6f9adf 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -220,9 +220,11 @@ def test_model_with_projection(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_with_projection(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -381,9 +383,11 @@ def test_model_with_projection(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_with_projection(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -535,7 +539,7 @@ def test_initialization(self): def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.torchscript = True @@ -636,7 +640,7 @@ def test_equivalence_pt_to_flax(self): fx_model_class_name = "Flax" + model_class.__name__ if not hasattr(transformers, fx_model_class_name): - return + self.skipTest(reason="No Flax model exists for this class") fx_model_class = getattr(transformers, fx_model_class_name) @@ -692,8 +696,7 @@ def test_equivalence_flax_to_pt(self): fx_model_class_name = "Flax" + model_class.__name__ if not hasattr(transformers, fx_model_class_name): - # no flax model exists for this class - return + self.skipTest(reason="No Flax model exists for this class") fx_model_class = getattr(transformers, fx_model_class_name) diff --git a/tests/models/clip/test_tokenization_clip.py b/tests/models/clip/test_tokenization_clip.py index 5885f8933c18..c24f554a0788 100644 --- a/tests/models/clip/test_tokenization_clip.py +++ b/tests/models/clip/test_tokenization_clip.py @@ -178,7 +178,6 @@ def test_log_warning(self): def test_tokenization_python_rust_equals(self): super().test_tokenization_python_rust_equals() - # overwrite common test + @unittest.skip(reason="CLIP always lower cases letters") def test_added_tokens_do_lower_case(self): - # CLIP always lower cases letters pass diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index e9bfd2201e88..a6f286c4c6b7 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -194,9 +194,11 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -331,9 +333,11 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -540,7 +544,7 @@ def test_initialization(self): def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.torchscript = True @@ -641,7 +645,7 @@ def test_equivalence_pt_to_flax(self): fx_model_class_name = "Flax" + model_class.__name__ if not hasattr(transformers, fx_model_class_name): - return + self.skipTest(reason="No Flax model exists for this class") fx_model_class = getattr(transformers, fx_model_class_name) @@ -697,8 +701,7 @@ def test_equivalence_flax_to_pt(self): fx_model_class_name = "Flax" + model_class.__name__ if not hasattr(transformers, fx_model_class_name): - # no flax model exists for this class - return + self.skipTest(reason="No Flax model exists for this class") fx_model_class = getattr(transformers, fx_model_class_name) @@ -744,7 +747,7 @@ def test_equivalence_flax_to_pt(self): def test_training(self): if not self.model_tester.is_training: - return + self.skipTest(reason="Training test is skipped as the model was not trained") for model_class in self.all_model_classes: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/clvp/test_tokenization_clvp.py b/tests/models/clvp/test_tokenization_clvp.py index 7bb522f41442..71ea9c08c831 100644 --- a/tests/models/clvp/test_tokenization_clvp.py +++ b/tests/models/clvp/test_tokenization_clvp.py @@ -102,7 +102,7 @@ def test_add_special_tokens(self): # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_rust_and_python_full_tokenizers def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True) diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py index fd4b38a17ec8..ee07c54c16aa 100644 --- a/tests/models/code_llama/test_tokenization_code_llama.py +++ b/tests/models/code_llama/test_tokenization_code_llama.py @@ -26,7 +26,6 @@ AddedToken, CodeLlamaTokenizer, CodeLlamaTokenizerFast, - is_torch_available, ) from transformers.convert_slow_tokenizer import convert_slow_tokenizer from transformers.testing_utils import ( @@ -44,10 +43,6 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") -if is_torch_available(): - pass - - @require_sentencepiece @require_tokenizers class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @@ -220,7 +215,7 @@ def test_save_pretrained(self): @require_torch def test_batch_tokenization(self): if not self.test_seq2seq: - return + self.skipTest(reason="test_seq2seq is False") tokenizers = self.get_tokenizers() for tokenizer in tokenizers: @@ -240,7 +235,7 @@ def test_batch_tokenization(self): return_tensors="pt", ) except NotImplementedError: - return + self.skipTest(reason="Encountered NotImplementedError when calling tokenizer") self.assertEqual(batch.input_ids.shape[1], 3) # max_target_length will default to max_length if not specified batch = tokenizer(text, max_length=3, return_tensors="pt") @@ -251,7 +246,7 @@ def test_batch_tokenization(self): self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3) self.assertNotIn("decoder_input_ids", batch_encoder_only) - @unittest.skip("Unfortunately way too slow to build a BPE with SentencePiece.") + @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.") def test_save_slow_from_fast_and_reload_fast(self): pass @@ -306,11 +301,11 @@ def test_picklable(self): pickled_tokenizer = pickle.dumps(tokenizer) pickle.loads(pickled_tokenizer) - @unittest.skip("worker 'gw4' crashed on CI, passing locally.") + @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") def test_pickle_subword_regularization_tokenizer(self): pass - @unittest.skip("worker 'gw4' crashed on CI, passing locally.") + @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") def test_subword_regularization_tokenizer(self): pass diff --git a/tests/models/codegen/test_tokenization_codegen.py b/tests/models/codegen/test_tokenization_codegen.py index e7945089c076..4832bf1962e4 100644 --- a/tests/models/codegen/test_tokenization_codegen.py +++ b/tests/models/codegen/test_tokenization_codegen.py @@ -99,7 +99,7 @@ def test_full_tokenizer(self): def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True) @@ -127,6 +127,7 @@ def test_rust_and_python_full_tokenizers(self): input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + @unittest.skip def test_pretokenized_inputs(self, *args, **kwargs): # It's very difficult to mix/test pretokenization with byte-level # And get both CodeGen and Roberta to work at the same time (mostly an issue of adding a space before the string) @@ -262,6 +263,7 @@ def test_truncation(self): # TODO @ArthurZ outputs of the fast tokenizer are different in this case, un-related to the PR # tokenizer has no padding token + @unittest.skip(reason="tokenizer has no padding token") def test_padding_different_model_input_name(self): pass diff --git a/tests/models/cohere/test_tokenization_cohere.py b/tests/models/cohere/test_tokenization_cohere.py index d1caf041cd93..a8ab85fe3b89 100644 --- a/tests/models/cohere/test_tokenization_cohere.py +++ b/tests/models/cohere/test_tokenization_cohere.py @@ -51,7 +51,7 @@ def get_rust_tokenizer(self, **kwargs): def test_torch_encode_plus_sent_to_model(self): super().test_torch_encode_plus_sent_to_model() - @unittest.skip("This needs a slow tokenizer. Cohere does not have one!") + @unittest.skip(reason="This needs a slow tokenizer. Cohere does not have one!") def test_encode_decode_with_spaces(self): return diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py index 171ec2d44f49..99a06613e141 100644 --- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py +++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py @@ -18,6 +18,8 @@ import pathlib import unittest +import numpy as np + from transformers.testing_utils import require_torch, require_vision, slow from transformers.utils import is_torch_available, is_vision_available @@ -87,6 +89,8 @@ def get_expected_values(self, image_inputs, batched=False): image = image_inputs[0] if isinstance(image, Image.Image): w, h = image.size + elif isinstance(image, np.ndarray): + h, w = image.shape[0], image.shape[1] else: h, w = image.shape[1], image.shape[2] if w < h: diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py index 18f85a71e071..2e2973679e91 100644 --- a/tests/models/conditional_detr/test_modeling_conditional_detr.py +++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py @@ -263,8 +263,8 @@ def test_resize_tokens_embeddings(self): pass @slow + @unittest.skip(reason="TODO Niels: fix me!") def test_model_outputs_equivalence(self): - # TODO Niels: fix me! pass def test_attention_outputs(self): diff --git a/tests/models/convbert/test_modeling_convbert.py b/tests/models/convbert/test_modeling_convbert.py index 0866f7679874..84b50f572908 100644 --- a/tests/models/convbert/test_modeling_convbert.py +++ b/tests/models/convbert/test_modeling_convbert.py @@ -433,7 +433,7 @@ def test_torchscript_device_change(self): for model_class in self.all_model_classes: # ConvBertForMultipleChoice behaves incorrectly in JIT environments. if model_class == ConvBertForMultipleChoice: - return + self.skipTest(reason="ConvBertForMultipleChoice behaves incorrectly in JIT environments.") config.torchscript = True model = model_class(config=config) diff --git a/tests/models/convnextv2/test_modeling_convnextv2.py b/tests/models/convnextv2/test_modeling_convnextv2.py index ba3878ba51ec..e5bb8e3d190a 100644 --- a/tests/models/convnextv2/test_modeling_convnextv2.py +++ b/tests/models/convnextv2/test_modeling_convnextv2.py @@ -216,7 +216,7 @@ def test_feed_forward_chunking(self): def test_training(self): if not self.model_tester.is_training: - return + self.skipTest(reason="ModelTester is not set to test training") for model_class in self.all_model_classes: config, inputs_dict = self.model_tester.prepare_config_and_inputs_with_labels() @@ -237,7 +237,7 @@ def test_training(self): def test_training_gradient_checkpointing(self): if not self.model_tester.is_training: - return + self.skipTest(reason="ModelTester is not set to test training") for model_class in self.all_model_classes: config, inputs_dict = self.model_tester.prepare_config_and_inputs_with_labels() diff --git a/tests/models/cpmant/test_modeling_cpmant.py b/tests/models/cpmant/test_modeling_cpmant.py index 64ee96b1e8e6..404280428ef9 100644 --- a/tests/models/cpmant/test_modeling_cpmant.py +++ b/tests/models/cpmant/test_modeling_cpmant.py @@ -154,7 +154,7 @@ def test_config(self): self.config_tester.run_common_tests() def test_inputs_embeds(self): - unittest.skip("CPMAnt doesn't support input_embeds.")(self.test_inputs_embeds) + unittest.skip(reason="CPMAnt doesn't support input_embeds.")(self.test_inputs_embeds) def test_retain_grad_hidden_states_attentions(self): unittest.skip( diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py index 8e9fb0d82fda..8bb16760ce61 100644 --- a/tests/models/data2vec/test_modeling_data2vec_audio.py +++ b/tests/models/data2vec/test_modeling_data2vec_audio.py @@ -426,22 +426,19 @@ def test_labels_out_of_vocab(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.check_labels_out_of_vocab(*config_and_inputs) - # Data2VecAudio has no inputs_embeds + @unittest.skip(reason="Data2VecAudio has no inputs_embeds") def test_inputs_embeds(self): pass - # `input_ids` is renamed to `input_values` + @unittest.skip(reason="`input_ids` is renamed to `input_values`") def test_forward_signature(self): pass - # Data2VecAudio cannot resize token embeddings - # since it has no tokens embeddings + @unittest.skip(reason="Data2VecAudio has no tokens embeddings") def test_resize_tokens_embeddings(self): pass - # Data2VecAudio has no inputs_embeds - # and thus the `get_input_embeddings` fn - # is not implemented + @unittest.skip(reason="Data2VecAudio has no inputs_embeds") def test_model_get_set_embeddings(self): pass diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py index 8f8a1fad447e..c729d88d614f 100644 --- a/tests/models/data2vec/test_modeling_data2vec_vision.py +++ b/tests/models/data2vec/test_modeling_data2vec_vision.py @@ -196,8 +196,8 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() + @unittest.skip(reason="Data2VecVision does not use inputs_embeds") def test_inputs_embeds(self): - # Data2VecVision does not use inputs_embeds pass @require_torch_multi_gpu @@ -226,7 +226,7 @@ def test_for_image_segmentation(self): def test_training(self): if not self.model_tester.is_training: - return + self.skipTest(reason="model_tester.is_training is set to False") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True @@ -245,7 +245,7 @@ def test_training(self): def test_training_gradient_checkpointing(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() if not self.model_tester.is_training: - return + self.skipTest(reason="model_tester.is_training is set to False") config.use_cache = False config.return_dict = True diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py index 31031c8f7afa..06c82c949cb3 100644 --- a/tests/models/dbrx/test_modeling_dbrx.py +++ b/tests/models/dbrx/test_modeling_dbrx.py @@ -350,21 +350,21 @@ def test_model_from_pretrained(self): model = DbrxModel.from_pretrained(model_name) self.assertIsNotNone(model) - @unittest.skip("Dbrx models have weight tying disabled.") + @unittest.skip(reason="Dbrx models have weight tying disabled.") def test_tied_weights_keys(self): pass # Offload does not work with Dbrx models because of the forward of DbrxExperts where we chunk the experts. # The issue is that the offloaded weights of the mlp layer are still on meta device (w1_chunked, v1_chunked, w2_chunked) - @unittest.skip("Dbrx models do not work with offload") + @unittest.skip(reason="Dbrx models do not work with offload") def test_cpu_offload(self): pass - @unittest.skip("Dbrx models do not work with offload") + @unittest.skip(reason="Dbrx models do not work with offload") def test_disk_offload_safetensors(self): pass - @unittest.skip("Dbrx models do not work with offload") + @unittest.skip(reason="Dbrx models do not work with offload") def test_disk_offload_bin(self): pass diff --git a/tests/models/deberta_v2/test_tokenization_deberta_v2.py b/tests/models/deberta_v2/test_tokenization_deberta_v2.py index 55f7e8b54290..da59fa282928 100644 --- a/tests/models/deberta_v2/test_tokenization_deberta_v2.py +++ b/tests/models/deberta_v2/test_tokenization_deberta_v2.py @@ -79,18 +79,18 @@ def test_do_lower_case(self): self.assertListEqual(rust_tokens, tokens_target) - @unittest.skip("There is an inconsistency between slow and fast tokenizer due to a bug in the fast one.") + @unittest.skip(reason="There is an inconsistency between slow and fast tokenizer due to a bug in the fast one.") def test_sentencepiece_tokenize_and_convert_tokens_to_string(self): pass - @unittest.skip("There is an inconsistency between slow and fast tokenizer due to a bug in the fast one.") + @unittest.skip(reason="There is an inconsistency between slow and fast tokenizer due to a bug in the fast one.") def test_sentencepiece_tokenize_and_decode(self): pass def test_split_by_punct(self): # fmt: off - sequence = "I was born in 92000, and this is falsé." - tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ] + sequence = "I was born in 92000, and this is falsé!" + tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", "!", ] # fmt: on tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", split_by_punct=True) @@ -105,8 +105,8 @@ def test_split_by_punct(self): def test_do_lower_case_split_by_punct(self): # fmt: off - sequence = "I was born in 92000, and this is falsé." - tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ] + sequence = "I was born in 92000, and this is falsé!" + tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", "!", ] # fmt: on tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=True) @@ -121,8 +121,8 @@ def test_do_lower_case_split_by_punct(self): def test_do_lower_case_split_by_punct_false(self): # fmt: off - sequence = "I was born in 92000, and this is falsé." - tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", ".", ] + sequence = "I was born in 92000, and this is falsé!" + tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "!", ] # fmt: on tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=False) @@ -139,8 +139,8 @@ def test_do_lower_case_split_by_punct_false(self): def test_do_lower_case_false_split_by_punct(self): # fmt: off - sequence = "I was born in 92000, and this is falsé." - tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", ".", ] + sequence = "I was born in 92000, and this is falsé!" + tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", "!", ] # fmt: on tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=True) @@ -177,7 +177,7 @@ def test_rust_and_python_full_tokenizers(self): tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() - sequence = "I was born in 92000, and this is falsé." + sequence = "I was born in 92000, and this is falsé!" tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False)) rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False)) @@ -216,10 +216,10 @@ def test_full_tokenizer(self): self.assertListEqual(rust_back_tokens, back_tokens_target) # fmt: off - sequence = "I was born in 92000, and this is falsé." - ids_target = [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9] - tokens_target = ["▁", "I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", ".", ] - back_tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", ".", ] + sequence = "I was born in 92000, and this is falsé!" + ids_target = [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 187] + tokens_target = ["▁", "I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "!", ] + back_tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "!", ] # fmt: on ids = tokenizer.encode(sequence, add_special_tokens=False) diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py index 51fbfc33f8c1..41e5a81e2f93 100644 --- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py +++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py @@ -18,6 +18,8 @@ import pathlib import unittest +import numpy as np + from transformers.testing_utils import require_torch, require_vision, slow from transformers.utils import is_torch_available, is_vision_available @@ -87,6 +89,8 @@ def get_expected_values(self, image_inputs, batched=False): image = image_inputs[0] if isinstance(image, Image.Image): w, h = image.size + elif isinstance(image, np.ndarray): + h, w = image.shape[0], image.shape[1] else: h, w = image.shape[1], image.shape[2] if w < h: diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py index f648e28f1da1..b77ffb6e7778 100644 --- a/tests/models/deformable_detr/test_modeling_deformable_detr.py +++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py @@ -606,15 +606,15 @@ def test_initialization(self): msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) - @unittest.skip("No support for low_cpu_mem_usage=True.") + @unittest.skip(reason="No support for low_cpu_mem_usage=True.") def test_save_load_low_cpu_mem_usage(self): pass - @unittest.skip("No support for low_cpu_mem_usage=True.") + @unittest.skip(reason="No support for low_cpu_mem_usage=True.") def test_save_load_low_cpu_mem_usage_checkpoints(self): pass - @unittest.skip("No support for low_cpu_mem_usage=True.") + @unittest.skip(reason="No support for low_cpu_mem_usage=True.") def test_save_load_low_cpu_mem_usage_no_safetensors(self): pass diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py index daf13d207b0a..1b4ca6e206a9 100644 --- a/tests/models/deit/test_modeling_deit.py +++ b/tests/models/deit/test_modeling_deit.py @@ -274,7 +274,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def test_training(self): if not self.model_tester.is_training: - return + self.skipTest(reason="model_tester.is_training is set to False") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True @@ -296,7 +296,7 @@ def test_training(self): def test_training_gradient_checkpointing(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() if not self.model_tester.is_training: - return + self.skipTest(reason="model_tester.is_training is set to False") config.use_cache = False config.return_dict = True diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py index fc6d56512724..4174df0f8cc7 100644 --- a/tests/models/detr/test_image_processing_detr.py +++ b/tests/models/detr/test_image_processing_detr.py @@ -17,6 +17,8 @@ import pathlib import unittest +import numpy as np + from transformers.testing_utils import require_torch, require_vision, slow from transformers.utils import is_torch_available, is_vision_available @@ -86,6 +88,8 @@ def get_expected_values(self, image_inputs, batched=False): image = image_inputs[0] if isinstance(image, Image.Image): w, h = image.size + elif isinstance(image, np.ndarray): + h, w = image.shape[0], image.shape[1] else: h, w = image.shape[1], image.shape[2] if w < h: diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py index be75423827a3..d1e36e32824d 100644 --- a/tests/models/detr/test_modeling_detr.py +++ b/tests/models/detr/test_modeling_detr.py @@ -263,8 +263,8 @@ def test_resize_tokens_embeddings(self): pass @slow + @unittest.skip(reason="TODO Niels: fix me!") def test_model_outputs_equivalence(self): - # TODO Niels: fix me! pass def test_attention_outputs(self): diff --git a/tests/models/dinat/test_modeling_dinat.py b/tests/models/dinat/test_modeling_dinat.py index dcebd82aa91f..7cfb5846e071 100644 --- a/tests/models/dinat/test_modeling_dinat.py +++ b/tests/models/dinat/test_modeling_dinat.py @@ -256,7 +256,7 @@ def test_model_get_set_embeddings(self): self.assertTrue(x is None or isinstance(x, nn.Linear)) def test_attention_outputs(self): - self.skipTest("Dinat's attention operation is handled entirely by NATTEN.") + self.skipTest(reason="Dinat's attention operation is handled entirely by NATTEN.") def check_hidden_states_output(self, inputs_dict, config, model_class, image_size): model = model_class(config) diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py index 6bd821859ea2..cde65080d2de 100644 --- a/tests/models/distilbert/test_modeling_distilbert.py +++ b/tests/models/distilbert/test_modeling_distilbert.py @@ -281,7 +281,7 @@ def test_torchscript_device_change(self): for model_class in self.all_model_classes: # BertForMultipleChoice behaves incorrectly in JIT environments. if model_class == DistilBertForMultipleChoice: - return + self.skipTest(reason="DistilBertForMultipleChoice behaves incorrectly in JIT environments.") config.torchscript = True model = model_class(config=config) diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py index 5a47856afed8..11c01c39fa6c 100644 --- a/tests/models/donut/test_modeling_donut_swin.py +++ b/tests/models/donut/test_modeling_donut_swin.py @@ -168,8 +168,8 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip(reason="DonutSwin does not use inputs_embeds") def test_inputs_embeds(self): - # DonutSwin does not use inputs_embeds pass def test_model_get_set_embeddings(self): diff --git a/tests/models/electra/test_tokenization_electra.py b/tests/models/electra/test_tokenization_electra.py index f3648e9863a5..2a9c47b93c24 100644 --- a/tests/models/electra/test_tokenization_electra.py +++ b/tests/models/electra/test_tokenization_electra.py @@ -78,7 +78,7 @@ def test_full_tokenizer(self): def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py index f720327ec714..e4f66d85641b 100644 --- a/tests/models/encodec/test_modeling_encodec.py +++ b/tests/models/encodec/test_modeling_encodec.py @@ -178,29 +178,35 @@ def test_forward_signature(self): expected_arg_names = ["input_values", "padding_mask", "bandwidth"] self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) - @unittest.skip("The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics") + @unittest.skip(reason="The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics") def test_inputs_embeds(self): pass - @unittest.skip("The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics") + @unittest.skip(reason="The EncodecModel is not transformers based, thus it does not have `inputs_embeds` logics") def test_model_get_set_embeddings(self): pass - @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `attention` logic") + @unittest.skip( + reason="The EncodecModel is not transformers based, thus it does not have the usual `attention` logic" + ) def test_retain_grad_hidden_states_attentions(self): pass - @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `attention` logic") + @unittest.skip( + reason="The EncodecModel is not transformers based, thus it does not have the usual `attention` logic" + ) def test_torchscript_output_attentions(self): pass - @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic") + @unittest.skip( + reason="The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic" + ) def test_torchscript_output_hidden_state(self): pass def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.torchscript = True @@ -288,7 +294,9 @@ def _create_and_check_torchscript(self, config, inputs_dict): # (Even with this call, there are still memory leak by ~0.04MB) self.clear_torch_jit_class_registry() - @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `attention` logic") + @unittest.skip( + reason="The EncodecModel is not transformers based, thus it does not have the usual `attention` logic" + ) def test_attention_outputs(self): pass @@ -321,19 +329,21 @@ def test_feed_forward_chunking(self): hidden_states_with_chunk = model(**inputs)[0] self.assertTrue(torch.allclose(hidden_states_no_chunk, hidden_states_with_chunk, atol=1e-3)) - @unittest.skip("The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic") + @unittest.skip( + reason="The EncodecModel is not transformers based, thus it does not have the usual `hidden_states` logic" + ) def test_hidden_states_output(self): pass - @unittest.skip("No support for low_cpu_mem_usage=True.") + @unittest.skip(reason="No support for low_cpu_mem_usage=True.") def test_save_load_low_cpu_mem_usage(self): pass - @unittest.skip("No support for low_cpu_mem_usage=True.") + @unittest.skip(reason="No support for low_cpu_mem_usage=True.") def test_save_load_low_cpu_mem_usage_checkpoints(self): pass - @unittest.skip("No support for low_cpu_mem_usage=True.") + @unittest.skip(reason="No support for low_cpu_mem_usage=True.") def test_save_load_low_cpu_mem_usage_no_safetensors(self): pass diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py index 63bb11ee57e4..5e5263b6afb9 100644 --- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py +++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py @@ -1005,6 +1005,7 @@ def get_pretrained_model(self): "google-bert/bert-base-cased", "openai-community/gpt2" ) + @unittest.skip def test_encoder_decoder_model_shared_weights(self): pass @@ -1079,6 +1080,7 @@ def get_pretrained_model(self): "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased" ) + @unittest.skip def test_encoder_decoder_model_shared_weights(self): pass @@ -1135,6 +1137,7 @@ def get_pretrained_model(self): "google-bert/bert-large-uncased", "facebook/bart-large" ) + @unittest.skip def test_encoder_decoder_model_shared_weights(self): pass diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py index da19d08e4661..232d91760344 100644 --- a/tests/models/ernie/test_modeling_ernie.py +++ b/tests/models/ernie/test_modeling_ernie.py @@ -577,9 +577,8 @@ def test_model_from_pretrained(self): def test_torchscript_device_change(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - # ErnieForMultipleChoice behaves incorrectly in JIT environments. if model_class == ErnieForMultipleChoice: - return + self.skipTest(reason="ErnieForMultipleChoice behaves incorrectly in JIT environments.") config.torchscript = True model = model_class(config=config) diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py index 3171264e2540..56a7e4d0c67f 100644 --- a/tests/models/esm/test_modeling_esm.py +++ b/tests/models/esm/test_modeling_esm.py @@ -290,11 +290,11 @@ def test_create_position_ids_from_inputs_embeds(self): self.assertEqual(position_ids.shape, expected_positions.shape) self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) - @unittest.skip("Esm does not support embedding resizing") + @unittest.skip(reason="Esm does not support embedding resizing") def test_resize_embeddings_untied(self): pass - @unittest.skip("Esm does not support embedding resizing") + @unittest.skip(reason="Esm does not support embedding resizing") def test_resize_tokens_embeddings(self): pass diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py index 11306e736861..5c05efb03f2f 100644 --- a/tests/models/esm/test_modeling_esmfold.py +++ b/tests/models/esm/test_modeling_esmfold.py @@ -184,7 +184,7 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip("Does not support attention outputs") + @unittest.skip(reason="Does not support attention outputs") def test_attention_outputs(self): pass @@ -192,75 +192,77 @@ def test_attention_outputs(self): def test_correct_missing_keys(self): pass - @unittest.skip("Esm does not support embedding resizing") + @unittest.skip(reason="Esm does not support embedding resizing") def test_resize_embeddings_untied(self): pass - @unittest.skip("Esm does not support embedding resizing") + @unittest.skip(reason="Esm does not support embedding resizing") def test_resize_tokens_embeddings(self): pass - @unittest.skip("ESMFold does not support passing input embeds!") + @unittest.skip(reason="ESMFold does not support passing input embeds!") def test_inputs_embeds(self): pass - @unittest.skip("ESMFold does not support head pruning.") + @unittest.skip(reason="ESMFold does not support head pruning.") def test_head_pruning(self): pass - @unittest.skip("ESMFold does not support head pruning.") + @unittest.skip(reason="ESMFold does not support head pruning.") def test_head_pruning_integration(self): pass - @unittest.skip("ESMFold does not support head pruning.") + @unittest.skip(reason="ESMFold does not support head pruning.") def test_head_pruning_save_load_from_config_init(self): pass - @unittest.skip("ESMFold does not support head pruning.") + @unittest.skip(reason="ESMFold does not support head pruning.") def test_head_pruning_save_load_from_pretrained(self): pass - @unittest.skip("ESMFold does not support head pruning.") + @unittest.skip(reason="ESMFold does not support head pruning.") def test_headmasking(self): pass - @unittest.skip("ESMFold does not output hidden states in the normal way.") + @unittest.skip(reason="ESMFold does not output hidden states in the normal way.") def test_hidden_states_output(self): pass - @unittest.skip("ESMfold does not output hidden states in the normal way.") + @unittest.skip(reason="ESMfold does not output hidden states in the normal way.") def test_retain_grad_hidden_states_attentions(self): pass - @unittest.skip("ESMFold only has one output format.") + @unittest.skip(reason="ESMFold only has one output format.") def test_model_outputs_equivalence(self): pass - @unittest.skip("This test doesn't work for ESMFold and doesn't test core functionality") + @unittest.skip(reason="This test doesn't work for ESMFold and doesn't test core functionality") def test_save_load_fast_init_from_base(self): pass - @unittest.skip("ESMFold does not support input chunking.") + @unittest.skip(reason="ESMFold does not support input chunking.") def test_feed_forward_chunking(self): pass - @unittest.skip("ESMFold doesn't respect you and it certainly doesn't respect your initialization arguments.") + @unittest.skip( + reason="ESMFold doesn't respect you and it certainly doesn't respect your initialization arguments." + ) def test_initialization(self): pass - @unittest.skip("ESMFold doesn't support torchscript compilation.") + @unittest.skip(reason="ESMFold doesn't support torchscript compilation.") def test_torchscript_output_attentions(self): pass - @unittest.skip("ESMFold doesn't support torchscript compilation.") + @unittest.skip(reason="ESMFold doesn't support torchscript compilation.") def test_torchscript_output_hidden_state(self): pass - @unittest.skip("ESMFold doesn't support torchscript compilation.") + @unittest.skip(reason="ESMFold doesn't support torchscript compilation.") def test_torchscript_simple(self): pass - @unittest.skip("ESMFold doesn't support data parallel.") + @unittest.skip(reason="ESMFold doesn't support data parallel.") def test_multi_gpu_data_parallel_forward(self): pass diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py index 50e8fcdbb4b0..2fb9e664c7b3 100644 --- a/tests/models/falcon/test_modeling_falcon.py +++ b/tests/models/falcon/test_modeling_falcon.py @@ -381,7 +381,7 @@ def test_past_key_values_format(self): # If it doesn't support cache, pass the test if not hasattr(config, "use_cache"): - return + self.skipTest(reason="Model does not support cache") model = model_class(config).to(torch_device) if "use_cache" not in inputs: @@ -390,7 +390,7 @@ def test_past_key_values_format(self): # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format) if "past_key_values" not in outputs: - return + self.skipTest(reason="Model does not return past_key_values") num_hidden_layers = ( getattr(config, "decoder_layers", None) diff --git a/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py b/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py index 119e35555a8f..72acb83999b9 100644 --- a/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py +++ b/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py @@ -174,7 +174,7 @@ def test_encode_decode_with_spaces(self): def test_convert_tokens_to_string_format(self): pass - @unittest.skip("FastSpeech2Conformer tokenizer does not support pairs.") + @unittest.skip(reason="FastSpeech2Conformer tokenizer does not support pairs.") def test_maximum_encoding_length_pair_input(self): pass diff --git a/tests/models/flaubert/test_modeling_flaubert.py b/tests/models/flaubert/test_modeling_flaubert.py index de0fd88db466..17502dc27353 100644 --- a/tests/models/flaubert/test_modeling_flaubert.py +++ b/tests/models/flaubert/test_modeling_flaubert.py @@ -477,7 +477,7 @@ def test_torchscript_device_change(self): for model_class in self.all_model_classes: # FlauBertForMultipleChoice behaves incorrectly in JIT environments. if model_class == FlaubertForMultipleChoice: - return + self.skipTest(reason="FlauBertForMultipleChoice behaves incorrectly in JIT environments.") config.torchscript = True model = model_class(config=config) diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index 388e2f041f2a..d8c8f385e9ce 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -176,8 +176,8 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() + @unittest.skip("Flava does not use input_ids") def test_inputs_embeds(self): - # FLAVA does not use inputs_embeds pass def test_model_get_set_embeddings(self): @@ -300,9 +300,11 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -318,13 +320,13 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass - # skip this test as FlavaImageModel has no base class and is - # not available in MODEL_MAPPING + @unittest.skip(reason="FlavaImageModel has no base class and is not available in MODEL_MAPPING") def test_save_load_fast_init_from_base(self): pass # skip this test as FlavaImageModel has no base class and is # not available in MODEL_MAPPING + @unittest.skip(reason="FlavaImageModel has no base class and is not available in MODEL_MAPPING") def test_save_load_fast_init_to_base(self): pass @@ -459,9 +461,11 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -477,17 +481,16 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass + @unittest.skip(reason="FLAVA does not use input_embeds") def test_inputs_embeds(self): # FLAVA does not use inputs_embeds pass - # skip this test as FlavaTextModel has no base class and is - # not available in MODEL_MAPPING + @unittest.skip(reason="FlavaTextModel has no base class and is not available in MODEL_MAPPING") def test_save_load_fast_init_from_base(self): pass - # skip this test as FlavaTextModel has no base class and is - # not available in MODEL_MAPPING + @unittest.skip(reason="FlavaTextModel has no base class and is not available in MODEL_MAPPING") def test_save_load_fast_init_to_base(self): pass @@ -619,13 +622,15 @@ def test_forward_signature(self): expected_arg_names = ["hidden_states"] self.assertListEqual(arg_names[:1], expected_arg_names) + @unittest.skip("FLAVA does not have input embeddings") def test_model_get_set_embeddings(self): - # No embedding in multimodal model pass + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -641,17 +646,15 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass + @unittest.skip(reason="FLAVA does not use input_embeds") def test_inputs_embeds(self): - # FLAVA does not use inputs_embeds pass - # skip this test as FlavaMultimodalModel has no base class and is - # not available in MODEL_MAPPING + @unittest.skip(reason="FlavaMultimodalModel has no base class and is not available in MODEL_MAPPING") def test_save_load_fast_init_from_base(self): pass - # skip this test as FlavaMultimodalModel has no base class and is - # not available in MODEL_MAPPING + @unittest.skip(reason="FlavaMultimodalModel has no base class and is not available in MODEL_MAPPING") def test_save_load_fast_init_to_base(self): pass @@ -742,20 +745,23 @@ def test_forward_signature(self): def test_attention_outputs(self): pass + @unittest.skip(reason="No embedding in multimodal model") def test_model_get_set_embeddings(self): - # No embedding in multimodal model pass + @unittest.skip def test_training(self): pass + @unittest.skip def test_hidden_states_output(self): pass + @unittest.skip(reason="FlavaImageCodebook has no attentions") def test_retain_grad_hidden_states_attentions(self): - # no attentions pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -771,20 +777,19 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass + @unittest.skip(reason="FLAVA does not use input_embeds") def test_inputs_embeds(self): - # FLAVA does not use inputs_embeds pass + @unittest.skip def test_model_outputs_equivalence(self): pass - # skip this test as FlavaImageCodebook has no base class and is - # not available in MODEL_MAPPING + @unittest.skip(reason="FlavaImageCodebook has no base class and is not available in MODEL_MAPPING") def test_save_load_fast_init_from_base(self): pass - # skip this test as FlavaImageCodebook has no base class and is - # not available in MODEL_MAPPING + @unittest.skip(reason="FlavaImageCodebook has no base class and is not available in MODEL_MAPPING") def test_save_load_fast_init_to_base(self): pass @@ -931,19 +936,19 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() self.model_tester.create_and_check_model(*config_and_inputs) - # hidden_states are tested in individual model tests + @unittest.skip(reason="tested in individual model tests") def test_hidden_states_output(self): pass - # input_embeds are tested in individual model tests + @unittest.skip(reason="tested in individual model tests") def test_inputs_embeds(self): pass - # tested in individual model tests + @unittest.skip(reason="tested in individual model tests") def test_retain_grad_hidden_states_attentions(self): pass - # FlavaModel does not have input/output embeddings + @unittest.skip(reason="FlavaModel does not have input/output embeddings") def test_model_get_set_embeddings(self): pass @@ -973,7 +978,7 @@ def test_initialization(self): def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.torchscript = True diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py index 8686c60ab698..b7acf3610c08 100644 --- a/tests/models/fnet/test_modeling_fnet.py +++ b/tests/models/fnet/test_modeling_fnet.py @@ -321,6 +321,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): return inputs_dict # Overriden Tests + @unittest.skip def test_attention_outputs(self): pass diff --git a/tests/models/fnet/test_tokenization_fnet.py b/tests/models/fnet/test_tokenization_fnet.py index a3492cf966c8..16f2e4950ef0 100644 --- a/tests/models/fnet/test_tokenization_fnet.py +++ b/tests/models/fnet/test_tokenization_fnet.py @@ -69,7 +69,7 @@ def test_vocab_size(self): def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() @@ -194,7 +194,7 @@ def test_special_tokens_initialization_from_slow(self): def test_padding(self, max_length=50): if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): diff --git a/tests/models/fsmt/test_modeling_fsmt.py b/tests/models/fsmt/test_modeling_fsmt.py index c3ad05e300f7..af95e0dca895 100644 --- a/tests/models/fsmt/test_modeling_fsmt.py +++ b/tests/models/fsmt/test_modeling_fsmt.py @@ -263,7 +263,7 @@ def test_save_load_missing_keys(self): model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True) self.assertEqual(info["missing_keys"], []) - @unittest.skip("Test has a segmentation fault on torch 1.8.0") + @unittest.skip(reason="Test has a segmentation fault on torch 1.8.0") def test_export_to_onnx(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs() model = FSMTModel(config).to(torch_device) @@ -312,23 +312,23 @@ def test_ensure_weights_are_shared(self): 2, ) - @unittest.skip("can't be implemented for FSMT due to dual vocab.") + @unittest.skip(reason="can't be implemented for FSMT due to dual vocab.") def test_resize_tokens_embeddings(self): pass - @unittest.skip("Passing inputs_embeds not implemented for FSMT.") + @unittest.skip(reason="Passing inputs_embeds not implemented for FSMT.") def test_inputs_embeds(self): pass - @unittest.skip("Input ids is required for FSMT.") + @unittest.skip(reason="Input ids is required for FSMT.") def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip("model weights aren't tied in FSMT.") + @unittest.skip(reason="model weights aren't tied in FSMT.") def test_tie_model_weights(self): pass - @unittest.skip("TODO: Decoder embeddings cannot be resized at the moment") + @unittest.skip(reason="TODO: Decoder embeddings cannot be resized at the moment") def test_resize_embeddings_untied(self): pass @@ -582,7 +582,7 @@ def test_odd_embed_dim(self): # odd num_embeddings is allowed SinusoidalPositionalEmbedding(num_positions=5, embedding_dim=4, padding_idx=self.padding_idx).to(torch_device) - @unittest.skip("different from marian (needs more research)") + @unittest.skip(reason="different from marian (needs more research)") def test_positional_emb_weights_against_marian(self): desired_weights = torch.tensor( [ diff --git a/tests/models/fsmt/test_tokenization_fsmt.py b/tests/models/fsmt/test_tokenization_fsmt.py index 4be15cbee133..bac487767ba2 100644 --- a/tests/models/fsmt/test_tokenization_fsmt.py +++ b/tests/models/fsmt/test_tokenization_fsmt.py @@ -160,10 +160,10 @@ def test_tokenizer_lower(self): expected = ["us", "a", "is", "un", "i", "ted", "st", "ates", "of", "am", "er", "ica"] self.assertListEqual(tokens, expected) - @unittest.skip("FSMTConfig.__init__ requires non-optional args") + @unittest.skip(reason="FSMTConfig.__init__ requires non-optional args") def test_torch_encode_plus_sent_to_model(self): pass - @unittest.skip("FSMTConfig.__init__ requires non-optional args") + @unittest.skip(reason="FSMTConfig.__init__ requires non-optional args") def test_np_encode_plus_sent_to_model(self): pass diff --git a/tests/models/fuyu/test_modeling_fuyu.py b/tests/models/fuyu/test_modeling_fuyu.py index f65498af33b7..6065251c5bb9 100644 --- a/tests/models/fuyu/test_modeling_fuyu.py +++ b/tests/models/fuyu/test_modeling_fuyu.py @@ -295,17 +295,17 @@ def test_training_gradient_checkpointing_use_reentrant_false(self): pass # TODO: Fix me (once this model gets more usage) - @unittest.skip("Does not work on the tiny model.") + @unittest.skip(reason="Does not work on the tiny model.") def test_disk_offload_bin(self): super().test_disk_offload() # TODO: Fix me (once this model gets more usage) - @unittest.skip("Does not work on the tiny model.") + @unittest.skip(reason="Does not work on the tiny model.") def test_disk_offload_safetensors(self): super().test_disk_offload() # TODO: Fix me (once this model gets more usage) - @unittest.skip("Does not work on the tiny model.") + @unittest.skip(reason="Does not work on the tiny model.") def test_model_parallelism(self): super().test_model_parallelism() diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py index 6aeb5f23c387..06eacdd65a35 100644 --- a/tests/models/gemma/test_modeling_gemma.py +++ b/tests/models/gemma/test_modeling_gemma.py @@ -47,11 +47,16 @@ GemmaForSequenceClassification, GemmaForTokenClassification, GemmaModel, - GemmaTokenizer, ) class GemmaModelTester: + config_class = GemmaConfig + model_class = GemmaModel + for_causal_lm_class = GemmaForCausalLM + for_sequence_class = GemmaForSequenceClassification + for_token_class = GemmaForTokenClassification + def __init__( self, parent, @@ -129,9 +134,8 @@ def prepare_config_and_inputs(self): return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - # Ignore copy def get_config(self): - return GemmaConfig( + return self.config_class( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, @@ -149,18 +153,16 @@ def get_config(self): head_dim=self.head_dim, ) - # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Gemma def create_and_check_model( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): - model = GemmaModel(config=config) + model = self.model_class(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask) result = model(input_ids) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) - # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Gemma def create_and_check_model_as_decoder( self, config, @@ -174,7 +176,7 @@ def create_and_check_model_as_decoder( encoder_attention_mask, ): config.add_cross_attention = True - model = GemmaModel(config) + model = self.model_class(config) model.to(torch_device) model.eval() result = model( @@ -191,7 +193,6 @@ def create_and_check_model_as_decoder( result = model(input_ids, attention_mask=input_mask) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) - # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Gemma def create_and_check_for_causal_lm( self, config, @@ -204,13 +205,12 @@ def create_and_check_for_causal_lm( encoder_hidden_states, encoder_attention_mask, ): - model = GemmaForCausalLM(config=config) + model = self.for_causal_lm_class(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, labels=token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) - # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Gemma def create_and_check_decoder_model_past_large_inputs( self, config, @@ -225,7 +225,7 @@ def create_and_check_decoder_model_past_large_inputs( ): config.is_decoder = True config.add_cross_attention = True - model = GemmaForCausalLM(config=config) + model = self.for_causal_lm_class(config=config) model.to(torch_device) model.eval() @@ -348,7 +348,7 @@ def test_Gemma_sequence_classification_model(self): input_ids = input_dict["input_ids"] attention_mask = input_ids.ne(1).to(torch_device) sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) - model = GemmaForSequenceClassification(config) + model = self.model_tester.for_sequence_class(config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) @@ -361,7 +361,7 @@ def test_Gemma_sequence_classification_model_for_single_label(self): input_ids = input_dict["input_ids"] attention_mask = input_ids.ne(1).to(torch_device) sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size) - model = GemmaForSequenceClassification(config) + model = self.model_tester.for_sequence_class(config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) @@ -376,20 +376,19 @@ def test_Gemma_sequence_classification_model_for_multi_label(self): sequence_labels = ids_tensor( [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size ).to(torch.float) - model = GemmaForSequenceClassification(config) + model = self.model_tester.for_sequence_class(config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) - # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->Gemma,llama->Gemma def test_Gemma_token_classification_model(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() config.num_labels = 3 input_ids = input_dict["input_ids"] attention_mask = input_ids.ne(1).to(torch_device) token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels) - model = GemmaForTokenClassification(config=config) + model = self.model_tester.for_token_class(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=attention_mask, labels=token_labels) @@ -398,11 +397,11 @@ def test_Gemma_token_classification_model(self): (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels), ) - @unittest.skip("Gemma buffers include complex numbers, which breaks this test") + @unittest.skip(reason="Gemma buffers include complex numbers, which breaks this test") def test_save_load_fast_init_from_base(self): pass - @unittest.skip("Gemma uses GQA on all models so the KV cache is a non standard format") + @unittest.skip(reason="Gemma uses GQA on all models so the KV cache is a non standard format") def test_past_key_values_format(self): pass @@ -456,7 +455,7 @@ def test_flash_attn_2_generate_use_cache(self): @pytest.mark.flash_attn_test @slow def test_flash_attn_2_inference_equivalence_right_padding(self): - self.skipTest("Gemma flash attention does not support right padding") + self.skipTest(reason="Gemma flash attention does not support right padding") @require_torch_sdpa @require_torch_gpu @@ -464,7 +463,7 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): def test_sdpa_equivalence(self): for model_class in self.all_model_classes: if not model_class._supports_sdpa: - return + self.skipTest(reason="Model does not support SDPA") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() model = model_class(config) @@ -493,12 +492,12 @@ def test_sdpa_equivalence(self): @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test - @is_flaky + @is_flaky() @slow def test_flash_attn_2_equivalence(self): for model_class in self.all_model_classes: if not model_class._supports_flash_attn_2: - return + self.skipTest(reason="Model does not support Flash Attention 2") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() model = model_class(config) @@ -539,47 +538,9 @@ def setUpClass(cls): # 8 is for A100 / A10 and 7 for T4 cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] - @require_read_token - def test_model_2b_fp32(self): - model_id = "google/gemma-2b" - EXPECTED_TEXTS = [ - "Hello I am doing a project on the 1990s and I need to know what the most popular music", - "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", - ] - - model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(torch_device) - - tokenizer = AutoTokenizer.from_pretrained(model_id) - inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) - - output = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_text = tokenizer.batch_decode(output, skip_special_tokens=True) - - self.assertEqual(output_text, EXPECTED_TEXTS) - @require_read_token def test_model_2b_fp16(self): - model_id = "google/gemma-2b" - EXPECTED_TEXTS = [ - "Hello I am doing a project on the 1990s and I need to know what the most popular music", - "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", - ] - - model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to( - torch_device - ) - - tokenizer = AutoTokenizer.from_pretrained(model_id) - inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) - - output = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_text = tokenizer.batch_decode(output, skip_special_tokens=True) - - self.assertEqual(output_text, EXPECTED_TEXTS) - - @require_read_token - def test_model_2b_fp16_static_cache(self): - model_id = "google/gemma-2b" + model_id = "google/gemma-2-9b" EXPECTED_TEXTS = [ "Hello I am doing a project on the 1990s and I need to know what the most popular music", "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", @@ -749,7 +710,7 @@ def test_model_2b_4bit(self): self.assertEqual(output_text, EXPECTED_TEXTS) - @unittest.skip("The test will not fit our CI runners") + @unittest.skip(reason="The test will not fit our CI runners") @require_read_token def test_model_7b_fp32(self): model_id = "google/gemma-7b" @@ -877,7 +838,7 @@ def test_compile_static_cache(self): # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2 # work as intended. See https://github.com/pytorch/pytorch/issues/121943 if version.parse(torch.__version__) < version.parse("2.3.0"): - self.skipTest("This test requires torch >= 2.3 to run.") + self.skipTest(reason="This test requires torch >= 2.3 to run.") NUM_TOKENS_TO_GENERATE = 40 # Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test @@ -903,7 +864,7 @@ def test_compile_static_cache(self): } prompts = ["Hello I am doing", "Hi today"] - tokenizer = GemmaTokenizer.from_pretrained("google/gemma-2b", pad_token="", padding_side="right") + tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", pad_token="", padding_side="right") model = GemmaForCausalLM.from_pretrained("google/gemma-2b", device_map="sequential", torch_dtype=torch.float16) inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py index d36f1b7dc176..4201e31e6f54 100644 --- a/tests/models/gemma/test_tokenization_gemma.py +++ b/tests/models/gemma/test_tokenization_gemma.py @@ -23,7 +23,6 @@ AddedToken, GemmaTokenizer, GemmaTokenizerFast, - is_torch_available, ) from transformers.convert_slow_tokenizer import convert_slow_tokenizer from transformers.testing_utils import ( @@ -43,10 +42,6 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") -if is_torch_available(): - pass - - @require_sentencepiece @require_tokenizers class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @@ -68,7 +63,7 @@ def setUp(self): @require_torch def test_batch_tokenization(self): if not self.test_seq2seq: - return + self.skipTest(reason="test_seq2seq is set to False") tokenizers = self.get_tokenizers() for tokenizer in tokenizers: @@ -88,7 +83,7 @@ def test_batch_tokenization(self): return_tensors="pt", ) except NotImplementedError: - return + self.skipTest(reason="Encountered NotImplementedError when calling tokenizer") self.assertEqual(batch.input_ids.shape[1], 3) # max_target_length will default to max_length if not specified batch = tokenizer(text, max_length=3, return_tensors="pt") @@ -99,7 +94,7 @@ def test_batch_tokenization(self): self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3) self.assertNotIn("decoder_input_ids", batch_encoder_only) - @unittest.skip("Unfortunately way too slow to build a BPE with SentencePiece.") + @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.") def test_save_slow_from_fast_and_reload_fast(self): pass @@ -147,15 +142,15 @@ def test_tokenizer_integration(self): padding=False, ) - @unittest.skip("worker 'gw4' crashed on CI, passing locally.") + @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") def test_pickle_subword_regularization_tokenizer(self): pass - @unittest.skip("worker 'gw4' crashed on CI, passing locally.") + @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") def test_subword_regularization_tokenizer(self): pass - @unittest.skip("Skipping") + @unittest.skip(reason="Skipping") def test_torch_encode_plus_sent_to_model(self): pass @@ -193,6 +188,19 @@ def integration_tests(self): }, ) + def test_user_added_tokens(self): + # Ensure that user added tokens are not split in the fast tokenizer + slow_tokenizer = self.tokenizer + fast_tokenizer = self.rust_tokenizer + + user_added_token = "" + + slow_tokens = slow_tokenizer.convert_ids_to_tokens(slow_tokenizer.encode(user_added_token)) + fast_tokens = slow_tokenizer.convert_ids_to_tokens(fast_tokenizer.encode(user_added_token)) + + self.assertTrue(user_added_token in fast_tokens) + self.assertEqual(slow_tokens, fast_tokens) + def test_fast_special_tokens(self): slow_tokenizer = self.tokenizer fast_tokenizer = self.rust_tokenizer @@ -214,7 +222,7 @@ def test_fast_special_tokens(self): self.tokenizer.add_eos_token = False self.rust_tokenizer.add_eos_token = False - @unittest.skip("Not super important and always failing. Let's skip it") + @unittest.skip(reason="Not super important and always failing. Let's skip it") @slow def test_conversion(self): # This is excruciatingly slow since it has to recreate the entire merge diff --git a/tests/models/gemma2/__init__.py b/tests/models/gemma2/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py new file mode 100644 index 000000000000..f7f50a4733f4 --- /dev/null +++ b/tests/models/gemma2/test_modeling_gemma2.py @@ -0,0 +1,498 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Gemma2 model.""" + +import unittest + +import pytest +from packaging import version + +from transformers import AutoModelForCausalLM, AutoTokenizer, Gemma2Config, is_torch_available +from transformers.testing_utils import ( + require_bitsandbytes, + require_flash_attn, + require_read_token, + require_torch, + require_torch_gpu, + require_torch_sdpa, + slow, + torch_device, +) + +from ...models.gemma.test_modeling_gemma import GemmaModelTest, GemmaModelTester +from ...test_configuration_common import ConfigTester + + +if is_torch_available(): + import torch + + from transformers import ( + Gemma2ForCausalLM, + Gemma2ForSequenceClassification, + Gemma2ForTokenClassification, + Gemma2Model, + GemmaTokenizer, + ) + + +class Gemma2ModelTester(GemmaModelTester): + config_class = Gemma2Config + model_class = Gemma2Model + for_causal_lm_class = Gemma2ForCausalLM + for_sequence_class = Gemma2ForSequenceClassification + for_token_class = Gemma2ForTokenClassification + + +@require_torch +class Gemma2ModelTest(GemmaModelTest, unittest.TestCase): + all_model_classes = ( + (Gemma2Model, Gemma2ForCausalLM, Gemma2ForSequenceClassification, Gemma2ForTokenClassification) + if is_torch_available() + else () + ) + all_generative_model_classes = () + pipeline_model_mapping = ( + { + "feature-extraction": Gemma2Model, + "text-classification": Gemma2ForSequenceClassification, + "token-classification": Gemma2ForTokenClassification, + "text-generation": Gemma2ForCausalLM, + "zero-shot": Gemma2ForSequenceClassification, + } + if is_torch_available() + else {} + ) + test_headmasking = False + test_pruning = False + _is_stateful = True + model_split_percents = [0.5, 0.6] + _torch_compile_test_ckpt = "google/gemma-2-9b" + + def setUp(self): + self.model_tester = Gemma2ModelTester(self) + self.config_tester = ConfigTester(self, config_class=Gemma2Config, hidden_size=37) + + @unittest.skip("Eager and SDPA do not produce the same outputs, thus this test fails") + def test_model_outputs_equivalence(self, **kwargs): + pass + + @unittest.skip("Gemma2's outputs are expected to be different") + def test_eager_matches_sdpa_inference(self): + pass + + +@slow +@require_torch_gpu +class Gemma2IntegrationTest(unittest.TestCase): + input_text = ["Hello I am doing", "Hi today"] + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + @require_read_token + def test_model_2b_fp32(self): + model_id = "google/gemma-2b" + EXPECTED_TEXTS = [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_read_token + def test_model_2b_fp16(self): + model_id = "google/gemma-2b" + EXPECTED_TEXTS = [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to( + torch_device + ) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_read_token + def test_model_2b_fp16_static_cache(self): + model_id = "google/gemma-2b" + EXPECTED_TEXTS = [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to( + torch_device + ) + + model.generation_config.cache_implementation = "static" + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_read_token + def test_model_2b_bf16(self): + model_id = "google/gemma-2b" + + # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4. + # + # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s, + # considering differences in hardware processing and potential deviations in generated text. + EXPECTED_TEXTS = { + 7: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Khichdi", + ], + 8: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ], + 9: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ], + } + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to( + torch_device + ) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) + + @require_read_token + def test_model_2b_eager(self): + model_id = "google/gemma-2b" + + # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4. + # + # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s, + # considering differences in hardware processing and potential deviations in generated text. + EXPECTED_TEXTS = { + 7: [ + "Hello I am doing a project on the 1990s and I am looking for some information on the ", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ], + 8: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ], + 9: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ], + } + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager" + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) + + @require_torch_sdpa + @require_read_token + def test_model_2b_sdpa(self): + model_id = "google/gemma-2b" + + # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4. + # + # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s, + # considering differences in hardware processing and potential deviations in generated text. + EXPECTED_TEXTS = { + 7: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Khichdi", + ], + 8: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ], + 9: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ], + } + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="sdpa" + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) + + @pytest.mark.flash_attn_test + @require_flash_attn + @require_read_token + def test_model_2b_flash_attn(self): + model_id = "google/gemma-2b" + EXPECTED_TEXTS = [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ] + + model = AutoModelForCausalLM.from_pretrained( + model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model.to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_bitsandbytes + @require_read_token + def test_model_2b_4bit(self): + model_id = "google/gemma-2b" + EXPECTED_TEXTS = [ + "Hello I am doing a project and I need to make a 3d model of a house. I have been using", + "Hi today I'd like to share with you my experience with the new wattpad wattpad wattpad wattpad wattpad wattpad wattpad", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @unittest.skip("The test will not fit our CI runners") + @require_read_token + def test_model_7b_fp32(self): + model_id = "google/gemma-7b" + EXPECTED_TEXTS = [ + "Hello my name is ***** ***** I will be assisting you today. I am sorry to hear about your issue. I will", + "Hi,\n\nI have a problem with my 2005 1.6 16", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_read_token + def test_model_7b_fp16(self): + model_id = "google/gemma-7b" + EXPECTED_TEXTS = [ + """Hello I am doing a project on a 1999 4.0L 4x4. I""", + "Hi today I am going to show you how to make a simple and easy to make a DIY 3D", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to( + torch_device + ) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_read_token + def test_model_7b_bf16(self): + model_id = "google/gemma-7b" + + # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4. + # + # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s, + # considering differences in hardware processing and potential deviations in generated text. + EXPECTED_TEXTS = { + 7: [ + """Hello I am doing a project on a 1991 240sx and I am trying to find""", + "Hi today I am going to show you how to make a very simple and easy to make a very simple and", + ], + 8: [ + "Hello I am doing a project for my school and I am trying to make a program that will read a .txt file", + "Hi today I am going to show you how to make a very simple and easy to make a very simple and", + ], + 9: [ + "Hello I am doing a project for my school and I am trying to get a servo to move a certain amount of degrees", + "Hi today I am going to show you how to make a very simple and easy to make DIY light up sign", + ], + } + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to( + torch_device + ) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) + + @require_read_token + def test_model_7b_fp16_static_cache(self): + model_id = "google/gemma-7b" + EXPECTED_TEXTS = [ + """Hello I am doing a project on a 1999 4.0L 4x4. I""", + "Hi today I am going to show you how to make a simple and easy to make a DIY 3D", + ] + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16).to( + torch_device + ) + + model.generation_config.cache_implementation = "static" + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_bitsandbytes + @require_read_token + def test_model_7b_4bit(self): + model_id = "google/gemma-7b" + EXPECTED_TEXTS = { + 7: [ + "Hello I am doing a project for my school and I am trying to make a program that will take a number and then", + """Hi today I am going to talk about the new update for the game called "The new update" and I""", + ], + 8: [ + "Hello I am doing a project for my school and I am trying to make a program that will take a number and then", + "Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very", + ], + } + + model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) + + @slow + @require_torch_gpu + @require_read_token + def test_compile_static_cache(self): + # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2 + # work as intended. See https://github.com/pytorch/pytorch/issues/121943 + if version.parse(torch.__version__) < version.parse("2.3.0"): + self.skipTest("This test requires torch >= 2.3 to run.") + + NUM_TOKENS_TO_GENERATE = 40 + # Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test + # was changed to have a cache of 53 tokens (as opposed to 4096), on Ampere GPUs. + # + # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4. + # + # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s, + # considering differences in hardware processing and potential deviations in generated text. + EXPECTED_TEXT_COMPLETION = { + 8: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found", + "Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the", + ], + 7: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found", + "Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the", + ], + 9: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found", + "Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the", + ], + } + + prompts = ["Hello I am doing", "Hi today"] + tokenizer = GemmaTokenizer.from_pretrained("google/gemma-2b", pad_token="", padding_side="right") + model = Gemma2ForCausalLM.from_pretrained( + "google/gemma-2b", device_map="sequential", torch_dtype=torch.float16 + ) + inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) + + # Dynamic Cache + generated_ids = model.generate(**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False) + dynamic_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION[8], dynamic_text) # Both GPU architectures have the same output + + # Static Cache + generated_ids = model.generate( + **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static" + ) + static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], static_text) + + # Static Cache + compile + model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) + generated_ids = model.generate( + **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static" + ) + static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], static_compiled_text) diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index ee2cf4927a56..a9c94f54f1fc 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -167,9 +167,11 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass diff --git a/tests/models/glpn/test_image_processing_glpn.py b/tests/models/glpn/test_image_processing_glpn.py index abffb31a6693..d4aa78656af5 100644 --- a/tests/models/glpn/test_image_processing_glpn.py +++ b/tests/models/glpn/test_image_processing_glpn.py @@ -66,6 +66,8 @@ def prepare_image_processor_dict(self): def expected_output_image_shape(self, images): if isinstance(images[0], Image.Image): width, height = images[0].size + elif isinstance(images[0], np.ndarray): + height, width = images[0].shape[0], images[0].shape[1] else: height, width = images[0].shape[1], images[0].shape[2] diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py index 151162fb1bb9..81e95ab244f9 100644 --- a/tests/models/glpn/test_modeling_glpn.py +++ b/tests/models/glpn/test_modeling_glpn.py @@ -168,11 +168,11 @@ def test_for_depth_estimation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) - @unittest.skip("GLPN does not use inputs_embeds") + @unittest.skip(reason="GLPN does not use inputs_embeds") def test_inputs_embeds(self): pass - @unittest.skip("GLPN does not have get_input_embeddings method and get_output_embeddings methods") + @unittest.skip(reason="GLPN does not have get_input_embeddings method and get_output_embeddings methods") def test_model_get_set_embeddings(self): pass @@ -283,7 +283,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): def test_training(self): if not self.model_tester.is_training: - return + self.skipTest(reason="model_tester.is_training is set to False") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py index 1e7c81e4be2c..9d13822ac64b 100644 --- a/tests/models/gpt2/test_tokenization_gpt2.py +++ b/tests/models/gpt2/test_tokenization_gpt2.py @@ -98,7 +98,7 @@ def test_full_tokenizer(self): def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True) @@ -126,6 +126,7 @@ def test_rust_and_python_full_tokenizers(self): input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + @unittest.skip def test_pretokenized_inputs(self, *args, **kwargs): # It's very difficult to mix/test pretokenization with byte-level # And get both GPT2 and Roberta to work at the same time (mostly an issue of adding a space before the string) @@ -247,7 +248,7 @@ def test_add_bos_token_slow(self): self.assertTrue(decode_s.startswith(bos_token)) self.assertTrue(all(d.startswith(bos_token) for d in decode_s2)) - # tokenizer has no padding token + @unittest.skip(reason="tokenizer has no padding token") def test_padding_different_model_input_name(self): pass @@ -331,7 +332,7 @@ def test_fast_slow_equivalence(self): # Same as above self.assertEqual(tokens_ids, [2, 250, 1345, 9, 10, 4758]) - @unittest.skip("This test is failing because of a bug in the fast tokenizer") + @unittest.skip(reason="This test is failing because of a bug in the fast tokenizer") def test_users_can_modify_bos(self): tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", from_slow=True) diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py index 3d4dd27fa472..cb1545c7fb56 100644 --- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py +++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py @@ -458,27 +458,27 @@ def tearDown(self): def test_config(self): self.config_tester.run_common_tests() - @unittest.skip("MQA models does not support retain_grad") + @unittest.skip(reason="MQA models does not support retain_grad") def test_retain_grad_hidden_states_attentions(self): pass - @unittest.skip("Contrastive search not supported due to non-standard caching mechanism") + @unittest.skip(reason="Contrastive search not supported due to non-standard caching mechanism") def test_contrastive_generate(self): pass - @unittest.skip("Contrastive search not supported due to non-standard caching mechanism") + @unittest.skip(reason="Contrastive search not supported due to non-standard caching mechanism") def test_contrastive_generate_dict_outputs_use_cache(self): pass - @unittest.skip("CPU offload seems to be broken for some reason - tiny models keep hitting corner cases") + @unittest.skip(reason="CPU offload seems to be broken for some reason - tiny models keep hitting corner cases") def test_cpu_offload(self): pass - @unittest.skip("Disk offload seems to be broken for some reason - tiny models keep hitting corner cases") + @unittest.skip(reason="Disk offload seems to be broken for some reason - tiny models keep hitting corner cases") def test_disk_offload(self): pass - @unittest.skip("BigCodeGPT has a non-standard KV cache format.") + @unittest.skip(reason="BigCodeGPT has a non-standard KV cache format.") def test_past_key_values_format(self): pass diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py index ed5bcac55e45..51a4d235c3bc 100644 --- a/tests/models/gpt_neox/test_modeling_gpt_neox.py +++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py @@ -19,7 +19,7 @@ from parameterized import parameterized from transformers import AutoTokenizer, GPTNeoXConfig, is_torch_available, set_seed -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -396,6 +396,68 @@ def test_model_rope_scaling(self): torch.testing.assert_close(ntk_sin_long, original_sin_long) self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) + @require_torch_sdpa + @slow + def test_eager_matches_sdpa_generate(self): + """ + Based on tests.models.llama.test_modeling_llama.LlamaModelTest.test_eager_matches_sdpa_generate + which also overwrites the common test as the test is flaky on tiny models. + """ + max_new_tokens = 30 + + tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-1b") + + model_sdpa = GPTNeoXForCausalLM.from_pretrained( + "EleutherAI/pythia-1b", + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + ).to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = GPTNeoXForCausalLM.from_pretrained( + "EleutherAI/pythia-1b", + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + attn_implementation="eager", + ).to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + has_sdpa = True + break + if not has_sdpa: + raise ValueError("The SDPA model should have SDPA attention layers") + + texts = [ + "hi here's a longer context, getting longer and", + "Hello this is a very long sentence my friend, very long for real", + "Today I am in Paris and", + ] + + for padding_side in ["left", "right"]: + tokenizer.padding_side = padding_side + tokenizer.pad_token = tokenizer.eos_token + + inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device) + + res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) + res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) + + with self.subTest(f"{padding_side}"): + torch.testing.assert_close( + res_eager, + res_sdpa, + msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}", + ) + @require_torch class GPTNeoXLanguageGenerationTest(unittest.TestCase): diff --git a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py index ec505da4a004..029c8b99d44b 100644 --- a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py +++ b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py @@ -128,10 +128,11 @@ def test_sequence_builders(self): assert encoded_sentence == ids_1 assert encoded_pair == ids_1 + ids_2 + @unittest.skip def test_conversion_reversible(self): # Intentionally convert some words to accommodate character fluctuations unique to Japanese pass + @unittest.skip(reason="tokenizer has no padding token") def test_padding_different_model_input_name(self): - # tokenizer has no padding token pass diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py index 68618fb256aa..5a2839784707 100644 --- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py +++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py @@ -18,6 +18,8 @@ import pathlib import unittest +import numpy as np + from transformers.testing_utils import require_torch, require_vision, slow from transformers.utils import is_torch_available, is_vision_available @@ -93,6 +95,8 @@ def get_expected_values(self, image_inputs, batched=False): image = image_inputs[0] if isinstance(image, Image.Image): w, h = image.size + elif isinstance(image, np.ndarray): + h, w = image.shape[0], image.shape[1] else: h, w = image.shape[1], image.shape[2] if w < h: diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py index 74c07b775bc9..ce31bc44a611 100644 --- a/tests/models/groupvit/test_modeling_groupvit.py +++ b/tests/models/groupvit/test_modeling_groupvit.py @@ -262,9 +262,11 @@ def test_attention_outputs(self): ], ) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -458,9 +460,11 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip def test_training(self): pass + @unittest.skip def test_training_gradient_checkpointing(self): pass @@ -618,7 +622,7 @@ def test_initialization(self): def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.torchscript = True diff --git a/tests/models/herbert/test_tokenization_herbert.py b/tests/models/herbert/test_tokenization_herbert.py index b8bbd7775812..02b2c54a2f08 100644 --- a/tests/models/herbert/test_tokenization_herbert.py +++ b/tests/models/herbert/test_tokenization_herbert.py @@ -95,7 +95,7 @@ def test_full_tokenizer(self): def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py index b040c57082ff..cd801be41d7b 100644 --- a/tests/models/hubert/test_modeling_hubert.py +++ b/tests/models/hubert/test_modeling_hubert.py @@ -350,22 +350,21 @@ def test_labels_out_of_vocab(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.check_labels_out_of_vocab(*config_and_inputs) - # Hubert has no inputs_embeds + @unittest.skip(reason="Hubert has no inputs_embeds") def test_inputs_embeds(self): pass - # `input_ids` is renamed to `input_values` + @unittest.skip(reason="Hubert has no inputs_embeds") def test_forward_signature(self): pass # Hubert cannot resize token embeddings # since it has no tokens embeddings + @unittest.skip(reason="Hubert has no tokens embeddings") def test_resize_tokens_embeddings(self): pass - # Hubert has no inputs_embeds - # and thus the `get_input_embeddings` fn - # is not implemented + @unittest.skip(reason="Hubert has no inputs_embeds") def test_model_get_set_embeddings(self): pass @@ -438,10 +437,10 @@ def test_initialization(self): # Hubert cannot be TorchScripted because of torch.nn.utils.weight_norm def _create_and_check_torch_fx_tracing(self, config, inputs_dict, output_loss=False): # TODO: fix it - self.skipTest("torch 2.1 breaks torch fx tests for wav2vec2/hubert.") + self.skipTest(reason="torch 2.1 breaks torch fx tests for wav2vec2/hubert.") if not is_torch_fx_available() or not self.fx_compatible: - return + self.skipTest(reason="torch fx is not available or not compatible with this model") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.return_dict = False @@ -615,22 +614,19 @@ def test_labels_out_of_vocab(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.check_labels_out_of_vocab(*config_and_inputs) - # Hubert has no inputs_embeds + @unittest.skip(reason="Hubert has no inputs_embeds") def test_inputs_embeds(self): pass - # `input_ids` is renamed to `input_values` + @unittest.skip(reason="Hubert has input_values instead of input_ids") def test_forward_signature(self): pass - # Hubert cannot resize token embeddings - # since it has no tokens embeddings + @unittest.skip(reason="Hubert has no tokens embeddings") def test_resize_tokens_embeddings(self): pass - # Hubert has no inputs_embeds - # and thus the `get_input_embeddings` fn - # is not implemented + @unittest.skip(reason="Hubert has no inputs_embeds") def test_model_get_set_embeddings(self): pass diff --git a/tests/models/ibert/test_modeling_ibert.py b/tests/models/ibert/test_modeling_ibert.py index 342d81754553..b9b5054d9044 100644 --- a/tests/models/ibert/test_modeling_ibert.py +++ b/tests/models/ibert/test_modeling_ibert.py @@ -379,7 +379,7 @@ def test_inputs_embeds(self): with torch.no_grad(): model(**inputs)[0] - @unittest.skip("ibert overrides scaling to None if inputs_embeds") + @unittest.skip(reason="ibert overrides scaling to None if inputs_embeds") def test_inputs_embeds_matches_input_ids(self): pass diff --git a/tests/models/idefics/test_image_processing_idefics.py b/tests/models/idefics/test_image_processing_idefics.py index 0273480333f1..2f7a8993df53 100644 --- a/tests/models/idefics/test_image_processing_idefics.py +++ b/tests/models/idefics/test_image_processing_idefics.py @@ -16,6 +16,8 @@ import unittest +import numpy as np + from transformers.testing_utils import require_torch, require_torchvision, require_vision from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available @@ -75,6 +77,8 @@ def get_expected_values(self, image_inputs, batched=False): image = image_inputs[0] if isinstance(image, Image.Image): w, h = image.size + elif isinstance(image, np.ndarray): + h, w = image.shape[0], image.shape[1] else: h, w = image.shape[1], image.shape[2] scale = size / min(w, h) @@ -187,18 +191,18 @@ def convert_to_rgb(image): torch.testing.assert_close(pixel_values_transform_implied, pixel_values_transform_supplied, rtol=0.0, atol=0.0) - @unittest.skip("not supported") + @unittest.skip(reason="not supported") def test_call_numpy(self): pass - @unittest.skip("not supported") + @unittest.skip(reason="not supported") def test_call_numpy_4_channels(self): pass - @unittest.skip("not supported") + @unittest.skip(reason="not supported") def test_call_pil(self): pass - @unittest.skip("not supported") + @unittest.skip(reason="not supported") def test_call_pytorch(self): pass diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index 91a9f661660e..0197ebcaff53 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -316,7 +316,7 @@ def prepare_pixel_values(self): @slow @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) def test_eager_matches_sdpa_inference(self, torch_dtype: str): - self.skipTest("Idefics has a hard requirement on SDPA, skipping this test") + self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test") @unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required") @@ -422,13 +422,13 @@ def test_cross_attention_gates(self): def test_training(self): if not self.model_tester.is_training: - return + self.skipTest(reason="model_tester.is_training is set to False") for model_class in self.all_model_classes: # IdeficsModel does not support training, users should use # IdeficsForVisionText2Text for this purpose if model_class == IdeficsModel: - return + self.skipTest(reason="IdeficsModel does not support training") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True @@ -442,13 +442,13 @@ def test_training(self): def test_training_gradient_checkpointing(self): if not self.model_tester.is_training: - return + self.skipTest(reason="model_tester.is_training is set to False") for model_class in self.all_model_classes: # IdeficsModel does not support training, users should use # IdeficsForVisionText2Text for this purpose if model_class == IdeficsModel: - return + self.skipTest(reason="IdeficsModel does not support training") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.use_cache = False @@ -575,7 +575,7 @@ def test_model_from_pretrained(self): @slow @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) def test_eager_matches_sdpa_inference(self, torch_dtype: str): - self.skipTest("Idefics has a hard requirement on SDPA, skipping this test") + self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test") @unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required") @@ -590,11 +590,11 @@ def setUp(self): ) self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37) - @unittest.skip("We only test the model that takes in multiple images") + @unittest.skip(reason="We only test the model that takes in multiple images") def test_model(self): pass - @unittest.skip("We only test the model that takes in multiple images") + @unittest.skip(reason="We only test the model that takes in multiple images") def test_for_token_classification(self): pass diff --git a/tests/models/idefics2/test_image_processing_idefics2.py b/tests/models/idefics2/test_image_processing_idefics2.py index 2e0d36e75c8a..624fdd6c98b3 100644 --- a/tests/models/idefics2/test_image_processing_idefics2.py +++ b/tests/models/idefics2/test_image_processing_idefics2.py @@ -99,6 +99,8 @@ def get_expected_values(self, image_inputs, batched=False): image = image_inputs[0] if isinstance(image, Image.Image): w, h = image.size + elif isinstance(image, np.ndarray): + h, w = image.shape[0], image.shape[1] else: h, w = image.shape[1], image.shape[2] @@ -176,6 +178,10 @@ def prepare_image_inputs( if torchify: images_list = [[torch.from_numpy(image) for image in images] for images in images_list] + if numpify: + # Numpy images are typically in channels last format + images_list = [[image.transpose(1, 2, 0) for image in images] for images in images_list] + return images_list @@ -206,66 +212,100 @@ def test_image_processor_properties(self): self.assertTrue(hasattr(image_processing, "do_image_splitting")) def test_call_numpy(self): - # Initialize image_processing - image_processing = self.image_processing_class(**self.image_processor_dict) - # create random numpy tensors - image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) - for sample_images in image_inputs: - for image in sample_images: - self.assertIsInstance(image, np.ndarray) - - # Test not batched input - encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values - expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) - self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) - - # Test batched - encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values - expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) - self.assertEqual( - tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) - ) + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + for sample_images in image_inputs: + for image in sample_images: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + def test_call_numpy_4_channels(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processor_dict = self.image_processor_dict + image_processor_dict["image_mean"] = [0.5, 0.5, 0.5, 0.5] + image_processor_dict["image_std"] = [0.5, 0.5, 0.5, 0.5] + image_processing = self.image_processing_class(**image_processor_dict) + # create random numpy tensors + self.image_processor_tester.num_channels = 4 + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + + for sample_images in image_inputs: + for image in sample_images: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing( + image_inputs[0], input_data_format="channels_last", return_tensors="pt" + ).pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing( + image_inputs, input_data_format="channels_last", return_tensors="pt" + ).pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) def test_call_pil(self): - # Initialize image_processing - image_processing = self.image_processing_class(**self.image_processor_dict) - # create random PIL images - image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) - for images in image_inputs: - for image in images: - self.assertIsInstance(image, Image.Image) - - # Test not batched input - encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values - expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) - self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) - - # Test batched - encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values - expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) - self.assertEqual( - tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) - ) + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) + for images in image_inputs: + for image in images: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) def test_call_pytorch(self): - # Initialize image_processing - image_processing = self.image_processing_class(**self.image_processor_dict) - # create random PyTorch tensors - image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) - - for images in image_inputs: - for image in images: - self.assertIsInstance(image, torch.Tensor) - - # Test not batched input - encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values - expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) - self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) - - # Test batched - expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) - encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values - self.assertEqual( - tuple(encoded_images.shape), - (self.image_processor_tester.batch_size, *expected_output_image_shape), - ) + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + + for images in image_inputs: + for image in images: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + tuple(encoded_images.shape), + (self.image_processor_tester.batch_size, *expected_output_image_shape), + ) diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 63e6316773b9..057ce93cd87e 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -176,19 +176,19 @@ def setUp(self): self.model_tester = Idefics2VisionText2TextModelTester(self) self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False) - @unittest.skip("input_embeds cannot be passed in without input_ids") + @unittest.skip(reason="input_embeds cannot be passed in without input_ids") def test_inputs_embeds(): pass - @unittest.skip("input_embeds cannot be passed in without input_ids") + @unittest.skip(reason="input_embeds cannot be passed in without input_ids") def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip("Model does not support padding right") + @unittest.skip(reason="Model does not support padding right") def test_flash_attn_2_generate_padding_right(self): pass - @unittest.skip("Model does not support padding right") + @unittest.skip(reason="Model does not support padding right") def test_flash_attn_2_inference_padding_right(self): pass @@ -336,15 +336,15 @@ def setUp(self): self.model_tester = Idefics2VisionText2TextModelTester(self) self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False) - @unittest.skip("input_embeds cannot be passed in without input_ids") + @unittest.skip(reason="input_embeds cannot be passed in without input_ids") def test_inputs_embeds(): pass - @unittest.skip("Model does not support padding right") + @unittest.skip(reason="Model does not support padding right") def test_flash_attn_2_generate_padding_right(self): pass - @unittest.skip("Model does not support padding right") + @unittest.skip(reason="Model does not support padding right") def test_flash_attn_2_inference_padding_right(self): pass diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py index a9dbc636ef30..aa1039103e95 100644 --- a/tests/models/imagegpt/test_image_processing_imagegpt.py +++ b/tests/models/imagegpt/test_image_processing_imagegpt.py @@ -176,7 +176,7 @@ def test_image_processor_save_load_with_autoimageprocessor(self): else: self.assertEqual(image_processor_first[key], value) - @unittest.skip("ImageGPT requires clusters at initialization") + @unittest.skip(reason="ImageGPT requires clusters at initialization") def test_init_without_params(self): pass @@ -220,7 +220,7 @@ def test_call_numpy(self): tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) ) - @unittest.skip("ImageGPT assumes clusters for 3 channels") + @unittest.skip(reason="ImageGPT assumes clusters for 3 channels") def test_call_numpy_4_channels(self): pass diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py index afb5ce87764c..9cf45a3f21b6 100644 --- a/tests/models/imagegpt/test_modeling_imagegpt.py +++ b/tests/models/imagegpt/test_modeling_imagegpt.py @@ -357,7 +357,7 @@ def test_resize_tokens_embeddings(self): inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() if not self.test_resize_embeddings: - return + self.skipTest(reason="test_resize_embeddings is set to False") for model_class in self.all_model_classes: config = copy.deepcopy(original_config) @@ -404,13 +404,13 @@ def test_resize_embeddings_untied(self): inputs_dict, ) = self.model_tester.prepare_config_and_inputs_for_common() if not self.test_resize_embeddings: - return + self.skipTest(reason="test_resize_embeddings is set to False") original_config.tie_word_embeddings = False # if model cannot untied embeddings -> leave test if original_config.tie_word_embeddings: - return + self.skipTest(reason="tie_word_embeddings is set to False") for model_class in self.all_model_classes: config = copy.deepcopy(original_config) @@ -493,7 +493,7 @@ def test_inputs_embeds_matches_input_ids(self): def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.torchscript = True @@ -573,7 +573,7 @@ def _create_and_check_torchscript(self, config, inputs_dict): self.assertTrue(models_equal) - @unittest.skip("The model doesn't support left padding") # and it's not used enough to be worth fixing :) + @unittest.skip(reason="The model doesn't support left padding") # and it's not used enough to be worth fixing :) def test_left_padding_compatibility(self): pass diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py index 5eab89a3adb3..e4e86fb69527 100644 --- a/tests/models/informer/test_modeling_informer.py +++ b/tests/models/informer/test_modeling_informer.py @@ -278,17 +278,19 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) - # Ignore since we have no tokens embeddings + @unittest.skip(reason="Informer does not have tokens embeddings") def test_resize_tokens_embeddings(self): pass + @unittest.skip def test_model_outputs_equivalence(self): pass + @unittest.skip def test_determinism(self): pass - @unittest.skip("randomly selects U keys while calculating attentions") + @unittest.skip(reason="randomly selects U keys while calculating attentions") def test_batching_equivalence(self): pass diff --git a/tests/models/instructblipvideo/__init__.py b/tests/models/instructblipvideo/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py b/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py new file mode 100644 index 000000000000..d53342416d28 --- /dev/null +++ b/tests/models/instructblipvideo/test_image_processing_instrictblipvideo.py @@ -0,0 +1,191 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import InstructBlipVideoImageProcessor + + +class InstructBlipVideoProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=5, + num_channels=3, + image_size=24, + min_resolution=30, + max_resolution=80, + do_resize=True, + size=None, + do_normalize=True, + image_mean=OPENAI_CLIP_MEAN, + image_std=OPENAI_CLIP_STD, + do_convert_rgb=True, + frames=4, + ): + size = size if size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + self.frames = frames + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + } + + def expected_output_image_shape(self, images): + return self.frames, self.num_channels, self.size["height"], self.size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + images = prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + # let's simply copy the frames to fake a long video-clip + if numpify or torchify: + videos = [] + for image in images: + if numpify: + video = image[None, ...].repeat(self.frames, 0) + else: + video = image[None, ...].repeat(self.frames, 1, 1, 1) + videos.append(video) + else: + videos = [] + for pil_image in images: + videos.append([pil_image] * self.frames) + + return videos + + +@require_torch +@require_vision +class InstructBlipVideoProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = InstructBlipVideoImageProcessor if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = InstructBlipVideoProcessingTester(self) + + @property + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + + def test_image_processor_from_dict_with_kwargs(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 18}) + + image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + + def test_call_pil(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True) + for video in video_inputs: + self.assertIsInstance(video[0], Image.Image) + + # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!) + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values + expected_output_video_shape = (1, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values + expected_output_video_shape = (5, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + def test_call_numpy(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, numpify=True) + for video in video_inputs: + self.assertIsInstance(video, np.ndarray) + + # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!) + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values + expected_output_video_shape = (1, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values + expected_output_video_shape = (5, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + def test_call_pytorch(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + video_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True, torchify=True) + for video in video_inputs: + self.assertIsInstance(video, torch.Tensor) + + # Test not batched input + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values + expected_output_video_shape = (1, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values + expected_output_video_shape = (5, 4, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py new file mode 100644 index 000000000000..1265db3a2a2e --- /dev/null +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -0,0 +1,585 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch InstructBlipVideo model.""" + +import inspect +import tempfile +import unittest + +import numpy as np +from huggingface_hub import hf_hub_download + +from transformers import ( + CONFIG_MAPPING, + InstructBlipVideoConfig, + InstructBlipVideoProcessor, + InstructBlipVideoQFormerConfig, + InstructBlipVideoVisionConfig, +) +from transformers.testing_utils import ( + require_accelerate, + require_bitsandbytes, + require_torch, + require_vision, + slow, + torch_device, +) +from transformers.utils import is_torch_available, is_vision_available + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + floats_tensor, + ids_tensor, + random_attention_mask, +) + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import InstructBlipVideoForConditionalGeneration, InstructBlipVideoVisionModel + + +if is_vision_available(): + pass + + +class InstructBlipVideoVisionModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=30, + frames=4, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=32, + projection_dim=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=1e-10, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.frames = frames + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + # in case of a vision transformer, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor( + [self.batch_size * self.frames, self.num_channels, self.image_size, self.image_size] + ) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return InstructBlipVideoVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values): + model = InstructBlipVideoVisionModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size * self.frames, num_patches + 1, self.hidden_size) + ) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size * self.frames, self.hidden_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class InstructBlipVideoVisionModelTest(ModelTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as InstructBlipVideo's vision encoder does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (InstructBlipVideoVisionModel,) if is_torch_available() else () + fx_compatible = False + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = InstructBlipVideoVisionModelTester(self) + self.config_tester = ConfigTester( + self, config_class=InstructBlipVideoVisionConfig, has_text_modality=False, hidden_size=37 + ) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="InstructBlipVideo's vision encoder does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="InstructBlipVideo's vision encoder is an nn.Embeddings layer") + def test_model_get_set_embeddings(self): + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip( + reason="InstructBlipVideoVisionModel is an internal building block, doesn't support standalone training" + ) + def test_training(self): + pass + + @unittest.skip( + reason="InstructBlipVideoVisionModel is an internal building block, doesn't support standalone training" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="InstructBlipVideoVisionModel has no base class and is not available in MODEL_MAPPING") + def test_save_load_fast_init_from_base(self): + pass + + @unittest.skip(reason="InstructBlipVideoVisionModel has no base class and is not available in MODEL_MAPPING") + def test_save_load_fast_init_to_base(self): + pass + + @slow + def test_model_from_pretrained(self): + model_name = "Salesforce/instructblip-vicuna-7b" + model = InstructBlipVideoVisionModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +class InstructBlipVideoQFormerModelTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + projection_dim=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + bos_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = scope + self.bos_token_id = bos_token_id + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + qformer_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + qformer_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + if input_mask is not None: + batch_size, seq_length = input_mask.shape + rnd_start_indices = np.random.randint(1, seq_length - 1, size=(batch_size,)) + for batch_idx, start_index in enumerate(rnd_start_indices): + input_mask[batch_idx, :start_index] = 1 + input_mask[batch_idx, start_index:] = 0 + + config = self.get_config() + + return config, input_ids, input_mask, qformer_input_ids, qformer_attention_mask + + def get_config(self): + return InstructBlipVideoQFormerConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + bos_token_id=self.bos_token_id, + ) + + +# this class is based on `OPTModelTester` found in tests/models/opt/test_modeling_opt.py +class InstructBlipVideoTextModelDecoderOnlyTester: + def __init__( + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_labels=False, + vocab_size=99, + hidden_size=16, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=100, + eos_token_id=2, + pad_token_id=1, + bos_token_id=0, + embed_dim=16, + num_labels=3, + word_embed_proj_dim=16, + type_sequence_label_size=2, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.embed_dim = embed_dim + self.num_labels = num_labels + self.type_sequence_label_size = type_sequence_label_size + self.word_embed_proj_dim = word_embed_proj_dim + self.is_encoder_decoder = False + + def prepare_config_and_inputs(self): + config = self.get_config() + + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(3) + input_ids[:, -1] = self.eos_token_id # Eos Token + + attention_mask = input_ids.ne(self.pad_token_id) + + return config, input_ids, attention_mask + + def get_config(self): + return CONFIG_MAPPING["opt"]( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + eos_token_id=self.eos_token_id, + bos_token_id=self.bos_token_id, + pad_token_id=self.pad_token_id, + embed_dim=self.embed_dim, + is_encoder_decoder=False, + word_embed_proj_dim=self.word_embed_proj_dim, + ) + + +# this model tester uses a decoder-only language model (OPT) +class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester: + def __init__( + self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10 + ): + if vision_kwargs is None: + vision_kwargs = {} + if qformer_kwargs is None: + qformer_kwargs = {} + if text_kwargs is None: + text_kwargs = {} + + self.parent = parent + self.vision_model_tester = InstructBlipVideoVisionModelTester(parent, **vision_kwargs) + self.qformer_model_tester = InstructBlipVideoQFormerModelTester(parent, **qformer_kwargs) + self.text_model_tester = InstructBlipVideoTextModelDecoderOnlyTester(parent, **text_kwargs) + self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test + self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests + self.is_training = is_training + self.num_query_tokens = num_query_tokens + + def prepare_config_and_inputs(self): + _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + _, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs() + _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + frames = self.vision_model_tester.frames + _, c, h, w = pixel_values.shape + pixel_values = pixel_values.reshape(-1, frames, c, h, w) + + config = self.get_config() + + return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values + + def get_config(self): + return InstructBlipVideoConfig.from_vision_qformer_text_configs( + vision_config=self.vision_model_tester.get_config(), + qformer_config=self.qformer_model_tester.get_config(), + text_config=self.text_model_tester.get_config(), + num_query_tokens=self.num_query_tokens, + ) + + def create_and_check_for_conditional_generation( + self, config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values + ): + model = InstructBlipVideoForConditionalGeneration(config).to(torch_device).eval() + with torch.no_grad(): + result = model( + pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + qformer_input_ids=qformer_input_ids, + qformer_attention_mask=qformer_attention_mask, + ) + + expected_seq_length = ( + self.num_query_tokens * self.vision_model_tester.frames + ) + self.text_model_tester.seq_length + self.parent.assertEqual( + result.logits.shape, + (self.vision_model_tester.batch_size, expected_seq_length, self.text_model_tester.vocab_size), + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values = config_and_inputs + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + "qformer_input_ids": qformer_input_ids, + "qformer_attention_mask": qformer_attention_mask, + "labels": input_ids, + } + return config, inputs_dict + + +@require_torch +class InstructBlipVideoForConditionalGenerationDecoderOnlyTest( + ModelTesterMixin, GenerationTesterMixin, unittest.TestCase +): + all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else () + fx_compatible = False + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + test_torchscript = False + + def setUp(self): + self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self) + + def test_for_conditional_generation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs) + + @unittest.skip(reason="Hidden_states is tested in individual model tests") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="InstructBlipVideoForConditionalGeneration doesn't support inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Tied weights are tested in individual model tests") + def test_tied_weights_keys(self): + pass + + @unittest.skip(reason="Retain_grad is tested in individual model tests") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="InstructBlipVideoModel does not have input/output embeddings") + def test_model_common_attributes(self): + pass + + @unittest.skip(reason="There's no base InstructBlipVideoModel") + def test_save_load_fast_init_from_base(self): + pass + + @unittest.skip(reason="There's no base InstructBlipVideoModel") + def test_save_load_fast_init_to_base(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_load_vision_qformer_text_config(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # Save InstructBlipVideoConfig and check if we can load InstructBlipVideoVisionConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + vision_config = InstructBlipVideoVisionConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict()) + + # Save InstructBlipVideoConfig and check if we can load InstructBlipVideoQFormerConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + qformer_config = InstructBlipVideoQFormerConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict()) + + @slow + def test_model_from_pretrained(self): + model_name = "Salesforce/instructblip-vicuna-7b" + model = InstructBlipVideoForConditionalGeneration.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_video(): + video_file = hf_hub_download( + repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset" + ) + video = np.load(video_file)[::2] # sample every 2nd frame to get 4 frames total + return video + + +@require_vision +@require_torch +@require_bitsandbytes +@require_accelerate +@slow +class InstructBlipVideoModelIntegrationTest(unittest.TestCase): + def test_inference_vicuna_7b(self): + processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b") + model = InstructBlipVideoForConditionalGeneration.from_pretrained( + "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True + ) + + clip = prepare_video() + prompt = "Explain what is happening in this short video." + inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, torch.float16) + + # verify generation + outputs = model.generate(**inputs, max_new_tokens=30) + generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() + self.assertEqual( + generated_text, + "a baby girl wearing glasses is reading a book on the bed 1080p", + ) diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py index f69eb0d806b8..1688c685e1d4 100644 --- a/tests/models/jamba/test_modeling_jamba.py +++ b/tests/models/jamba/test_modeling_jamba.py @@ -390,7 +390,7 @@ def test_mismatched_shapes_have_properly_initialized_weights(self): Overriding the test_mismatched_shapes_have_properly_initialized_weights test because A_log and D params of the Mamba block are initialized differently and we tested that in test_initialization """ - self.skipTest("Cumbersome and redundant for Jamba") + self.skipTest(reason="Cumbersome and redundant for Jamba") def test_attention_outputs(self): r""" @@ -638,9 +638,9 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): Overriding the test_flash_attn_2_inference_padding_right test as the Jamba model, like Mixtral, doesn't support right padding + use cache with FA2 """ - self.skipTest("Jamba flash attention does not support right padding") + self.skipTest(reason="Jamba flash attention does not support right padding") - @unittest.skip("Jamba has its own special cache type") + @unittest.skip(reason="Jamba has its own special cache type") @parameterized.expand([(1, False), (1, True), (4, False)]) def test_new_cache_format(self, num_beams, do_sample): pass diff --git a/tests/models/jetmoe/test_modeling_jetmoe.py b/tests/models/jetmoe/test_modeling_jetmoe.py index 12e5dd682c6c..cdb82cb5a955 100644 --- a/tests/models/jetmoe/test_modeling_jetmoe.py +++ b/tests/models/jetmoe/test_modeling_jetmoe.py @@ -378,11 +378,11 @@ def test_jetmoe_sequence_classification_model_for_multi_label(self): result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) - @unittest.skip("JetMoe buffers include complex numbers, which breaks this test") + @unittest.skip(reason="JetMoe buffers include complex numbers, which breaks this test") def test_save_load_fast_init_from_base(self): pass - @unittest.skip("JetMoe uses MoA on all models so the KV cache is a non standard format") + @unittest.skip(reason="JetMoe uses MoA on all models so the KV cache is a non standard format") def test_past_key_values_format(self): pass @@ -470,7 +470,7 @@ def test_flash_attn_2_generate_use_cache(self): @pytest.mark.flash_attn_test @slow def test_flash_attn_2_inference_equivalence_right_padding(self): - self.skipTest("JetMoe flash attention does not support right padding") + self.skipTest(reason="JetMoe flash attention does not support right padding") @require_torch diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 66f070ed4623..6f34689004ef 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -375,7 +375,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): # overwrite from common in order to use `config.text_config.vocab_size` instead of `config.vocab_size` def test_tie_model_weights(self): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -429,7 +429,7 @@ def test_model_from_pretrained(self): def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: - return + self.skipTest(reason="test_torchscript is set to False") configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init.torchscript = True diff --git a/tests/models/layoutlm/test_tokenization_layoutlm.py b/tests/models/layoutlm/test_tokenization_layoutlm.py index 3ddd6e766031..eb0e1de626a5 100644 --- a/tests/models/layoutlm/test_tokenization_layoutlm.py +++ b/tests/models/layoutlm/test_tokenization_layoutlm.py @@ -69,6 +69,7 @@ def test_full_tokenizer(self): self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) + @unittest.skip def test_special_tokens_as_you_expect(self): """If you are training a seq2seq model that expects a decoder_prefix token make sure it is prepended to decoder_input_ids""" pass diff --git a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py index 5e213e0a3644..09dabfc5bed4 100644 --- a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py @@ -96,7 +96,7 @@ def test_image_processor_from_dict_with_kwargs(self): image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) self.assertEqual(image_processor.size, {"height": 42, "width": 42}) - @unittest.skip("Tesseract version is not correct in ci. @Arthur FIXME") + @unittest.skip(reason="Tesseract version is not correct in ci. @Arthur FIXME") def test_layoutlmv2_integration_test(self): # with apply_OCR = True image_processing = LayoutLMv2ImageProcessor() diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py index 284ba82e3ff9..3d366fe3e84e 100644 --- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py @@ -414,7 +414,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) - @unittest.skip("We cannot configure detectron2 to output a smaller backbone") + @unittest.skip(reason="We cannot configure detectron2 to output a smaller backbone") def test_model_is_small(self): pass diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py index 9f9a86a999dd..0dbeef0c4176 100644 --- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py @@ -195,7 +195,7 @@ def test_basic_tokenizer_respects_never_split_tokens(self): tokenizer.tokenize(" \tHeLLo!how \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"] ) - @unittest.skip("Chat template tests don't play well with table/layout models.") + @unittest.skip(reason="Chat template tests don't play well with table/layout models.") def test_chat_template_batched(self): pass @@ -385,11 +385,11 @@ def test_encode_decode_with_spaces(self): decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens) self.assertIn(decoded, [output, output.lower()]) - @unittest.skip("Not implemented") + @unittest.skip(reason="Not implemented") def test_right_and_left_truncation(self): pass - @unittest.skip("Not implemented") + @unittest.skip(reason="Not implemented") def test_split_special_tokens(self): pass @@ -814,7 +814,7 @@ def test_padding(self, max_length=50): def test_padding_warning_message_fast_tokenizer(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") words, boxes = self.get_words_and_boxes_batch() @@ -835,7 +835,7 @@ def test_padding_warning_message_fast_tokenizer(self): ) if not self.test_slow_tokenizer: - return + self.skipTest(reason="test_slow_tokenizer is set to False") tokenizer_slow = self.get_tokenizer() @@ -942,7 +942,7 @@ def test_batch_encode_plus_batch_sequence_length(self): encoded_sequences_batch_padded_2[key], ) - @unittest.skip("batch_encode_plus does not handle overflowing tokens.") + @unittest.skip(reason="batch_encode_plus does not handle overflowing tokens.") def test_batch_encode_plus_overflowing_tokens(self): pass @@ -1003,7 +1003,7 @@ def test_padding_to_multiple_of(self): for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): if tokenizer.pad_token is None: - self.skipTest("No padding token.") + self.skipTest(reason="No padding token.") else: words, boxes = self.get_words_and_boxes() @@ -1046,7 +1046,7 @@ def test_tokenizer_slow_store_full_signature(self): def test_build_inputs_with_special_tokens(self): if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -1290,13 +1290,13 @@ def test_torch_encode_plus_sent_to_model(self): for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: - return + self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING") config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__] config = config_class() if config.is_encoder_decoder or config.pad_token_id is None: - return + self.skipTest(reason="Model is an encoder-decoder or has no pad token id set.") model = model_class(config) @@ -1327,11 +1327,11 @@ def test_torch_encode_plus_sent_to_model(self): def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() @@ -1349,7 +1349,7 @@ def test_rust_and_python_full_tokenizers(self): def test_tokenization_python_rust_equals(self): if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -1403,7 +1403,7 @@ def test_tokenization_python_rust_equals(self): def test_embeded_special_tokens(self): if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -1593,7 +1593,7 @@ def test_special_tokens_initialization(self): def test_training_new_tokenizer(self): # This feature only exists for fast tokenizers if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_rust_tokenizer() new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100) @@ -1630,7 +1630,7 @@ def test_training_new_tokenizer(self): def test_training_new_tokenizer_with_special_tokens_change(self): # This feature only exists for fast tokenizers if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_rust_tokenizer() # Test with a special tokens map @@ -1743,7 +1743,7 @@ def test_prepare_for_model(self): def test_padding_different_model_input_name(self): if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -1837,7 +1837,7 @@ def test_batch_encode_dynamic_overflowing(self): self.assertEqual(len(tokens[key].shape), 3) self.assertEqual(tokens[key].shape[-1], 4) - @unittest.skip("TO DO: overwrite this very extensive test.") + @unittest.skip(reason="TO DO: overwrite this very extensive test.") def test_alignement_methods(self): pass @@ -1875,7 +1875,7 @@ def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, return words, boxes, output_ids - # @unittest.skip("LayoutLMv2 tokenizer requires boxes besides sequences.") + # @unittest.skip(reason="LayoutLMv2 tokenizer requires boxes besides sequences.") def test_maximum_encoding_length_pair_input(self): tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100) for tokenizer in tokenizers: @@ -2237,7 +2237,7 @@ def test_maximum_encoding_length_pair_input(self): self.assertEqual(bbox, bbox_second_sequence) self.assertEqual(overflowing_bbox, overflowing_token_bbox_second_sequence_slow) - # @unittest.skip("LayoutLMv2 tokenizer requires boxes besides sequences.") + # @unittest.skip(reason="LayoutLMv2 tokenizer requires boxes besides sequences.") def test_maximum_encoding_length_single_input(self): tokenizers = self.get_tokenizers(do_lower_case=False, model_max_length=100) for tokenizer in tokenizers: @@ -2359,15 +2359,15 @@ def test_maximum_encoding_length_single_input(self): self.assertEqual(bbox, sequence["bbox"][:-2]) self.assertEqual(overflowing_bbox, sequence["bbox"][-(2 + stride) :]) - @unittest.skip("LayoutLMv2 tokenizer requires boxes besides sequences.") + @unittest.skip(reason="LayoutLMv2 tokenizer requires boxes besides sequences.") def test_pretokenized_inputs(self): pass - @unittest.skip("LayoutLMv2 tokenizer always expects pretokenized inputs.") + @unittest.skip(reason="LayoutLMv2 tokenizer always expects pretokenized inputs.") def test_compare_pretokenized_inputs(self): pass - @unittest.skip("LayoutLMv2 fast tokenizer does not support prepare_for_model") + @unittest.skip(reason="LayoutLMv2 fast tokenizer does not support prepare_for_model") def test_compare_prepare_for_model(self): pass @@ -2476,10 +2476,10 @@ def test_layoutlmv2_integration_test(self): self.assertDictEqual(dict(encoding_p), expected_results) self.assertDictEqual(dict(encoding_r), expected_results) - @unittest.skip("Doesn't support another framework than PyTorch") + @unittest.skip(reason="Doesn't support another framework than PyTorch") def test_np_encode_plus_sent_to_model(self): pass - @unittest.skip("Chat is not supported") + @unittest.skip(reason="Chat is not supported") def test_chat_template(self): pass diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py index 80d29d3a46b1..e478e0ac62cb 100644 --- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py @@ -140,7 +140,7 @@ def get_input_output_texts(self, tokenizer): output_text = "lower newer" return input_text, output_text - @unittest.skip("Chat template tests don't play well with table/layout models.") + @unittest.skip(reason="Chat template tests don't play well with table/layout models.") def test_chat_template_batched(self): pass @@ -265,11 +265,11 @@ def test_encode_decode_with_spaces(self): decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens) self.assertIn(decoded, [output, output.lower()]) - @unittest.skip("Not implemented") + @unittest.skip(reason="Not implemented") def test_right_and_left_truncation(self): pass - @unittest.skip("Not implemented") + @unittest.skip(reason="Not implemented") def test_split_special_tokens(self): pass @@ -694,7 +694,7 @@ def test_padding(self, max_length=50): def test_padding_warning_message_fast_tokenizer(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") words, boxes = self.get_words_and_boxes_batch() @@ -715,7 +715,7 @@ def test_padding_warning_message_fast_tokenizer(self): ) if not self.test_slow_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer_slow = self.get_tokenizer() @@ -822,7 +822,7 @@ def test_batch_encode_plus_batch_sequence_length(self): encoded_sequences_batch_padded_2[key], ) - @unittest.skip("batch_encode_plus does not handle overflowing tokens.") + @unittest.skip(reason="batch_encode_plus does not handle overflowing tokens.") def test_batch_encode_plus_overflowing_tokens(self): pass @@ -883,7 +883,7 @@ def test_padding_to_multiple_of(self): for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): if tokenizer.pad_token is None: - self.skipTest("No padding token.") + self.skipTest(reason="No padding token.") else: words, boxes = self.get_words_and_boxes() @@ -926,7 +926,7 @@ def test_tokenizer_slow_store_full_signature(self): def test_build_inputs_with_special_tokens(self): if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_rust_tokenizer is set to False") for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -1168,13 +1168,13 @@ def test_torch_encode_plus_sent_to_model(self): for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: - return + self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING") config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__] config = config_class() if config.is_encoder_decoder or config.pad_token_id is None: - return + self.skipTest(reason="Model is an encoder-decoder or has no pad token id set.") model = model_class(config) @@ -1205,11 +1205,11 @@ def test_torch_encode_plus_sent_to_model(self): def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() @@ -1227,7 +1227,7 @@ def test_rust_and_python_full_tokenizers(self): def test_tokenization_python_rust_equals(self): if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -1281,7 +1281,7 @@ def test_tokenization_python_rust_equals(self): def test_embeded_special_tokens(self): if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -1471,7 +1471,7 @@ def test_special_tokens_initialization(self): def test_training_new_tokenizer(self): # This feature only exists for fast tokenizers if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_rust_tokenizer() new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100) @@ -1508,7 +1508,7 @@ def test_training_new_tokenizer(self): def test_training_new_tokenizer_with_special_tokens_change(self): # This feature only exists for fast tokenizers if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_rust_tokenizer() # Test with a special tokens map @@ -1621,7 +1621,7 @@ def test_prepare_for_model(self): def test_padding_different_model_input_name(self): if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -1720,7 +1720,7 @@ def test_batch_encode_dynamic_overflowing(self): self.assertEqual(len(tokens[key].shape), 3) self.assertEqual(tokens[key].shape[-1], 4) - @unittest.skip("TO DO: overwrite this very extensive test.") + @unittest.skip(reason="TO DO: overwrite this very extensive test.") def test_alignement_methods(self): pass @@ -2272,15 +2272,15 @@ def test_maximum_encoding_length_single_input(self): # self.assertEqual(bbox, sequence["bbox"][:-2]) # self.assertEqual(overflowing_bbox, sequence["bbox"][-(2 + stride) :]) - @unittest.skip("LayoutLMv3 tokenizer requires boxes besides sequences.") + @unittest.skip(reason="LayoutLMv3 tokenizer requires boxes besides sequences.") def test_pretokenized_inputs(self): pass - @unittest.skip("LayoutLMv3 tokenizer always expects pretokenized inputs.") + @unittest.skip(reason="LayoutLMv3 tokenizer always expects pretokenized inputs.") def test_compare_pretokenized_inputs(self): pass - @unittest.skip("LayoutLMv3 fast tokenizer does not support prepare_for_model") + @unittest.skip(reason="LayoutLMv3 fast tokenizer does not support prepare_for_model") def test_compare_prepare_for_model(self): pass @@ -2393,7 +2393,7 @@ def test_layoutlmv3_integration_test(self): self.assertDictEqual(dict(encoding_p), expected_results) self.assertDictEqual(dict(encoding_r), expected_results) - @unittest.skip("Doesn't support another framework than PyTorch") + @unittest.skip(reason="Doesn't support another framework than PyTorch") def test_np_encode_plus_sent_to_model(self): pass @@ -2408,13 +2408,13 @@ def test_tf_encode_plus_sent_to_model(self): for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: - return + self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING") config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__] config = config_class() if config.is_encoder_decoder or config.pad_token_id is None: - return + self.skipTest(reason="Model is an encoder-decoder or has no pad token id set.") model = model_class(config) @@ -2433,6 +2433,6 @@ def test_tf_encode_plus_sent_to_model(self): model(encoded_sequence) model(batch_encoded_sequence) - @unittest.skip("Chat is not supported") + @unittest.skip(reason="Chat is not supported") def test_chat_template(self): pass diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py index 03f2bf414bd6..2f8b19a662ab 100644 --- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py +++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py @@ -107,7 +107,7 @@ def get_input_output_texts(self, tokenizer): output_text = "unwanted, running" return input_text, output_text - @unittest.skip("Chat template tests don't play well with table/layout models.") + @unittest.skip(reason="Chat template tests don't play well with table/layout models.") def test_chat_template_batched(self): pass @@ -115,7 +115,7 @@ def test_chat_template_batched(self): # this tokenizer def test_save_sentencepiece_tokenizer(self) -> None: if not self.test_sentencepiece or not self.test_slow_tokenizer: - return + self.skipTest(reason="test_sentencepiece or test_slow_tokenizer is set to False") # We want to verify that we will be able to save the tokenizer even if the original files that were used to # build the tokenizer have been deleted in the meantime. words, boxes = self.get_words_and_boxes() @@ -745,7 +745,7 @@ def test_padding(self, max_length=50): def test_padding_warning_message_fast_tokenizer(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") words, boxes = self.get_words_and_boxes_batch() @@ -766,7 +766,7 @@ def test_padding_warning_message_fast_tokenizer(self): ) if not self.test_slow_tokenizer: - return + self.skipTest(reason="test_slow_tokenizer is set to False") tokenizer_slow = self.get_tokenizer() @@ -873,7 +873,7 @@ def test_batch_encode_plus_batch_sequence_length(self): encoded_sequences_batch_padded_2[key], ) - @unittest.skip("batch_encode_plus does not handle overflowing tokens.") + @unittest.skip(reason="batch_encode_plus does not handle overflowing tokens.") def test_batch_encode_plus_overflowing_tokens(self): pass @@ -934,7 +934,7 @@ def test_padding_to_multiple_of(self): for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): if tokenizer.pad_token is None: - self.skipTest("No padding token.") + self.skipTest(reason="No padding token.") else: words, boxes = self.get_words_and_boxes() @@ -977,7 +977,7 @@ def test_tokenizer_slow_store_full_signature(self): def test_build_inputs_with_special_tokens(self): if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -1066,7 +1066,7 @@ def test_save_and_load_tokenizer(self): shutil.rmtree(tmpdirname) - @unittest.skip("Not implemented") + @unittest.skip(reason="Not implemented") def test_right_and_left_truncation(self): pass @@ -1224,13 +1224,13 @@ def test_torch_encode_plus_sent_to_model(self): for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: - return + self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING") config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__] config = config_class() if config.is_encoder_decoder or config.pad_token_id is None: - return + self.skipTest(reason="Model is an encoder-decoder or has no pad token id set.") model = model_class(config) @@ -1256,11 +1256,11 @@ def test_torch_encode_plus_sent_to_model(self): def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() @@ -1278,7 +1278,7 @@ def test_rust_and_python_full_tokenizers(self): def test_tokenization_python_rust_equals(self): if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -1332,7 +1332,7 @@ def test_tokenization_python_rust_equals(self): def test_embeded_special_tokens(self): if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -1522,7 +1522,7 @@ def test_special_tokens_initialization(self): def test_training_new_tokenizer(self): # This feature only exists for fast tokenizers if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_rust_tokenizer() new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100) @@ -1559,7 +1559,7 @@ def test_training_new_tokenizer(self): def test_training_new_tokenizer_with_special_tokens_change(self): # This feature only exists for fast tokenizers if not self.test_rust_tokenizer: - return + self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_rust_tokenizer() # Test with a special tokens map @@ -1672,7 +1672,7 @@ def test_prepare_for_model(self): def test_padding_different_model_input_name(self): if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -1770,7 +1770,7 @@ def test_batch_encode_dynamic_overflowing(self): def test_save_pretrained(self): if not self.test_slow_tokenizer: # as we don't have a slow version, we can't compare the outputs between slow and fast versions - return + self.skipTest(reason="test_slow_tokenizer is set to False") self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-layoutxlm", {}) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: @@ -1838,27 +1838,27 @@ def test_save_pretrained(self): shutil.rmtree(tmpdirname2) - @unittest.skip("TO DO: overwrite this very extensive test.") + @unittest.skip(reason="TO DO: overwrite this very extensive test.") def test_alignement_methods(self): pass - @unittest.skip("layoutxlm tokenizer requires boxes besides sequences.") + @unittest.skip(reason="layoutxlm tokenizer requires boxes besides sequences.") def test_maximum_encoding_length_pair_input(self): pass - @unittest.skip("layoutxlm tokenizer requires boxes besides sequences.") + @unittest.skip(reason="layoutxlm tokenizer requires boxes besides sequences.") def test_maximum_encoding_length_single_input(self): pass - @unittest.skip("layoutxlm tokenizer requires boxes besides sequences.") + @unittest.skip(reason="layoutxlm tokenizer requires boxes besides sequences.") def test_pretokenized_inputs(self): pass - @unittest.skip("layoutxlm tokenizer always expects pretokenized inputs.") + @unittest.skip(reason="layoutxlm tokenizer always expects pretokenized inputs.") def test_compare_pretokenized_inputs(self): pass - @unittest.skip("layoutxlm fast tokenizer does not support prepare_for_model") + @unittest.skip(reason="layoutxlm fast tokenizer does not support prepare_for_model") def test_compare_prepare_for_model(self): pass @@ -1962,18 +1962,18 @@ def test_layoutxlm_integration_test(self): self.assertDictEqual(dict(encoding_p), expected_results) self.assertDictEqual(dict(encoding_r), expected_results) - @unittest.skip("Doesn't support another framework than PyTorch") + @unittest.skip(reason="Doesn't support another framework than PyTorch") def test_np_encode_plus_sent_to_model(self): pass - @unittest.skip("Doesn't use SentencePiece") + @unittest.skip(reason="Doesn't use SentencePiece") def test_sentencepiece_tokenize_and_convert_tokens_to_string(self): pass - @unittest.skip("Doesn't use SentencePiece") + @unittest.skip(reason="Doesn't use SentencePiece") def test_sentencepiece_tokenize_and_decode(self): pass - @unittest.skip("Chat is not supported") + @unittest.skip(reason="Chat is not supported") def test_chat_template(self): pass diff --git a/tests/models/led/test_modeling_led.py b/tests/models/led/test_modeling_led.py index 6f5c645855e3..2247a64374dd 100644 --- a/tests/models/led/test_modeling_led.py +++ b/tests/models/led/test_modeling_led.py @@ -378,8 +378,8 @@ def test_generate_fp16(self): model.generate(input_ids, attention_mask=attention_mask) model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3) + @unittest.skip(reason="Longformer cannot keep gradients in attentions or hidden states") def test_retain_grad_hidden_states_attentions(self): - # longformer cannot keep gradients in attentions or hidden states return def test_attention_outputs(self): diff --git a/tests/models/led/test_tokenization_led.py b/tests/models/led/test_tokenization_led.py index f287677a1295..7d677bf3f5e2 100644 --- a/tests/models/led/test_tokenization_led.py +++ b/tests/models/led/test_tokenization_led.py @@ -154,6 +154,7 @@ def test_global_attention_mask(self): outputs = tokenizer.pad(encoded_output) self.assertSequenceEqual(outputs["global_attention_mask"], expected_global_attention_mask) + @unittest.skip def test_pretokenized_inputs(self): pass diff --git a/tests/models/levit/test_modeling_levit.py b/tests/models/levit/test_modeling_levit.py index 833e949d6e16..6199d9cdfcfd 100644 --- a/tests/models/levit/test_modeling_levit.py +++ b/tests/models/levit/test_modeling_levit.py @@ -281,7 +281,7 @@ def test_for_image_classification(self): # special case for LevitForImageClassificationWithTeacher model def test_training(self): if not self.model_tester.is_training: - return + self.skipTest(reason="model_tester.is_training is set to False") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True @@ -303,7 +303,7 @@ def test_training(self): def test_training_gradient_checkpointing(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() if not self.model_tester.is_training: - return + self.skipTest(reason="model_tester.is_training is set to False") config.use_cache = False config.return_dict = True diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index 3e84552ab7e2..0935e802c685 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -393,7 +393,7 @@ def test_llama_token_classification_model(self): (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels), ) - @unittest.skip("Llama buffers include complex numbers, which breaks this test") + @unittest.skip(reason="Llama buffers include complex numbers, which breaks this test") def test_save_load_fast_init_from_base(self): pass @@ -710,7 +710,7 @@ def test_compile_static_cache(self): # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2 # work as intended. See https://github.com/pytorch/pytorch/issues/121943 if version.parse(torch.__version__) < version.parse("2.3.0"): - self.skipTest("This test requires torch >= 2.3 to run.") + self.skipTest(reason="This test requires torch >= 2.3 to run.") NUM_TOKENS_TO_GENERATE = 40 # Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index a41774e9f5db..e45149672a8e 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -26,7 +26,6 @@ AddedToken, LlamaTokenizer, LlamaTokenizerFast, - is_torch_available, ) from transformers.convert_slow_tokenizer import convert_slow_tokenizer from transformers.testing_utils import ( @@ -45,10 +44,6 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") -if is_torch_available(): - pass - - @require_sentencepiece @require_tokenizers class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @@ -144,7 +139,7 @@ def test_full_tokenizer(self): ], ) - @unittest.skip("Let's wait for the fast tokenizer!") + @unittest.skip(reason="Let's wait for the fast tokenizer!") def test_save_pretrained(self): self.tokenizers_list += (self.rust_tokenizer_class, "hf-internal-testing/llama-tokenizer", {}) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: @@ -213,7 +208,7 @@ def test_save_pretrained(self): @require_torch def test_batch_tokenization(self): if not self.test_seq2seq: - return + self.skipTest(reason="test_seq2seq is set to False") tokenizers = self.get_tokenizers() for tokenizer in tokenizers: @@ -233,7 +228,7 @@ def test_batch_tokenization(self): return_tensors="pt", ) except NotImplementedError: - return + self.skipTest(reason="Encountered NotImplementedError when calling tokenizer") self.assertEqual(batch.input_ids.shape[1], 3) # max_target_length will default to max_length if not specified batch = tokenizer(text, max_length=3, return_tensors="pt") @@ -244,7 +239,7 @@ def test_batch_tokenization(self): self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3) self.assertNotIn("decoder_input_ids", batch_encoder_only) - @unittest.skip("Unfortunately way too slow to build a BPE with SentencePiece.") + @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.") def test_save_slow_from_fast_and_reload_fast(self): pass @@ -299,11 +294,11 @@ def test_picklable(self): pickled_tokenizer = pickle.dumps(tokenizer) pickle.loads(pickled_tokenizer) - @unittest.skip("worker 'gw4' crashed on CI, passing locally.") + @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") def test_pickle_subword_regularization_tokenizer(self): pass - @unittest.skip("worker 'gw4' crashed on CI, passing locally.") + @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") def test_subword_regularization_tokenizer(self): pass diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index aaf0284c0587..b37e4df3cc10 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -204,6 +204,14 @@ def test_training_gradient_checkpointing_use_reentrant(self): def test_training_gradient_checkpointing_use_reentrant_false(self): pass + @unittest.skip(reason="Compile not yet supported because in LLava models") + def test_sdpa_can_compile_dynamic(self): + pass + + @unittest.skip(reason="Compile not yet supported because in LLava models") + def test_sdpa_can_dispatch_on_flash(self): + pass + @require_torch class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/llava_next/test_image_processing_llava_next.py b/tests/models/llava_next/test_image_processing_llava_next.py index ff5c9e970874..fc399298c39a 100644 --- a/tests/models/llava_next/test_image_processing_llava_next.py +++ b/tests/models/llava_next/test_image_processing_llava_next.py @@ -197,7 +197,9 @@ def test_call_pytorch(self): expected_output_image_shape = (7, 1445, 3, 18, 18) self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape) - @unittest.skip("LlavaNextImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy + @unittest.skip( + reason="LlavaNextImageProcessor doesn't treat 4 channel PIL and numpy consistently yet" + ) # FIXME Amy def test_call_numpy_4_channels(self): pass diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index c060a892c9d4..69794a85d9fe 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -265,6 +265,14 @@ def test_feed_forward_chunking(self): def test_cpu_offload(self): pass + @unittest.skip(reason="Compile not yet supported because in LLava models") + def test_sdpa_can_compile_dynamic(self): + pass + + @unittest.skip(reason="Compile not yet supported because in LLava models") + def test_sdpa_can_dispatch_on_flash(self): + pass + @require_torch class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/llava_next_video/__init__.py b/tests/models/llava_next_video/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/models/llava_next_video/test_image_processing_llava_next_video.py b/tests/models/llava_next_video/test_image_processing_llava_next_video.py new file mode 100644 index 000000000000..8c525fa256da --- /dev/null +++ b/tests/models/llava_next_video/test_image_processing_llava_next_video.py @@ -0,0 +1,218 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import LlavaNextVideoImageProcessor + + +class LlavaNextVideoProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=5, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=80, + do_resize=True, + size=None, + do_center_crop=True, + crop_size=None, + do_normalize=True, + image_mean=OPENAI_CLIP_MEAN, + image_std=OPENAI_CLIP_STD, + do_convert_rgb=True, + ): + size = size if size is not None else {"shortest_edge": 20} + crop_size = crop_size if crop_size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_center_crop": self.do_center_crop, + "crop_size": self.crop_size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + } + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.expected_output_image_shape + def expected_output_image_shape(self, images): + return self.num_channels, self.crop_size["height"], self.crop_size["width"] + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTester.prepare_image_inputs + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False): + images = prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + # let's simply copy the frames to fake a long video-clip + if numpify or torchify: + videos = [] + for image in images: + if numpify: + video = image[None, ...].repeat(8, 0) + else: + video = image[None, ...].repeat(8, 1, 1, 1) + videos.append(video) + else: + videos = [] + for pil_image in images: + videos.append([pil_image] * 8) + + return videos + + +@require_torch +@require_vision +class LlavaNextVideoProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = LlavaNextVideoImageProcessor if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = LlavaNextVideoProcessingTester(self) + + @property + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.image_processor_dict + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_center_crop")) + self.assertTrue(hasattr(image_processing, "center_crop")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + + # Copied from tests.models.clip.test_image_processing_clip.CLIPImageProcessingTest.test_image_processor_from_dict_with_kwargs + def test_image_processor_from_dict_with_kwargs(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 20}) + self.assertEqual(image_processor.crop_size, {"height": 18, "width": 18}) + + image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42, crop_size=84) + self.assertEqual(image_processor.size, {"shortest_edge": 42}) + self.assertEqual(image_processor.crop_size, {"height": 84, "width": 84}) + + def test_call_pil(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True) + for video in video_inputs: + self.assertIsInstance(video[0], Image.Image) + + # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!) + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos + expected_output_video_shape = (1, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos + expected_output_video_shape = (5, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + def test_call_numpy(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, numpify=True) + for video in video_inputs: + self.assertIsInstance(video, np.ndarray) + + # Test not batched input (pass as `videos` arg to test that ImageProcessor can handle videos in absence of images!) + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos + expected_output_video_shape = (1, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos + expected_output_video_shape = (5, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + def test_call_pytorch(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True, torchify=True) + for video in video_inputs: + self.assertIsInstance(video, torch.Tensor) + + # Test not batched input + encoded_videos = image_processing(images=video_inputs[0], return_tensors="pt").pixel_values_videos + expected_output_video_shape = (1, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=video_inputs, return_tensors="pt").pixel_values_videos + expected_output_video_shape = (5, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + @unittest.skip("LlavaNextVideoImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") + def test_call_numpy_4_channels(self): + pass diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py new file mode 100644 index 000000000000..afe3062fb50e --- /dev/null +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -0,0 +1,455 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Llava-NeXT model.""" + +import gc +import unittest + +import numpy as np +from huggingface_hub import hf_hub_download + +from transformers import ( + AutoProcessor, + LlavaNextVideoConfig, + LlavaNextVideoForConditionalGeneration, + is_torch_available, + is_vision_available, +) +from transformers.testing_utils import ( + require_bitsandbytes, + require_torch, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ( + ModelTesterMixin, + _config_zero_init, + floats_tensor, + ids_tensor, +) + + +if is_torch_available(): + import torch + +else: + is_torch_greater_or_equal_than_2_0 = False + +if is_vision_available(): + from PIL import Image + + +class LlavaNextVideoVisionText2TextModelTester: + def __init__( + self, + parent, + ignore_index=-100, + image_token_index=0, + video_token_index=1, + projector_hidden_act="gelu", + seq_length=7, + vision_feature_select_strategy="default", + vision_feature_layer=-1, + text_config={ + "model_type": "llama", + "seq_length": 7, + "is_training": True, + "use_input_mask": True, + "use_token_type_ids": False, + "use_labels": True, + "vocab_size": 99, + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 37, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 580, + "type_vocab_size": 16, + "type_sequence_label_size": 2, + "initializer_range": 0.02, + "num_labels": 3, + "num_choices": 4, + "pad_token_id": 0, + }, + is_training=True, + vision_config={ + "image_size": 16, + "patch_size": 2, + "num_channels": 3, + "is_training": True, + "hidden_size": 32, + "projection_dim": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 37, + "dropout": 0.1, + "attention_dropout": 0.1, + "initializer_range": 0.02, + }, + ): + self.parent = parent + self.ignore_index = ignore_index + self.image_token_index = image_token_index + self.video_token_index = video_token_index + self.projector_hidden_act = projector_hidden_act + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + self.text_config = text_config + self.vision_config = vision_config + self.seq_length = seq_length + + self.num_hidden_layers = text_config["num_hidden_layers"] + self.vocab_size = text_config["vocab_size"] + self.hidden_size = text_config["hidden_size"] + self.num_attention_heads = text_config["num_attention_heads"] + self.is_training = is_training + + self.batch_size = 3 + self.num_channels = 3 + self.image_size = 30 + self.encoder_seq_length = 468 + self.image_grid_pinpoints = [[32, 32]] + + def get_config(self): + return LlavaNextVideoConfig( + text_config=self.text_config, + vision_config=self.vision_config, + ignore_index=self.ignore_index, + image_token_index=self.image_token_index, + video_token_index=self.video_token_index, + projector_hidden_act=self.projector_hidden_act, + vision_feature_select_strategy=self.vision_feature_select_strategy, + vision_feature_layer=self.vision_feature_layer, + image_grid_pinpoints=self.image_grid_pinpoints, + ) + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor( + [ + self.batch_size, + 5, + self.vision_config["num_channels"], + self.vision_config["image_size"], + self.vision_config["image_size"], + ] + ) + pixel_values_videos = floats_tensor( + [ + self.batch_size, + 8, + self.vision_config["num_channels"], + self.vision_config["image_size"], + self.vision_config["image_size"], + ] + ) + config = self.get_config() + + return config, pixel_values, pixel_values_videos + + def prepare_config_and_inputs_for_common(self): + config, pixel_values, pixel_values_videos = self.prepare_config_and_inputs() + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 + # make attention mask left-padded to avoid issues with "model has no attribute padding_side" + attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) + attention_mask[:, :1] = 0 + # we are giving 3 images and videos let's make sure we pass in 3 special tokens + input_ids[:, 1] = config.image_token_index + input_ids[:, 2] = config.video_token_index + labels = torch.zeros((self.batch_size, self.seq_length), dtype=torch.long, device=torch_device) + # maskout where the image/video token is + labels[:, 1] == self.ignore_index + labels[:, 2] == self.ignore_index + inputs_dict = { + "pixel_values": pixel_values, + "pixel_values_videos": pixel_values_videos, + "image_sizes": torch.tensor( + [[self.vision_config["image_size"], self.vision_config["image_size"]]] * self.batch_size + ), + "input_ids": input_ids, + "attention_mask": attention_mask, + "labels": labels, + } + return config, inputs_dict + + def create_and_check_llava_next_video_model_fp16_forward( + self, config, input_ids, pixel_values, pixel_values_videos, attention_mask, image_sizes + ): + model = LlavaNextVideoForConditionalGeneration(config=config) + model.to(torch_device) + model.half() + model.eval() + logits = model( + input_ids=input_ids, + attention_mask=attention_mask, + image_sizes=image_sizes, + pixel_values=pixel_values.to(torch.bfloat16), + pixel_values_videos=pixel_values_videos.to(torch.bfloat16), + return_dict=True, + )["logits"] + self.parent.assertFalse(torch.isnan(logits).any().item()) + + def create_and_check_llava_next_video_model_fp16_autocast_forward( + self, config, input_ids, pixel_values, pixel_values_videos, attention_mask, image_sizes + ): + config.torch_dtype = torch.float16 + model = LlavaNextVideoForConditionalGeneration(config=config) + model.to(torch_device) + model.eval() + with torch.autocast(device_type="cuda", dtype=torch.float16): + logits = model( + input_ids=input_ids, + attention_mask=attention_mask, + image_sizes=image_sizes, + pixel_values=pixel_values.to(torch.bfloat16), + pixel_values_videos=pixel_values_videos.to(torch.bfloat16), + return_dict=True, + )["logits"] + self.parent.assertFalse(torch.isnan(logits).any().item()) + + +@require_torch +class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + """ + Model tester for `LlavaNextVideoForConditionalGeneration`. + """ + + all_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else () + test_pruning = False + test_head_masking = False + + def setUp(self): + self.model_tester = LlavaNextVideoVisionText2TextModelTester(self) + self.config_tester = ConfigTester(self, config_class=LlavaNextVideoConfig, has_text_modality=False) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if "image_newline" in name: + continue + elif param.requires_grad: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + def test_inputs_embeds(self): + # overwrite because llava can't support both inputs_embeds and pixel values at ipnut + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + del inputs["pixel_values_videos"] + + wte = model.get_input_embeddings() + inputs["inputs_embeds"] = wte(input_ids) + + with torch.no_grad(): + model(**inputs) + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="Feedforward chunking is not yet supported") + def test_feed_forward_chunking(self): + pass + + @unittest.skip(reason="CPU offload is not yet supported") + def test_cpu_offload(self): + pass + + @unittest.skip( + reason="Compile not yet supported because in LLava models (https://github.com/huggingface/transformers/issues/29891)" + ) + def test_sdpa_can_compile_dynamic(self): + pass + + @unittest.skip( + reason="Compile not yet supported because in LLava models (https://github.com/huggingface/transformers/issues/29891)" + ) + def test_sdpa_can_dispatch_on_flash(self): + pass + + +@require_torch +class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase): + def setUp(self): + self.processor = AutoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf") + image_file = hf_hub_download( + repo_id="raushan-testing-hf/images_test", filename="llava_v1_5_radar.jpg", repo_type="dataset" + ) + video_file = hf_hub_download( + repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset" + ) + self.image = Image.open(image_file) + self.video = np.load(video_file) + self.prompt_image = "USER: \nWhat is shown in this image? ASSISTANT:" + self.prompt_video = "USER: