From 4d127eb6e81204084edefa43ba44bfb9605d8220 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 2 Jul 2024 11:03:40 -0700 Subject: [PATCH 001/116] toctree --- docs/source/en/_toctree.yml | 40 ++++++------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 4b34ccf0e3e9..f15fda0e781d 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -1,8 +1,9 @@ -- sections: +- title: Get started + sections: - local: index - title: 🤗 Transformers + title: Transformers - local: quicktour - title: Quick tour + title: Quicktour - local: installation title: Installation - local: add_new_model @@ -84,7 +85,7 @@ - local: tasks/mask_generation title: Mask Generation - local: tasks/keypoint_detection - title: Keypoint Detection + title: Keypoint detection - local: tasks/knowledge_distillation_for_image_classification title: Knowledge Distillation for Computer Vision title: Computer Vision @@ -247,44 +248,15 @@ title: Performance and scalability - sections: - local: contributing - title: How to contribute to 🤗 Transformers? - - local: add_new_model - title: How to add a model to 🤗 Transformers? - - local: add_new_pipeline - title: How to add a pipeline to 🤗 Transformers? + title: How to contribute to Transformers? - local: testing title: Testing - local: pr_checks title: Checks on a Pull Request - title: Contribute -- sections: - local: philosophy title: Philosophy - local: glossary title: Glossary - - local: task_summary - title: What 🤗 Transformers can do - - local: tasks_explained - title: How 🤗 Transformers solve tasks - - local: model_summary - title: The Transformer model family - - local: tokenizer_summary - title: Summary of the tokenizers - - local: attention - title: Attention mechanisms - - local: pad_truncation - title: Padding and truncation - - local: bertology - title: BERTology - - local: perplexity - title: Perplexity of fixed-length models - - local: pipeline_webserver - title: Pipelines for webserver inference - - local: model_memory_anatomy - title: Model training anatomy - - local: llm_tutorial_optimization - title: Getting the most out of LLMs - title: Conceptual guides - sections: - sections: - local: main_classes/agent From 3c1955b5dcffde6f1f7318848d29dbcf994618bf Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 2 Jul 2024 11:30:56 -0700 Subject: [PATCH 002/116] not-doctested.txt --- utils/not_doctested.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt index 24a8a4ba7a28..19f157b2b9ba 100644 --- a/utils/not_doctested.txt +++ b/utils/not_doctested.txt @@ -287,7 +287,6 @@ docs/source/en/perf_train_gpu_many.md docs/source/en/perf_train_gpu_one.md docs/source/en/perf_train_special.md docs/source/en/perf_train_tpu_tf.md -docs/source/en/performance.md docs/source/en/perplexity.md docs/source/en/philosophy.md docs/source/en/pipeline_webserver.md From 72dd45333acbe9546ae22e2f8d49966f0fb9b906 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 3 Jul 2024 10:32:06 -0700 Subject: [PATCH 003/116] collapse sections --- docs/source/en/_toctree.yml | 258 +++++++++++++++++++----------------- 1 file changed, 139 insertions(+), 119 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index f15fda0e781d..aa83ac970322 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -24,131 +24,149 @@ title: Set up distributed training with 🤗 Accelerate - local: peft title: Load and train adapters with 🤗 PEFT - - local: model_sharing - title: Share your model - - local: agents - title: Agents 101 - - local: agents_advanced - title: Agents, supercharged - Multi-agents, External tools, and more - - local: llm_tutorial - title: Generation with LLMs - - local: conversations - title: Chatting with Transformers - title: Tutorials -- sections: - - isExpanded: false - sections: - - local: tasks/sequence_classification - title: Text classification - - local: tasks/token_classification - title: Token classification - - local: tasks/question_answering - title: Question answering - - local: tasks/language_modeling - title: Causal language modeling - - local: tasks/masked_language_modeling - title: Masked language modeling - - local: tasks/translation - title: Translation - - local: tasks/summarization - title: Summarization - - local: tasks/multiple_choice - title: Multiple choice - title: Natural Language Processing - - isExpanded: false - sections: - - local: tasks/audio_classification - title: Audio classification - - local: tasks/asr - title: Automatic speech recognition - title: Audio - - isExpanded: false - sections: - - local: tasks/image_classification - title: Image classification - - local: tasks/semantic_segmentation - title: Image segmentation - - local: tasks/video_classification - title: Video classification - - local: tasks/object_detection - title: Object detection - - local: tasks/zero_shot_object_detection - title: Zero-shot object detection - - local: tasks/zero_shot_image_classification - title: Zero-shot image classification - - local: tasks/monocular_depth_estimation - title: Depth estimation - - local: tasks/image_to_image - title: Image-to-Image - - local: tasks/image_feature_extraction - title: Image Feature Extraction - - local: tasks/mask_generation - title: Mask Generation - - local: tasks/keypoint_detection - title: Keypoint detection - - local: tasks/knowledge_distillation_for_image_classification - title: Knowledge Distillation for Computer Vision - title: Computer Vision - - isExpanded: false + - local: sagemaker + title: Run training on Amazon SageMaker + - local: debugging + title: Debugging + - local: model_memory_anatomy + title: Model training anatomy +- title: Inference + sections: + - title: Pipeline API sections: - - local: tasks/image_captioning - title: Image captioning - - local: tasks/document_question_answering - title: Document Question Answering - - local: tasks/visual_question_answering - title: Visual Question Answering - - local: tasks/text-to-speech - title: Text to speech - - local: tasks/image_text_to_text - title: Image-text-to-text - - local: tasks/video_text_to_text - title: Video-text-to-text - title: Multimodal - - isExpanded: false + - local: pipeline_tutorial + title: Run inference with pipelines + - local: pipeline_webserver + title: Pipelines for webserver inference + - local: add_new_pipeline + title: How to add a pipeline to 🤗 Transformers? + - title: LLMs sections: + - local: tasks/prompting + title: LLM prompting guide + - local: llm_optims + title: LLM inference optimization + - local: llm_tutorial + title: Generation with LLMs - local: generation_strategies title: Customize the generation strategy - - local: kv_cache - title: Best Practices for Generation with Cache - title: Generation - - isExpanded: false + - local: llm_tutorial_optimization + title: Getting the most out of LLMs + - local: perplexity + title: Perplexity of fixed-length models + - title: Chat models sections: - - local: chat_template_basics - title: Getting Started with Chat Templates for Text LLMs - - local: chat_template_multimodal - title: Multimodal Chat Templates for Vision and Audio LLMs - - local: chat_template_tools_and_documents - title: Expanding Chat Templates with Tools and Documents - - local: chat_template_advanced - title: Advanced Usage and Customizing Your Chat Templates - title: Chat Templates - - isExpanded: false + - local: conversations + title: Chatting with Transformers + - local: chat_templating + title: Templates for chat models + - title: Framework-specific inference optimization sections: - - local: tasks/idefics - title: Image tasks with IDEFICS - - local: tasks/prompting - title: LLM prompting guide - title: Prompting - title: Task Guides -- sections: - - local: fast_tokenizers - title: Use fast tokenizers from 🤗 Tokenizers + - local: tf_xla + title: XLA Integration for TensorFlow Models + - local: perf_torch_compile + title: Optimize inference using `torch.compile()` + - local: agents + title: Agents - local: multilingual title: Run inference with multilingual models - - local: create_a_model - title: Use model-specific APIs - - local: custom_models - title: Share a custom model - - local: trainer - title: Trainer - - local: sagemaker - title: Run training on Amazon SageMaker - - local: serialization - title: Export to ONNX - - local: tflite - title: Export to TFLite - - local: torchscript - title: Export to TorchScript + - local: gguf + title: Interoperability with GGUF files + - local: perf_infer_cpu + title: CPU inference + - local: perf_infer_gpu_one + title: GPU inference + - local: big_models + title: Instantiate a big model +- title: Quantization + sections: + - local: quantization/overview + title: Getting started + - local: quantization/bitsandbytes + title: bitsandbytes + - local: quantization/gptq + title: GPTQ + - local: quantization/awq + title: AWQ + - local: quantization/aqlm + title: AQLM + - local: quantization/quanto + title: Quanto + - local: quantization/eetq + title: EETQ + - local: quantization/hqq + title: HQQ + - local: quantization/optimum + title: Optimum + - local: quantization/contribute + title: Contribute new quantization method +- title: Resources + sections: + - title: Task recipes + isExpanded: false + sections: + - title: Natural language processing + sections: + - local: tasks/sequence_classification + title: Text classification + - local: tasks/token_classification + title: Token classification + - local: tasks/question_answering + title: Question answering + - local: tasks/language_modeling + title: Causal language modeling + - local: tasks/masked_language_modeling + title: Masked language modeling + - local: tasks/translation + title: Translation + - local: tasks/summarization + title: Summarization + - local: tasks/multiple_choice + title: Multiple choice + - title: Audio + sections: + - local: tasks/audio_classification + title: Audio classification + - local: tasks/asr + title: Automatic speech recognition + - title: Computer vision + sections: + - local: tasks/image_classification + title: Image classification + - local: tasks/semantic_segmentation + title: Image segmentation + - local: tasks/video_classification + title: Video classification + - local: tasks/object_detection + title: Object detection + - local: tasks/zero_shot_object_detection + title: Zero-shot object detection + - local: tasks/zero_shot_image_classification + title: Zero-shot image classification + - local: tasks/monocular_depth_estimation + title: Depth estimation + - local: tasks/image_to_image + title: Image-to-Image + - local: tasks/image_feature_extraction + title: Image Feature Extraction + - local: tasks/mask_generation + title: Mask Generation + - local: tasks/knowledge_distillation_for_image_classification + title: Knowledge Distillation for Computer Vision + - title: Multimodal + sections: + - local: tasks/image_captioning + title: Image captioning + - local: tasks/document_question_answering + title: Document Question Answering + - local: tasks/visual_question_answering + title: Visual Question Answering + - local: tasks/text-to-speech + title: Text to speech + - local: tasks/idefics + title: Image tasks with IDEFICS + - local: benchmarks + title: Benchmarks - local: notebooks title: Notebooks with examples - local: community @@ -258,7 +276,8 @@ - local: glossary title: Glossary - sections: - - sections: + - isExpanded: false + sections: - local: main_classes/agent title: Agents and Tools - local: model_doc/auto @@ -990,7 +1009,8 @@ title: Graphormer title: Graph models title: Models - - sections: + - isExpanded: false + sections: - local: internal/modeling_utils title: Custom Layers and Utilities - local: internal/pipelines_utils From cf19270663959c2a1ad8017f249f5246fe9f3ec4 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 9 Jul 2024 14:01:01 -0700 Subject: [PATCH 004/116] feedback --- docs/source/en/_toctree.yml | 174 ++++++++++++++++++++++++------------ 1 file changed, 116 insertions(+), 58 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index aa83ac970322..0ca31a2c9340 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -6,30 +6,42 @@ title: Quicktour - local: installation title: Installation - - local: add_new_model - title: Adding a new model to `transformers` - title: Get started -- sections: - - local: pipeline_tutorial - title: Run inference with pipelines - - local: autoclass_tutorial - title: Write portable code with AutoClass - - local: preprocessing - title: Preprocess data - - local: training - title: Fine-tune a pretrained model - - local: run_scripts - title: Train with a script - - local: accelerate - title: Set up distributed training with 🤗 Accelerate - - local: peft - title: Load and train adapters with 🤗 PEFT - - local: sagemaker - title: Run training on Amazon SageMaker - - local: debugging - title: Debugging - - local: model_memory_anatomy - title: Model training anatomy +- title: Base classes + sections: + - title: Models + isExpanded: false + sections: + - local: autoclass_tutorial + title: AutoClass API + - local: create_a_model + title: Use model-specific APIs + - local: custom_models + title: Share a custom model + - local: model_sharing + title: Share your model + - local: add_new_model + title: How to add a model to 🤗 Transformers? + - local: task_summary + title: What 🤗 Transformers can do + - local: tasks_explained + title: How 🤗 Transformers solve tasks + - local: model_summary + title: The Transformer model family + - local: attention + title: Attention mechanisms + - local: bertology + title: BERTology + - title: Tokenizers + isExpanded: false + sections: + - local: preprocessing + title: Preprocess data + - local: fast_tokenizers + title: Use fast tokenizers from 🤗 Tokenizers + - local: tokenizer_summary + title: Summary of the tokenizers + - local: pad_truncation + title: Padding and truncation - title: Inference sections: - title: Pipeline API @@ -54,12 +66,12 @@ title: Getting the most out of LLMs - local: perplexity title: Perplexity of fixed-length models - - title: Chat models - sections: - - local: conversations - title: Chatting with Transformers - - local: chat_templating - title: Templates for chat models + - title: Chat models + sections: + - local: conversations + title: Chatting with Transformers + - local: chat_templating + title: Templates for chat models - title: Framework-specific inference optimization sections: - local: tf_xla @@ -78,6 +90,53 @@ title: GPU inference - local: big_models title: Instantiate a big model +- title: Training + sections: + - title: Trainer API + isExpanded: false + sections: + - local: training + title: Fine-tune a pretrained model + - local: trainer + title: Trainer + - local: hpo_train + title: Hyperparameter Search using Trainer API + - local: run_scripts + title: Train with a script + - title: Distributed training + isExpanded: false + sections: + - local: accelerate + title: Set up distributed training with 🤗 Accelerate + - local: perf_train_gpu_many + title: Multiple GPUs and parallelism + - local: fsdp + title: Fully Sharded Data Parallel + - local: deepspeed + title: DeepSpeed + - local: perf_train_cpu_many + title: Distributed CPU training + - title: Hardware-specific training + isExpanded: false + sections: + - local: perf_train_gpu_one + title: Methods and tools for efficient training on a single GPU + - local: perf_train_cpu + title: Efficient training on CPU + - local: perf_train_tpu_tf + title: Training on TPU with TensorFlow + - local: perf_train_special + title: PyTorch training on Apple silicon + - local: perf_hardware + title: Custom hardware for training + - local: peft + title: Load and train adapters with 🤗 PEFT + - local: sagemaker + title: Run training on Amazon SageMaker + - local: debugging + title: Debugging + - local: model_memory_anatomy + title: Model training anatomy - title: Quantization sections: - local: quantization/overview @@ -100,6 +159,14 @@ title: Optimum - local: quantization/contribute title: Contribute new quantization method +- title: Deploy to production + sections: + - local: serialization + title: Export to ONNX + - local: tflite + title: Export to TFLite + - local: torchscript + title: Export to TorchScript - title: Resources sections: - title: Task recipes @@ -275,8 +342,10 @@ title: Philosophy - local: glossary title: Glossary -- sections: - - isExpanded: false +- title: API + sections: + - title: Main classes + isExpanded: false sections: - local: main_classes/agent title: Agents and Tools @@ -322,9 +391,10 @@ title: Feature Extractor - local: main_classes/image_processor title: Image Processor - title: Main Classes - - sections: - - isExpanded: false + - title: Models + isExpanded: false + sections: + - title: Text models sections: - local: model_doc/albert title: ALBERT @@ -630,12 +700,7 @@ title: XLNet - local: model_doc/yoso title: YOSO - - local: model_doc/zamba - title: Zamba - - local: model_doc/zamba2 - title: Zamba2 - title: Text models - - isExpanded: false + - title: Vision models sections: - local: model_doc/beit title: BEiT @@ -761,10 +826,9 @@ title: ViTPose - local: model_doc/yolos title: YOLOS - - local: model_doc/zoedepth - title: ZoeDepth - title: Vision models - - isExpanded: false + - local: model_doc/zamba + title: Zamba + - title: Audio models sections: - local: model_doc/audio-spectrogram-transformer title: Audio Spectrogram Transformer @@ -834,8 +898,7 @@ title: XLS-R - local: model_doc/xlsr_wav2vec2 title: XLSR-Wav2Vec2 - title: Audio models - - isExpanded: false + - title: Video models sections: - local: model_doc/timesformer title: TimeSformer @@ -843,8 +906,7 @@ title: VideoMAE - local: model_doc/vivit title: ViViT - title: Video models - - isExpanded: false + - title: Multimodal models sections: - local: model_doc/align title: ALIGN @@ -982,15 +1044,13 @@ title: VisualBERT - local: model_doc/xclip title: X-CLIP - title: Multimodal models - - isExpanded: false + - title: Reinforcement learning models sections: - local: model_doc/decision_transformer title: Decision Transformer - local: model_doc/trajectory_transformer title: Trajectory Transformer - title: Reinforcement learning models - - isExpanded: false + - title: Time series models sections: - local: model_doc/autoformer title: Autoformer @@ -1002,14 +1062,12 @@ title: PatchTST - local: model_doc/time_series_transformer title: Time Series Transformer - title: Time series models - - isExpanded: false + - title: Graph models sections: - local: model_doc/graphormer title: Graphormer - title: Graph models - title: Models - - isExpanded: false + - title: Internal helpers + isExpanded: false sections: - local: internal/modeling_utils title: Custom Layers and Utilities From c4e88d2fcd5608f99651915f4d48836f80ec9e62 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 22 Jul 2024 09:33:51 -0700 Subject: [PATCH 005/116] update --- docs/source/en/_toctree.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 0ca31a2c9340..d778ba901532 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -232,6 +232,8 @@ title: Text to speech - local: tasks/idefics title: Image tasks with IDEFICS + - local: tasks/image_text_to_text + title: Image-text-to-text - local: benchmarks title: Benchmarks - local: notebooks From 0823c9fcd140004cc6f5b29c42e3ca6b9184e125 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Thu, 25 Jul 2024 13:49:31 -0700 Subject: [PATCH 006/116] rewrite get started sections --- docs/source/en/_toctree.yml | 2 +- docs/source/en/index.md | 48 ++- docs/source/en/installation.md | 250 +++++--------- docs/source/en/quicktour.md | 604 ++++++++++----------------------- 4 files changed, 280 insertions(+), 624 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index d778ba901532..fa8bf7bbd587 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -3,7 +3,7 @@ - local: index title: Transformers - local: quicktour - title: Quicktour + title: Quickstart - local: installation title: Installation - title: Base classes diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 16d2dd3efd27..3864e7fe300e 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -1,4 +1,4 @@ - -# 🤗 Transformers +# Transformers -State-of-the-art Machine Learning for [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), and [JAX](https://jax.readthedocs.io/en/latest/). +Transformers is a library of pretrained natural language processing, computer vision, audio, and multimodal models. -🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as: +It supports the main machine learning frameworks ([PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), and [Flax](https://flax.readthedocs.io/en/latest/)), and provides APIs for inference and training to help you use pretrained models out-of-the-box or train new ones from scratch. -📝 **Natural Language Processing**: text classification, named entity recognition, question answering, language modeling, code generation, summarization, translation, multiple choice, and text generation.
-🖼️ **Computer Vision**: image classification, object detection, and segmentation.
-🗣️ **Audio**: automatic speech recognition and audio classification.
-🐙 **Multimodal**: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering. +Join us on the [Hugging Face Hub](https://huggingface.co/), [Discord](https://discord.com/invite/JfAtkvEtRb), or [forum](https://discuss.huggingface.co/) today! -🤗 Transformers support framework interoperability between PyTorch, TensorFlow, and JAX. This provides the flexibility to use a different framework at each stage of a model's life; train a model in three lines of code in one framework, and load it for inference in another. Models can also be exported to a format like ONNX and TorchScript for deployment in production environments. +## Features -Join the growing community on the [Hub](https://huggingface.co/models), [forum](https://discuss.huggingface.co/), or [Discord](https://discord.com/invite/JfAtkvEtRb) today! +Transformers provides everything you need for training or inference with state-of-the-art pretrained models. Some of its main features include: -## If you are looking for custom support from the Hugging Face team +- [`Pipeline`]: A high-level API that supports optimized inference for many machine learning tasks like text generation, image segmentation, automatic speech recognition, document question answering, and more. +- [`Trainer`]: A feature-rich API that supports training and distributed training for PyTorch models. It includes many performant and efficient training features such as mixed precision, torch.compile, and FlashAttention. +- [`~GenerationMixin.generate`]: A generation API for large language models (LLMs) and vision language models (VLMs) that supports streaming and multiple decoding strategies for different use cases. - - HuggingFace Expert Acceleration Program - - -## Contents +## Design -The documentation is organized into five sections: +> [!TIP] +> Refer to our [Philosophy](./philosophy) for a more detailed explanation of Transformers' design principles. -- **GET STARTED** provides a quick tour of the library and installation instructions to get up and running. -- **TUTORIALS** are a great place to start if you're a beginner. This section will help you gain the basic skills you need to start using the library. -- **HOW-TO GUIDES** show you how to achieve a specific goal, like finetuning a pretrained model for language modeling or how to write and share a custom model. -- **CONCEPTUAL GUIDES** offers more discussion and explanation of the underlying concepts and ideas behind models, tasks, and the design philosophy of 🤗 Transformers. -- **API** describes all classes and functions: +Transformers is designed for developers and machine learning engineers and researchers alike. Its main design principles are: - - **MAIN CLASSES** details the most important classes like configuration, model, tokenizer, and pipeline. - - **MODELS** details the classes and functions related to each model implemented in the library. - - **INTERNAL HELPERS** details utility classes and functions used internally. +1. Easy and fast to use: Every model is implemented from only three main classes (model, preprocessor, and configuration) and can be quickly used for inference or training with two APIs ([`Pipeline`] or [`Trainer`]). +2. Pretrained models: Reduce your carbon footprint, compute cost and time by using a pretrained model instead of training an entirely new one. Each pretrained model is reproduced as closely as possible to the original model and offers state-of-the-art performance. + + HuggingFace Expert Acceleration Program + ## Supported models and frameworks -The table below represents the current support in the library for each of those models, whether they have a Python -tokenizer (called "slow"). A "fast" tokenizer backed by the 🤗 Tokenizers library, whether they have support in Jax (via -Flax), PyTorch, and/or TensorFlow. +Check the table below to see whether a model supports PyTorch, TensorFlow, or JAX. diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md index 4573efbb43c7..049d2edd3cea 100644 --- a/docs/source/en/installation.md +++ b/docs/source/en/installation.md @@ -1,5 +1,5 @@ -# Quick tour +# Quickstart [[open-in-colab]] -Get up and running with 🤗 Transformers! Whether you're a developer or an everyday user, this quick tour will help you get started and show you how to use the [`pipeline`] for inference, load a pretrained model and preprocessor with an [AutoClass](./model_doc/auto), and quickly train a model with PyTorch or TensorFlow. If you're a beginner, we recommend checking out our tutorials or [course](https://huggingface.co/course/chapter1/1) next for more in-depth explanations of the concepts introduced here. +Get up and running with Transformers! Whether you're a developer or a machine learning engineer, this quickstart will show you Transformers' key features. -Before you begin, make sure you have all the necessary libraries installed: +Transformers is a library of pretrained models, providing three classes to instantiate any model and two APIs for inference or training. By limiting the number of user-facing abstractions, Transformers is easier to learn and faster to use. -```bash -!pip install transformers datasets evaluate accelerate -``` - -You'll also need to install your preferred machine learning framework: - - - +Transformers supports popular machine learning frameworks like [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), and [Flax](https://flax.readthedocs.io/en/latest/). Switching between frameworks is easy, granting the flexibility to use the best tool for the job (training, evaluation, or production). -```bash -pip install torch -``` - - +In this quickstart, you'll learn how to: -```bash -pip install tensorflow -``` - - +- load a pretrained model for inference or training +- run inference with the [`Pipeline`] API +- train a model with the [`Trainer`] API -## Pipeline +## Setup - +To start, we recommend creating a Hugging Face [account](https://hf.co/join). This allows you to host and access version controlled models, datasets, and apps on the [Hugging Face Hub](https://hf.co/docs/hub/index), a collaborative platform for discovery and building. -The [`pipeline`] is the easiest and fastest way to use a pretrained model for inference. You can use the [`pipeline`] out-of-the-box for many tasks across different modalities, some of which are shown in the table below: - - - -For a complete list of available tasks, check out the [pipeline API reference](./main_classes/pipelines). - - - -| **Task** | **Description** | **Modality** | **Pipeline identifier** | -|------------------------------|--------------------------------------------------------------------------------------------------------------|-----------------|-----------------------------------------------| -| Text classification | assign a label to a given sequence of text | NLP | pipeline(task=“sentiment-analysis”) | -| Text generation | generate text given a prompt | NLP | pipeline(task=“text-generation”) | -| Summarization | generate a summary of a sequence of text or document | NLP | pipeline(task=“summarization”) | -| Image classification | assign a label to an image | Computer vision | pipeline(task=“image-classification”) | -| Image segmentation | assign a label to each individual pixel of an image (supports semantic, panoptic, and instance segmentation) | Computer vision | pipeline(task=“image-segmentation”) | -| Object detection | predict the bounding boxes and classes of objects in an image | Computer vision | pipeline(task=“object-detection”) | -| Audio classification | assign a label to some audio data | Audio | pipeline(task=“audio-classification”) | -| Automatic speech recognition | transcribe speech into text | Audio | pipeline(task=“automatic-speech-recognition”) | -| Visual question answering | answer a question about the image, given an image and a question | Multimodal | pipeline(task=“vqa”) | -| Document question answering | answer a question about the document, given a document and a question | Multimodal | pipeline(task="document-question-answering") | -| Image captioning | generate a caption for a given image | Multimodal | pipeline(task="image-to-text") | - -Start by creating an instance of [`pipeline`] and specifying a task you want to use it for. In this guide, you'll use the [`pipeline`] for sentiment analysis as an example: +Create a [User Access Token](https://hf.co/docs/hub/security-tokens#user-access-tokens) and login to your account. ```py ->>> from transformers import pipeline +from huggingface_hub import notebook_login ->>> classifier = pipeline("sentiment-analysis") +notebook_login() ``` -The [`pipeline`] downloads and caches a default [pretrained model](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) and tokenizer for sentiment analysis. Now you can use the `classifier` on your target text: +Make sure your preferred machine learning framework is installed. -```py ->>> classifier("We are very happy to show you the 🤗 Transformers library.") -[{'label': 'POSITIVE', 'score': 0.9998}] -``` + + -If you have more than one input, pass your inputs as a list to the [`pipeline`] to return a list of dictionaries: - -```py ->>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."]) ->>> for result in results: -... print(f"label: {result['label']}, with score: {round(result['score'], 4)}") -label: POSITIVE, with score: 0.9998 -label: NEGATIVE, with score: 0.5309 +```bash +!pip install torch ``` -The [`pipeline`] can also iterate over an entire dataset for any task you like. For this example, let's choose automatic speech recognition as our task: + + -```py ->>> import torch ->>> from transformers import pipeline - ->>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") +```bash +!pip install tensorflow ``` -Load an audio dataset (see the 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart#audio) for more details) you'd like to iterate over. For example, load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset: + + -```py ->>> from datasets import load_dataset, Audio +Install an up-to-date version of Transformers and some additional libraries from the Hugging Face ecosystem for accessing datasets and vision models, evaluating training, and optimizing training for large models. ->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") # doctest: +IGNORE_RESULT +```bash +!pip install -U transformers datasets evaluate accelerate timm ``` -You need to make sure the sampling rate of the dataset matches the sampling -rate [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h) was trained on: +## Base classes -```py ->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate)) -``` +Each pretrained model inherits from three base classes. -The audio files are automatically loaded and resampled when calling the `"audio"` column. -Extract the raw waveform arrays from the first 4 samples and pass it as a list to the pipeline: +| **Class** | **Description** | +|---|---| +| [`PretrainedConfig`] | A json file that specifies a models attributes such as the number of attention heads or vocabulary size. | +| [`PreTrainedModel`] | A model (or architecture) defined by the attributes from the configuration file. For training and inference with a task, you need a model with a specific head attached to convert the raw hidden states into task-specific outputs. For example, [`PreTrainedModel`] outputs the raw hidden states but [`AutoModelForCausalLM`] adds a causal language model head on top to output the generated text. | +| Preprocessor | A class for converting raw inputs (text, images, audio, multimodal) into numerical inputs to the model. For example, [`PretrainedTokenizerFast`] converts text into tensors and [`ImageProcessingMixin`] converts pixels into tensors. | -```py ->>> result = speech_recognizer(dataset[:4]["audio"]) ->>> print([d["text"] for d in result]) -['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I FURN A JOINA COUT'] -``` +Unless you're building a custom model, you'll primarily interact with the [AutoClass](./model_doc/auto) API like [`AutoConfig`], [`AutoModelForCausalLM`], and [`AutoTokenizer`]. An `AutoClass` automatically infers the appropriate architecture for each task and machine learning framework based on the name or path to the pretrained weights and configuration file. -For larger datasets where the inputs are big (like in speech or vision), you'll want to pass a generator instead of a list to load all the inputs in memory. Take a look at the [pipeline API reference](./main_classes/pipelines) for more information. +Use the [`~PreTrainedModel.from_pretrained`] method to load a pretrained models weights and configuration file from the Hub into the model and preprocessor class. -### Use another model and tokenizer in the pipeline + + -The [`pipeline`] can accommodate any model from the [Hub](https://huggingface.co/models), making it easy to adapt the [`pipeline`] for other use-cases. For example, if you'd like a model capable of handling French text, use the tags on the Hub to filter for an appropriate model. The top filtered result returns a multilingual [BERT model](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) finetuned for sentiment analysis you can use for French text: +When you load a model, especially a large language model (LLM), setting `device_map="auto"` automatically allocates the model weights on your device(s) beginning with the GPU. ```py ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +from transformers import AutoModelForCausalLM, AutoTokenizer + +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") ``` - - -Use [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `AutoClass` in the next section): +Tokenize the text and convert it into tensors with the tokenizer. To accelerate inference, move the model to a GPU if it's available. ```py ->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained(model_name) ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) +model_inputs = tokenizer(["Hugging Face is a"], return_tensors="pt").to("cuda") ``` - - -Use [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and it's associated tokenizer (more on an `TFAutoClass` in the next section): -```py ->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification - ->>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name) ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) -``` - - +The model is now ready for inference or training. -Specify the model and tokenizer in the [`pipeline`], and now you can apply the `classifier` on French text: +For inference, pass the tokenized inputs to the [`~GenerationMixin.generate`] API to generate text. Decode the token ids back into text with the [`~PreTrainedTokenizerBase.batch_decode`] method. ```py ->>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) ->>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.") -[{'label': '5 stars', 'score': 0.7273}] +generated_ids = model.generate(**model_inputs, max_length=30) +tokenizer.batch_decode(generated_ids)[0] +' The secret to baking a good cake is 100% in the preparation. There are so many recipes out there,' ``` -If you can't find a model for your use-case, you'll need to finetune a pretrained model on your data. Take a look at our [finetuning tutorial](./training) to learn how. Finally, after you've finetuned your pretrained model, please consider [sharing](./model_sharing) the model with the community on the Hub to democratize machine learning for everyone! 🤗 - -## AutoClass - - - -Under the hood, the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] classes work together to power the [`pipeline`] you used above. An [AutoClass](./model_doc/auto) is a shortcut that automatically retrieves the architecture of a pretrained model from its name or path. You only need to select the appropriate `AutoClass` for your task and it's associated preprocessing class. - -Let's return to the example from the previous section and see how you can use the `AutoClass` to replicate the results of the [`pipeline`]. - -### AutoTokenizer - -A tokenizer is responsible for preprocessing text into an array of numbers as inputs to a model. There are multiple rules that govern the tokenization process, including how to split a word and at what level words should be split (learn more about tokenization in the [tokenizer summary](./tokenizer_summary)). The most important thing to remember is you need to instantiate a tokenizer with the same model name to ensure you're using the same tokenization rules a model was pretrained with. - -Load a tokenizer with [`AutoTokenizer`]: + + ```py ->>> from transformers import AutoTokenizer +from transformers import TFAutoModelForCausalLM, AutoTokenizer ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) +model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2-xl") +tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-xl") ``` -Pass your text to the tokenizer: +Tokenize the text and convert it into tensors with the tokenizer. To accelerate inference, move the model to a GPU if it's available. ```py ->>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.") ->>> print(encoding) -{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +model_inputs = tokenizer(["Hugging Face is a"], return_tensors="tf") ``` -The tokenizer returns a dictionary containing: - -* [input_ids](./glossary#input-ids): numerical representations of your tokens. -* [attention_mask](./glossary#attention-mask): indicates which tokens should be attended to. - -A tokenizer can also accept a list of inputs, and pad and truncate the text to return a batch with uniform length: +The model is now ready for inference or training. - - +For inference, call the [`~GenerationMixin.generate`] API to generate text and the [`~PreTrainedTokenizerBase.batch_decode`] method to convert the token ids back into text. ```py ->>> pt_batch = tokenizer( -... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], -... padding=True, -... truncation=True, -... max_length=512, -... return_tensors="pt", -... ) +generated_ids = model.generate(**model_inputs, max_length=30) +tokenizer.batch_decode(generated_ids)[0] +'The secret to baking a good cake is \xa0to use the right ingredients. \xa0The secret to baking a good cake is to use the right' ``` - - -```py ->>> tf_batch = tokenizer( -... ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], -... padding=True, -... truncation=True, -... max_length=512, -... return_tensors="tf", -... ) -``` - - + + + +For training, skip ahead to the [Trainer API](#trainer-api) section to learn how. - +## Pipeline API -Check out the [preprocess](./preprocessing) tutorial for more details about tokenization, and how to use an [`AutoImageProcessor`], [`AutoFeatureExtractor`] and [`AutoProcessor`] to preprocess image, audio, and multimodal inputs. +The [`Pipeline`] is the most convenient way to inference with a pretrained model. It supports many tasks such as text generation, image segmentation, automatic speech recognition, document question answering, and more. - +> [!TIP] +> Check out the [Pipeline](./main_classes/pipelines) API reference for a complete list of available tasks. -### AutoModel +Create a [`Pipeline`] object and select a task. By default, the [`Pipeline`] downloads and caches a default pretrained model for a given task. To choose a specific model, pass the model name to the `model` parameter. - - -🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`AutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`AutoModel`] for the task. For text (or sequence) classification, you should load [`AutoModelForSequenceClassification`]. + + -By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. +Set `device="cuda"` to accelerate inference with a GPU. ```py ->>> from transformers import AutoModelForSequenceClassification +from transformers import pipeline ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype="auto") +pipeline = pipeline("text-generation", model="meta-llama/Llama-2-7b-hf", device="cuda") ``` - - -See the [task summary](./task_summary) for tasks supported by an [`AutoModel`] class. - - - -Now pass your preprocessed batch of inputs directly to the model. You just have to unpack the dictionary by adding `**`: +Prompt the [`Pipeline`] with some initial text to generate more text. ```py ->>> pt_outputs = pt_model(**pt_batch) +pipeline("The secret to baking a good cake is ", max_length=50) +[{'generated_text': 'The secret to baking a good cake is 100% in the batter. The secret to a great cake is the icing.\nThis is why we’ve created the best buttercream frosting reci'}] ``` -The model outputs the final activations in the `logits` attribute. Apply the softmax function to the `logits` to retrieve the probabilities: + + -```py ->>> from torch import nn - ->>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1) ->>> print(pt_predictions) -tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725], - [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=) -``` - - -🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`TFAutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`TFAutoModel`] for the task. For text (or sequence) classification, you should load [`TFAutoModelForSequenceClassification`]: +Set `device="cuda"` to accelerate inference with a GPU. ```py ->>> from transformers import TFAutoModelForSequenceClassification +from transformers import pipeline ->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name) +pipeline = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic", device="cuda") ``` - - -See the [task summary](./task_summary) for tasks supported by an [`AutoModel`] class. +Pass an image (a URL or local path to the image) to the [`Pipeline`]. - - -Now pass your preprocessed batch of inputs directly to the model. You can pass the tensors as-is: +
+ +
```py ->>> tf_outputs = tf_model(tf_batch) +segments = pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png") +segments[0]["label"] +'bird' +segments[1]["label"] +'bird' ``` -The model outputs the final activations in the `logits` attribute. Apply the softmax function to the `logits` to retrieve the probabilities: + + + +Set `device="cuda"` to accelerate inference with a GPU. ```py ->>> import tensorflow as tf +from transformers import pipeline ->>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1) ->>> tf_predictions # doctest: +IGNORE_RESULT +pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device="cuda") ``` -
-
- +Pass an audio file to the [`Pipeline`]. -All 🤗 Transformers models (PyTorch or TensorFlow) output the tensors *before* the final activation -function (like softmax) because the final activation function is often fused with the loss. Model outputs are special dataclasses so their attributes are autocompleted in an IDE. The model outputs behave like a tuple or a dictionary (you can index with an integer, a slice or a string) in which case, attributes that are None are ignored. +```py +pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac") +{'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'} +``` - + + -### Save a model +## Trainer API - - -Once your model is fine-tuned, you can save it with its tokenizer using [`PreTrainedModel.save_pretrained`]: +The [`Trainer`] is an optimized training and evaluation loop for PyTorch models, a [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html). It abstracts away a lot of the standard boilerplate involved in manually writing a training loop, allowing you to start training faster and focus on training design choices. You only need a model, dataset, a preprocessor, and a data collator to build batches of data from the dataset. -```py ->>> pt_save_directory = "./pt_save_pretrained" ->>> tokenizer.save_pretrained(pt_save_directory) # doctest: +IGNORE_RESULT ->>> pt_model.save_pretrained(pt_save_directory) -``` +Customize the training process with the [`TrainingArguments`] class. It provides many options for training, evaluation, and more. The training process can be as complex or simple as you want or need. Experiment with training hyperparameters and features like batch size, learning rate, mixed precision, torch.compile, and more. Or if you prefer, just use the default settings to quickly produce a baseline. -When you are ready to use the model again, reload it with [`PreTrainedModel.from_pretrained`]: +Load a model, tokenizer, and dataset for training. ```py ->>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained") -``` - - -Once your model is fine-tuned, you can save it with its tokenizer using [`TFPreTrainedModel.save_pretrained`]: +from transformers import AutoModelForSequenceClassification, AutoTokenizer +from datasets import load_dataset -```py ->>> tf_save_directory = "./tf_save_pretrained" ->>> tokenizer.save_pretrained(tf_save_directory) # doctest: +IGNORE_RESULT ->>> tf_model.save_pretrained(tf_save_directory) +model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") +tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") +dataset = load_dataset("rotten_tomatoes") # doctest: +IGNORE_RESULT ``` -When you are ready to use the model again, reload it with [`TFPreTrainedModel.from_pretrained`]: +Create a function to tokenize the text and convert it into tensors. Apply this function to the whole dataset with the [`datasets.Dataset.map`] method. ```py ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained") +def tokenize_dataset(dataset): + return tokenizer(dataset["text"]) +dataset = dataset.map(tokenize_dataset, batched=True) ``` - - - -One particularly cool 🤗 Transformers feature is the ability to save a model and reload it as either a PyTorch or TensorFlow model. The `from_pt` or `from_tf` parameter can convert the model from one framework to the other: - - - - -```py ->>> from transformers import AutoModel ->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) -``` - - +Load a data collator to create batches of data. ```py ->>> from transformers import TFAutoModel +from transformers import DataCollatorWithPadding ->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) ``` - - - -## Custom model builds - -You can modify the model's configuration class to change how a model is built. The configuration specifies a model's attributes, such as the number of hidden layers or attention heads. You start from scratch when you initialize a model from a custom configuration class. The model attributes are randomly initialized, and you'll need to train the model before you can use it to get meaningful results. -Start by importing [`AutoConfig`], and then load the pretrained model you want to modify. Within [`AutoConfig.from_pretrained`], you can specify the attribute you want to change, such as the number of attention heads: +Next, create an instance of [`TrainingArguments`] to customize the training process. ```py ->>> from transformers import AutoConfig +from transformers import TrainingArguments ->>> my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12) +training_args = TrainingArguments( + output_dir="distilbert-rotten-tomatoes", + learning_rate=2e-5, + per_device_train_batch_size=8, + per_device_eval_batch_size=8, + num_train_epochs=2, + push_to_hub=True, +) ``` - - -Create a model from your custom configuration with [`AutoModel.from_config`]: +Finally, pass all these separate components to [`Trainer`] and call the [`~Trainer.train`] method to start. ```py ->>> from transformers import AutoModel +from transformers import Trainer ->>> my_model = AutoModel.from_config(my_config) -``` - - -Create a model from your custom configuration with [`TFAutoModel.from_config`]: - -```py ->>> from transformers import TFAutoModel +trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset["train"], + eval_dataset=dataset["test"], + tokenizer=tokenizer, + data_collator=data_collator, +) # doctest: +SKIP ->>> my_model = TFAutoModel.from_config(my_config) +trainer.train() ``` - - - -Take a look at the [Create a custom architecture](./create_a_model) guide for more information about building custom configurations. - -## Trainer - a PyTorch optimized training loop - -All models are a standard [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) so you can use them in any typical training loop. While you can write your own training loop, 🤗 Transformers provides a [`Trainer`] class for PyTorch, which contains the basic training loop and adds additional functionality for features like distributed training, mixed precision, and more. - -Depending on your task, you'll typically pass the following parameters to [`Trainer`]: -1. You'll start with a [`PreTrainedModel`] or a [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module). Set `torch_dtype="auto"` to automatically load the most memory-efficient data type the weights are stored in. +Push your model to the Hub to share it with the community. - ```py - >>> from transformers import AutoModelForSequenceClassification - - >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto") - ``` - -2. [`TrainingArguments`] contains the model hyperparameters you can change like learning rate, batch size, and the number of epochs to train for. The default values are used if you don't specify any training arguments: - - ```py - >>> from transformers import TrainingArguments - - >>> training_args = TrainingArguments( - ... output_dir="path/to/save/folder/", - ... learning_rate=2e-5, - ... per_device_train_batch_size=8, - ... per_device_eval_batch_size=8, - ... num_train_epochs=2, - ... ) - ``` - -3. Load a preprocessing class like a tokenizer, image processor, feature extractor, or processor: - - ```py - >>> from transformers import AutoTokenizer - - >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") - ``` - -4. Load a dataset: - - ```py - >>> from datasets import load_dataset - - >>> dataset = load_dataset("rotten_tomatoes") # doctest: +IGNORE_RESULT - ``` +```py +trainer.push_to_hub() +``` -5. Create a function to tokenize the dataset: +Congratulations, you just trained your first model with Transformers! - ```py - >>> def tokenize_dataset(dataset): - ... return tokenizer(dataset["text"]) - ``` +### TensorFlow - Then apply it over the entire dataset with [`~datasets.Dataset.map`]: +> [!WARNING] +> Not all pretrained models are available in TensorFlow. Check which ones are implemented in [Supported models and frameworks](./index#supported-models-and-frameworks). - ```py - >>> dataset = dataset.map(tokenize_dataset, batched=True) - ``` +[`Trainer`] doesn't work with TensorFlow models, but you can still train one with [Keras](https://keras.io/). Transformers implements TensorFlow models as a standard [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model), which is compatible with Keras' [compile](https://keras.io/api/models/model_training_apis/#compile-method) and [fit](https://keras.io/api/models/model_training_apis/#fit-method) methods. -6. A [`DataCollatorWithPadding`] to create a batch of examples from your dataset: +Load a model, tokenizer, and dataset for training. - ```py - >>> from transformers import DataCollatorWithPadding +```py +from transformers import TFAutoModelForSequenceClassification, AutoTokenizer - >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - ``` +model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") +tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") +``` -Now gather all these classes in [`Trainer`]: +Create a function to tokenize the text and convert it into tensors. Apply this function to the whole dataset with the [`datasets.Dataset.map`] method. ```py ->>> from transformers import Trainer - ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=dataset["train"], -... eval_dataset=dataset["test"], -... processing_class=tokenizer, -... data_collator=data_collator, -... ) # doctest: +SKIP +def tokenize_dataset(dataset): + return tokenizer(dataset["text"]) # doctest: +SKIP +dataset = dataset.map(tokenize_dataset) # doctest: +SKIP ``` -When you're ready, call [`~Trainer.train`] to start training: +Transformers provides the [`~TFPreTrainedModel.prepare_tf_dataset] method to collate and batch a dataset. ```py ->>> trainer.train() # doctest: +SKIP +tf_dataset = model.prepare_tf_dataset( + dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer +) # doctest: +SKIP ``` - +Finally, call [compile](https://keras.io/api/models/model_training_apis/#compile-method) to configure the model for training and [fit](https://keras.io/api/models/model_training_apis/#fit-method) to start. -For tasks - like translation or summarization - that use a sequence-to-sequence model, use the [`Seq2SeqTrainer`] and [`Seq2SeqTrainingArguments`] classes instead. - - - -You can customize the training loop behavior by subclassing the methods inside [`Trainer`]. This allows you to customize features such as the loss function, optimizer, and scheduler. Take a look at the [`Trainer`] reference for which methods can be subclassed. - -The other way to customize the training loop is by using [Callbacks](./main_classes/callback). You can use callbacks to integrate with other libraries and inspect the training loop to report on progress or stop the training early. Callbacks do not modify anything in the training loop itself. To customize something like the loss function, you need to subclass the [`Trainer`] instead. - -## Train with TensorFlow - -All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) so they can be trained in TensorFlow with the [Keras](https://keras.io/) API. 🤗 Transformers provides the [`~TFPreTrainedModel.prepare_tf_dataset`] method to easily load your dataset as a `tf.data.Dataset` so you can start training right away with Keras' [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) and [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) methods. - -1. You'll start with a [`TFPreTrainedModel`] or a [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model): - - ```py - >>> from transformers import TFAutoModelForSequenceClassification - - >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") - ``` - -2. Load a preprocessing class like a tokenizer, image processor, feature extractor, or processor: - - ```py - >>> from transformers import AutoTokenizer - - >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") - ``` - -3. Create a function to tokenize the dataset: - - ```py - >>> def tokenize_dataset(dataset): - ... return tokenizer(dataset["text"]) # doctest: +SKIP - ``` - -4. Apply the tokenizer over the entire dataset with [`~datasets.Dataset.map`] and then pass the dataset and tokenizer to [`~TFPreTrainedModel.prepare_tf_dataset`]. You can also change the batch size and shuffle the dataset here if you'd like: - - ```py - >>> dataset = dataset.map(tokenize_dataset) # doctest: +SKIP - >>> tf_dataset = model.prepare_tf_dataset( - ... dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer - ... ) # doctest: +SKIP - ``` - -5. When you're ready, you can call `compile` and `fit` to start training. Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: - - ```py - >>> from tensorflow.keras.optimizers import Adam - - >>> model.compile(optimizer='adam') # No loss argument! - >>> model.fit(tf_dataset) # doctest: +SKIP - ``` - - -## Chat with text generation models - -If you're working with a model that generates text as an output, you can also engage in a multi-turn conversation with -it through the `transformers-cli chat` command. This is the fastest way to interact with a model, e.g. for a -qualitative assessment (aka vibe check). - -This CLI is implemented on top of our `AutoClass` abstraction, leveraging our [text generation](llm_tutorial.md) and -[chat](chat_templating.md) tooling, and thus will be compatible with any 🤗 Transformers model. If you have the library -[installed](installation.md), you can launch the chat session on your terminal with - -```bash -transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct -``` - -For a full list of options to launch the chat, type +```py +from tensorflow.keras.optimizers import Adam -```bash -transformers-cli chat -h +model.compile(optimizer="adam") +model.fit(tf_dataset) # doctest: +SKIP ``` -After the chat is launched, you will enter an interactive session with the model. There are special commands for this -session as well, such as `clear` to reset the conversation. Type `help` at any moment to display all special chat -commands, and `exit` to terminate the session. +## Next steps +Great work on completing the quickstart! You have only scratched the surface of what you can achieve with Transformers. -## What's next? +Now that you have a better understanding of the library, it is time to keep exploring and learning what interests you the most. -Now that you've completed the 🤗 Transformers quick tour, check out our guides and learn how to do more specific things like writing a custom model, fine-tuning a model for a task, and how to train a model with a script. If you're interested in learning more about 🤗 Transformers core concepts, grab a cup of coffee and take a look at our Conceptual Guides! +Base classes: Learn more about the base classes and the model and processor classes that inherit from it. This will help you understand how to write your own custom models, preprocess different types of inputs (audio, images, multimodal), and how to share your model. +Inference: Dive deeper into inference with the [`Pipeline`] API, inference with LLMs, chatting with LLMs, agents, and how to optimize inference with your machine learning framework and hardware. +Training: Explore the [`Trainer`] API in more detail, distributed training, and optimizing training on your hardware. +Quantization: Reduce memory and storage requirements with quantization and speed up inference by representing weights in fewer bits. +Resources: Looking for end-to-end recipes for how to train and inference with a model for a specific task? Check out the task recipes! From edb848c0b139bca9e955cda2930d08a3c2601393 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Thu, 25 Jul 2024 14:49:38 -0700 Subject: [PATCH 007/116] fixes --- docs/source/en/autoclass_tutorial.md | 2 +- docs/source/en/index.md | 12 +++++++----- docs/source/en/quicktour.md | 16 ++++++++-------- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/docs/source/en/autoclass_tutorial.md b/docs/source/en/autoclass_tutorial.md index 33f48b2b043f..f4601fba1e6f 100644 --- a/docs/source/en/autoclass_tutorial.md +++ b/docs/source/en/autoclass_tutorial.md @@ -14,7 +14,7 @@ rendered properly in your Markdown viewer. --> -# Load pretrained instances with an AutoClass +# AutoClass API With so many different Transformer architectures, it can be challenging to create one for your checkpoint. As a part of 🤗 Transformers core philosophy to make the library easy, simple and flexible to use, an `AutoClass` automatically infers and loads the correct architecture from a given checkpoint. The `from_pretrained()` method lets you quickly load a pretrained model for any architecture so you don't have to devote time and resources to train a model from scratch. Producing this type of checkpoint-agnostic code means if your code works for one checkpoint, it will work with another checkpoint - as long as it was trained for a similar task - even if the architecture is different. diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 3864e7fe300e..22696effd403 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -36,12 +36,14 @@ Transformers provides everything you need for training or inference with state-o Transformers is designed for developers and machine learning engineers and researchers alike. Its main design principles are: -1. Easy and fast to use: Every model is implemented from only three main classes (model, preprocessor, and configuration) and can be quickly used for inference or training with two APIs ([`Pipeline`] or [`Trainer`]). -2. Pretrained models: Reduce your carbon footprint, compute cost and time by using a pretrained model instead of training an entirely new one. Each pretrained model is reproduced as closely as possible to the original model and offers state-of-the-art performance. +1. Easy and fast to use: Every model is implemented from only three main classes (model, preprocessor, and configuration) and can be quickly used for inference or training with two APIs ([`Pipeline`] or [`Trainer`]). +2. Pretrained models: Reduce your carbon footprint, compute cost and time by using a pretrained model instead of training an entirely new one. Each pretrained model is reproduced as closely as possible to the original model and offers state-of-the-art performance. - - HuggingFace Expert Acceleration Program - + ## Supported models and frameworks diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md index 69c9872d214d..4c3a5f0549d5 100755 --- a/docs/source/en/quicktour.md +++ b/docs/source/en/quicktour.md @@ -75,7 +75,7 @@ Each pretrained model inherits from three base classes. |---|---| | [`PretrainedConfig`] | A json file that specifies a models attributes such as the number of attention heads or vocabulary size. | | [`PreTrainedModel`] | A model (or architecture) defined by the attributes from the configuration file. For training and inference with a task, you need a model with a specific head attached to convert the raw hidden states into task-specific outputs. For example, [`PreTrainedModel`] outputs the raw hidden states but [`AutoModelForCausalLM`] adds a causal language model head on top to output the generated text. | -| Preprocessor | A class for converting raw inputs (text, images, audio, multimodal) into numerical inputs to the model. For example, [`PretrainedTokenizerFast`] converts text into tensors and [`ImageProcessingMixin`] converts pixels into tensors. | +| Preprocessor | A class for converting raw inputs (text, images, audio, multimodal) into numerical inputs to the model. For example, [`PreTrainedTokenizer`] converts text into tensors and [`ImageProcessingMixin`] converts pixels into tensors. | Unless you're building a custom model, you'll primarily interact with the [AutoClass](./model_doc/auto) API like [`AutoConfig`], [`AutoModelForCausalLM`], and [`AutoTokenizer`]. An `AutoClass` automatically infers the appropriate architecture for each task and machine learning framework based on the name or path to the pretrained weights and configuration file. @@ -302,7 +302,7 @@ model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilb tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") ``` -Create a function to tokenize the text and convert it into tensors. Apply this function to the whole dataset with the [`datasets.Dataset.map`] method. +Create a function to tokenize the text and convert it into tensors. Apply this function to the whole dataset with the [`~datasets.Dataset.map`] method. ```py def tokenize_dataset(dataset): @@ -310,7 +310,7 @@ def tokenize_dataset(dataset): dataset = dataset.map(tokenize_dataset) # doctest: +SKIP ``` -Transformers provides the [`~TFPreTrainedModel.prepare_tf_dataset] method to collate and batch a dataset. +Transformers provides the [`~TFPreTrainedModel.prepare_tf_dataset`] method to collate and batch a dataset. ```py tf_dataset = model.prepare_tf_dataset( @@ -333,8 +333,8 @@ Great work on completing the quickstart! You have only scratched the surface of Now that you have a better understanding of the library, it is time to keep exploring and learning what interests you the most. -Base classes: Learn more about the base classes and the model and processor classes that inherit from it. This will help you understand how to write your own custom models, preprocess different types of inputs (audio, images, multimodal), and how to share your model. -Inference: Dive deeper into inference with the [`Pipeline`] API, inference with LLMs, chatting with LLMs, agents, and how to optimize inference with your machine learning framework and hardware. -Training: Explore the [`Trainer`] API in more detail, distributed training, and optimizing training on your hardware. -Quantization: Reduce memory and storage requirements with quantization and speed up inference by representing weights in fewer bits. -Resources: Looking for end-to-end recipes for how to train and inference with a model for a specific task? Check out the task recipes! +Base classes: Learn more about the base classes and the model and processor classes that inherit from it. This will help you understand how to write your own custom models, preprocess different types of inputs (audio, images, multimodal), and how to share your model. +Inference: Dive deeper into inference with the [`Pipeline`] API, inference with LLMs, chatting with LLMs, agents, and how to optimize inference with your machine learning framework and hardware. +Training: Explore the [`Trainer`] API in more detail, distributed training, and optimizing training on your hardware. +Quantization: Reduce memory and storage requirements with quantization and speed up inference by representing weights in fewer bits. +Resources: Looking for end-to-end recipes for how to train and inference with a model for a specific task? Check out the task recipes! From cdddc34a40f31f48cf9038c4dc420f45439f16c4 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Fri, 26 Jul 2024 08:51:01 -0700 Subject: [PATCH 008/116] fix --- docs/source/en/quicktour.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md index 4c3a5f0549d5..4995b9cf5f27 100755 --- a/docs/source/en/quicktour.md +++ b/docs/source/en/quicktour.md @@ -333,8 +333,8 @@ Great work on completing the quickstart! You have only scratched the surface of Now that you have a better understanding of the library, it is time to keep exploring and learning what interests you the most. -Base classes: Learn more about the base classes and the model and processor classes that inherit from it. This will help you understand how to write your own custom models, preprocess different types of inputs (audio, images, multimodal), and how to share your model. -Inference: Dive deeper into inference with the [`Pipeline`] API, inference with LLMs, chatting with LLMs, agents, and how to optimize inference with your machine learning framework and hardware. -Training: Explore the [`Trainer`] API in more detail, distributed training, and optimizing training on your hardware. -Quantization: Reduce memory and storage requirements with quantization and speed up inference by representing weights in fewer bits. -Resources: Looking for end-to-end recipes for how to train and inference with a model for a specific task? Check out the task recipes! +- Base classes: Learn more about the base classes and the model and processor classes that inherit from it. This will help you understand how to write your own custom models, preprocess different types of inputs (audio, images, multimodal), and how to share your model. +- Inference: Dive deeper into inference with the [`Pipeline`] API, inference with LLMs, chatting with LLMs, agents, and how to optimize inference with your machine learning framework and hardware. +- Training: Explore the [`Trainer`] API in more detail, distributed training, and optimizing training on your hardware. +- Quantization: Reduce memory and storage requirements with quantization and speed up inference by representing weights in fewer bits. +- Resources: Looking for end-to-end recipes for how to train and inference with a model for a specific task? Check out the task recipes! From 8685e54f098e0bc242caf2a01e996acb4c97ff0c Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 30 Jul 2024 14:37:08 -0700 Subject: [PATCH 009/116] loading models --- docs/source/en/_toctree.yml | 12 +- docs/source/en/create_a_model.md | 472 ------------------------------- docs/source/en/models.md | 291 +++++++++++++++++++ docs/source/en/quicktour.md | 2 +- 4 files changed, 297 insertions(+), 480 deletions(-) delete mode 100644 docs/source/en/create_a_model.md create mode 100644 docs/source/en/models.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index fa8bf7bbd587..6354738307c9 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -11,14 +11,12 @@ - title: Models isExpanded: false sections: - - local: autoclass_tutorial - title: AutoClass API - - local: create_a_model - title: Use model-specific APIs + - local: models + title: Load - local: custom_models - title: Share a custom model + title: Create a custom model - local: model_sharing - title: Share your model + title: Share - local: add_new_model title: How to add a model to 🤗 Transformers? - local: task_summary @@ -31,7 +29,7 @@ title: Attention mechanisms - local: bertology title: BERTology - - title: Tokenizers + - title: Preprocessors isExpanded: false sections: - local: preprocessing diff --git a/docs/source/en/create_a_model.md b/docs/source/en/create_a_model.md deleted file mode 100644 index 0ecc503df615..000000000000 --- a/docs/source/en/create_a_model.md +++ /dev/null @@ -1,472 +0,0 @@ - - -# Create a custom architecture - -An [`AutoClass`](model_doc/auto) automatically infers the model architecture and downloads pretrained configuration and weights. Generally, we recommend using an `AutoClass` to produce checkpoint-agnostic code. But users who want more control over specific model parameters can create a custom 🤗 Transformers model from just a few base classes. This could be particularly useful for anyone who is interested in studying, training or experimenting with a 🤗 Transformers model. In this guide, dive deeper into creating a custom model without an `AutoClass`. Learn how to: - -- Load and customize a model configuration. -- Create a model architecture. -- Create a slow and fast tokenizer for text. -- Create an image processor for vision tasks. -- Create a feature extractor for audio tasks. -- Create a processor for multimodal tasks. - -## Configuration - -A [configuration](main_classes/configuration) refers to a model's specific attributes. Each model configuration has different attributes; for instance, all NLP models have the `hidden_size`, `num_attention_heads`, `num_hidden_layers` and `vocab_size` attributes in common. These attributes specify the number of attention heads or hidden layers to construct a model with. - -Get a closer look at [DistilBERT](model_doc/distilbert) by accessing [`DistilBertConfig`] to inspect it's attributes: - -```py ->>> from transformers import DistilBertConfig - ->>> config = DistilBertConfig() ->>> print(config) -DistilBertConfig { - "activation": "gelu", - "attention_dropout": 0.1, - "dim": 768, - "dropout": 0.1, - "hidden_dim": 3072, - "initializer_range": 0.02, - "max_position_embeddings": 512, - "model_type": "distilbert", - "n_heads": 12, - "n_layers": 6, - "pad_token_id": 0, - "qa_dropout": 0.1, - "seq_classif_dropout": 0.2, - "sinusoidal_pos_embds": false, - "transformers_version": "4.16.2", - "vocab_size": 30522 -} -``` - -[`DistilBertConfig`] displays all the default attributes used to build a base [`DistilBertModel`]. All attributes are customizable, creating space for experimentation. For example, you can customize a default model to: - -- Try a different activation function with the `activation` parameter. -- Use a higher dropout ratio for the attention probabilities with the `attention_dropout` parameter. - -```py ->>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4) ->>> print(my_config) -DistilBertConfig { - "activation": "relu", - "attention_dropout": 0.4, - "dim": 768, - "dropout": 0.1, - "hidden_dim": 3072, - "initializer_range": 0.02, - "max_position_embeddings": 512, - "model_type": "distilbert", - "n_heads": 12, - "n_layers": 6, - "pad_token_id": 0, - "qa_dropout": 0.1, - "seq_classif_dropout": 0.2, - "sinusoidal_pos_embds": false, - "transformers_version": "4.16.2", - "vocab_size": 30522 -} -``` - -Pretrained model attributes can be modified in the [`~PretrainedConfig.from_pretrained`] function: - -```py ->>> my_config = DistilBertConfig.from_pretrained("distilbert/distilbert-base-uncased", activation="relu", attention_dropout=0.4) -``` - -Once you are satisfied with your model configuration, you can save it with [`~PretrainedConfig.save_pretrained`]. Your configuration file is stored as a JSON file in the specified save directory: - -```py ->>> my_config.save_pretrained(save_directory="./your_model_save_path") -``` - -To reuse the configuration file, load it with [`~PretrainedConfig.from_pretrained`]: - -```py ->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json") -``` - - - -You can also save your configuration file as a dictionary or even just the difference between your custom configuration attributes and the default configuration attributes! See the [configuration](main_classes/configuration) documentation for more details. - - - -## Model - -The next step is to create a [model](main_classes/models). The model - also loosely referred to as the architecture - defines what each layer is doing and what operations are happening. Attributes like `num_hidden_layers` from the configuration are used to define the architecture. Every model shares the base class [`PreTrainedModel`] and a few common methods like resizing input embeddings and pruning self-attention heads. In addition, all models are also either a [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) or [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. This means models are compatible with each of their respective framework's usage. - - - -Load your custom configuration attributes into the model: - -```py ->>> from transformers import DistilBertModel - ->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json") ->>> model = DistilBertModel(my_config) -``` - -This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training. - -Create a pretrained model with [`~PreTrainedModel.from_pretrained`]: - -```py ->>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased") -``` - -When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like: - -```py ->>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config) -``` - - -Load your custom configuration attributes into the model: - -```py ->>> from transformers import TFDistilBertModel - ->>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") ->>> tf_model = TFDistilBertModel(my_config) -``` - -This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training. - -Create a pretrained model with [`~TFPreTrainedModel.from_pretrained`]: - -```py ->>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased") -``` - -When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like: - -```py ->>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config) -``` - - - -### Model heads - -At this point, you have a base DistilBERT model which outputs the *hidden states*. The hidden states are passed as inputs to a model head to produce the final output. 🤗 Transformers provides a different model head for each task as long as a model supports the task (i.e., you can't use DistilBERT for a sequence-to-sequence task like translation). - - - -For example, [`DistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs. - -```py ->>> from transformers import DistilBertForSequenceClassification - ->>> model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") -``` - -Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`DistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output. - -```py ->>> from transformers import DistilBertForQuestionAnswering - ->>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased") -``` - - -For example, [`TFDistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs. - -```py ->>> from transformers import TFDistilBertForSequenceClassification - ->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") -``` - -Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`TFDistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output. - -```py ->>> from transformers import TFDistilBertForQuestionAnswering - ->>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased") -``` - - - -## Tokenizer - -The last base class you need before using a model for textual data is a [tokenizer](main_classes/tokenizer) to convert raw text to tensors. There are two types of tokenizers you can use with 🤗 Transformers: - -- [`PreTrainedTokenizer`]: a Python implementation of a tokenizer. -- [`PreTrainedTokenizerFast`]: a tokenizer from our Rust-based [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) library. This tokenizer type is significantly faster - especially during batch tokenization - due to its Rust implementation. The fast tokenizer also offers additional methods like *offset mapping* which maps tokens to their original words or characters. - -Both tokenizers support common methods such as encoding and decoding, adding new tokens, and managing special tokens. - - - -Not every model supports a fast tokenizer. Take a look at this [table](index#supported-frameworks) to check if a model has fast tokenizer support. - - - -If you trained your own tokenizer, you can create one from your *vocabulary* file: - -```py ->>> from transformers import DistilBertTokenizer - ->>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left") -``` - -It is important to remember the vocabulary from a custom tokenizer will be different from the vocabulary generated by a pretrained model's tokenizer. You need to use a pretrained model's vocabulary if you are using a pretrained model, otherwise the inputs won't make sense. Create a tokenizer with a pretrained model's vocabulary with the [`DistilBertTokenizer`] class: - -```py ->>> from transformers import DistilBertTokenizer - ->>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased") -``` - -Create a fast tokenizer with the [`DistilBertTokenizerFast`] class: - -```py ->>> from transformers import DistilBertTokenizerFast - ->>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased") -``` - - - -By default, [`AutoTokenizer`] will try to load a fast tokenizer. You can disable this behavior by setting `use_fast=False` in `from_pretrained`. - - - -## Image processor - -An image processor processes vision inputs. It inherits from the base [`~image_processing_utils.ImageProcessingMixin`] class. - -To use, create an image processor associated with the model you're using. For example, create a default [`ViTImageProcessor`] if you are using [ViT](model_doc/vit) for image classification: - -```py ->>> from transformers import ViTImageProcessor - ->>> vit_extractor = ViTImageProcessor() ->>> print(vit_extractor) -ViTImageProcessor { - "do_normalize": true, - "do_resize": true, - "image_processor_type": "ViTImageProcessor", - "image_mean": [ - 0.5, - 0.5, - 0.5 - ], - "image_std": [ - 0.5, - 0.5, - 0.5 - ], - "resample": 2, - "size": 224 -} -``` - - - -If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default image processor parameters. - - - -Modify any of the [`ViTImageProcessor`] parameters to create your custom image processor: - -```py ->>> from transformers import ViTImageProcessor - ->>> my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3]) ->>> print(my_vit_extractor) -ViTImageProcessor { - "do_normalize": false, - "do_resize": true, - "image_processor_type": "ViTImageProcessor", - "image_mean": [ - 0.3, - 0.3, - 0.3 - ], - "image_std": [ - 0.5, - 0.5, - 0.5 - ], - "resample": "PIL.Image.BOX", - "size": 224 -} -``` - -## Backbone - -
- -
- -Computer vision models consist of a backbone, neck, and head. The backbone extracts features from an input image, the neck combines and enhances the extracted features, and the head is used for the main task (e.g., object detection). Start by initializing a backbone in the model config and specify whether you want to load pretrained weights or load randomly initialized weights. Then you can pass the model config to the model head. - -For example, to load a [ResNet](../model_doc/resnet) backbone into a [MaskFormer](../model_doc/maskformer) model with an instance segmentation head: - - - - -Set `use_pretrained_backbone=True` to load pretrained ResNet weights for the backbone. - -```py -from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation - -config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=True) # backbone and neck config -model = MaskFormerForInstanceSegmentation(config) # head -``` - - - - -Set `use_pretrained_backbone=False` to randomly initialize a ResNet backbone. - -```py -from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation - -config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=False) # backbone and neck config -model = MaskFormerForInstanceSegmentation(config) # head -``` - -You could also load the backbone config separately and then pass it to the model config. - -```py -from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig - -backbone_config = ResNetConfig() -config = MaskFormerConfig(backbone_config=backbone_config) -model = MaskFormerForInstanceSegmentation(config) -``` - - - - -[timm](https://hf.co/docs/timm/index) models are loaded within a model with `use_timm_backbone=True` or with [`TimmBackbone`] and [`TimmBackboneConfig`]. - -Use `use_timm_backbone=True` and `use_pretrained_backbone=True` to load pretrained timm weights for the backbone. - -```python -from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation - -config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=True, use_timm_backbone=True) # backbone and neck config -model = MaskFormerForInstanceSegmentation(config) # head -``` - -Set `use_timm_backbone=True` and `use_pretrained_backbone=False` to load a randomly initialized timm backbone. - -```python -from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation - -config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=False, use_timm_backbone=True) # backbone and neck config -model = MaskFormerForInstanceSegmentation(config) # head -``` - -You could also load the backbone config and use it to create a `TimmBackbone` or pass it to the model config. Timm backbones will load pretrained weights by default. Set `use_pretrained_backbone=False` to load randomly initialized weights. - -```python -from transformers import TimmBackboneConfig, TimmBackbone - -backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=False) - -# Create a backbone class -backbone = TimmBackbone(config=backbone_config) - -# Create a model with a timm backbone -from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation - -config = MaskFormerConfig(backbone_config=backbone_config) -model = MaskFormerForInstanceSegmentation(config) -``` - -## Feature extractor - -A feature extractor processes audio inputs. It inherits from the base [`~feature_extraction_utils.FeatureExtractionMixin`] class, and may also inherit from the [`SequenceFeatureExtractor`] class for processing audio inputs. - -To use, create a feature extractor associated with the model you're using. For example, create a default [`Wav2Vec2FeatureExtractor`] if you are using [Wav2Vec2](model_doc/wav2vec2) for audio classification: - -```py ->>> from transformers import Wav2Vec2FeatureExtractor - ->>> w2v2_extractor = Wav2Vec2FeatureExtractor() ->>> print(w2v2_extractor) -Wav2Vec2FeatureExtractor { - "do_normalize": true, - "feature_extractor_type": "Wav2Vec2FeatureExtractor", - "feature_size": 1, - "padding_side": "right", - "padding_value": 0.0, - "return_attention_mask": false, - "sampling_rate": 16000 -} -``` - - - -If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default feature extractor parameters. - - - -Modify any of the [`Wav2Vec2FeatureExtractor`] parameters to create your custom feature extractor: - -```py ->>> from transformers import Wav2Vec2FeatureExtractor - ->>> w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000, do_normalize=False) ->>> print(w2v2_extractor) -Wav2Vec2FeatureExtractor { - "do_normalize": false, - "feature_extractor_type": "Wav2Vec2FeatureExtractor", - "feature_size": 1, - "padding_side": "right", - "padding_value": 0.0, - "return_attention_mask": false, - "sampling_rate": 8000 -} -``` - -## Processor - -For models that support multimodal tasks, 🤗 Transformers offers a processor class that conveniently wraps processing classes such as a feature extractor and a tokenizer into a single object. For example, let's use the [`Wav2Vec2Processor`] for an automatic speech recognition task (ASR). ASR transcribes audio to text, so you will need a feature extractor and a tokenizer. - -Create a feature extractor to handle the audio inputs: - -```py ->>> from transformers import Wav2Vec2FeatureExtractor - ->>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True) -``` - -Create a tokenizer to handle the text inputs: - -```py ->>> from transformers import Wav2Vec2CTCTokenizer - ->>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt") -``` - -Combine the feature extractor and tokenizer in [`Wav2Vec2Processor`]: - -```py ->>> from transformers import Wav2Vec2Processor - ->>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) -``` - -With two basic classes - configuration and model - and an additional preprocessing class (tokenizer, image processor, feature extractor, or processor), you can create any of the models supported by 🤗 Transformers. Each of these base classes are configurable, allowing you to use the specific attributes you want. You can easily setup a model for training or modify an existing pretrained model to fine-tune. diff --git a/docs/source/en/models.md b/docs/source/en/models.md new file mode 100644 index 000000000000..ed734f4d4a80 --- /dev/null +++ b/docs/source/en/models.md @@ -0,0 +1,291 @@ + + +# Load + +Transformers provides many pretrained models that are ready to use with just a single line of code. It requires a model class and the [`~PreTrainedModel.from_pretrained`] method. + +To load a model, call the [`~PreTrainedModel.from_pretrained`] method to download and load the model weights and configuration stored on the Hugging Face [Hub](https://hf.co/models) into the model class. + +> [!TIP] +> The [`~PreTrainedModel.from_pretrained`] method loads weights stored in the [safetensors](https://hf.co/docs/safetensors/index) file format if they're available. Traditionally, PyTorch model weights are serialized with the [pickle](https://docs.python.org/3/library/pickle.html) utility which is known to be unsecure. Safetensor files are more secure and faster to load. + +```py +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") +``` + +This guide will briefly explain how models are loaded, the different ways you can load a model, and how to overcome memory issues for really big models. + +## Models and configurations + +All models have a `configuration.py` file with specific attributes like the number of hidden layers, vocabulary size, activation function, and more. You'll also find a `modeling.py` file that defines the layers and mathematical operations taking place inside each layer. The `modeling.py` file takes the model attributes in `configuration.py` and builds the model accordingly. At this point, you have a model with random weights that needs to be trained to output meaningful results. + + + +> [!TIP] +> An *architecture* refers to the model's skeleton and a *checkpoint* refers to the model's weights for a given architecture. For example, [BERT](./model_doc/bert) is an architecture while [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) is a checkpoint. You'll see the term *model* used interchangeably for architecture and checkpoint. + +To get a pretrained model, you need to load the weights into the model. This is done by calling the [`~PreTrainedModel.from_pretrained`] method which accepts weights from the Hugging Face Hub or a local directory. + +There are two general types of models you can load: + +1. A generic model class like [`LlamaModel`] or [`AutoModel`] that outputs hidden states. +2. A model class with a specific *head* attached to the generic model, like [`LlamaForCausalLM`] or [`AutoModelForCausalLM`], for performing specific tasks. + +For each model type, there is a separate class for each machine learning framework (PyTorch, TensorFlow, Flax). Pick the corresponding prefix for the framework you're using. + + + + +```py +from transformers import AutoModelForCausalLM, MistralForCausalLM + +# load with AutoClass or model-specific class +model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +``` + + + + +```py +from transformers import TFAutoModelForCausalLM, TFMistralForCausalLM + +# load with AutoClass or model-specific class +model = TFAutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +model = TFMistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +``` + + + + +```py +from transformers import FlaxAutoModelForCausalLM, FlaxMistralForCausalLM + +# load with AutoClass or model-specific class +model = FlaxAutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +model = FlaxMistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +``` + + + + +## AutoClass + + + +The [AutoClass](./model_doc/auto) API is a convenient way to load an architecture without needing to know the exact model class name, because there are many architectures. It automatically selects the correct model class based on the configuration file. You only need to know the task and checkpoint you want to use. + +The AutoClass makes it easy to switch between models or tasks, as long as the architecture is supported for a given task. + +For example, the same model can be used for separate tasks. + +```py +from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModelForQuestionAnswering + +# use the same API for 3 different tasks +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") +model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-2-7b-hf") +model = AutoModelForQuestionAnswering.from_pretrained("meta-llama/Llama-2-7b-hf") +``` + +In other cases, you want to quickly try out several models for a task. + +```py +from transformers import AutoModelForCausalLM + +# use the same API to load 3 different models +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") +model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +model = AutoModelForCausalLM.from_pretrained("google/gemma-7b") +``` + +## Model-specific class + +The [AutoClass](#autoclass) builds on top of model-specific classes. All model classes that support a specific task are mapped to their respective `AutoModelFor` task class. + +But if you already know which model class you want to use, then you could use its model-specific class directly. + +```py +from transformers import LlamaModel, LlamaForCausalLM + +model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") +``` + +## Big models + +Large pretrained models require a lot of memory to load. The loading process involves: + +1. creating a model with random weights +2. loading the pretrained weights +3. placing the pretrained weights on the model + +You need enough memory to hold two copies of the model weights (random and pretrained) which may not be possible depending on your hardware. In distributed training environments, this is an even bigger challenge because each process loads a pretrained model. + +Transformers reduces some of these memory-related challenges with fast initialization, sharded checkpoints, leveraging Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature, and supporting lower bit data types. + +### Fast initialization + +A PyTorch model is instantiated with random weights, or "empty" tensors, that take up space in memory without filling it. + +Transformers boosts loading speed and avoids random weight initialization with the [_fast_init](https://github.com/huggingface/transformers/blob/c9f6e5e35156e068b227dd9b15521767f6afd4d2/src/transformers/modeling_utils.py#L2710) parameter if the pretrained weights are correctly initialized. This parameter is set to `True` by default. + +### Sharded checkpoints + +For big models with sharded checkpoints, each shard is loaded sequentially after the previous shard is loaded. This limits memory-usage to only the model size and the largest shard size. + +Transformers' [`~PreTrainedModel.save_pretrained`] method automatically shards checkpoints larger than 10GB. + +The `max_shard_size` parameter defaults to 5GB for each shard because it is easier to run on free-tier GPU instances without running out of memory. + +For example, let's shard [BioMistral/BioMistral-7B](https://hf.co/BioMistral/BioMistral-7B). + +```py +from transformers import AutoModel +import tempfile +import os + +model = AutoModel.from_pretrained("biomistral/biomistral-7b") +with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir, max_shard_size="5GB") + print(sorted(os.listdir(tmp_dir))) +``` + +Reload the sharded checkpoint with [`~PreTrainedModel.from_pretrained`]. + +```py +with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + new_model = AutoModel.from_pretrained(tmp_dir) +``` + +Sharded checkpoints can also be directly loaded with the [`~transformers.modeling_utils.load_sharded_checkpoint] method. + +```py +from transformers.modeling_utils import load_sharded_checkpoint + +with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir, max_shard_size="5GB") + load_sharded_checkpoint(model, tmp_dir) +``` + +#### Model metadata + +Transformers' [`~PreTrainedModel.save_pretrained`] method creates an index file that maps parameter names to the files they're stored in. The index file has two keys, `metadata` and `weight_map`. + +```py +import json + +with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir, max_shard_size="5GB") + with open(os.path.join(tmp_dir, "model.safetensors.index.json"), "r") as f: + index = json.load(f) + +print(index.keys()) +``` + +The `metadata` key provides the total model size. + +```py +index["metadata"] +{'total_size': 28966928384} +``` + +The `weight_map` key maps each parameter to the shard it's stored in. + +```py +index["weight_map"] +{'lm_head.weight': 'model-00006-of-00006.safetensors', + 'model.embed_tokens.weight': 'model-00001-of-00006.safetensors', + 'model.layers.0.input_layernorm.weight': 'model-00001-of-00006.safetensors', + 'model.layers.0.mlp.down_proj.weight': 'model-00001-of-00006.safetensors', + ... +} +``` + +### Big Model Inference + +> [!TIP] +> Make sure you have Accelerate v0.9.0 and PyTorch v1.9.0 or later installed to use this feature! + + + +The [`~PreTrainedModel.from_pretrained`] method is supercharged with Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling). + +Big Model Inference creates a *model skeleton* on PyTorch's [meta](https://pytorch.org/docs/main/meta.html) device. The meta device doesn't store any real data, only the metadata. + +Randomly initialized weights are only created when the pretrained weights are loaded to avoid maintaining two copies of the model in memory at the same time. The maximum memory-usage is only the size of the model. + +> [!TIP] +> Learn more about device placement in [Designing a device map](https://hf.co/docs/accelerate/v0.33.0/en/concept_guides/big_model_inference#designing-a-device-map) section. + +Big Model Inference's second feature relates to how weights are loaded and dispatched in the model skeleton. Model weights are dispatched across all available devices, starting with the fastest device (usually the GPU) and then offloading any remaining weights to slower devices (CPU and hard drive). + +Both features combine reduced memory-usage and faster loading times for big pretrained models. + +Set the [device_map](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3061) parameter to `"auto"` to enable Big Model Inference. This also sets the [low_cpu_mem_usage](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3028) parameter to `True`. + +```py +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto") +``` + +To manually assign layers to devices, create a `device_map`. It should map all model parameters to a device, but you don't have to detail where all the submodules of a layer go if the entire layer is on the same device. Access the `hf_device_map` attribute to see how the model is distributed across devices. + +```py +device_map = {"model.layers.1": 0, "model.layers.14": 1, "model.layers.31": "cpu", "lm_head": "disk"} +model.hf_device_map +``` + +### Model data type + +PyTorch model weights are initialized as torch.float32. To load a model in a different data type, like torch.float16, it requires additional memory to load the model again in the desired data type. + +Explicitly set the [torch_dtype]() parameter to directly initialize the model in the desired data type instead of essentially loading a model twice (torch.float32, torch.float16). You could also set `torch_dtype="auto"` to automatically load the weights with the most optimal memory pattern (the data type is derived from the model weights). + + + + +```py +from transformers import AutoModelForCausalLM + +gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype=torch.float16) +``` + + + + +```py +from transformers import AutoModelForCausalLM + +gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype="auto") +``` + + + + +The `torch_dtype` parameter can also be configured in [`AutoConfig`] for models instantiated from scratch. + +```py +import torch +from transformers import AutoConfig, AutoModel + +my_config = AutoConfig.from_pretrained("google/gemma-2b", torch_dtype=torch.float16) +model = AutoModel.from_config(my_config) +``` diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md index 4995b9cf5f27..c567f0647007 100755 --- a/docs/source/en/quicktour.md +++ b/docs/source/en/quicktour.md @@ -73,7 +73,7 @@ Each pretrained model inherits from three base classes. | **Class** | **Description** | |---|---| -| [`PretrainedConfig`] | A json file that specifies a models attributes such as the number of attention heads or vocabulary size. | +| [`PretrainedConfig`] | A file that specifies a models attributes such as the number of attention heads or vocabulary size. | | [`PreTrainedModel`] | A model (or architecture) defined by the attributes from the configuration file. For training and inference with a task, you need a model with a specific head attached to convert the raw hidden states into task-specific outputs. For example, [`PreTrainedModel`] outputs the raw hidden states but [`AutoModelForCausalLM`] adds a causal language model head on top to output the generated text. | | Preprocessor | A class for converting raw inputs (text, images, audio, multimodal) into numerical inputs to the model. For example, [`PreTrainedTokenizer`] converts text into tensors and [`ImageProcessingMixin`] converts pixels into tensors. | From 6c3fa1e926a8bbbf0aa44c5c3973fa041aec1280 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 30 Jul 2024 14:41:25 -0700 Subject: [PATCH 010/116] fix --- utils/not_doctested.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt index 19f157b2b9ba..43190d27bb42 100644 --- a/utils/not_doctested.txt +++ b/utils/not_doctested.txt @@ -9,7 +9,6 @@ docs/source/en/bertology.md docs/source/en/big_models.md docs/source/en/community.md docs/source/en/contributing.md -docs/source/en/create_a_model.md docs/source/en/custom_models.md docs/source/en/debugging.md docs/source/en/fast_tokenizers.md From 712b713a5f42dff7326a31d6372c6c2fe7c22567 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Thu, 1 Aug 2024 11:16:53 -0700 Subject: [PATCH 011/116] customize models --- docs/source/en/_toctree.yml | 2 +- docs/source/en/custom_models.md | 241 ++++++++++---------------------- docs/source/en/models.md | 27 +++- 3 files changed, 101 insertions(+), 169 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 6354738307c9..236873e8de43 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -14,7 +14,7 @@ - local: models title: Load - local: custom_models - title: Create a custom model + title: Customize - local: model_sharing title: Share - local: add_new_model diff --git a/docs/source/en/custom_models.md b/docs/source/en/custom_models.md index 6599ded962d1..a22c51e19d3b 100644 --- a/docs/source/en/custom_models.md +++ b/docs/source/en/custom_models.md @@ -1,4 +1,4 @@ - -# Building custom models +# Customize -The 🤗 Transformers library is designed to be easily extensible. Every model is fully coded in a given subfolder -of the repository with no abstraction, so you can easily copy a modeling file and tweak it to your needs. +Transformers models are easily customizable. Models are fully contained in the [model](https://github.com/huggingface/transformers/tree/main/src/transformers/models) subfolder of the Transformers repository. Each folder contains a `modeling.py` and a `configuration.py` file. Copy these files to start customizing a model. -If you are writing a brand new model, it might be easier to start from scratch. In this tutorial, we will show you -how to write a custom model and its configuration so it can be used inside Transformers, and how you can share it -with the community (with the code it relies on) so that anyone can use it, even if it's not present in the 🤗 -Transformers library. We'll see how to build upon transformers and extend the framework with your hooks and -custom code. +> [!TIP] +> It may be easier to start from scratch if you're creating an entirely new model. For models that are very similar to an existing one in Transformers, it is faster to reuse or subclass the same configuration and model class. -We will illustrate all of this on a ResNet model, by wrapping the ResNet class of the -[timm library](https://github.com/rwightman/pytorch-image-models) into a [`PreTrainedModel`]. +This guide will show you how to customize a ResNet model, enable [AutoClass](./models#autoclass) API support, and share it on the Hub. -## Writing a custom configuration +## Configuration -Before we dive into the model, let's first write its configuration. The configuration of a model is an object that -will contain all the necessary information to build the model. As we will see in the next section, the model can only -take a `config` to be initialized, so we really need that object to be as complete as possible. +A configuration, given by the base [`PretrainedConfig`] class, contains all the necessary information to build a model. This is where you'll configure the parameters of the custom ResNet model. Different configurations gives different ResNet model types. - +The three main rules for customizing a configuration are: -Models in the `transformers` library itself generally follow the convention that they accept a `config` object -in their `__init__` method, and then pass the whole `config` to sub-layers in the model, rather than breaking the -config object into multiple arguments that are all passed individually to sub-layers. Writing your model in this -style results in simpler code with a clear "source of truth" for any hyperparameters, and also makes it easier -to reuse code from other models in `transformers`. +1. A custom configuration must inherit from [`PretrainedConfig`]. Inheritance ensures a custom model has all the functionality of a Transformers model such as [`PretrainedConfig.from_pretrained`], [`PretrainedConfig.save_pretrained`], and [`PretrainedConfig.push_to_hub`]. +2. The [`PretrainedConfig`] `__init__` must accept any `kwargs` and `kwargs` must be passed to the superclass `__init__`. [`PretrainedConfig`] has more more fields than the ones you're setting in your custom configuration. When you load a configuration with [`PretrainedConfig.from_pretrained`], those fields need to be accepted by your configuration and passed to the superclass. - +> [!TIP] +> It is useful to check the validity of some of the parameters. In the example below, a check is implemented to ensure `block_type` and `stem_type` are one of the predefined values. +> +> Add `model_type` to the configuration class to enable [AutoClass](./models#autoclass) support. -In our example, we will take a couple of arguments of the ResNet class that we might want to tweak. Different -configurations will then give us the different types of ResNets that are possible. We then just store those arguments, -after checking the validity of a few of them. - -```python +```py from transformers import PretrainedConfig from typing import List - class ResnetConfig(PretrainedConfig): model_type = "resnet" @@ -86,56 +74,36 @@ class ResnetConfig(PretrainedConfig): super().__init__(**kwargs) ``` -The three important things to remember when writing you own configuration are the following: -- you have to inherit from `PretrainedConfig`, -- the `__init__` of your `PretrainedConfig` must accept any kwargs, -- those `kwargs` need to be passed to the superclass `__init__`. - -The inheritance is to make sure you get all the functionality from the 🤗 Transformers library, while the two other -constraints come from the fact a `PretrainedConfig` has more fields than the ones you are setting. When reloading a -config with the `from_pretrained` method, those fields need to be accepted by your config and then sent to the -superclass. - -Defining a `model_type` for your configuration (here `model_type="resnet"`) is not mandatory, unless you want to -register your model with the auto classes (see last section). - -With this done, you can easily create and save your configuration like you would do with any other model config of the -library. Here is how we can create a resnet50d config and save it: +Save the configuration to a JSON file with the [`PretrainedConfig.save_pretrained`] method. This file is stored in your custom model folder, `custom-resnet`. ```py resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) resnet50d_config.save_pretrained("custom-resnet") ``` -This will save a file named `config.json` inside the folder `custom-resnet`. You can then reload your config with the -`from_pretrained` method: +## Model -```py -resnet50d_config = ResnetConfig.from_pretrained("custom-resnet") -``` +With the custom ResNet configuration, you can now create and customize the model. The model inherits from the base [`PreTrainedModel`] class. Like [`PretrainedConfig`], inheriting from [`PreTrainedModel`] and initializing the superclass with the configuration extends Transformers functionalities such as saving and loading to the custom model. + +Transformers' models follow the convention of accepting a `config` object in the `__init__` method. This passes the entire `config` to the models sublayers, instead of breaking the `config` object into multiple arguments that are passed individually to the sublayers. Writing models this way produces simpler code with a clear *source of truth* for any hyperparameters. It is also easier to reuse code from other Transformers' models. -You can also use any other method of the [`PretrainedConfig`] class, like [`~PretrainedConfig.push_to_hub`] to -directly upload your config to the Hub. +You'll create two ResNet models, a ResNet model that outputs the hidden states and a ResNet model with an image classification head. -## Writing a custom model + + -Now that we have our ResNet configuration, we can go on writing the model. We will actually write two: one that -extracts the hidden features from a batch of images (like [`BertModel`]) and one that is suitable for image -classification (like [`BertForSequenceClassification`]). +Define a mapping between the block types and block classes. Everything else is created by passing the configuration class to the Resnet model class. -As we mentioned before, we'll only write a loose wrapper of the model to keep it simple for this example. The only -thing we need to do before writing this class is a map between the block types and actual block classes. Then the -model is defined from the configuration by passing everything to the `ResNet` class: +> [!TIP] +> Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support. ```py from transformers import PreTrainedModel from timm.models.resnet import BasicBlock, Bottleneck, ResNet from .configuration_resnet import ResnetConfig - BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck} - class ResnetModel(PreTrainedModel): config_class = ResnetConfig @@ -158,12 +126,17 @@ class ResnetModel(PreTrainedModel): return self.model.forward_features(tensor) ``` -For the model that will classify images, we just change the forward method: + + + +The `forward` method needs to be rewrittten to calculate the loss for each logit if labels are available. Otherwise, the Resnet model class is the same. + +> [!TIP] +> Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support. ```py import torch - class ResnetModelForImageClassification(PreTrainedModel): config_class = ResnetConfig @@ -190,34 +163,20 @@ class ResnetModelForImageClassification(PreTrainedModel): return {"logits": logits} ``` -In both cases, notice how we inherit from `PreTrainedModel` and call the superclass initialization with the `config` -(a bit like when you write a regular `torch.nn.Module`). The line that sets the `config_class` is not mandatory, unless -you want to register your model with the auto classes (see last section). - - - -If your model is very similar to a model inside the library, you can re-use the same configuration as this model. + + - +A model can return any output format. Returning a dictionary (like ResnetModelForImageClassification) with losses when labels are available, makes the custom model compatible with the [`Trainer`]. For other output formats, you'll need your own training loop or a different library for training. -You can have your model return anything you want, but returning a dictionary like we did for -`ResnetModelForImageClassification`, with the loss included when labels are passed, will make your model directly -usable inside the [`Trainer`] class. Using another output format is fine as long as you are planning on using your own -training loop or another library for training. - -Now that we have our model class, let's create one: +Instantiate the custom model class with the configuration. ```py resnet50d = ResnetModelForImageClassification(resnet50d_config) ``` -Again, you can use any of the methods of [`PreTrainedModel`], like [`~PreTrainedModel.save_pretrained`] or -[`~PreTrainedModel.push_to_hub`]. We will use the second in the next section, and see how to push the model weights -with the code of our model. But first, let's load some pretrained weights inside our model. +At this point, you can load pretrained weights into the model or train it from scratch. You'll load pretrained weights in this guide. -In your own use case, you will probably be training your custom model on your own data. To go fast for this tutorial, -we will use the pretrained version of the resnet50d. Since our model is just a wrapper around it, it's going to be -easy to transfer those weights: +Load the pretrained weights from the [timm](https://hf.co/docs/timm/index) library, and then transfer those weights to the custom model with the [load_state_dict](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.load_state_dict) method. ```py import timm @@ -226,17 +185,14 @@ pretrained_model = timm.create_model("resnet50d", pretrained=True) resnet50d.model.load_state_dict(pretrained_model.state_dict()) ``` -Now let's see how to make sure that when we do [`~PreTrainedModel.save_pretrained`] or [`~PreTrainedModel.push_to_hub`], the -code of the model is saved. +## AutoClass support -## Registering a model with custom code to the auto classes +The [AutoClass](./models#autoclass) API is a shortcut for automatically loading the correct architecture for a given model. It may be convenient for your users to add this API to your custom model. -If you are writing a library that extends 🤗 Transformers, you may want to extend the auto classes to include your own -model. This is different from pushing the code to the Hub in the sense that users will need to import your library to -get the custom models (contrarily to automatically downloading the model code from the Hub). +Make sure you have the `model_type` attribute (must be different from existing model types) in the configuration class and `config_class` attribute in the model class. With the [`~AutoConfig.register`] method, add the custom configuration and model to the [AutoClass](./models#autoclass) API. -As long as your config has a `model_type` attribute that is different from existing model types, and that your model -classes have the right `config_class` attributes, you can just add them to the auto classes like this: +> [!TIP] +> The first argument to [`AutoConfig.register`] must match the `model_type` attribute in the custom configuration class, and the first argument to [`AutoModel.register`] must match the `config_class` of the custom model class. ```py from transformers import AutoConfig, AutoModel, AutoModelForImageClassification @@ -246,25 +202,21 @@ AutoModel.register(ResnetConfig, ResnetModel) AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) ``` -Note that the first argument used when registering your custom config to [`AutoConfig`] needs to match the `model_type` -of your custom config, and the first argument used when registering your custom models to any auto model class needs -to match the `config_class` of those models. +## Share a custom model on the Hub -## Sending the code to the Hub +Upload a custom model to the [Hub](https://hf.co/models) to allow other users to easily load and use it. - +Ensure the model directory is structured correctly as shown below. The directory should contain: -This API is experimental and may have some slight breaking changes in the next releases. +- `modeling.py`: Contains the code for ResnetModel and ResnetModelForImageClassification. This file can rely on relative imports to other files as long as they're in the same directory. - +> [!WARNING] +> Replace all relative imports at the top of the `modeling.py` file to import from Transformers instead if you're copying a model file from Transformers. -First, make sure your model is fully defined in a `.py` file. It can rely on relative imports to some other files as -long as all the files are in the same directory (we don't support submodules for this feature yet). For our example, -we'll define a `modeling_resnet.py` file and a `configuration_resnet.py` file in a folder of the current working -directory named `resnet_model`. The configuration file contains the code for `ResnetConfig` and the modeling file -contains the code of `ResnetModel` and `ResnetModelForImageClassification`. +- `configuration.py`: Contains the code for ResnetConfig. +- `__init__.py`: Can be empty. This file allows Python `resnet_model` to be used as a module. -``` +```bash . └── resnet_model ├── __init__.py @@ -272,27 +224,14 @@ contains the code of `ResnetModel` and `ResnetModelForImageClassification`. └── modeling_resnet.py ``` -The `__init__.py` can be empty, it's just there so that Python detects `resnet_model` can be use as a module. - - - -If copying a modeling files from the library, you will need to replace all the relative imports at the top of the file -to import from the `transformers` package. - - - -Note that you can re-use (or subclass) an existing configuration/model. - -To share your model with the community, follow those steps: first import the ResNet model and config from the newly -created files: +To share the model, import the ResNet model and configuration. ```py from resnet_model.configuration_resnet import ResnetConfig from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification ``` -Then you have to tell the library you want to copy the code files of those objects when using the `save_pretrained` -method and properly register them with a given Auto class (especially for models), just run: +Copy the code from the model and configuration files and register them with an [AutoClass](./models#autoclass) with the [`~PretrainedConfig.register_for_auto_class`] method. For the model, pick the appropriate `AutoModelFor` class based on the task. ```py ResnetConfig.register_for_auto_class() @@ -300,27 +239,17 @@ ResnetModel.register_for_auto_class("AutoModel") ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification") ``` -Note that there is no need to specify an auto class for the configuration (there is only one auto class for them, -[`AutoConfig`]) but it's different for models. Your custom model could be suitable for many different tasks, so you -have to specify which one of the auto classes is the correct one for your model. - - - -Use `register_for_auto_class()` if you want the code files to be copied. If you instead prefer to use code on the Hub from another repo, -you don't need to call it. In cases where there's more than one auto class, you can modify the `config.json` directly using the -following structure: +To map more than one task to the model, edit `auto_map` in the configuration JSON file directly. ```json -"auto_map": { - "AutoConfig": "--", - "AutoModel": "--", - "AutoModelFor": "--", +"auto_map": { + "AutoConfig": "--", + "AutoModel": "--", + "AutoModelFor": "--", }, ``` - - -Next, let's create the config and models as we did before: +Create the configuration and model and load pretrained weights into it. ```py resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) @@ -330,13 +259,17 @@ pretrained_model = timm.create_model("resnet50d", pretrained=True) resnet50d.model.load_state_dict(pretrained_model.state_dict()) ``` -Now to send the model to the Hub, make sure you are logged in. Either run in your terminal: +The model is ready to be pushed to the Hub now. Login to your Hugging Face account from the command line or notebook. + + + ```bash huggingface-cli login ``` -or from a notebook: + + ```py from huggingface_hub import notebook_login @@ -344,41 +277,15 @@ from huggingface_hub import notebook_login notebook_login() ``` -You can then push to your own namespace (or an organization you are a member of) like this: - -```py -resnet50d.push_to_hub("custom-resnet50d") -``` - -On top of the modeling weights and the configuration in json format, this also copied the modeling and -configuration `.py` files in the folder `custom-resnet50d` and uploaded the result to the Hub. You can check the result -in this [model repo](https://huggingface.co/sgugger/custom-resnet50d). - -See the [sharing tutorial](model_sharing) for more information on the push to Hub method. - -## Using a model with custom code - -You can use any configuration, model or tokenizer with custom code files in its repository with the auto-classes and -the `from_pretrained` method. All files and code uploaded to the Hub are scanned for malware (refer to the [Hub security](https://huggingface.co/docs/hub/security#malware-scanning) documentation for more information), but you should still -review the model code and author to avoid executing malicious code on your machine. Set `trust_remote_code=True` to use -a model with custom code: + + -```py -from transformers import AutoModelForImageClassification - -model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True) -``` - -It is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not -update the code with some malicious new lines (unless you fully trust the authors of the models). +Call [`~PreTrainedModel.push_to_hub`] on the model to upload the model to the Hub. ```py -commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292" -model = AutoModelForImageClassification.from_pretrained( - "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash -) +resnet50d.push_to_hub("custom-resnet50d") ``` -Note that when browsing the commit history of the model repo on the Hub, there is a button to easily copy the commit -hash of any commit. +The pretrained weights, configuration in JSON format, `modeling.py` and `configuration.py` files should all be uploaded to the Hub now under a namespace and specified directory [here](https://hf.co/sgugger/custom-resnet50d). +Because a custom model doesn't use the same modeling code as Transformers' model, you need to add `trust_remode_code=True` in the [`~PreTrainedModel.from_pretrained`] method. Refer to the load [custom models](./models#custom-models) section for more information. diff --git a/docs/source/en/models.md b/docs/source/en/models.md index ed734f4d4a80..7513679550e0 100644 --- a/docs/source/en/models.md +++ b/docs/source/en/models.md @@ -174,7 +174,7 @@ with tempfile.TemporaryDirectory() as tmp_dir: new_model = AutoModel.from_pretrained(tmp_dir) ``` -Sharded checkpoints can also be directly loaded with the [`~transformers.modeling_utils.load_sharded_checkpoint] method. +Sharded checkpoints can also be directly loaded with the [`~transformers.modeling_utils.load_sharded_checkpoint`] method. ```py from transformers.modeling_utils import load_sharded_checkpoint @@ -289,3 +289,28 @@ from transformers import AutoConfig, AutoModel my_config = AutoConfig.from_pretrained("google/gemma-2b", torch_dtype=torch.float16) model = AutoModel.from_config(my_config) ``` + +## Custom models + +Custom models use Transformers' configuration and modeling classes, supports the [AutoClass](#autoclass) API, and are loaded with [`~PreTrainedModel.from_pretrained`]. What makes custom models different is the modeling code is not from Transformers. + +The Hub includes [malware scanning](https://hf.co/docs/hub/security-malware#malware-scanning) for every repository, but extra care should still be taken when loading a custom model to avoid inadvertently executing malicious code. + +Set the `trust_remote_code` parameter to `True` in [`~PreTrainedModel.from_pretrained`] to load a custom model. + +```py +from transformers import AutoModelForImageClassification + +model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True) +``` + +As an extra layer of security, load a custom model from a specific revision to make sure the model code hasn't changed. The commit hash can be copied from the model's [commit history](https://hf.co/sgugger/custom-resnet50d/commits/main). + +```py +commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292" +model = AutoModelForImageClassification.from_pretrained( + "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash +) +``` + +Learn more about how to create a custom model in [Customize](./custom_models). \ No newline at end of file From 9f37ab4f5ed3bc3a34a86264599f363278a2c058 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Thu, 1 Aug 2024 16:07:41 -0700 Subject: [PATCH 012/116] share --- docs/source/en/custom_models.md | 2 + docs/source/en/model_sharing.md | 248 +++++++++++++++----------------- 2 files changed, 116 insertions(+), 134 deletions(-) diff --git a/docs/source/en/custom_models.md b/docs/source/en/custom_models.md index a22c51e19d3b..1171405b74d6 100644 --- a/docs/source/en/custom_models.md +++ b/docs/source/en/custom_models.md @@ -289,3 +289,5 @@ resnet50d.push_to_hub("custom-resnet50d") The pretrained weights, configuration in JSON format, `modeling.py` and `configuration.py` files should all be uploaded to the Hub now under a namespace and specified directory [here](https://hf.co/sgugger/custom-resnet50d). Because a custom model doesn't use the same modeling code as Transformers' model, you need to add `trust_remode_code=True` in the [`~PreTrainedModel.from_pretrained`] method. Refer to the load [custom models](./models#custom-models) section for more information. + +6401 \ No newline at end of file diff --git a/docs/source/en/model_sharing.md b/docs/source/en/model_sharing.md index 076fc2ccdd57..b8f69016b03e 100644 --- a/docs/source/en/model_sharing.md +++ b/docs/source/en/model_sharing.md @@ -1,4 +1,4 @@ - -# Share a model +# Share -The last two tutorials showed how you can fine-tune a model with PyTorch, Keras, and 🤗 Accelerate for distributed setups. The next step is to share your model with the community! At Hugging Face, we believe in openly sharing knowledge and resources to democratize artificial intelligence for everyone. We encourage you to consider sharing your model with the community to help others save time and resources. +The Hugging Face [Hub](https://hf.co/models) is a platform for sharing, discovering, and consuming models of all different types and sizes. We highly recommend sharing your model on the Hub to push open-source machine learning forward for everyone! -In this tutorial, you will learn two methods for sharing a trained or fine-tuned model on the [Model Hub](https://huggingface.co/models): +This guide will show you how to share a model to the Hub from Transformers. -- Programmatically push your files to the Hub. -- Drag-and-drop your files to the Hub with the web interface. +## Setup - +To share a model to the Hub, you need a Hugging Face [account](https://hf.co/join). Create a [User Access Token](https://hf.co/docs/hub/security-tokens#user-access-tokens) and login to your account from either the CLI or a notebook. - + + -To share a model with the community, you need an account on [huggingface.co](https://huggingface.co/join). You can also join an existing organization or create a new one. +```bash +huggingface-cli login +``` - + + -## Repository features +```py +from huggingface_hub import notebook_login -Each repository on the Model Hub behaves like a typical GitHub repository. Our repositories offer versioning, commit history, and the ability to visualize differences. +notebook_login() +``` -The Model Hub's built-in versioning is based on git and [git-lfs](https://git-lfs.github.com/). In other words, you can treat one model as one repository, enabling greater access control and scalability. Version control allows *revisions*, a method for pinning a specific version of a model with a commit hash, tag or branch. + + -As a result, you can load a specific model version with the `revision` parameter: +## Model repository features -```py ->>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="4c77982" # tag name, or branch name, or commit hash -... ) -``` + -Files are also easily edited in a repository, and you can view the commit history as well as the differences: +Each model repository supports versioning, commit history, and visualizing diffs. -![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png) +
+ +
-## Setup +The repository's built-in versioning is based on [Git](https://git-scm.com/) and [Git Large File Storage (LFS)](https://git-lfs.github.com/). Version control enables revisions, a way to specify a model version with a commit hash, tag or branch. -Before sharing a model to the Hub, you will need your Hugging Face credentials. If you have access to a terminal, run the following command in the virtual environment where 🤗 Transformers is installed. This will store your access token in your Hugging Face cache folder (`~/.cache/` by default): +For example, specify the `revision` parameter in [`~PreTrainedModel.from_pretrained`] to load a specific model version. -```bash -huggingface-cli login +```py +model = AutoModel.from_pretrained( + "julien-c/EsperBERTo-small", revision="v2.0.1" +) ``` -If you are using a notebook like Jupyter or Colaboratory, make sure you have the [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) library installed. This library allows you to programmatically interact with the Hub. +Model repositories also support [gating](https://hf.co/docs/hub/models-gated) for more control over how and who can access a model. Gating is common for allowing a select group of users to preview a research model before it's made public. -```bash -pip install huggingface_hub -``` - -Then use `notebook_login` to sign-in to the Hub, and follow the link [here](https://huggingface.co/settings/token) to generate a token to login with: +
+ +
-```py ->>> from huggingface_hub import notebook_login +The model repository also includes an inference [widget](https://hf.co/docs/hub/models-widgets) for users to directly interact with a model. ->>> notebook_login() -``` +Check out the Hub [Models](https://hf.co/docs/hub/models) documentation to learn more about. ## Convert a model for all frameworks -To ensure your model can be used by someone working with a different framework, we recommend you convert and upload your model with both PyTorch and TensorFlow checkpoints. While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly. +Reach a wider audience by converting a model to be compatible with all machine learning frameworks (PyTorch, TensorFlow, Flax). While users can still load a model if they're using a different framework, it is slower because Transformers converts the checkpoint on the fly. It is faster to convert the checkpoint beforehand. -Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework. + + - - -Specify `from_tf=True` to convert a checkpoint from TensorFlow to PyTorch: +Set `from_tf=True` to convert a checkpoint from TensorFlow to PyTorch and then save it. ```py ->>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True) ->>> pt_model.save_pretrained("path/to/awesome-name-you-picked") -``` - - -Specify `from_pt=True` to convert a checkpoint from PyTorch to TensorFlow: +import DistilBertForSequenceClassification -```py ->>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True) +pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True) +pt_model.save_pretrained("path/to/awesome-name-you-picked") ``` -Then you can save your new TensorFlow model with its new checkpoint: + + -```py ->>> tf_model.save_pretrained("path/to/awesome-name-you-picked") -``` - - -If a model is available in Flax, you can also convert a checkpoint from PyTorch to Flax: +Set `from_pt=True` to convert a checkpoint from PyTorch to TensorFlow and then save it. ```py ->>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained( -... "path/to/awesome-name-you-picked", from_pt=True -... ) -``` - - - -## Push a model during training +import TFDistilBertForSequenceClassification - - - - -Sharing a model to the Hub is as simple as adding an extra parameter or callback. Remember from the [fine-tuning tutorial](training), the [`TrainingArguments`] class is where you specify hyperparameters and additional training options. One of these training options includes the ability to push a model directly to the Hub. Set `push_to_hub=True` in your [`TrainingArguments`]: - -```py ->>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True) +tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True) +tf_model.save_pretrained("path/to/awesome-name-you-picked") ``` -Pass your training arguments as usual to [`Trainer`]: + + -```py ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=small_train_dataset, -... eval_dataset=small_eval_dataset, -... compute_metrics=compute_metrics, -... ) -``` - -After you fine-tune your model, call [`~transformers.Trainer.push_to_hub`] on [`Trainer`] to push the trained model to the Hub. 🤗 Transformers will even automatically add training hyperparameters, training results and framework versions to your model card! +Set `from_pt=True` to convert a checkpoint from PyTorch to Flax and then save it. ```py ->>> trainer.push_to_hub() +flax_model = FlaxDistilBertForSequenceClassification.from_pretrained( + "path/to/awesome-name-you-picked", from_pt=True +) +flax_model.save_pretrained("path/to/awesome-name-you-picked") ``` - - -Share a model to the Hub with [`PushToHubCallback`]. In the [`PushToHubCallback`] function, add: -- An output directory for your model. -- A tokenizer. -- The `hub_model_id`, which is your Hub username and model name. + + -```py ->>> from transformers import PushToHubCallback +## Upload model ->>> push_to_hub_callback = PushToHubCallback( -... output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model" -... ) -``` +There are several ways to upload a model to the Hub depending on your workflow preference. You can push a model with the [`Trainer`], call the [`~PreTrainedModel.push_to_hub`] method directly on a model, or use the Hub's web interface. -Add the callback to [`fit`](https://keras.io/api/models/model_training_apis/), and 🤗 Transformers will push the trained model to the Hub: - -```py ->>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback) -``` - - + -## Use the `push_to_hub` function +### Upload from Trainer -You can also call `push_to_hub` directly on your model to upload it to the Hub. +The [`Trainer`], Transformers' training API, allows pushing a model directly to the Hub after training. Set `push_to_hub=True` in the [`TrainingArguments`] class and pass it to the [`Trainer`]. Once training is complete, call [`~transformers.Trainer.push_to_hub`] to upload the model. -Specify your model name in `push_to_hub`: +The [`~transformers.Trainer.push_to_hub`] method automatically adds useful information like the training hyperparameters and results to the model card. ```py ->>> pt_model.push_to_hub("my-awesome-model") +from transformers import TrainingArguments, Trainer + +training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True) +trainer = Trainer( + model=model, + args=training_args, + train_dataset=small_train_dataset, + eval_dataset=small_eval_dataset, + compute_metrics=compute_metrics, +) +trainer.push_to_hub() ``` -This creates a repository under your username with the model name `my-awesome-model`. Users can now load your model with the `from_pretrained` function: +#### TensorFlow models + +For TensorFlow models, add the [`PushToHubCallback`] to [fit](https://keras.io/api/models/model_training_apis/#fit-method). ```py ->>> from transformers import AutoModel +from transformers import PushToHubCallback ->>> model = AutoModel.from_pretrained("your_username/my-awesome-model") +push_to_hub_callback = PushToHubCallback( + output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model" +) +model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback) ``` -If you belong to an organization and want to push your model under the organization name instead, just add it to the `repo_id`: +### Upload from model -```py ->>> pt_model.push_to_hub("my-awesome-org/my-awesome-model") -``` - -The `push_to_hub` function can also be used to add other files to a model repository. For example, add a tokenizer to a model repository: +Call [`~PreTrainedModel.push_to_hub`] directly on a model to upload it to the Hub. It creates a repository under your namespace with the model name specified in [`~PreTrainedModel.push_to_hub`]. ```py ->>> tokenizer.push_to_hub("my-awesome-model") +model.push_to_hub("my-awesome-model") ``` -Or perhaps you'd like to add the TensorFlow version of your fine-tuned PyTorch model: +Other objects like a tokenizer or TensorFlow model are also pushed to the Hub in the same way. ```py ->>> tf_model.push_to_hub("my-awesome-model") +tokenizer.push_to_hub("my-awesome-model") ``` -Now when you navigate to your Hugging Face profile, you should see your newly created model repository. Clicking on the **Files** tab will display all the files you've uploaded to the repository. +Your Hugging Face profile should now display the newly created model repository. Navigate to the **Files** tab to see all the uploaded files + +Refer to the [Upload files to the Hub](https://hf.co/docs/hub/how-to-upstream) guide for more details about pushing files to the Hub. -For more details on how to create and upload files to a repository, refer to the Hub documentation [here](https://huggingface.co/docs/hub/how-to-upstream). +### Upload from web interface -## Upload with the web interface +For a no-code approach, upload a model with the Hub's web interface. -Users who prefer a no-code approach are able to upload a model through the Hub's web interface. Visit [huggingface.co/new](https://huggingface.co/new) to create a new repository: +Create a new repository by selecting [**New Model**](https://huggingface.co/new). -![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png) +
+ +
-From here, add some information about your model: +Add some details about your model: - Select the **owner** of the repository. This can be yourself or any of the organizations you belong to. - Pick a name for your model, which will also be the repository name. - Choose whether your model is public or private. - Specify the license usage for your model. -Now click on the **Files** tab and click on the **Add file** button to upload a new file to your repository. Then drag-and-drop a file to upload and add a commit message. +Click on **Create model** to create the model repository. + +Now select the **Files** tab and click on the **Add file** button to drag-and-drop a file to your repository. Add a commit message and click on **Commit changes to `main`** to commit the file. + +
+ +
+ +## Model card -![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png) +[Model cards](https://hf.co/docs/hub/model-cards#model-cards) inform users about a model's performance, limitations, potential biases, and ethical considerations. It is highly recommended to add a model card to your repository! -## Add a model card +A model card is a `README.md` file in your repository. Add this file by: -To make sure users understand your model's capabilities, limitations, potential biases and ethical considerations, please add a model card to your repository. The model card is defined in the `README.md` file. You can add a model card by: +- manually creating and uploading a `README.md` file +- clicking on the **Edit model card** button in the repository -* Manually creating and uploading a `README.md` file. -* Clicking on the **Edit model card** button in your model repository. +Take a look at the Llama 3.1 [model card](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) for an example of the type of information to include on a model card. -Take a look at the DistilBert [model card](https://huggingface.co/distilbert/distilbert-base-uncased) for a good example of the type of information a model card should include. For more details about other options you can control in the `README.md` file such as a model's carbon footprint or widget examples, refer to the documentation [here](https://huggingface.co/docs/hub/models-cards). +Learn more about other model card metadata (carbon emissions, license, link to paper, etc.) to include in the [Model Cards](https://hf.co/docs/hub/model-cards#model-cards) guide. From 812530e579877138b51998463ea43c9ed96b2631 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Thu, 1 Aug 2024 16:36:27 -0700 Subject: [PATCH 013/116] fix link --- docs/source/en/_toctree.yml | 2 +- docs/source/en/add_new_model.md | 2 +- docs/source/en/model_sharing.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 236873e8de43..15635c5243e6 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -18,7 +18,7 @@ - local: model_sharing title: Share - local: add_new_model - title: How to add a model to 🤗 Transformers? + title: Contribute - local: task_summary title: What 🤗 Transformers can do - local: tasks_explained diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md index 9aab36bb6fbe..ba9212575d3b 100644 --- a/docs/source/en/add_new_model.md +++ b/docs/source/en/add_new_model.md @@ -13,7 +13,7 @@ rendered properly in your Markdown viewer. --> -# How to add a model to 🤗 Transformers? +# Contribute The 🤗 Transformers library is often able to offer new models thanks to community contributors. But this can be a challenging project and requires an in-depth knowledge of the 🤗 Transformers library and the model to implement. At Hugging Face, we're trying to empower more of the community to actively add models and we've put together this guide to walk you through the process of adding a PyTorch model (make sure you have [PyTorch installed](https://pytorch.org/get-started/locally/)). diff --git a/docs/source/en/model_sharing.md b/docs/source/en/model_sharing.md index b8f69016b03e..1aa193a1377b 100644 --- a/docs/source/en/model_sharing.md +++ b/docs/source/en/model_sharing.md @@ -66,7 +66,7 @@ model = AutoModel.from_pretrained( Model repositories also support [gating](https://hf.co/docs/hub/models-gated) for more control over how and who can access a model. Gating is common for allowing a select group of users to preview a research model before it's made public.
- +
The model repository also includes an inference [widget](https://hf.co/docs/hub/models-widgets) for users to directly interact with a model. From 454acf6d673e87dc73badc7484dc2036963c1e10 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 5 Aug 2024 15:13:56 -0700 Subject: [PATCH 014/116] contribute part 1 --- docs/source/en/add_new_model.md | 951 +++++++++----------------------- docs/source/en/custom_models.md | 10 +- docs/source/en/model_sharing.md | 10 +- 3 files changed, 260 insertions(+), 711 deletions(-) diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md index ba9212575d3b..50a49584c2db 100644 --- a/docs/source/en/add_new_model.md +++ b/docs/source/en/add_new_model.md @@ -1,4 +1,4 @@ - + +To keep the code readable, there is never more than two levels of abstraction for any model. The example model here, BrandNewBert, traces its inheritance from `BrandNewBertPreTrainedModel` and [`PreTrainedModel`]. It is important that a new model only depends on [`PreTrainedModel`] because it allows a model to be loaded and saved with [`~PreTrainedModel.from_pretrained`] and [`PreTrainedModel.save_pretrained`]. + +Other important functions like the forward method are defined in the `modeling.py` file. + +Specific model heads (for example, for sequence classification or language modeling) should use the base model as a component that is called in the forward pass rather than inherting from it. This keeps abstraction low. + +New models require a configuration, for example `BrandNewBertConfig`, that is stored as an attribute of [`PreTrainedModel`]. + +```py model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert") -model.config # model has access to its config +model.config ``` -Similar to the model, the configuration inherits basic serialization and deserialization functionalities from -[`PretrainedConfig`]. Note that the configuration and the model are always serialized into two -different formats - the model to a *pytorch_model.bin* file and the configuration to a *config.json* file. Calling -the model's [`~PreTrainedModel.save_pretrained`] will automatically call -the config's [`~PretrainedConfig.save_pretrained`], so that both model and configuration are saved. +Like [`PreTrainedModel`], [`PretrainedConfig`] provides [`~PretrainedConfig.from_pretrained`] and [`PretrainedConfig.save_pretrained`] methods. +When you use [`~PreTrainedModel.save_pretrained`], it automatically calls the configurations [`~PretrainedConfig.save_pretrained`] method so that both the model and configuration are saved together. + +A model is saved to a `model.safetensors` file and a configuration is saved to a `config.json` file. ### Code style -When coding your new model, keep in mind that Transformers is an opinionated library and we have a few quirks of our -own regarding how code should be written :-) - -1. The forward pass of your model should be fully written in the modeling file while being fully independent of other - models in the library. If you want to reuse a block from another model, copy the code and paste it with a - `# Copied from` comment on top (see [here](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160) - for a good example and [there](pr_checks#check-copies) for more documentation on Copied from). -2. The code should be fully understandable, even by a non-native English speaker. This means you should pick - descriptive variable names and avoid abbreviations. As an example, `activation` is preferred to `act`. - One-letter variable names are strongly discouraged unless it's an index in a for loop. -3. More generally we prefer longer explicit code to short magical one. -4. Avoid subclassing `nn.Sequential` in PyTorch but subclass `nn.Module` and write the forward pass, so that anyone - using your code can quickly debug it by adding print statements or breaking points. -5. Your function signature should be type-annotated. For the rest, good variable names are way more readable and - understandable than type annotations. - -### Overview of tokenizers - -Not quite ready yet :-( This section will be added soon! - -## Step-by-step recipe to add a model to 🤗 Transformers - -Everyone has different preferences of how to port a model so it can be very helpful for you to take a look at summaries -of how other contributors ported models to Hugging Face. Here is a list of community blog posts on how to port a model: - -1. [Porting GPT2 Model](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) by [Thomas](https://huggingface.co/thomwolf) -2. [Porting WMT19 MT Model](https://huggingface.co/blog/porting-fsmt) by [Stas](https://huggingface.co/stas) - -From experience, we can tell you that the most important things to keep in mind when adding a model are: - -- Don't reinvent the wheel! Most parts of the code you will add for the new 🤗 Transformers model already exist - somewhere in 🤗 Transformers. Take some time to find similar, already existing models and tokenizers you can copy - from. [grep](https://www.gnu.org/software/grep/) and [rg](https://github.com/BurntSushi/ripgrep) are your - friends. Note that it might very well happen that your model's tokenizer is based on one model implementation, and - your model's modeling code on another one. *E.g.* FSMT's modeling code is based on BART, while FSMT's tokenizer code - is based on XLM. -- It's more of an engineering challenge than a scientific challenge. You should spend more time creating an - efficient debugging environment rather than trying to understand all theoretical aspects of the model in the paper. -- Ask for help, when you're stuck! Models are the core component of 🤗 Transformers so we at Hugging Face are more - than happy to help you at every step to add your model. Don't hesitate to ask if you notice you are not making - progress. - -In the following, we try to give you a general recipe that we found most useful when porting a model to 🤗 Transformers. - -The following list is a summary of everything that has to be done to add a model and can be used by you as a To-Do -List: - -☐ (Optional) Understood the model's theoretical aspects
-☐ Prepared 🤗 Transformers dev environment
-☐ Set up debugging environment of the original repository
-☐ Created script that successfully runs the `forward()` pass using the original repository and checkpoint
-☐ Successfully added the model skeleton to 🤗 Transformers
-☐ Successfully converted original checkpoint to 🤗 Transformers checkpoint
-☐ Successfully ran `forward()` pass in 🤗 Transformers that gives identical output to original checkpoint
-☐ Finished model tests in 🤗 Transformers
-☐ Successfully added tokenizer in 🤗 Transformers
-☐ Run end-to-end integration tests
-☐ Finished docs
-☐ Uploaded model weights to the Hub
-☐ Submitted the pull request
-☐ (Optional) Added a demo notebook - -To begin with, we usually recommend starting by getting a good theoretical understanding of `BrandNewBert`. However, -if you prefer to understand the theoretical aspects of the model *on-the-job*, then it is totally fine to directly dive -into the `BrandNewBert`'s code-base. This option might suit you better if your engineering skills are better than -your theoretical skill, if you have trouble understanding `BrandNewBert`'s paper, or if you just enjoy programming -much more than reading scientific papers. - -### 1. (Optional) Theoretical aspects of BrandNewBert - -You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large -sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is -not to get a deep theoretical understanding of the paper, but to extract the necessary information required to -effectively re-implement the model in 🤗 Transformers. That being said, you don't have to spend too much time on the -theoretical aspects, but rather focus on the practical ones, namely: - -- What type of model is *brand_new_bert*? BERT-like encoder-only model? GPT2-like decoder-only model? BART-like - encoder-decoder model? Look at the [model_summary](model_summary) if you're not familiar with the differences between those. -- What are the applications of *brand_new_bert*? Text classification? Text generation? Seq2Seq tasks, *e.g.,* - summarization? -- What is the novel feature of the model that makes it different from BERT/GPT-2/BART? -- Which of the already existing [🤗 Transformers models](https://huggingface.co/transformers/#contents) is most - similar to *brand_new_bert*? -- What type of tokenizer is used? A sentencepiece tokenizer? Word piece tokenizer? Is it the same tokenizer as used - for BERT or BART? - -After you feel like you have gotten a good overview of the architecture of the model, you might want to write to the -Hugging Face team with any questions you might have. This might include questions regarding the model's architecture, -its attention layer, etc. We will be more than happy to help you. - -### 2. Next prepare your environment - -1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the ‘Fork' button on the - repository's page. This creates a copy of the code under your GitHub user account. - -2. Clone your `transformers` fork to your local disk, and add the base repository as a remote: - - ```bash - git clone https://github.com/[your Github handle]/transformers.git - cd transformers - git remote add upstream https://github.com/huggingface/transformers.git - ``` - -3. Set up a development environment, for instance by running the following command: - - ```bash - python -m venv .env - source .env/bin/activate - pip install -e ".[dev]" - ``` - - Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a - failure with this command. If that's the case make sure to install the Deep Learning framework you are working with - (PyTorch, TensorFlow and/or Flax) then do: - - ```bash - pip install -e ".[quality]" - ``` - - which should be enough for most use cases. You can then return to the parent directory - - ```bash - cd .. - ``` - -4. We recommend adding the PyTorch version of *brand_new_bert* to Transformers. To install PyTorch, please follow the - instructions on https://pytorch.org/get-started/locally/. - - **Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient. - -5. To port *brand_new_bert*, you will also need access to its original repository: - - ```bash - git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git - cd brand_new_bert - pip install -e . - ``` - -Now you have set up a development environment to port *brand_new_bert* to 🤗 Transformers. - -### 3.-4. Run a pretrained checkpoint using the original repository - -At first, you will work on the original *brand_new_bert* repository. Often, the original implementation is very -“researchy”. Meaning that documentation might be lacking and the code can be difficult to understand. But this should -be exactly your motivation to reimplement *brand_new_bert*. At Hugging Face, one of our main goals is to *make people -stand on the shoulders of giants* which translates here very well into taking a working model and rewriting it to make -it as **accessible, user-friendly, and beautiful** as possible. This is the number-one motivation to re-implement -models into 🤗 Transformers - trying to make complex new NLP technology accessible to **everybody**. - -You should start thereby by diving into the original repository. - -Successfully running the official pretrained model in the original repository is often **the most difficult** step. -From our experience, it is very important to spend some time getting familiar with the original code-base. You need to -figure out the following: - -- Where to find the pretrained weights? -- How to load the pretrained weights into the corresponding model? -- How to run the tokenizer independently from the model? -- Trace one forward pass so that you know which classes and functions are required for a simple forward pass. Usually, - you only have to reimplement those functions. -- Be able to locate the important components of the model: Where is the model's class? Are there model sub-classes, - *e.g.* EncoderModel, DecoderModel? Where is the self-attention layer? Are there multiple different attention layers, - *e.g.* *self-attention*, *cross-attention*...? -- How can you debug the model in the original environment of the repo? Do you have to add *print* statements, can you - work with an interactive debugger like *ipdb*, or should you use an efficient IDE to debug the model, like PyCharm? +Transformers prefers a clean and readable code style over a more abstracted one. Some of the coed style choices include: -It is very important that before you start the porting process, you can **efficiently** debug code in the original -repository! Also, remember that you are working with an open-source library, so do not hesitate to open an issue, or -even a pull request in the original repository. The maintainers of this repository are most likely very happy about -someone looking into their code! - -At this point, it is really up to you which debugging environment and strategy you prefer to use to debug the original -model. We strongly advise against setting up a costly GPU environment, but simply work on a CPU both when starting to -dive into the original repository and also when starting to write the 🤗 Transformers implementation of the model. Only -at the very end, when the model has already been successfully ported to 🤗 Transformers, one should verify that the -model also works as expected on GPU. - -In general, there are two possible debugging environments for running the original model +- The forward pass is written in the `modeling.py` file, completely independent of other models in the library. To reuse a block from another model, copy the code and paste it with a `# Copied from` comment above it. For example, the `RobertaSelfAttention` class is copied from the `BertSelfAttention` class. -- [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb) -- Local python scripts. + ```py + # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Roberta + class RobertaSelfAttention(nn.Module): + ``` -Jupyter notebooks have the advantage that they allow for cell-by-cell execution which can be helpful to better split -logical components from one another and to have faster debugging cycles as intermediate results can be stored. Also, -notebooks are often easier to share with other contributors, which might be very helpful if you want to ask the Hugging -Face team for help. If you are familiar with Jupyter notebooks, we strongly recommend you work with them. + Refer to the [Check copies](./pr_checks#check-copies) section for more information about the `# Copied from` comment. -The obvious disadvantage of Jupyter notebooks is that if you are not used to working with them you will have to spend -some time adjusting to the new programming environment and you might not be able to use your known debugging tools -anymore, like `ipdb`. +- The code should be accessible to users from a non-native English background. Pick descriptive variable names and avoid abbreviations. For example, "activation" is preferred over "act". One letter variables names are highly discouraged unless it's an infex in a for loop. -For each code-base, a good first step is always to load a **small** pretrained checkpoint and to be able to reproduce a -single forward pass using a dummy integer vector of input IDs as an input. Such a script could look like this (in -pseudocode): +- Explicit code is preferred over shorter code even if it's longer. -```python -model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") -input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids -original_output = model.predict(input_ids) -``` +- Avoid subclassing [nn.Sequential](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html). Subclass [nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module) instead so the code can be quickly debugged with print statements or breakpoints. -Next, regarding the debugging strategy, there are generally a few from which to choose from: +- Function signatures should be type-annotated. Otherwise, good variable names are preferred because they're more readable and understandable. -- Decompose the original model into many small testable components and run a forward pass on each of those for - verification -- Decompose the original model only into the original *tokenizer* and the original *model*, run a forward pass on - those, and use intermediate print statements or breakpoints for verification +## Add a new model -Again, it is up to you which strategy to choose. Often, one or the other is advantageous depending on the original code -base. +With some background knowledge about your model and the Transformers library, you're ready to add BrandNewBert now! -If the original code-base allows you to decompose the model into smaller sub-components, *e.g.* if the original -code-base can easily be run in eager mode, it is usually worth the effort to do so. There are some important advantages -to taking the more difficult road in the beginning: +> [!TIP] +> Each contributor has a unique style and workflow for porting models to Transformers. It may be helpful to take a look at how [GPT2](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) and [WMT19](https://huggingface.co/blog/porting-fsmt) were ported. -- at a later stage when comparing the original model to the Hugging Face implementation, you can verify automatically - for each component individually that the corresponding component of the 🤗 Transformers implementation matches instead - of relying on visual comparison via print statements -- it can give you some rope to decompose the big problem of porting a model into smaller problems of just porting - individual components and thus structure your work better -- separating the model into logical meaningful components will help you to get a better overview of the model's design - and thus to better understand the model -- at a later stage those component-by-component tests help you to ensure that no regression occurs as you continue - changing your code +Some final tips to keep in mind are: -[Lysandre's](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) integration checks for ELECTRA -gives a nice example of how this can be done. +- Don't reinvent the wheel! Take your time to explore existing models and tokenizers to see what you can copy and reuse. [Grep](https://www.gnu.org/software/grep/) and [ripgrep](https://github.com/BurntSushi/ripgrep) are great tools for this. +- This is an engineering challenge more than a scientific one. Focus on the more practical aspects (set up an efficient debugging environment for example) instead of theoretical ones. +- Don't be shy to ask for help! We are here to support you. 🤗 -However, if the original code-base is very complex or only allows intermediate components to be run in a compiled mode, -it might be too time-consuming or even impossible to separate the model into smaller testable sub-components. A good -example is [T5's MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) library which is -very complex and does not offer a simple way to decompose the model into its sub-components. For such libraries, one -often relies on verifying print statements. +### Dev environment -No matter which strategy you choose, the recommended procedure is often the same that you should start to debug the -starting layers first and the ending layers last. +Click on the **Fork** button on the [Transformers](https://github.com/huggingface/transformers) repository to create your own copy of it to work on. Then clone the repository to your local disk and add the base repository as the remote. -It is recommended that you retrieve the output, either by print statements or sub-component functions, of the following -layers in the following order: +```bash +git clone https://github.com/[your Github handle]/transformers.git +cd transformers +git remote add upstream https://github.com/huggingface/transformers.git +``` -1. Retrieve the input IDs passed to the model -2. Retrieve the word embeddings -3. Retrieve the input of the first Transformer layer -4. Retrieve the output of the first Transformer layer -5. Retrieve the output of the following n - 1 Transformer layers -6. Retrieve the output of the whole BrandNewBert Model +Create a virtual environment and do an [editable install](./installation#editable-install) of the library with the "dev" or development dependencies. -Input IDs should thereby consists of an array of integers, *e.g.* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]` +```bash +python -m venv .env +source .env/bin/activate +pip install -e ".[dev]" +``` -The outputs of the following layers often consist of multi-dimensional float arrays and can look like this: +Due to the number of optional dependencies as Transformers grows, this command may fail. In that case, install the "quality" dependencies. Also make sure you have a deep learning framework installed. +```bash +pip install -e ".[quality]" ``` -[[ - [-0.1465, -0.6501, 0.1993, ..., 0.1451, 0.3430, 0.6024], - [-0.4417, -0.5920, 0.3450, ..., -0.3062, 0.6182, 0.7132], - [-0.5009, -0.7122, 0.4548, ..., -0.3662, 0.6091, 0.7648], - ..., - [-0.5613, -0.6332, 0.4324, ..., -0.3792, 0.7372, 0.9288], - [-0.5416, -0.6345, 0.4180, ..., -0.3564, 0.6992, 0.9191], - [-0.5334, -0.6403, 0.4271, ..., -0.3339, 0.6533, 0.8694]]], + +Return to the parent directory and clone and install the original BrandNewBert repository. + +```bash +git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git +cd brand_new_bert +pip install -e . ``` -We expect that every model added to 🤗 Transformers passes a couple of integration tests, meaning that the original -model and the reimplemented version in 🤗 Transformers have to give the exact same output up to a precision of 0.001! -Since it is normal that the exact same model written in different libraries can give a slightly different output -depending on the library framework, we accept an error tolerance of 1e-3 (0.001). It is not enough if the model gives -nearly the same output, they have to be almost identical. Therefore, you will certainly compare the intermediate -outputs of the 🤗 Transformers version multiple times against the intermediate outputs of the original implementation of -*brand_new_bert* in which case an **efficient** debugging environment of the original repository is absolutely -important. Here is some advice to make your debugging environment as efficient as possible. - -- Find the best way of debugging intermediate results. Is the original repository written in PyTorch? Then you should - probably take the time to write a longer script that decomposes the original model into smaller sub-components to - retrieve intermediate values. Is the original repository written in Tensorflow 1? Then you might have to rely on - TensorFlow print operations like [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) to output - intermediate values. Is the original repository written in Jax? Then make sure that the model is **not jitted** when - running the forward pass, *e.g.* check-out [this link](https://github.com/google/jax/issues/196). -- Use the smallest pretrained checkpoint you can find. The smaller the checkpoint, the faster your debug cycle - becomes. It is not efficient if your pretrained model is so big that your forward pass takes more than 10 seconds. - In case only very large checkpoints are available, it might make more sense to create a dummy model in the new - environment with randomly initialized weights and save those weights for comparison with the 🤗 Transformers version - of your model -- Make sure you are using the easiest way of calling a forward pass in the original repository. Ideally, you want to - find the function in the original repository that **only** calls a single forward pass, *i.e.* that is often called - `predict`, `evaluate`, `forward` or `__call__`. You don't want to debug a function that calls `forward` - multiple times, *e.g.* to generate text, like `autoregressive_sample`, `generate`. -- Try to separate the tokenization from the model's *forward* pass. If the original repository shows examples where - you have to input a string, then try to find out where in the forward call the string input is changed to input ids - and start from this point. This might mean that you have to possibly write a small script yourself or change the - original code so that you can directly input the ids instead of an input string. -- Make sure that the model in your debugging setup is **not** in training mode, which often causes the model to yield - random outputs due to multiple dropout layers in the model. Make sure that the forward pass in your debugging - environment is **deterministic** so that the dropout layers are not used. Or use *transformers.utils.set_seed* - if the old and new implementations are in the same framework. - -The following section gives you more specific details/tips on how you can do this for *brand_new_bert*. - -### 5.-14. Port BrandNewBert to 🤗 Transformers - -Next, you can finally start adding new code to 🤗 Transformers. Go into the clone of your 🤗 Transformers' fork: +Return to your clone of Transformers to begin porting BrandNewBert. ```bash cd transformers ``` -In the special case that you are adding a model whose architecture exactly matches the model architecture of an -existing model you only have to add a conversion script as described in [this section](#write-a-conversion-script). -In this case, you can just re-use the whole model architecture of the already existing model. +> [!TIP] +> If the model architecture is identical to an existing model, skip ahead to add a [conversion script](#conversion-script), because you can reuse the architecture of the existing model. -Otherwise, let's start generating a new model. We recommend using the following script to add a model starting from -an existing model: +Run the command below to start and complete the questionnaire with some basic information about the new model. This command jumpstarts the process by automatically generating some model code that you'll need to adapt. ```bash transformers-cli add-new-model-like ``` -You will be prompted with a questionnaire to fill in the basic information of your model. +### Create a pull request -**Open a Pull Request on the main huggingface/transformers repo** +Before you start adapting the code, create a pull request to track your progress and get feedback from the Transformers team. Title your pull request "[WIP] Add BrandNewBert" so it's clear that this is a work in progress. -Before starting to adapt the automatically generated code, now is the time to open a “Work in progress (WIP)” pull -request, *e.g.* “[WIP] Add *brand_new_bert*”, in 🤗 Transformers so that you and the Hugging Face team can work -side-by-side on integrating the model into 🤗 Transformers. +Create a branch with a descriptive name from your main branch. -You should do the following: +```bash +git checkout -b add_brand_new_bert +``` -1. Create a branch with a descriptive name from your main branch +Commit the code, and then fetch and rebase on the main branch. - ```bash - git checkout -b add_brand_new_bert - ``` +```bash +git add . +git commit +git fetch upstream +git rebase upstream/main +``` + +Push any changes to your branch and click on **Compare & pull request** to open a pull request on GitHub. Open the pull request as a *draft* to signal it's a work in progress. -2. Commit the automatically generated code: +```bash +git push -u origin a-descriptive-name-for-my-changes +``` - ```bash - git add . - git commit - ``` +Include relevant Hugging Face team members GitHub handles in the pull request for questions, feedback, comments, and reviews. Direct team members to specific parts of the code you want by clicking on the **Files changed** tab, and then clicking on **+** to the left of the line number to add a comment. When a question or problem is solved, click on **Resolve** to indicate the issue is resolved. This keeps the conversation organized and clean. -3. Fetch and rebase to current main +Remember to periodically commit and push your work, and update your work with the current main branch. - ```bash - git fetch upstream - git rebase upstream/main - ``` +```bash +git fetch upstream +git merge upstream/main +``` -4. Push the changes to your account using: +### Run original checkpoint - ```bash - git push -u origin a-descriptive-name-for-my-changes - ``` +### Adapt the model code -5. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the - GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for - future changes. +The `transformers-cli add-new-model-like` command should have generated a model and configuration file. -6. Change the PR into a draft by clicking on “Convert to draft” on the right of the GitHub pull request web page. +- `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` +- `src/transformers/models/brand_new_bert/configuration_brand_new_bert.py` -In the following, whenever you have made some progress, don't forget to commit your work and push it to your account so -that it shows in the pull request. Additionally, you should make sure to update your work with the current main from -time to time by doing: +The automatically generated code in the `modeling.py` file will have the same architecture as BERT if you answered it's an encoder-only model or it will have the same architecture as BART if you answered it's an encoder-decoder model. The generated code is just a starting point. Based on your research on the new model, you'll need to implement those specific changes by adapting the generated code. This may involve changes to the self-attention layer, the order of the normalization layer, and so on. -```bash -git fetch upstream -git merge upstream/main -``` +At this point, your code doesn't have to be clean or even fully correct! It is more efficiently to quickly create a first draft and then iteratively improve on it. The only thing that matters is that your model should be able to be instantiated from Transformers. The command below creates a model from the configuration with random weights, verifying that the the `__init__` method works. -In general, all questions you might have regarding the model or your implementation should be asked in your PR and -discussed/solved in the PR. This way, the Hugging Face team will always be notified when you are committing new code or -if you have a question. It is often very helpful to point the Hugging Face team to your added code so that the Hugging -Face team can efficiently understand your problem or question. - -To do so, you can go to the “Files changed” tab where you see all of your changes, go to a line regarding which you -want to ask a question, and click on the “+” symbol to add a comment. Whenever a question or problem has been solved, -you can click on the “Resolve” button of the created comment. - -In the same way, the Hugging Face team will open comments when reviewing your code. We recommend asking most questions -on GitHub on your PR. For some very general questions that are not very useful for the public, feel free to ping the -Hugging Face team by Slack or email. - -**5. Adapt the generated models code for brand_new_bert** - -At first, we will focus only on the model itself and not care about the tokenizer. All the relevant code should be -found in the generated files `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` and -`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`. - -Now you can finally start coding :). The generated code in -`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` will either have the same architecture as BERT if -it's an encoder-only model or BART if it's an encoder-decoder model. At this point, you should remind yourself what -you've learned in the beginning about the theoretical aspects of the model: *How is the model different from BERT or -BART?*". Implement those changes which often means changing the *self-attention* layer, the order of the normalization -layer, etc… Again, it is often useful to look at the similar architecture of already existing models in Transformers to -get a better feeling of how your model should be implemented. - -**Note** that at this point, you don't have to be very sure that your code is fully correct or clean. Rather, it is -advised to add a first *unclean*, copy-pasted version of the original code to -`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` until you feel like all the necessary code is -added. From our experience, it is much more efficient to quickly add a first version of the required code and -improve/correct the code iteratively with the conversion script as described in the next section. The only thing that -has to work at this point is that you can instantiate the 🤗 Transformers implementation of *brand_new_bert*, *i.e.* the -following command should work: - -```python -from transformers import BrandNewBertModel, BrandNewBertConfig - -model = BrandNewBertModel(BrandNewBertConfig()) +```py +from transformers import BrandNewBert, BrandNewBertConfig +model = BrandNewBert(BrandNewBertConfig()) ``` -The above command will create a model according to the default parameters as defined in `BrandNewBertConfig()` with -random weights, thus making sure that the `init()` methods of all components works. - -Note that all random initialization should happen in the `_init_weights` method of your `BrandnewBertPreTrainedModel` -class. It should initialize all leaf modules depending on the variables of the config. Here is an example with the -BERT `_init_weights` method: +Random initialization occurs in BrandNewBertPreTrainedModel's `_init_weights` method. All leaf modules are initialized depending on the configuration's variables. ```py def _init_weights(self, module): @@ -520,9 +237,9 @@ def _init_weights(self, module): module.weight.data.fill_(1.0) ``` -You can have some more custom schemes if you need a special initialization for some modules. For instance, in -`Wav2Vec2ForPreTraining`, the last two linear layers need to have the initialization of the regular PyTorch `nn.Linear` -but all the other ones should use an initialization as above. This is coded like this: +The initialization scheme can look different if you need to adapt it to your model. For example, [`Wave2Vec2ForPreTraining`] initializes [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) in its last two linear layers. + +The `_is_hf_initialized` flag makes sure the submodule is only initialized once. Setting `module.project_q` and `module.project_hid` to `True` ensures the custom initialization is not overriden later. The `_init_weights` function won't be applied to these modules. ```py def _init_weights(self, module): @@ -538,30 +255,34 @@ def _init_weights(self, module): module.bias.data.zero_() ``` -The `_is_hf_initialized` flag is internally used to make sure we only initialize a submodule once. By setting it to -`True` for `module.project_q` and `module.project_hid`, we make sure the custom initialization we did is not overridden later on, -the `_init_weights` function won't be applied to them. +### Conversion script -**6. Write a conversion script** +The original checkpoint must be converted to a Transformers compatible checkpoint. -Next, you should write a conversion script that lets you convert the checkpoint you used to debug *brand_new_bert* in -the original repository to a checkpoint compatible with your just created 🤗 Transformers implementation of -*brand_new_bert*. It is not advised to write the conversion script from scratch, but rather to look through already -existing conversion scripts in 🤗 Transformers for one that has been used to convert a similar model that was written in -the same framework as *brand_new_bert*. Usually, it is enough to copy an already existing conversion script and -slightly adapt it for your use case. Don't hesitate to ask the Hugging Face team to point you to a similar already -existing conversion script for your model. +> [!TIP] +> Try looking for an existing conversion script to copy, adapt, and reuse for your model! +> +> - If you're porting a model from TensorFlow to PyTorch, a good starting point may be the BERT [conversion script](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91). +> - If you're porting a model from PyTorch to PyTorch, a good starting point may be the BART [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py). -- If you are porting a model from TensorFlow to PyTorch, a good starting point might be BERT's conversion script [here](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91) -- If you are porting a model from PyTorch to PyTorch, a good starting point might be BART's conversion script [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py) +Make sure **all** required weights are initialized and print out all the checkpoint weights that weren't used for initialization to make sure the model has been converted correctly. -In the following, we'll quickly explain how PyTorch models store layer weights and define layer names. In PyTorch, the -name of a layer is defined by the name of the class attribute you give the layer. Let's define a dummy model in -PyTorch, called `SimpleModel` as follows: +You may encounter wrong shape statements of name assignments during the conversion. This is most likely because of incorrect parameters in BrandNewBertConfig, the wrong architecture, a bug in the `init` method of your implementation, or you need to transpose one of the checkpoint weights. -```python -from torch import nn +Keep iterating with the [Adapt the model code](#adapt-the-model-code) section until all the checkpoint weights are correctly loaded. Once you can load a checkpoint in your model, save it to a folder. This should contain a `model.safetensors` file and a `config.json` file. + +```py +model.save_pretrained("/path/to/converted/checkpoint/folder") +``` + +To help with conversion, the next section briefly describes how PyTorch models stores and defines layer weights and names. +#### PyTorch layer weights and names + +It is helpful to create a basic PyTorch model to understand how layer names are defined and weights are initialized. + +```py +from torch import nn class SimpleModel(nn.Module): def __init__(self): @@ -571,18 +292,11 @@ class SimpleModel(nn.Module): self.layer_norm = nn.LayerNorm(10) ``` -Now we can create an instance of this model definition which will fill all weights: `dense`, `intermediate`, -`layer_norm` with random weights. We can print the model to see its architecture +PyTorch layer names are defined by the class attribute name of the layer (dense, intermediate, layer_norm). Create a instance of SimpleModel to fill all the layers with random weights. -```python +```py model = SimpleModel() - print(model) -``` - -This will print out the following: - -``` SimpleModel( (dense): Linear(in_features=10, out_features=10, bias=True) (intermediate): Linear(in_features=10, out_features=10, bias=True) @@ -590,16 +304,10 @@ SimpleModel( ) ``` -We can see that the layer names are defined by the name of the class attribute in PyTorch. You can print out the weight -values of a specific layer: +The weight values of a specific layer are randomly initialized. -```python +```py print(model.dense.weight.data) -``` - -to see that the weights were randomly initialized - -``` tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212, -0.2077, 0.2157], [ 0.1044, 0.0201, 0.0990, 0.2482, 0.3116, 0.2509, 0.2866, -0.2190, @@ -622,339 +330,182 @@ tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212, 0.2220, 0.2358]]). ``` -In the conversion script, you should fill those randomly initialized weights with the exact weights of the -corresponding layer in the checkpoint. *E.g.* +In the conversion script, the random weights should be replaced with the exact weights from the corresponding layer in the original checkpoint. -```python -# retrieve matching layer weights, e.g. by -# recursive algorithm +```py +# retrieve matching layer weights with recursive algorithm layer_name = "dense" pretrained_weight = array_of_dense_layer model_pointer = getattr(model, "dense") - model_pointer.weight.data = torch.from_numpy(pretrained_weight) ``` -While doing so, you must verify that each randomly initialized weight of your PyTorch model and its corresponding -pretrained checkpoint weight exactly match in both **shape and name**. To do so, it is **necessary** to add assert -statements for the shape and print out the names of the checkpoints weights. E.g. you should add statements like: +Verify the randomly initialized weights and their corresponding pretrained checkpoint weights have the identical **shape** and **name**. Add assert statements for the shape and print out the checkpoint weight names. -```python +```py assert ( model_pointer.weight.shape == pretrained_weight.shape ), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched" + +logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}") ``` -Besides, you should also print out the names of both weights to make sure they match, *e.g.* +When the shape or name don't match, you may have assigned the incorrect checkpoint weight to a randomly initialized layer. An incorrect shape may be because of an BrandNewBert's parameters don't exactly match the original model's parameters. But it could be that the PyTorch layer implementation requires the weights to be transposed first. -```python -logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}") +### Implement the forward pass + +The forward pass should be implemented next if the model loads correctly. It takes some inputs and returns the model output. + +```py +model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder") +input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19] +output = model(input_ids).last_hidden_states ``` -If either the shape or the name doesn't match, you probably assigned the wrong checkpoint weight to a randomly -initialized layer of the 🤗 Transformers implementation. +Don't be discouraged if your forward pass isn't identical with the output from the original model or if it returns an error! Check that the forward pass doesn't throw any errors. This is often because the dimensions are wrong (dimensionality mismatch) or because the wrong data type is used ([torch.long](https://pytorch.org/docs/stable/generated/torch.Tensor.long.html) instead of [torch.float32](https://pytorch.org/docs/stable/tensors.html)). -An incorrect shape is most likely due to an incorrect setting of the config parameters in `BrandNewBertConfig()` that -do not exactly match those that were used for the checkpoint you want to convert. However, it could also be that -PyTorch's implementation of a layer requires the weight to be transposed beforehand. +Your output should have a precision of *1e-3*. Ensure the output shapes and output values are identical. Common reasons for why the outputs aren't identical include: -Finally, you should also check that **all** required weights are initialized and print out all checkpoint weights that -were not used for initialization to make sure the model is correctly converted. It is completely normal, that the -conversion trials fail with either a wrong shape statement or a wrong name assignment. This is most likely because either -you used incorrect parameters in `BrandNewBertConfig()`, have a wrong architecture in the 🤗 Transformers -implementation, you have a bug in the `init()` functions of one of the components of the 🤗 Transformers -implementation or you need to transpose one of the checkpoint weights. +- Some layers were not added (activation layer or a residual connection). +- The word embedding matix is not tied. +- The wrong positional embeddings are used because the original implementation includes an offset. +- Dropout is applied during the forward pass. Fix this error by making sure `model.training` is `False` and passing `self.training` to [torch.nn.functional.dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout). -This step should be iterated with the previous step until all weights of the checkpoint are correctly loaded in the -Transformers model. Having correctly loaded the checkpoint into the 🤗 Transformers implementation, you can then save -the model under a folder of your choice `/path/to/converted/checkpoint/folder` that should then contain both a -`pytorch_model.bin` file and a `config.json` file: +Compare the forward pass of the original model and your implementation to check if there are any differences. Ideally, debug and print out the intermediate outputs of both implementations of the forward pass to pinpoint where the original implementation differs from yours. -```python -model.save_pretrained("/path/to/converted/checkpoint/folder") -``` +1. Make sure the hardcoded `input_ids` in both implementations are identical. +2. Verify the outputs of the first transformation of `input_ids` (usually the word embeddings) are identical, and work your way through to the last layer. -**7. Implement the forward pass** +Any difference between the two implementations should point to the bug in your implementation. -Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make -sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#3-4-run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward -pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers -implementation instead of the original one. It should look as follows: +One of the best strategies is to add many print statements to the same positions in both implementations, and then successively remove them when they output identical values for the intermediate outputs. -```python -model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder") -input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19] -output = model(input_ids).last_hidden_states +When both implementations produce the same output, verify the outputs are within the precision of *1e-3*. + +```py +torch.allclose(original_output, output, atol=1e-3) ``` -It is very likely that the 🤗 Transformers implementation and the original model implementation don't give the exact -same output the very first time or that the forward pass throws an error. Don't be disappointed - it's expected! First, -you should make sure that the forward pass doesn't throw any errors. It often happens that the wrong dimensions are -used leading to a *Dimensionality mismatch* error or that the wrong data type object is used, *e.g.* `torch.long` -instead of `torch.float32`. Don't hesitate to ask the Hugging Face team for help, if you don't manage to solve -certain errors. - -The final part to make sure the 🤗 Transformers implementation works correctly is to ensure that the outputs are -equivalent to a precision of `1e-3`. First, you should ensure that the output shapes are identical, *i.e.* -`outputs.shape` should yield the same value for the script of the 🤗 Transformers implementation and the original -implementation. Next, you should make sure that the output values are identical as well. This one of the most difficult -parts of adding a new model. Common mistakes why the outputs are not identical are: - -- Some layers were not added, *i.e.* an *activation* layer was not added, or the residual connection was forgotten -- The word embedding matrix was not tied -- The wrong positional embeddings are used because the original implementation uses on offset -- Dropout is applied during the forward pass. To fix this make sure *model.training is False* and that no dropout - layer is falsely activated during the forward pass, *i.e.* pass *self.training* to [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout) - -The best way to fix the problem is usually to look at the forward pass of the original implementation and the 🤗 -Transformers implementation side-by-side and check if there are any differences. Ideally, you should debug/print out -intermediate outputs of both implementations of the forward pass to find the exact position in the network where the 🤗 -Transformers implementation shows a different output than the original implementation. First, make sure that the -hard-coded `input_ids` in both scripts are identical. Next, verify that the outputs of the first transformation of -the `input_ids` (usually the word embeddings) are identical. And then work your way up to the very last layer of the -network. At some point, you will notice a difference between the two implementations, which should point you to the bug -in the 🤗 Transformers implementation. From our experience, a simple and efficient way is to add many print statements -in both the original implementation and 🤗 Transformers implementation, at the same positions in the network -respectively, and to successively remove print statements showing the same values for intermediate presentations. - -When you're confident that both implementations yield the same output, verify the outputs with -`torch.allclose(original_output, output, atol=1e-3)`, you're done with the most difficult part! Congratulations - the -work left to be done should be a cakewalk 😊. - -**8. Adding all necessary model tests** - -At this point, you have successfully added a new model. However, it is very much possible that the model does not yet -fully comply with the required design. To make sure, the implementation is fully compatible with 🤗 Transformers, all -common tests should pass. The Cookiecutter should have automatically added a test file for your model, probably under -the same `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`. Run this test file to verify that all common -tests pass: +This is typically the most difficult part of the process. Congratulations if you've made it this far! + +And if you're stuck or struggling with this step, don't hesitate to ask for help on your pull request. + +### Add model tests + +While the model works, you still need to add tests to ensure it is compatible with Transformers and all the tests pass. Tests are important because they help users understand your work by looking at specific tests, and because they prevent your model from breaking in the future if any changes are made. + +[Cookiecutter](https://cookiecutter.readthedocs.io/en/stable/) should have added a test file for your model. Run the test file below to make sure all common tests pass. ```bash pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py ``` -Having fixed all common tests, it is now crucial to ensure that all the nice work you have done is well tested, so that - -- a) The community can easily understand your work by looking at specific tests of *brand_new_bert* -- b) Future changes to your model will not break any important feature of the model. +The integration tests should be added first because they serve the same purpose as the debugging scripts you used earlier to implement the new model in Transformers. A template of those model tests, BrandNewBertModelIntegrationTests, was added by Cookiecutter and only needs to be filled out by you. To ensure it passes, run the following command. -At first, integration tests should be added. Those integration tests essentially do the same as the debugging scripts -you used earlier to implement the model to 🤗 Transformers. A template of those model tests has already added by the -Cookiecutter, called `BrandNewBertModelIntegrationTests` and only has to be filled out by you. To ensure that those -tests are passing, run + + ```bash RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests ``` - - -In case you are using Windows, you should replace `RUN_SLOW=1` with `SET RUN_SLOW=1` - - + + -Second, all features that are special to *brand_new_bert* should be tested additionally in a separate test under -`BrandNewBertModelTester`/`BrandNewBertModelTest`. This part is often forgotten but is extremely useful in two -ways: +```bash +SET RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests +``` -- It helps to transfer the knowledge you have acquired during the model addition to the community by showing how the - special features of *brand_new_bert* should work. -- Future contributors can quickly test changes to the model by running those special tests. + + +All features unique to BrandNewBert should be tested in a separate test under `BrandNewBertModelTester/BrandNewBertModelTest`. This test is often overlooked, but it is extremely important because: -**9. Implement the tokenizer** +- it helps transfer knowledge you acquired during the process to the community by showing how the novel features of the new model works +- future contributors can quickly test changes to the model by running these special tests -Next, we should add the tokenizer of *brand_new_bert*. Usually, the tokenizer is equivalent to or very similar to an -already existing tokenizer of 🤗 Transformers. +### Implement tokenizer -It is very important to find/extract the original tokenizer file and to manage to load this file into the 🤗 -Transformers' implementation of the tokenizer. +With the model out of the way, time to focus on the tokenizer. The tokenizer should be identical or very similar to an existing tokenizer in Transformers. -To ensure that the tokenizer works correctly, it is recommended to first create a script in the original repository -that inputs a string and returns the `input_ids`. It could look similar to this (in pseudo-code): +Find and load the original tokenizer file into your implementation. Create a script in the original repository that inputs a string and returns the `input_ids`. The pseudocode should look similar to this. -```python +```py input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") input_ids = model.tokenize(input_str) ``` -You might have to take a deeper look again into the original repository to find the correct tokenizer function or you -might even have to do changes to your clone of the original repository to only output the `input_ids`. Having written -a functional tokenization script that uses the original repository, an analogous script for 🤗 Transformers should be -created. It should look similar to this: +You may need to search the original repository to find the correct tokenizer function or modify the existing tokenizer in your clone of the original repository to only return the `input_ids`. The script for your tokenizer should look something like this. -```python +```py from transformers import BrandNewBertTokenizer input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." - tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/") - input_ids = tokenizer(input_str).input_ids ``` -When both `input_ids` yield the same values, as a final step a tokenizer test file should also be added. +When both implementations have the same `input_ids`, add a tokenizer test file. This file is analogous to the modeling test files. The tokenizer test files should contain a couple of hardcoded integration tests. + +### Run integration tests -Analogous to the modeling test files of *brand_new_bert*, the tokenization test files of *brand_new_bert* should -contain a couple of hard-coded integration tests. +Now that you have a model and tokenizer, add end-to-end integration tests using both the model and tokenizer to `tests/models/brand_new_bert/test_modeling_brand-new_bert.py`. -**10. Run End-to-end integration tests** +The test should provide a meaningful text-to-text example that the model works as expected. For example, you can include a source-to-target translation pair, an article-to-summary pair, or a question-to-answer pair. -Having added the tokenizer, you should also add a couple of end-to-end integration tests using both the model and the -tokenizer to `tests/models/brand_new_bert/test_modeling_brand_new_bert.py` in 🤗 Transformers. -Such a test should show on a meaningful -text-to-text sample that the 🤗 Transformers implementation works as expected. A meaningful text-to-text sample can -include *e.g.* a source-to-target-translation pair, an article-to-summary pair, a question-to-answer pair, etc… If none -of the ported checkpoints has been fine-tuned on a downstream task it is enough to simply rely on the model tests. In a -final step to ensure that the model is fully functional, it is advised that you also run all tests on GPU. It can -happen that you forgot to add some `.to(self.device)` statements to internal tensors of the model, which in such a -test would show in an error. In case you have no access to a GPU, the Hugging Face team can take care of running those -tests for you. +If the checkpoint hasn't been finetuned on a downstream task, then the model tests will suffice. -**11. Add Docstring** +Finally, try to make sure your tests can run on a GPU by adding `.to(self.device)` statements to the models internal tensors. Don't worry if you don't have access to a GPU, we can take care of that for you if that's the case. -Now, all the necessary functionality for *brand_new_bert* is added - you're almost done! The only thing left to add is -a nice docstring and a doc page. The Cookiecutter should have added a template file called -`docs/source/model_doc/brand_new_bert.md` that you should fill out. Users of your model will usually first look at -this page before using your model. Hence, the documentation must be understandable and concise. It is very useful for -the community to add some *Tips* to show how the model should be used. Don't hesitate to ping the Hugging Face team -regarding the docstrings. +### Add documentation -Next, make sure that the docstring added to `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` is -correct and included all necessary inputs and outputs. We have a detailed guide about writing documentation and our docstring format [here](writing-documentation). It is always good to remind oneself that documentation should -be treated at least as carefully as the code in 🤗 Transformers since the documentation is usually the first contact -point of the community with the model. +Your model is only useful if users know how to use it. This is why it's important to add documentation and docstrings. Cookiecutter added a template file, `docs/source/model_doc/brand_new_bert.md`, that you can fill out with information about your model. -**Code refactor** +This is generally a user's first interaction with a model, so the documentation should be clear and concise. It is often very useful to add examples of how the model should be used. -Great, now you have added all the necessary code for *brand_new_bert*. At this point, you should correct some potential -incorrect code style by running: +Make sure docstrings are added to `src/transformers/models/brand_new_bert/modeling_brand_new_bert/py` and includes all necessary inputs and outputs. Review our [guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification) for writing documentation and docstrings. + +### Refactor + +Time to tidy things up and make sure the code style is consistent with the rest of the library. Run the following command to automatically fix incorrect styles. ```bash make style ``` -and verify that your coding style passes the quality check: +To verify the code style passes quality checks, run the command below. ```bash make quality ``` -There are a couple of other very strict design tests in 🤗 Transformers that might still be failing, which shows up in -the tests of your pull request. This is often because of some missing information in the docstring or some incorrect -naming. The Hugging Face team will surely help you if you're stuck here. - -Lastly, it is always a good idea to refactor one's code after having ensured that the code works correctly. With all -tests passing, now it's a good time to go over the added code again and do some refactoring. - -You have now finished the coding part, congratulation! 🎉 You are Awesome! 😎 - -**12. Upload the models to the model hub** - -In this final part, you should convert and upload all checkpoints to the model hub and add a model card for each -uploaded model checkpoint. You can get familiar with the hub functionalities by reading our [Model sharing and uploading Page](model_sharing). You should work alongside the Hugging Face team here to decide on a fitting name for each -checkpoint and to get the required access rights to be able to upload the model under the author's organization of -*brand_new_bert*. The `push_to_hub` method, present in all models in `transformers`, is a quick and efficient way to push your checkpoint to the hub. A little snippet is pasted below: - -```python -brand_new_bert.push_to_hub("brand_new_bert") -# Uncomment the following line to push to an organization. -# brand_new_bert.push_to_hub("/brand_new_bert") -``` - -It is worth spending some time to create fitting model cards for each checkpoint. The model cards should highlight the -specific characteristics of this particular checkpoint, *e.g.* On which dataset was the checkpoint -pretrained/fine-tuned on? On what down-stream task should the model be used? And also include some code on how to -correctly use the model. - -**13. (Optional) Add notebook** - -It is very helpful to add a notebook that showcases in-detail how *brand_new_bert* can be used for inference and/or -fine-tuned on a downstream task. This is not mandatory to merge your PR, but very useful for the community. - -**14. Submit your finished PR** - -You're done programming now and can move to the last step, which is getting your PR merged into main. Usually, the -Hugging Face team should have helped you already at this point, but it is worth taking some time to give your finished -PR a nice description and eventually add comments to your code, if you want to point out certain design choices to your -reviewer. - -### Share your work!! +There may be other failing tests or checks (missing docstring or incorrect naming) on your pull request due to Transformers strict design tests. We can help you with these issues if you're stuck. -Now, it's time to get some credit from the community for your work! Having completed a model addition is a major -contribution to Transformers and the whole NLP community. Your code and the ported pre-trained models will certainly be -used by hundreds and possibly even thousands of developers and researchers. You should be proud of your work and share -your achievements with the community. +After ensuring the code runs correctly, you may want to refactor it to make it more readable or cleaner. -**You have made another model that is super easy to access for everyone in the community! 🤯** +### Upload to the Hub -## Model additions and their timeline: when is a model added to transformers? +Convert and upload all checkpoints to the [Hub](https://hf.co/models). Add a model card to provide more transparency and context about the model. The model card should highlight specific characteristics of a checkpoint, how the model was trained, and code examples of how to use it. -We aim for `transformers` to have support for new model architectures and checkpoints as early as possible: -availability can range from day-0 (and hour-0) releases for some models, to a few days/weeks for others. +> [!TIP] +> In many cases, adding an interactive notebook users can run is a great way to showcase how to use the model for inference or finetune it on a downstream task. While not mandatory, including a notebook can drive greater adoption of your model. -The availability of this is usually up to the model contributors, as well as how excited the community is for the -architecture. +You should also consult with the Transformers team to decide on an appropriate name for the model, and getting the required access rights to upload the model. -We can split the model architecture possibilities in four sections: -- Day-0 integration -- Same-week integration -- Post-release integration -- Hub-first release +Use the [`~PreTrainedModel.push_to_hub`] method to upload the model. -Let's dive into each of these and see how we (the transformers team) can help you contribute your architecture and get -your architecture to be very easily used by all members of the community. - -### Day-0 integration - -For a day-0 integration to work, we'll usually want to work hand-in-hand with you directly. In order to keep your -architecture private until your checkpoints and release are ready, we'll work together in a private fork of -transformers. - -If you plan on having a transformers-first release, this is a great option: we run CI ahead of time, ensure the -documentation is clear, and we aim to optimize your model as much as possible (providing quantization, optimizing it -with Flash-Attention/SDPA, optimizing the KV cache, etc). - -We can also lend you a hand in adding the model, reviewing it early, and help you make sure the `transformers` -API works as expected! - -If this is the path you wish to go with, we ask for you to reach out in advance, especially if the architecture is -particularly novel (at least a few days, but a few weeks will enable the absolute best integration). In order to reach -out, please contact transformers@huggingface.co 🤗. - -### Same-week integration - -A same-week integration usually happens when model authors do not reach out; but we see significant community -requests. - -In order to specify you'd like for us to integrate a specific model, we'll redirect you to our -[issue tracker](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&projects=&template=new-model-addition.yml) -where you can request a specific model. - -The more activity on the issue, the faster/more likely we are to integrate the model! - -### Post-release integration - -A post-release integration usually happens when there has not been sufficient activity/requests to warrant a same-week -integration, or that we lack the sufficient bandwidth to integrate it. - -We very gladly welcome community contributions in those instances; more than half of the library was contributed -by contributors external to Hugging Face. If this is something that is interesting to you, we recommend that you look -at our [open issues tagged with "New model"](https://github.com/huggingface/transformers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+model%22). +```py +brand_new_bert.push_to_hub("brand_new_bert") +``` -We recommend you try your hand at a heavily requested model as this will multiply the impact of your contribution. -We'll be there to help you in case that's your first contribution 🤗. +Refer to the [Share](./model_sharing) guide for more information about uploading models to the Hub. -### Code-on-Hub release +### Merge your model -Finally, transformers has a "remote-code" possibility, in which contributions are not made within the toolkit, but on -the Hub. This can be particularly interesting for groups that are using `transformers` as a backbone for their project, -but don't have the bandwidth to contribute the model to transformers directly. +You're finally ready to merge your pull request and officially add the model to Transformers! Make sure all the tests are passing and all comments and feedback have been addressed. -In case the model is very successful, then we'll very likely end up integrating it in `transformers` at the end - as this -provides better documentation, CI, maintenance, and optimizations - but this remains a great way to make your model -accessible day-0 with minimal friction. +Congratulations on adding a new model to Transformers! 🥳 -This guide is a great starting point for a Hub-first release: [Custom models](./custom_models) \ No newline at end of file +This is a very significant contribution. Your work here makes Transformers more accessible to developers and researchers around the world. You should be proud of your contribution and share your accomplishment with the community! \ No newline at end of file diff --git a/docs/source/en/custom_models.md b/docs/source/en/custom_models.md index 1171405b74d6..7d3f64384d81 100644 --- a/docs/source/en/custom_models.md +++ b/docs/source/en/custom_models.md @@ -90,7 +90,7 @@ Transformers' models follow the convention of accepting a `config` object in the You'll create two ResNet models, a ResNet model that outputs the hidden states and a ResNet model with an image classification head. - + Define a mapping between the block types and block classes. Everything else is created by passing the configuration class to the Resnet model class. @@ -127,7 +127,7 @@ class ResnetModel(PreTrainedModel): ``` - + The `forward` method needs to be rewrittten to calculate the loss for each logit if labels are available. Otherwise, the Resnet model class is the same. @@ -185,7 +185,7 @@ pretrained_model = timm.create_model("resnet50d", pretrained=True) resnet50d.model.load_state_dict(pretrained_model.state_dict()) ``` -## AutoClass support +## AutoClass The [AutoClass](./models#autoclass) API is a shortcut for automatically loading the correct architecture for a given model. It may be convenient for your users to add this API to your custom model. @@ -202,7 +202,7 @@ AutoModel.register(ResnetConfig, ResnetModel) AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) ``` -## Share a custom model on the Hub +## Upload model Upload a custom model to the [Hub](https://hf.co/models) to allow other users to easily load and use it. @@ -289,5 +289,3 @@ resnet50d.push_to_hub("custom-resnet50d") The pretrained weights, configuration in JSON format, `modeling.py` and `configuration.py` files should all be uploaded to the Hub now under a namespace and specified directory [here](https://hf.co/sgugger/custom-resnet50d). Because a custom model doesn't use the same modeling code as Transformers' model, you need to add `trust_remode_code=True` in the [`~PreTrainedModel.from_pretrained`] method. Refer to the load [custom models](./models#custom-models) section for more information. - -6401 \ No newline at end of file diff --git a/docs/source/en/model_sharing.md b/docs/source/en/model_sharing.md index 1aa193a1377b..30fd5a2e30c7 100644 --- a/docs/source/en/model_sharing.md +++ b/docs/source/en/model_sharing.md @@ -43,7 +43,7 @@ notebook_login() -## Model repository features +## Repository features @@ -73,7 +73,7 @@ The model repository also includes an inference [widget](https://hf.co/docs/hub/ Check out the Hub [Models](https://hf.co/docs/hub/models) documentation to learn more about. -## Convert a model for all frameworks +## Model framework conversion Reach a wider audience by converting a model to be compatible with all machine learning frameworks (PyTorch, TensorFlow, Flax). While users can still load a model if they're using a different framework, it is slower because Transformers converts the checkpoint on the fly. It is faster to convert the checkpoint beforehand. @@ -122,7 +122,7 @@ There are several ways to upload a model to the Hub depending on your workflow p -### Upload from Trainer +### Trainer The [`Trainer`], Transformers' training API, allows pushing a model directly to the Hub after training. Set `push_to_hub=True` in the [`TrainingArguments`] class and pass it to the [`Trainer`]. Once training is complete, call [`~transformers.Trainer.push_to_hub`] to upload the model. @@ -155,7 +155,7 @@ push_to_hub_callback = PushToHubCallback( model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback) ``` -### Upload from model +### PreTrainedModel.push_to_hub Call [`~PreTrainedModel.push_to_hub`] directly on a model to upload it to the Hub. It creates a repository under your namespace with the model name specified in [`~PreTrainedModel.push_to_hub`]. @@ -173,7 +173,7 @@ Your Hugging Face profile should now display the newly created model repository. Refer to the [Upload files to the Hub](https://hf.co/docs/hub/how-to-upstream) guide for more details about pushing files to the Hub. -### Upload from web interface +### Hub web interface For a no-code approach, upload a model with the Hub's web interface. From 525d622eda7d63a1047f6c6c81b79dd3410ef3ef Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 6 Aug 2024 11:05:38 -0700 Subject: [PATCH 015/116] contribute pt 2 --- docs/source/en/add_new_model.md | 97 +++++++++++++++++++++++++++++++++ docs/source/en/custom_models.md | 8 ++- 2 files changed, 103 insertions(+), 2 deletions(-) diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md index 50a49584c2db..0dcf65d04215 100644 --- a/docs/source/en/add_new_model.md +++ b/docs/source/en/add_new_model.md @@ -158,6 +158,15 @@ Return to your clone of Transformers to begin porting BrandNewBert. cd transformers ``` +There are two debugging environments for running the original model, a notebook ([Google Colab](https://colab.research.google.com/notebooks/intro.ipynb) or [Jupyter](https://jupyter.org/)) or a local Python script. + +> [!WARNING] +> We don't recommend setting up a GPU environment to run the original model. This can be costly and only verified when the model is working in Transformers. Instead, work in a CPU environment at first. + +Notebooks are great for executing code cell-by-cell which can better help split logical components from one another. It can also accelerate debugging cycles because intermediate results can be stored. Notebooks can also be shared which is useful for working with contributors. + +The downside of notebooks is that if you aren't used to them, it may take some time to get used to. + > [!TIP] > If the model architecture is identical to an existing model, skip ahead to add a [conversion script](#conversion-script), because you can reuse the architecture of the existing model. @@ -203,6 +212,94 @@ git merge upstream/main ### Run original checkpoint +Before you start working on your model implementation, you should work on the original model implementation first to understand how it works. + +This can be difficult if the original model repository is lacking documentation or if the codebase is complex. But you should use this as your motivation to implement the model in Transformers. Your contribution makes it more accessible and user-friendly to everyone. + +Orient yourself with the original repository by doing the following. + +- Locate the pretrained weights. +- Figure out how to the load pretrained weights into the model. +- Figure out how to run the tokenizer indepdently of the model. +- Trace one forward pass to understand which classes and functions are required. These are probably the only classes and functions you'll have to implement. +- Locate all the important components (model class, model subclasses, self-attention layer, etc.) of the model. +- Figure out how to debug the model in the original repository. Add print statements, use interactive debuggers like [ipdb](https://github.com/gotcha/ipdb), or a efficient integrated development environment (IDE) like [PyCharm](https://www.jetbrains.com/pycharm/). + +The last point is especially important because you'll need a thorough understanding of what's happening inside the original model before you can reimplement it in Transformers. Feel free to open issues and pull requests in the original repository if you encounter any issues. + +A good first step is to load a *small* pretrained checkpoint and try to reproduce a single forward pass with an example integer vector of inputs. For example, in pseudocode, this could look like the following. + +```py +model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids +original_output = model.predict(input_ids) +``` + +If you run into issues, you'll need to choose one of the following debugging decomposition strategies depending on the original models codebase. + + + + +This strategy relies on breaking the original model into smaller sub-components, such as when the code can be easily run in eager mode. While more difficult, there are some advantages to this approach. + +1. It is easier later to compare the original model to your implementation. You can automatically verify that each individual component matches its corresponding component in Transformers' implementation. This is better than relying on a visual comparison based on print statements. +2. It is easier to port individal components instead of the entire model. +3. It is easier for understanding how a model works by breaking it up into its components. +4. It is easier to prevent regressions at a later stage when you change your code thanks to component-by-component tests. + +> [!TIP] +> Refer to the ELECTRA [integration checks](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) for a good example of how to decompose a model into smaller components. + + + + +This strategy is viable when the original codebase is too complex, only allows intermediate components to be run in compiled mode, or if it's too time-consuming (maybe even impossible) to separate the model into smaller sub-components. + +For example, the MeshTensorFlow implementation of [T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) is too complex and doesn't offer a simple way to decompose the model into its sub-components. In this situation, you'll have to relay on verifying print statements. + + + + +Whichever strategy you choose, it is recommended to debug the initial layers first and the final layers last. Retrieve the output, either with print statements or sub-component functions, of the following layers in this order. + +1. input ids passed to the model +2. word embeddings +3. input of the first Transformer layer +4. output of the first Transformer layer +5. output of the following n-1 Transformer layers +6. output of the whole model + +The input ids should just be an array of integers like `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`. + +Layer outputs often consist of multi-dimensional float arrays. + +```py +[[ + [-0.1465, -0.6501, 0.1993, ..., 0.1451, 0.3430, 0.6024], + [-0.4417, -0.5920, 0.3450, ..., -0.3062, 0.6182, 0.7132], + [-0.5009, -0.7122, 0.4548, ..., -0.3662, 0.6091, 0.7648], + ..., + [-0.5613, -0.6332, 0.4324, ..., -0.3792, 0.7372, 0.9288], + [-0.5416, -0.6345, 0.4180, ..., -0.3564, 0.6992, 0.9191], + [-0.5334, -0.6403, 0.4271, ..., -0.3339, 0.6533, 0.8694]]], +``` + +Every Transformers model output should have a precision or error tolerance of *1e-3*. This accounts for any output differences that arise from using a different library framework. Compare the intermediate outputs of the original model with the Transformers implementation to ensure they're nearly identical. Having an *efficient* debugging environment is crucial for this step. + +Here are some tips for an efficient debugging environment. + +- To debug intermediate results, it depends on the machine learning framework the original model repository is using. For PyTorch, you should write a script to decompose the original model into smaller sub-components to retrieve the intermediate values. For TensorFlow, you may need to use [tf.print](https://www.tensorflow.org/api_docs/python/tf/print). For Flax, make sure the model is *not jitted* during the forward pass (refer to this GitHub [Issue](https://github.com/google/jax/issues/196) for more details). + +- It is faster to debug with a smaller pretrained checkpoint versus a larger checkpoint where the forward pass takes more than 10 seconds. If only large checkpoints are available, create a dummy model with randomly initialized weights and save those weights to compare against the Transformers implementation. + +- Find the easiest way to call the model's forward pass. Ideally, this function (may be called `predict`, `evaluate`, `forward`, or `__call__`) should only call the forward pass *once*. It is more difficult to debug a function that calls the forward pass multiple times. + +- Separate tokenization from the forward pass. Locate where a string input is changed to input ids in the forward pass and start here. You may need to create a small script or modify the original code to directly input the input ids instead of an input string. + +- Ensure the model is *not* in training mode. This can produce random outputs due to multiple dropout layers in a model. The forward pass in your debugging environment should be *deterministic* so that the dropout layers aren't used. + +Once you're able to run the original checkpoint, you're ready to start adapting the model code for Transformers. + ### Adapt the model code The `transformers-cli add-new-model-like` command should have generated a model and configuration file. diff --git a/docs/source/en/custom_models.md b/docs/source/en/custom_models.md index 7d3f64384d81..9ecf9716f508 100644 --- a/docs/source/en/custom_models.md +++ b/docs/source/en/custom_models.md @@ -185,7 +185,7 @@ pretrained_model = timm.create_model("resnet50d", pretrained=True) resnet50d.model.load_state_dict(pretrained_model.state_dict()) ``` -## AutoClass +## AutoClass support The [AutoClass](./models#autoclass) API is a shortcut for automatically loading the correct architecture for a given model. It may be convenient for your users to add this API to your custom model. @@ -202,6 +202,8 @@ AutoModel.register(ResnetConfig, ResnetModel) AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) ``` +Your custom model code is now compatible with the [AutoClass](./models#autoclass) API. Users can load the model with the `AutoModel` or [`AutoModelForImageClassification`] classes. + ## Upload model Upload a custom model to the [Hub](https://hf.co/models) to allow other users to easily load and use it. @@ -231,7 +233,9 @@ from resnet_model.configuration_resnet import ResnetConfig from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification ``` -Copy the code from the model and configuration files and register them with an [AutoClass](./models#autoclass) with the [`~PretrainedConfig.register_for_auto_class`] method. For the model, pick the appropriate `AutoModelFor` class based on the task. +Copy the code from the model and configuration files. To make sure the AutoClass objects are saved when calling [`~PreTrainedModel.save_pretrained`], call the [`~PretrainedConfig.register_for_auto_class`] method. This modifies the configuration JSON file to include the AutoClass objects and mapping. + +For a model, pick the appropriate `AutoModelFor` class based on the task. ```py ResnetConfig.register_for_auto_class() From 074e5ae67670da83f3f276ae434a948dfaa6117f Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 6 Aug 2024 12:32:05 -0700 Subject: [PATCH 016/116] fix toctree --- docs/source/en/_toctree.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 15635c5243e6..2de2fa8dae5a 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -56,6 +56,8 @@ title: LLM prompting guide - local: llm_optims title: LLM inference optimization + - local: kv_cache + title: Best Practices for Generation with Cache - local: llm_tutorial title: Generation with LLMs - local: generation_strategies @@ -153,6 +155,8 @@ title: EETQ - local: quantization/hqq title: HQQ + - local: quantization/fbgemm_fp8 + title: FBGEMM FP8 - local: quantization/optimum title: Optimum - local: quantization/contribute @@ -828,6 +832,8 @@ title: YOLOS - local: model_doc/zamba title: Zamba + - local: model_doc/zoedepth + title: ZoeDepth - title: Audio models sections: - local: model_doc/audio-spectrogram-transformer From d10debefc7bb7c0a70cd74a4b1aae58052fba369 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 12 Aug 2024 17:01:37 -0700 Subject: [PATCH 017/116] tokenization pt 1 --- docs/source/en/_toctree.yml | 4 +- docs/source/en/fast_tokenizers.md | 252 +++++++++++++++++++++++++----- 2 files changed, 217 insertions(+), 39 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 2de2fa8dae5a..cb188bf06fa3 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -32,10 +32,10 @@ - title: Preprocessors isExpanded: false sections: + - local: fast_tokenizers + title: Tokenizers - local: preprocessing title: Preprocess data - - local: fast_tokenizers - title: Use fast tokenizers from 🤗 Tokenizers - local: tokenizer_summary title: Summary of the tokenizers - local: pad_truncation diff --git a/docs/source/en/fast_tokenizers.md b/docs/source/en/fast_tokenizers.md index aebc17106008..aeba81f1fec3 100644 --- a/docs/source/en/fast_tokenizers.md +++ b/docs/source/en/fast_tokenizers.md @@ -1,4 +1,4 @@ - -# Use tokenizers from 🤗 Tokenizers +# Tokenizers -The [`PreTrainedTokenizerFast`] depends on the [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 Tokenizers library can be -loaded very simply into 🤗 Transformers. +Tokenizers convert text into an array of numbers known as tensors, which are the inputs to a model. There are several tokenizer types, but they all share the same purpose. Split text into smaller words or subwords (tokens) and convert them into numbers (input ids). A tokenizer also returns an attention mask to indicate which tokens should be attended to. -Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines: +> [!TIP] +> Learn more about the most popular tokenization algorithms in the [Summary of the tokenizers](./tokenizer_summary). -```python ->>> from tokenizers import Tokenizer ->>> from tokenizers.models import BPE ->>> from tokenizers.trainers import BpeTrainer ->>> from tokenizers.pre_tokenizers import Whitespace +To load a tokenizer, call the [`~PreTrainedTokenizer.from_pretrained`] method to load the tokenizer and its configuration from the Hugging Face [Hub](https://hf.co) into the tokenizer class. Apply the tokenizer to a string of text to return the input ids and attention mask. ->>> tokenizer = Tokenizer(BPE(unk_token="[UNK]")) ->>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) +```py +from transformers import AutoTokenizer ->>> tokenizer.pre_tokenizer = Whitespace() ->>> files = [...] ->>> tokenizer.train(files, trainer) +tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b") +tokenizer("We are very happy to show you the 🤗 Transformers library") +{'input_ids': [2, 1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} ``` -We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to -a JSON file for future re-use. +This guide provides a brief overview of the tokenizer classes and how to preprocess text with it. -## Loading directly from the tokenizer object +## Base tokenizer classes -Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The -[`PreTrainedTokenizerFast`] class allows for easy instantiation, by accepting the instantiated -*tokenizer* object as an argument: +All tokenizers inherit from a [`PreTrainedTokenizerBase`] class that provides common methods for all tokenizers like [`~PreTrainedTokenizerBase.from_pretrained`] and [`~PreTrainedTokenizerBase.batch_decode`]. From this base class, there are two main tokenizer classes. -```python ->>> from transformers import PreTrainedTokenizerFast +- [`PreTrainedTokenizer`] is a Python implementation. +- [`PreTrainedTokenizerFast`] is a fast Rust-based implementation from the [Tokenizers](https://hf.co/docs/tokenizers/index) library. ->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer) +Each model tokenizer inherits from one of these two base tokenizer classes, for example [`LlamaTokenizer`] and [`LlamaTokenizerFast`]. + +The pretrained tokenizer is saved in a [tokenizer.model](https://huggingface.co/google/gemma-2-2b/blob/main/tokenizer.model) file with all its associated vocabulary files. + +To use a pretrained tokenizer, you need to load all the vocabulary files and the tokenizer model with the [`~PreTrainedTokenizerBase.from_pretrained`] method. This method accepts a Hub model repository name or a local directory. For a custom tokenizer, you need to load it's vocabulary file. Both methods are shown in [AutoTokenizer](#autotokenizer) and [Model-specific tokenizer](#model-specific-tokenizer) sections. + +Whatever tokenizer you use, make sure the tokenizer vocabulary is the same as a pretrained models tokenizers vocabulary. This is especially important if you're using a custom tokenizer which has a different vocabulary than the one generated by a pretrained models tokenizer. + +## AutoTokenizer + + + +The [AutoClass](./model_doc/auto) API is a fast and easy way to load a tokenizer without needing to know whether a Python or Rust-based implementation is available. By default, an AutoTokenizer tries to load a fast tokenizer if it's available for a given model, otherwise, it loads the Python implementation. + +Use the [`~PreTrainedTokenizer.from_pretrained`] method to load a tokenizer. + +```py +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b") +tokenizer("We are very happy to show you the 🤗 Transformers library.") +{'input_ids': [2, 1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581, 235265], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +``` + +Load your own tokenizer by passing its vocabulary file to the [`~AutoTokenizer.from_pretrained`] method. + +```py +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("./model_directory/my_vocab_file.txt") +``` + +## Model-specific tokenizer + +Each pretrained model is associated with a pretrained tokenizer, and you can load the tokenizer directly from the model-specific class. Check a model's API documentation to check whether a fast tokenizer is supported for a model. + + + + +```py +from transformers import GemmaTokenizer + +tokenizer = GemmaTokenizer.from_pretrained("google/gemma-2-2b") +tokenizer("We are very happy to show you the 🤗 Transformers library.") +``` + + + + +```py +from transformers import GemmaTokenizerFast + +tokenizer = GemmaTokenizerFast.from_pretrained("google/gemma-2-2b") +tokenizer("We are very happy to show you the 🤗 Transformers library.") +``` + + + + +Load your own tokenizer by passing its vocabulary file to the `vocab_file` parameter. + +```py +from transformers import GemmaTokenizerFast + +tokenizer = GemmaTokenizerFast(vocab_file="my_vocab_file.txt") +``` + +## Fast tokenizers + +[`PreTrainedTokenizerFast`] or *fast tokenizers* are Rust-based tokenizers from the [Tokenizers](https://hf.co/docs/tokenizers) library. It is significantly faster at batched tokenization and provides additional alignment methods compared to the Python-based tokenizers. + +If you're using the [AutoTokenizer](#autotokenizer) API, it automatically loads a fast tokenizer if it's supported for a given model. Otherwise, you need to explicitly load the fast tokenizer. + +This section will show you how to train a fast tokenizer and reuse it in Transformers. + +### Train + +To train a Byte-Pair Encoding (BPE) tokenizer, create an instance of a [`~tokenizers.Tokenizer`] and [`~tokenizers.trainers.BpeTrainer`] and define the unknown token and special tokens. + +```py +from tokenizers import Tokenizer +from tokenizers.models import BPE +from tokenizers.trainers import BpeTrainer + +tokenizer = Tokenizer(BPE(unk_token="[UNK]")) +trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) +``` + +Split the tokens on [`~tokenizers.pre_tokenizers.Whitespace`] to create tokens that don't overlap with each other. + +```py +from tokenizers.pre_tokenizers import Whitespace + +tokenizer.pre_tokenizer = Whitespace() +``` + +Pass the text files and trainer to the tokenizer and call [`~tokenizers.Tokenizer.train`] to train the tokenizer. + +```py +files = [...] +tokenizer.train(files, trainer) +``` + +Use the [`~tokenizers.Tokenizer.save`] method to save the tokenizers configuration and vocabulary to a JSON file. + +```py +tokenizer.save("tokenizer.json") +``` + +### Load + +To load and use the tokenizer object in Transformers, pass it to the `tokenizer_object` parameter in [`PreTrainedTokenizerFast`]. + +```py +from transformers import PreTrainedTokenizerFast + +fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer) +``` + +To load a saved tokenizer from its JSON file, pass the file path to the `tokenizer_file` parameter in [`PreTrainedTokenizerFast`]. + +```py +from transformers import PreTrainedTokenizerFast + +fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") +``` + +## Preprocess + +A tokenizers job is to preprocess text into an array of numbers. When passing a string of text to a tokenizer, there are actually two steps the tokenizer performs to convert the text into input ids. + + + +```py +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b") +``` + + + + +In the first step, a string of text is split into tokens. How the text is split depends on the tokenization algorithm. Call the [`~PreTrainedTokenizer.tokenize`] method to tokenize the text. + +```py +tokens = tokenizer.tokenize("We are very happy to show you the 🤗 Transformers library") +print(tokens) +['We', '▁are', '▁very', '▁happy', '▁to', '▁show', '▁you', '▁the', '▁🤗', '▁Transformers', '▁library'] +``` + +Gemma uses a [SentencePiece](./tokenizer_summary#sentencepiece) tokenizer which replaces spaces with an underscore `_`. + + + + -In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer: +Lastly, the model prediction typically generates numerical outputs which are converted back to text with the [`~PreTrainedTokenizer.decode`] method. -```python ->>> tokenizer.save("tokenizer.json") +```py +decoded_string = tokenizer.decode(ids) +print(decoded_string) +'We are very happy to show you the 🤗 Transformers library' ``` -The path to which we saved this file can be passed to the [`PreTrainedTokenizerFast`] initialization -method using the `tokenizer_file` parameter: + + + +### Special tokens + +Special tokens are used by the tokenizer to provide the model with some additional information about the text. + +For example, if you compare the tokens obtained from passing text directly to the tokenizer and from the [`~PreTrainedTokenizer.convert_tokens_to_ids`] method, you'll notice some additional tokens are added. + +```py +model_inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.") +[2, 1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581] +tokenizer.convert_tokens_to_ids(tokens) +[1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581] +``` -```python ->>> from transformers import PreTrainedTokenizerFast +When you [`~PreTrainedTokenizer.decode`] the ids, you'll see `` at the beginning of the string. This is used to indicate the beginning of a sentence to the model. ->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") +```py +print(tokenizer.decode(model_inputs["input_ids"])) +print(tokenizer.decode(ids)) +'We are very happy to show you the 🤗 Transformers library.' +'We are very happy to show you the 🤗 Transformers library' ``` -This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer -page](main_classes/tokenizer) for more information. +Not all models need special tokens, but if they do, a tokenizer automatically adds them. + +### Batch tokenization + +### Padding + +### Truncation + +### Framework-specific tensors From 7720f2442e0852c8e4a057303131cf3b3799f045 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Mon, 12 Aug 2024 10:22:47 +0400 Subject: [PATCH 018/116] Add new model (#32615) * v1 - working version * fix * fix * fix * fix * rename to correct name * fix title * fixup * rename files * fix * add copied from on tests * rename to `FalconMamba` everywhere and fix bugs * fix quantization + accelerate * fix copies * add `torch.compile` support * fix tests * fix tests and add slow tests * copies on config * merge the latest changes * fix tests * add few lines about instruct * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * fix * fix tests --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- .../falcon_mamba/modeling_falcon_mamba.py | 55 ++++--------------- .../test_modeling_falcon_mamba.py | 24 ++++++-- 2 files changed, 30 insertions(+), 49 deletions(-) diff --git a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py index d7a40ed5c5ff..b015373ca3aa 100644 --- a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -62,7 +62,7 @@ (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn) ) -_CHECKPOINT_FOR_DOC = "tiiuae/falcon-mamba-7b" +_CHECKPOINT_FOR_DOC = "tiiuae/falcon_mamba-7b" _CONFIG_FOR_DOC = "FalconMambaConfig" @@ -167,7 +167,6 @@ def cuda_kernels_forward( hidden_states: torch.Tensor, cache_params: Optional[MambaCache] = None, cache_position: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.LongTensor] = None, ): # 1. Gated MLP's linear projection projected_states = self.in_proj(hidden_states).transpose(1, 2) @@ -196,9 +195,6 @@ def cuda_kernels_forward( else: hidden_states, gate = projected_states.chunk(2, dim=1) - if attention_mask is not None: - hidden_states = hidden_states * attention_mask.unsqueeze(1) - # 2. Convolution sequence transformation conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2)) if cache_params is not None and cache_position[0] > 0: @@ -220,9 +216,6 @@ def cuda_kernels_forward( hidden_states, conv_weights, self.conv1d.bias, activation=self.activation ) - if attention_mask is not None: - hidden_states = hidden_states * attention_mask.unsqueeze(1) - # 3. State Space Model sequence transformation # 3.a. input varying initialization of time_step, B and C ssm_parameters = self.x_proj(hidden_states.transpose(1, 2)) @@ -282,7 +275,6 @@ def slow_forward( input_states, cache_params: Optional[MambaCache] = None, cache_position: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.LongTensor] = None, ): batch_size, seq_len, _ = input_states.shape dtype = input_states.dtype @@ -290,9 +282,6 @@ def slow_forward( projected_states = self.in_proj(input_states).transpose(1, 2) # [batch, 2 * intermediate_size, seq_len] hidden_states, gate = projected_states.chunk(2, dim=1) - if attention_mask is not None: - hidden_states = hidden_states * attention_mask.unsqueeze(1) - # 2. Convolution sequence transformation if cache_params is not None: ssm_state = cache_params.ssm_states[self.layer_idx].clone() @@ -322,9 +311,6 @@ def slow_forward( ) hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len]) # [batch, intermediate_size, seq_len] - if attention_mask is not None: - hidden_states = hidden_states * attention_mask.unsqueeze(1) - # 3. State Space Model sequence transformation # 3.a. Selection: [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2] ssm_parameters = self.x_proj(hidden_states.transpose(1, 2)) @@ -386,11 +372,10 @@ def forward( hidden_states, cache_params: Optional[MambaCache] = None, cache_position: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.LongTensor] = None, ): if is_fast_path_available and "cuda" in self.x_proj.weight.device.type and not torch._dynamo.is_compiling(): - return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask) - return self.slow_forward(hidden_states, cache_params, cache_position, attention_mask) + return self.cuda_kernels_forward(hidden_states, cache_params, cache_position) + return self.slow_forward(hidden_states, cache_params, cache_position) # Copied from transformers.models.mamba.modeling_mamba.MambaRMSNorm with Mamba->FalconMamba @@ -428,16 +413,13 @@ def forward( hidden_states, cache_params: Optional[MambaCache] = None, cache_position: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.LongTensor] = None, ): residual = hidden_states hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype)) if self.residual_in_fp32: residual = residual.to(torch.float32) - hidden_states = self.mixer( - hidden_states, cache_params=cache_params, cache_position=cache_position, attention_mask=attention_mask - ) + hidden_states = self.mixer(hidden_states, cache_params=cache_params, cache_position=cache_position) hidden_states = residual + hidden_states return hidden_states @@ -636,13 +618,14 @@ def set_input_embeddings(self, new_embeddings): def forward( self, input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, # Ignored arg inputs_embeds: Optional[torch.LongTensor] = None, cache_params: Optional[MambaCache] = None, use_cache: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.LongTensor] = None, + **kwargs, # `attention_mask` is passed by the tokenizer and we don't want it ) -> Union[Tuple, FalconMambaOutput]: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -681,15 +664,10 @@ def forward( for mixer_block in self.layers: if self.gradient_checkpointing and self.training: hidden_states = self._gradient_checkpointing_func( - mixer_block.__call__, hidden_states, cache_params, cache_position, attention_mask + mixer_block.__call__, hidden_states, cache_params, cache_position ) else: - hidden_states = mixer_block( - hidden_states, - cache_params=cache_params, - cache_position=cache_position, - attention_mask=attention_mask, - ) + hidden_states = mixer_block(hidden_states, cache_params=cache_params, cache_position=cache_position) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) @@ -749,13 +727,6 @@ def _update_model_kwargs_for_generation( and model_kwargs["cache_position"] is not None ): model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens - - if "attention_mask" in model_kwargs: - attention_mask = model_kwargs["attention_mask"] - model_kwargs["attention_mask"] = torch.cat( - [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 - ) - return model_kwargs def prepare_inputs_for_generation( @@ -765,7 +736,6 @@ def prepare_inputs_for_generation( use_cache=None, cache_params: Optional[MambaCache] = None, cache_position: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.LongTensor] = None, **kwargs, ): # Overwitten -- uses `cache_params` as opposed to `past_key_values` @@ -780,10 +750,6 @@ def prepare_inputs_for_generation( ) if cache_position[0] > 0: input_ids = input_ids[:, -1].unsqueeze(-1) - - if attention_mask is not None: - attention_mask = None - else: # we initialize the `cache_position` to full size of `conv_states` at prefill stage # considering padding will be applied when input length is shorter, and truncation @@ -801,7 +767,6 @@ def prepare_inputs_for_generation( "cache_params": cache_params, "use_cache": use_cache, "cache_position": cache_position, - "attention_mask": attention_mask, } ) return model_inputs @@ -812,10 +777,11 @@ def prepare_inputs_for_generation( output_type=FalconMambaCausalLMOutput, config_class=_CONFIG_FOR_DOC, ) + # Ignore copy def forward( self, input_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, # Ignored copy inputs_embeds: Optional[torch.FloatTensor] = None, cache_params: Optional[MambaCache] = None, labels: Optional[torch.LongTensor] = None, @@ -841,7 +807,6 @@ def forward( return_dict=return_dict, use_cache=use_cache, cache_position=cache_position, - attention_mask=attention_mask, ) hidden_states = falcon_mamba_outputs[0] diff --git a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py index 6ac432766ac1..d4f084ab2941 100644 --- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py +++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py @@ -98,7 +98,6 @@ def prepare_config_and_inputs( self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False ): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) - attention_mask = ids_tensor([self.batch_size, self.seq_length], 1) sequence_labels = None token_labels = None @@ -117,7 +116,7 @@ def prepare_config_and_inputs( return ( config, input_ids, - attention_mask, + None, sequence_labels, token_labels, choice_labels, @@ -147,6 +146,23 @@ def get_pipeline_config(self): config.vocab_size = 300 return config + def prepare_config_and_inputs_for_decoder(self): + ( + config, + input_ids, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() + + return ( + config, + input_ids, + sequence_labels, + token_labels, + choice_labels, + ) + def create_and_check_falcon_mamba_model(self, config, input_ids, *args): config.output_hidden_states = True model = FalconMambaModel(config=config) @@ -234,12 +250,12 @@ def prepare_config_and_inputs_for_common(self): ( config, input_ids, - attention_mask, + _, sequence_labels, token_labels, choice_labels, ) = self.prepare_config_and_inputs() - inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask} + inputs_dict = {"input_ids": input_ids} return config, inputs_dict From 78fe667f67815566e56e5f003c236eaffec430f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <45557362+qgallouedec@users.noreply.github.com> Date: Mon, 12 Aug 2024 21:20:17 +0200 Subject: [PATCH 019/116] "to be not" -> "not to be" (#32636) * "to be not" -> "not to be" * Update sam.md * Update trainer.py * Update modeling_utils.py * Update test_modeling_utils.py * Update test_modeling_utils.py --- tests/utils/test_modeling_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index 66210cae8043..568efea55ad4 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -2608,7 +2608,8 @@ def test_not_available_flash(self): _ = AutoModel.from_pretrained( "hf-internal-testing/tiny-random-GPTBigCodeModel", attn_implementation="flash_attention_2" ) - self.assertTrue("the package flash_attn seems to be not installed" in str(cm.exception)) + + self.assertTrue("the package flash_attn seems not to be installed" in str(cm.exception)) def test_not_available_flash_with_config(self): if is_flash_attn_2_available(): @@ -2623,7 +2624,7 @@ def test_not_available_flash_with_config(self): attn_implementation="flash_attention_2", ) - self.assertTrue("the package flash_attn seems to be not installed" in str(cm.exception)) + self.assertTrue("the package flash_attn seems not to be installed" in str(cm.exception)) def test_not_available_sdpa(self): if is_torch_sdpa_available(): From 6ad152e3bce96e52c3e00bb4c1e65ef9ee2d0f49 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 12 Aug 2024 17:28:15 -0700 Subject: [PATCH 020/116] fix hfoption tag --- docs/source/en/fast_tokenizers.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/fast_tokenizers.md b/docs/source/en/fast_tokenizers.md index aeba81f1fec3..aeccb8082bc1 100644 --- a/docs/source/en/fast_tokenizers.md +++ b/docs/source/en/fast_tokenizers.md @@ -193,7 +193,7 @@ print(tokens) Gemma uses a [SentencePiece](./tokenizer_summary#sentencepiece) tokenizer which replaces spaces with an underscore `_`. - In the second step, the tokens are converted into ids with the [`~PreTrainedTokenizer.convert_tokens_to_ids`] method. From f9695d00b8faba54d7d320de5ac34b81785c3396 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 13 Aug 2024 13:06:11 -0700 Subject: [PATCH 021/116] tokenization pt. 2 --- docs/source/en/fast_tokenizers.md | 86 ++++++++++++++++++++++++++----- 1 file changed, 72 insertions(+), 14 deletions(-) diff --git a/docs/source/en/fast_tokenizers.md b/docs/source/en/fast_tokenizers.md index aeccb8082bc1..b5042a52bef4 100644 --- a/docs/source/en/fast_tokenizers.md +++ b/docs/source/en/fast_tokenizers.md @@ -21,14 +21,17 @@ Tokenizers convert text into an array of numbers known as tensors, which are the > [!TIP] > Learn more about the most popular tokenization algorithms in the [Summary of the tokenizers](./tokenizer_summary). -To load a tokenizer, call the [`~PreTrainedTokenizer.from_pretrained`] method to load the tokenizer and its configuration from the Hugging Face [Hub](https://hf.co) into the tokenizer class. Apply the tokenizer to a string of text to return the input ids and attention mask. +To load a tokenizer, call the [`~PreTrainedTokenizer.from_pretrained`] method to load the tokenizer and its configuration from the Hugging Face [Hub](https://hf.co) into the tokenizer class. Apply the tokenizer to a string of text to return the input ids and attention mask. Set the type of framework tensor to return with the `return_tensors` parameter. ```py from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b") -tokenizer("We are very happy to show you the 🤗 Transformers library") -{'input_ids': [2, 1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +tokenizer("We are very happy to show you the 🤗 Transformers library", return_tensors="pt") +{'input_ids': tensor([[ 2, 1734, 708, 1508, 4915, 577, 1500, 692, 573, + 156808, 128149, 9581, 235265]]), + 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) +} ``` This guide provides a brief overview of the tokenizer classes and how to preprocess text with it. @@ -50,8 +53,6 @@ Whatever tokenizer you use, make sure the tokenizer vocabulary is the same as a ## AutoTokenizer - - The [AutoClass](./model_doc/auto) API is a fast and easy way to load a tokenizer without needing to know whether a Python or Rust-based implementation is available. By default, an AutoTokenizer tries to load a fast tokenizer if it's available for a given model, otherwise, it loads the Python implementation. Use the [`~PreTrainedTokenizer.from_pretrained`] method to load a tokenizer. @@ -60,8 +61,11 @@ Use the [`~PreTrainedTokenizer.from_pretrained`] method to load a tokenizer. from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b") -tokenizer("We are very happy to show you the 🤗 Transformers library.") -{'input_ids': [2, 1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581, 235265], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} +tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt") +{'input_ids': tensor([[ 2, 1734, 708, 1508, 4915, 577, 1500, 692, 573, + 156808, 128149, 9581, 235265]]), + 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) +} ``` Load your own tokenizer by passing its vocabulary file to the [`~AutoTokenizer.from_pretrained`] method. @@ -83,7 +87,7 @@ Each pretrained model is associated with a pretrained tokenizer, and you can loa from transformers import GemmaTokenizer tokenizer = GemmaTokenizer.from_pretrained("google/gemma-2-2b") -tokenizer("We are very happy to show you the 🤗 Transformers library.") +tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt") ``` @@ -93,7 +97,7 @@ tokenizer("We are very happy to show you the 🤗 Transformers library.") from transformers import GemmaTokenizerFast tokenizer = GemmaTokenizerFast.from_pretrained("google/gemma-2-2b") -tokenizer("We are very happy to show you the 🤗 Transformers library.") +tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt") ``` @@ -109,6 +113,8 @@ tokenizer = GemmaTokenizerFast(vocab_file="my_vocab_file.txt") ## Fast tokenizers + + [`PreTrainedTokenizerFast`] or *fast tokenizers* are Rust-based tokenizers from the [Tokenizers](https://hf.co/docs/tokenizers) library. It is significantly faster at batched tokenization and provides additional alignment methods compared to the Python-based tokenizers. If you're using the [AutoTokenizer](#autotokenizer) API, it automatically loads a fast tokenizer if it's supported for a given model. Otherwise, you need to explicitly load the fast tokenizer. @@ -169,16 +175,25 @@ fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") ## Preprocess -A tokenizers job is to preprocess text into an array of numbers. When passing a string of text to a tokenizer, there are actually two steps the tokenizer performs to convert the text into input ids. + - +A Transformers model expects the input as a PyTorch, TensorFlow, or NumPy tensor. A tokenizers job is to preprocess text into those tensors. Specify the type of framework tensor to return with the `return_tensors` parameter. ```py from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b") +tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt") +{'input_ids': tensor([[ 2, 1734, 708, 1508, 4915, 577, 1500, 692, 573, + 156808, 128149, 9581, 235265]]), + 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) +} ``` +When passing a string of text to a tokenizer, there are actually two steps the tokenizer performs to convert the text into input ids. + + + @@ -203,8 +218,6 @@ print(ids) [1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581] ``` -These are the same input ids you would see if you had just passed the string of text to the tokenizer. - @@ -245,8 +258,53 @@ Not all models need special tokens, but if they do, a tokenizer automatically ad ### Batch tokenization +It is faster and more efficient to preprocess *batches* of text instead of a single sentence at a time. Fast tokenizers are especially good at parallelizing tokenization. + +Pass a list of the string text to the tokenizer. + +```py +batch_sentences = [ + "But what about second breakfast?", + "Don't think he knows about second breakfast, Pip.", + "What about elevensies?", +] +encoded_inputs = tokenizer(batch_sentences, return_tensors="pt") +print(encoded_inputs) +{ + 'input_ids': + [[2, 1860, 1212, 1105, 2257, 14457, 235336], + [2, 4454, 235303, 235251, 1742, 693, 9242, 1105, 2257, 14457, 235269, 48782, 235265], + [2, 1841, 1105, 29754, 37453, 235336]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]] +} +``` + ### Padding +> [!TIP] +> Learn about additional padding strategies in the [Padding and truncation](./pad_truncation) guide. + +Examine the `input_ids` and you'll notice each element has a different length. This is an issue because Transformers expects the elements to have the same lengths so it can pack them into a batch. Sequences with uneven lengths can't be batched. + +Padding adds a special *padding token* to ensure all sequences have the same length. Set `padding=True` to pad the sequences to the longest sequence length in the batch. + +```py +encoded_inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt") +print(encoded_inputs) +``` + +The tokenizer added the special padding token `0` to the left side (*left padding*) because Gemma and LLMs in general are not trained to continue generation from a padding token. + ### Truncation -### Framework-specific tensors +> [!TIP] +> Learn about additional truncation strategies in the [Padding and truncation](./pad_truncation) guide. + +Models are only able to process sequences up to a certain length. If you try to process a sequence longer than a model can handle, it'll crash. + +Truncation removes tokens from a sequence to ensure it doesn't exceed the maximum length. Set `truncation=True` to truncate a sequence to the maximum length accepted by the model. Or you can set the maximum length yourself with the `max_length` parameter. + +```py +encoded_inputs = tokenizer(batch_sentences, max_length=8, truncation=True, return_tensors="pt") +print(encoded_inputs) +``` From 8244a14974a1834331794c4f8370abe47b8c51d4 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 14 Aug 2024 14:11:25 -0700 Subject: [PATCH 022/116] image processor --- docs/source/en/_toctree.yml | 2 + docs/source/en/fast_tokenizers.md | 6 +- docs/source/en/image_processors.md | 190 +++++++++++++++++++++++++++++ 3 files changed, 195 insertions(+), 3 deletions(-) create mode 100644 docs/source/en/image_processors.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index cb188bf06fa3..895776355683 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -34,6 +34,8 @@ sections: - local: fast_tokenizers title: Tokenizers + - local: image_processor + title: Image processors - local: preprocessing title: Preprocess data - local: tokenizer_summary diff --git a/docs/source/en/fast_tokenizers.md b/docs/source/en/fast_tokenizers.md index b5042a52bef4..096f4f192b1b 100644 --- a/docs/source/en/fast_tokenizers.md +++ b/docs/source/en/fast_tokenizers.md @@ -34,6 +34,8 @@ tokenizer("We are very happy to show you the 🤗 Transformers library", return_ } ``` +Whatever tokenizer you use, make sure the tokenizer vocabulary is the same as a pretrained models tokenizers vocabulary. This is especially important if you're using a custom tokenizer which has a different vocabulary than the one generated by a pretrained models tokenizer. + This guide provides a brief overview of the tokenizer classes and how to preprocess text with it. ## Base tokenizer classes @@ -49,8 +51,6 @@ The pretrained tokenizer is saved in a [tokenizer.model](https://huggingface.co/ To use a pretrained tokenizer, you need to load all the vocabulary files and the tokenizer model with the [`~PreTrainedTokenizerBase.from_pretrained`] method. This method accepts a Hub model repository name or a local directory. For a custom tokenizer, you need to load it's vocabulary file. Both methods are shown in [AutoTokenizer](#autotokenizer) and [Model-specific tokenizer](#model-specific-tokenizer) sections. -Whatever tokenizer you use, make sure the tokenizer vocabulary is the same as a pretrained models tokenizers vocabulary. This is especially important if you're using a custom tokenizer which has a different vocabulary than the one generated by a pretrained models tokenizer. - ## AutoTokenizer The [AutoClass](./model_doc/auto) API is a fast and easy way to load a tokenizer without needing to know whether a Python or Rust-based implementation is available. By default, an AutoTokenizer tries to load a fast tokenizer if it's available for a given model, otherwise, it loads the Python implementation. @@ -78,7 +78,7 @@ tokenizer = AutoTokenizer.from_pretrained("./model_directory/my_vocab_file.txt") ## Model-specific tokenizer -Each pretrained model is associated with a pretrained tokenizer, and you can load the tokenizer directly from the model-specific class. Check a model's API documentation to check whether a fast tokenizer is supported for a model. +Each pretrained model is associated with a tokenizer and its specific vocabulary it was trained on. A tokenizer can be loaded directly from the model-specific class. Check a model's API documentation to check whether a fast tokenizer is supported for a model. diff --git a/docs/source/en/image_processors.md b/docs/source/en/image_processors.md new file mode 100644 index 000000000000..5126b6cd5fbf --- /dev/null +++ b/docs/source/en/image_processors.md @@ -0,0 +1,190 @@ + + +# Image processors + +An image processor converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision or video model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images it was pretrained on. + +- [`~BaseImageProcessor.center_crop`] to resize an image +- [`~BaseImageProcessor.normalize`] or [`~BaseImageProcessor.rescale`] pixel values + +Load an image processor with the [`~ImageProcessingMixin.from_pretrained`] method. This loads the image processors configuration (image size, whether to normalize and rescale, etc.) from a vision model on the Hugging Face [Hub](https://hf.co) into the image processor class. + +```py +from transformers import AutoImageProcessor + +image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") +``` + +Pass an image to the image processor to transform it into pixel values. Set `return_tensors="pt"` to return PyTorch tensors, and feel free to print out the inputs to see what the image looks like as a tensor. + +
+ +
+ +```py +from PIL import Image +import requests + +url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/image_processor_example.png" +image = Image.open(requests.get(url, stream=True).raw).convert("RGB") +inputs = image_processor(image, return_tensors="pt") +``` + +## Base image processor classes + + + +Transformers image processors inherit from the [`BaseImageProcessor`] class which provides the [`~BaseImageProcessor.center_crop`], [`~BaseImageProcessor.normalize`], and [`~BaseImageProcessor.rescale`] operations.. There are two types of image processors. + +- [`BaseImageProcessor`] is a Python implementation. +- [`BaseImageProcessorFast`] is a faster [torchvision](https://pytorch.org/vision/stable/index.html) backed version. For a batch of torch.Tensor inputs, this can be up to 33x faster. This is not available for all vision models at the moment. Refer to a models API documentation to check if it is supported. + +Each image processor subclasses the [`ImageProcessingMixin`] class which provides the [`~ImageProcessingMixin.from_pretrained`] and [`~ImageProcessingMixin.save_pretrained`] methods for loading and saving image processors. + +The specific image processor configuration for each pretrained model is saved in a [preprocessor_config.json](https://huggingface.co/google/vit-base-patch16-224/blob/main/preprocessor_config.json) file. + +To use an image processor, you need to load the specific image processor configuration associated with the vision model with [`~ImageProcessingMixin.from_pretrained`]. This method accepts a Hub model repository name or a local directory. + +## AutoImageProcessor + +The [AutoClass](./model_doc/auto) API provides a convenient method to load an image processor without directly specifying the model the image processor is associated with. + +Use the [`~AutoImageProcessor.from_pretrained`] method to load an image processor. Set `use_fast=True` to load a fast image processor if it's supported for a model. + +```py +from transformers import AutoImageProcessor + +image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224", use_fast=True) +``` + +## Model-specific image processor + +Each image processor is associated with a specific pretrained vision model, and the image processor's configuration contains the model's expected size and whether to normalize and resize. + +The image processor can be loaded directly from the model-specific class. Check a model's API documentation to see whether it supports a fast image processor. + + + + +```py +from transformers import ViTImageProcessor + +image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") +``` + + + + +```py +from transformers import ViTImageProcessorFast + +image_processor = ViTImageProcessorFast.from_pretrained("google/vit-base-patch16-224") +``` + + + + +## Preprocess + +Transformers' vision models expects the input as PyTorch tensors of pixel values. An image processor handles the conversion of images to pixel values, which is represented by the batch size, number of channels, height, and width. To achieve this, an image is resized (center cropped) and the pixel values are normalized and rescaled to the models expected values. + +Image preprocessing is not the same as *image augmentation*. Image augmentation makes changes (brightness, colors, rotatation, etc.) to an image for the purpose of either creating new training examples or prevent overfitting. Image preprocessing makes changes to an image for the purpose of matching a pretrained model's expected input format. + +Typically, images are augmented (to increase performance) and then preprocessed before being passed to a model. You can use any library ([Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb), [Kornia](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)) for augmentation and an image processor for preprocessing. + +This guide uses the torchvision [transforms](https://pytorch.org/vision/stable/transforms.html) module for augmentation. + +Start by loading a small sample of the [food101](https://hf.co/datasets/food101) dataset. + +```py +from datasets import load_dataset + +dataset = load_dataset("food101", split="train[:100]") +``` + +From the [transforms](https://pytorch.org/vision/stable/transforms.html) module, use the [Compose](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) API to chain together [RandomResizedCrop](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) and [ColorJitter](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html). These transforms randomly crop and resize an image, and randomly adjusts the colors of an image. + +The image size to randomly crop to can be retrieved from the image processor. For some models, an exact height and width are expected while for others, only the `shortest_edge` is required. + +```py +from torchvision.transforms import RandomResizedCrop, ColorJitter, Compose + +size = ( + image_processor.size["shortest_edge"] + if "shortest_edge" in image_processor.size + else (image_processor.size["height"], image_processor.size["width"]) +) +_transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)]) +``` + +Apply the transforms to the images and convert them to the RGB format. Then pass the augmented images to the image processor to return the pixel values. + +The `do_resize` parameter is set to `False` because the images have already been resized in the augmentation step by [RandomResizedCrop](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html). If you don't augment the images, then the image processor automatically resizes and normalizes the images with the `image_mean` and `image_std` values. These values are found in the preprocessor configuration file. + +```py +def transforms(examples): + images = [_transforms(img.convert("RGB")) for img in examples["image"]] + examples["pixel_values"] = image_processor(images, do_resize=False, return_tensors="pt")["pixel_values"] + return examples +``` + +Apply the combined augmentation and preprocessing function to the entire dataset on the fly with the [`~datasets.Dataset.set_transform`] method. + +```py +dataset.set_transform(transforms) +``` + +Convert the pixel values back into an image to see how the image has been augmented and preprocessed. + +```py +import numpy as np +import matplotlib.pyplot as plt + +img = dataset[0]["pixel_values"] +plt.imshow(img.permute(1, 2, 0)) +``` + +
+
+ +
before
+
+
+ +
after
+
+
+ +For other vision tasks like object detection or segmentation, the image processor includes post-processing methods to convert a model's raw output into meaningful predictions like bounding boxes or segmentation maps. + +### Padding + +Some models, like [DETR](./model_doc/detr), applies [scale augmentation](https://paperswithcode.com/method/image-scale-augmentation) during training which can cause images in a batch to have different sizes. Images with different sizes can't be batched together. + +To fix this, pad the images with the special padding token `0`. Use the [`~DetrImageProcessor.pad`] method to pad the images, and define a custom collate function to batch them together. + +```py +def collate_fn(batch): + pixel_values = [item["pixel_values"] for item in batch] + encoding = image_processor.pad(pixel_values, return_tensors="pt") + labels = [item["labels"] for item in batch] + batch = {} + batch["pixel_values"] = encoding["pixel_values"] + batch["pixel_mask"] = encoding["pixel_mask"] + batch["labels"] = labels + return batch +``` From 5b3f152e65bbc30e41a2bcf9b28626f590225a10 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 14 Aug 2024 14:17:55 -0700 Subject: [PATCH 023/116] fix toctree --- docs/source/en/_toctree.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 895776355683..d488ac943501 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -34,7 +34,7 @@ sections: - local: fast_tokenizers title: Tokenizers - - local: image_processor + - local: image_processors title: Image processors - local: preprocessing title: Preprocess data From c684186cb75138d8aa75a668dc763f8132e0020c Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Thu, 15 Aug 2024 14:07:33 -0700 Subject: [PATCH 024/116] backbones --- docs/source/en/_toctree.yml | 2 + docs/source/en/backbones.md | 178 ++++++++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 docs/source/en/backbones.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index d488ac943501..9a754f07dd1d 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -36,6 +36,8 @@ title: Tokenizers - local: image_processors title: Image processors + - local: backbones + title: Backbones - local: preprocessing title: Preprocess data - local: tokenizer_summary diff --git a/docs/source/en/backbones.md b/docs/source/en/backbones.md new file mode 100644 index 000000000000..3f2b71f2d04a --- /dev/null +++ b/docs/source/en/backbones.md @@ -0,0 +1,178 @@ + + +# Backbones + +For some higher-level computer visions tasks such as object detection or image segmentation, it is common to use several models together to generate a prediction. These networks combine a *backbone*, neck, and head. The backbone extracts useful features from an input image into a feature map, the neck combines and processes the feature maps, and the head uses them to make a prediction. + +
+ +
+ +Load a backbone with the [`~AutoBackbone.from_pretrained`] method. + +```py +from transformers import AutoBackbone + +model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,)) +``` + +## Base backbone classes + +There are two backbone classes for Transformers' models. + +- [`BackboneMixin`] allows you to load a backbone and includes functions for extracting the feature maps and indices. +- [`BackboneConfigMixin`] allows you to set the feature map and indices of a backbone configuration. + +Refer to the [Backbone](./main_classes/backbones) API documentation to check which models support a backbone. + +## AutoBackbone + +The [AutoClass](./model_doc/auto) API automatically loads a pretrained vision model with [`~AutoBackbone.from_pretrained`] as a backbone if it's supported. + +Set the `out_indices` parameter to the layer you'd like to get the feature map from. If you known the name of the layer, you could also use `out_features`. These parameters can be used interchangeably, but if you use both, make sure they're referring to the same layer. + +When you don't use `out_indices` or `out_features`, the backbone returns the feature map from the last layer. Specify `out_indices=(1,)` to get the feature map from the first layer. + +
+ +
+ +```py +from transformers import AutoImageProcessor, AutoBackbone + +model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,)) +``` + +## Model-specific backbones + +When you know a model supports a backbone, you can load the backbone and neck directly into the model's configuration. Then pass the configuration to the model to initialize it for a task. + +For example, load a [ResNet](./model_doc/resnet) backbone and neck for use in a [MaskFormer](./model_doc/maskformer) instance segmentation head. + +Set the `backbone` parameter to the pretrained model to load the model configuration class. Toggle the `use_pretrained_backbone` parameter to determine whether you want to use pretrained or randomly initialized weights. + + + + +```py +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation + +config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=True) +model = MaskFormerForInstanceSegmentation(config) +``` + + + + +```py +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation + +config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=False) +model = MaskFormerForInstanceSegmentation(config) +``` + + + + +Another option is to separately load the backbone configuration and then pass it to the `backbone_config` paramater in the model configuration. + +```py +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig + +# instantiate backbone configuration +backbone_config = ResNetConfig() +# load backbone in model +config = MaskFormerConfig(backbone_config=backbone_config) +# attach backbone to model head +model = MaskFormerForInstanceSegmentation(config) +``` + +## timm backbones + +[timm](https://hf.co/docs/timm/index) is a collection of vision models for training and inference. Transformers supports timm models as backbones with the [`TimmBackbone`] and [`TimmBackboneConfig`] classes. + +Set `use_timm_backnoe=True` to load pretrained timm weights. The `use_pretrained_backbone` parameter can be toggled to use pretrained or randomly initialized weights. + + + + +```py +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation + +config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=True, use_timm_backbone=True) +model = MaskFormerForInstanceSegmentation(config) +``` + + + + +```py +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation + +config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=False, use_timm_backbone=True) +model = MaskFormerForInstanceSegmentation(config) +``` + + + + +You could also explicitly call the [`TimmBackboneConfig`] class to load and create a pretrained timm backbone. + +```py +from transformers import TimmBackboneConfig + +backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=True) +``` + +Pass the backbone configuration to the model configuration and then instantiate the model head, [`MaskFomerForInstanceSegmentation`], with the backbone. + +```py +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation + +config = MaskFormerConfig(backbone_config=backbone_config) +model = MaskFormerForInstanceSegmentation(config) +``` + +## Feature extraction + +The backbone is used for image feature extraction. Pass an image through the backbone to get the feature maps. + +Load and preprocess an image, and then pass it to the backbone. + +```py +from transformers import AutoImageProcessor, AutoBackbone +import torch +from PIL import Image +import requests + +model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,)) +processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224") + +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) + +inputs = processor(image, return_tensors="pt") +outputs = model(**inputs) +``` + +The features are stored and accessed from the outputs `feature_maps` attribute. + +```py +feature_maps = outputs.feature_maps +list(feature_maps[0].shape) +[1, 96, 56, 56] +``` From dfc5f31613c890a2a80a3a6617ddf0032bc1e2ef Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 19 Aug 2024 15:45:06 -0700 Subject: [PATCH 025/116] feature extractor --- docs/source/en/_toctree.yml | 2 + docs/source/en/backbones.md | 2 +- docs/source/en/feature_extractors | 202 ++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+), 1 deletion(-) create mode 100644 docs/source/en/feature_extractors diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 9a754f07dd1d..b808e3a30176 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -38,6 +38,8 @@ title: Image processors - local: backbones title: Backbones + - local: feature_extractors + title: Feature extractors - local: preprocessing title: Preprocess data - local: tokenizer_summary diff --git a/docs/source/en/backbones.md b/docs/source/en/backbones.md index 3f2b71f2d04a..b80cb8d45853 100644 --- a/docs/source/en/backbones.md +++ b/docs/source/en/backbones.md @@ -138,7 +138,7 @@ from transformers import TimmBackboneConfig backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=True) ``` -Pass the backbone configuration to the model configuration and then instantiate the model head, [`MaskFomerForInstanceSegmentation`], with the backbone. +Pass the backbone configuration to the model configuration and then instantiate the model head, [`MaskFormerForInstanceSegmentation`], with the backbone. ```py from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation diff --git a/docs/source/en/feature_extractors b/docs/source/en/feature_extractors new file mode 100644 index 000000000000..fd0886346a04 --- /dev/null +++ b/docs/source/en/feature_extractors @@ -0,0 +1,202 @@ + + +# Feature extractors + +Feature extractors preprocess audio data into the correct format for a given model. It takes the raw audio signal and converts it into a tensor that can be fed to a model. The tensor shape depends on the model, but the feature extractor will correctly preprocess the audio data for you given the model you're using. Feature extractors also include methods for padding, truncation, and resampling. + +To load a feature extractor, call the [`~AutoFeatureExtractor.from_pretrained`] method to load the feature extractor and its preprocessor configuration from the Hugging Face [Hub](https://hf.co/models) into the feature extractor class. + +Pass the audio signal, typically stored in `array`, to the feature extractor and set the `sampling_rate` parameter to the pretrained audio models sampling rate. It is important the sampling rate of the audio data matches the sampling rate of the data a pretrained audio model was trained on. + +```py +from transformers import AutoFeatureExtractor + +feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") +processed_sample = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000) +processed_sample +{'input_values': [array([ 9.4472744e-05, 3.0777880e-03, -2.8888427e-03, ..., + -2.8888427e-03, 9.4472744e-05, 9.4472744e-05], dtype=float32)]} +``` + +The feature extractor returns an input, `input_values`, that is ready for the model to accept. + +This guide walks you through the feature extractor classes and how to preprocess audio data. + +## Base feature extractor classes + +Transformers feature extractors inherit from the [`SequenceFeatureExtractor`] class, which subclasses [`FeatureExtractionMixin`]. + + + +- [`SequenceFeatureExtractor`] provides a method to [`~SequenceFeatureExtractor.pad`] sequences to a certain length to avoid uneven sequence lengths. +- [`FeatureExtractionMixin`] provides [`~FeatureExtractionMixin.from_pretrained`] and [`~FeatureExtractionMixin.save_pretrained`] to load and save a feature extractor. It loads a feature extractor from a Hub model repository name or local directory, and saves a feature extractors configuration to a [preprocessor_config.json](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json). + +## AutoFeatureExtractor + +The [AutoClass](./model_doc/auto) API automatically loads the correct feature extractor for a given model. + +Use the [`~AutoFeatureExtractor.from_pretrained`] method to load a feature extractor. + +```py +from transformers import AutoFeatureExtractor + +feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny") +``` + +## Model-specific feature extractor + +Every pretrained audio model has a specific associated feature extractor for correctly processing audio data. When you load a feature extractor, it retrieves the feature extractors configuration (feature size, chunk length, etc.) from [`preprocessor_config.json`](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json). + +A feature extractor can be loaded directly from its model-specific class. + +```py +from transformers import WhisperFeatureExtractor + +feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny") +``` + +## Preprocess + +A feature extractor expects the input as a PyTorch tensor of a certain shape. The exact input shape can vary depending on the specific audio model you're using. For example, [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper) expects `input_features` which is a tensor of shape (batch_size, feature_size, sequence_length) but [Wav2Vec2](https://hf.co/docs/transformers/model_doc/wav2vec2) expects `input_values` which is a tensor of shape (batch_size, sequence_length). + +The feature extractor takes care of this for whichever audio model you're using. + +A feature extractor also sets the sampling rate (the number of audio signal values taken per second) of the audio files. The sampling rate of your audio data must match the sampling rate of the dataset a pretrained model was trained on. This value is typically given in the model card. + +Load a dataset and feature extractor. + +```py +from datasets import load_dataset, Audio +from transformers import AutoFeatureExtractor + +dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") +feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") +``` + +Check out the first example from the dataset and access the `audio` column which contains `array`, the raw audio signal. + +```py +dataset[0]["audio"]["array"] +array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, + 0. , 0. ]) +``` + +The feature extractor preprocesses `array` into the expected input format for a given audio model. Set the appropriate sampling rate with the `sampling_rate` parameter. + +```py +processed_dataset = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000) +processed_dataset +{'input_values': [array([ 9.4472744e-05, 3.0777880e-03, -2.8888427e-03, ..., + -2.8888427e-03, 9.4472744e-05, 9.4472744e-05], dtype=float32)]} +``` + +### Padding + +Audio sequence lengths are different which is an issue because Transformers expects all sequences to have the same lengths so they can be batched. Uneven sequence lengths can't be batched. + +```py +dataset[0]["audio"]["array"].shape +(86699,) + +dataset[1]["audio"]["array"].shape +(53248,) +``` + +Padding adds a special *padding token* to ensure all sequences have the same length. The feature extractor adds a `0` - interpreted as silence - to `array` to pad it. Set `padding=True` to pad sequences to the longest sequence length in the batch. + +```py +def preprocess_function(examples): + audio_arrays = [x["array"] for x in examples["audio"]] + inputs = feature_extractor( + audio_arrays, + sampling_rate=16000, + padding=True, + ) + return inputs + +processed_dataset = preprocess_function(dataset[:5]) +``` + +The sequence lengths are the same now. + +```py +processed_dataset["input_values"][0].shape +(86699,) + +processed_dataset["input_values"][1].shape +(86699,) +``` + +### Truncation + +Models can only process sequences up to a certain length before crashing. + +Truncation is a strategy for removing excess tokens from a sequence to ensure it doesn't exceed the maximum length. Set `truncation=True` to truncate a sequence to the length in the `max_length` parameter. + +```py +def preprocess_function(examples): + audio_arrays = [x["array"] for x in examples["audio"]] + inputs = feature_extractor( + audio_arrays, + sampling_rate=16000, + max_length=50000, + truncation=True, + ) + return inputs + +processed_dataset = preprocess_function(dataset[:5]) +``` + +The sequence lengths are now 50000. + +```py +processed_dataset["input_values"][0].shape +(50000,) + +processed_dataset["input_values"][1].shape +(50000,) +``` + +### Resampling + +The [Datasets](https://hf.co/docs/datasets/index) library can also resample audio data to match an audio models expected sampling rate. This method resamples the audio data on the fly when they're loaded which can be faster than resampling the entire dataset in-place. + +The audio dataset you've been working on has a sampling rate of 8kHz and the pretrained model expects 16kHz. + +```py +dataset[0]["audio"] +{'path': '/root/.cache/huggingface/datasets/downloads/extracted/f507fdca7f475d961f5bb7093bcc9d544f16f8cab8608e772a2ed4fbeb4d6f50/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', + 'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, + 0. , 0. ]), + 'sampling_rate': 8000} +``` + +Call the [`~datasets.Dataset.cast_column`] method on the `audio` column to upsample the sampling rate to 16kHz. + +```py +dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) +``` + +When you load the dataset sample, it is resampled to 16kHz. + +```py +dataset[0]["audio"] +{'path': '/root/.cache/huggingface/datasets/downloads/extracted/f507fdca7f475d961f5bb7093bcc9d544f16f8cab8608e772a2ed4fbeb4d6f50/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', + 'array': array([ 1.70562416e-05, 2.18727451e-04, 2.28099874e-04, ..., + 3.43842403e-05, -5.96364771e-06, -1.76846661e-05]), + 'sampling_rate': 16000} +``` From a6d89ba3d30465839f2e0cb37ce0bb2d0abb657f Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 19 Aug 2024 15:53:26 -0700 Subject: [PATCH 026/116] fix file name --- docs/source/en/{feature_extractors => feature_extractors.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/source/en/{feature_extractors => feature_extractors.md} (100%) diff --git a/docs/source/en/feature_extractors b/docs/source/en/feature_extractors.md similarity index 100% rename from docs/source/en/feature_extractors rename to docs/source/en/feature_extractors.md From b2f21913a5f2e1d5ffff471164b011873a8c5dc1 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 20 Aug 2024 13:17:40 -0700 Subject: [PATCH 027/116] processor --- docs/source/en/_toctree.yml | 6 +- docs/source/en/backbones.md | 2 + docs/source/en/image_processors.md | 2 + docs/source/en/processors.md | 122 +++++++++++++++++++++++++++++ 4 files changed, 128 insertions(+), 4 deletions(-) create mode 100644 docs/source/en/processors.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index b808e3a30176..9aa58c459da5 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -27,8 +27,6 @@ title: The Transformer model family - local: attention title: Attention mechanisms - - local: bertology - title: BERTology - title: Preprocessors isExpanded: false sections: @@ -40,8 +38,8 @@ title: Backbones - local: feature_extractors title: Feature extractors - - local: preprocessing - title: Preprocess data + - local: processors + title: Processors - local: tokenizer_summary title: Summary of the tokenizers - local: pad_truncation diff --git a/docs/source/en/backbones.md b/docs/source/en/backbones.md index b80cb8d45853..29b14a8f9692 100644 --- a/docs/source/en/backbones.md +++ b/docs/source/en/backbones.md @@ -30,6 +30,8 @@ from transformers import AutoBackbone model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,)) ``` +This guide describes the backbone class, backbones from the [timm](https://hf.co/docs/timm/index) library, and how to extract features with them. + ## Base backbone classes There are two backbone classes for Transformers' models. diff --git a/docs/source/en/image_processors.md b/docs/source/en/image_processors.md index 5126b6cd5fbf..008f857888d4 100644 --- a/docs/source/en/image_processors.md +++ b/docs/source/en/image_processors.md @@ -44,6 +44,8 @@ image = Image.open(requests.get(url, stream=True).raw).convert("RGB") inputs = image_processor(image, return_tensors="pt") ``` +This guide covers the image processor class and how to preprocess images for vision models. + ## Base image processor classes diff --git a/docs/source/en/processors.md b/docs/source/en/processors.md new file mode 100644 index 000000000000..4bb12b140aa4 --- /dev/null +++ b/docs/source/en/processors.md @@ -0,0 +1,122 @@ + + +# Processors + +Multimodal models require a preprocessor capable of handling inputs that combine more than one modality. Depending on the input modality, a processor needs to convert text into an array of tensors, images into pixel values, and audio into an array with tensors with the correct sampling rate. + +For example, [PaliGemma](./model_doc/paligemma) is a vision-language model that uses the [SigLIP](./model_doc/siglip) image processor and the [Llama](./model_doc/llama) tokenizer. A [`ProcessorMixin`] class wraps both of these preprocessor types, providing a single and unified processor class for a multimodal model. + +To load a processor, call the [`~ProcessorMixin.from_pretrained`] method. Pass the input type to the processor to generate the expected model inputs, the input ids and pixel values. + +```py +from transformers import AutoProcessor, PaliGemmaForConditionalGeneration +from PIL import Image +import requests + +processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224") + +prompt = "answer en Where is the cow standing?" +url = "https://huggingface.co/gv-hf/PaliGemma-test-224px-hf/resolve/main/cow_beach_1.png" +image = Image.open(requests.get(url, stream=True).raw) + +inputs = processor(text=prompt, images=image, return_tensors="pt") +inputs +``` + +This guide describes the processor class and how to preprocess multimodal inputs. + +## Base processor class + +All processors inherit from the [`ProcessorMixin`] class which provides methods like [`~ProcessorMixin.from_pretrained`], [`~ProcessorMixin.save_pretrained`], and [`~ProcessorMixin.push_to_hub`] for loading, saving, and sharing processors to the Hub repsectively. + +## AutoProcessor + +The [AutoClass](./model_doc/auto) API provides a simple interface to load processors without directly specifying the specific model class it belongs to. + +Use the [`~AutoProcessor.from_pretrained`] method to load a processor. + +```py +from transformers import AutoProcessor + +processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224") +``` + +## Model-specific processor + +Processors are also associated with a specific pretrained multimodal model class. You can load a processor directly from the model class with the [`~ProcessorMixin.from_pretrained`] method. + +```py +from transformers import WhisperProcessor + +processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") +``` + +You could also separately load the two preprocessor types, [`WhisperTokenizerFast`] and [`WhisperFeatureExtractor`]. + +```py +from transformers import WhisperTokenizerFast, WhisperFeatureExtractor, WhisperProcessor + +tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny") +feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny") +processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer) +``` + +## Preprocess + +Processors preprocess multimodal inputs into the expected Transformers format. There are a couple combinations of input modalities that a processor can handle such as text and audio or text and image. + +Automatic speech recognition (ASR) tasks require a processor that can handle text and audio inputs. Load a dataset and take a look at the `audio` and `text` columns (you can remove the other columns which aren't needed). + +```py +from datasets import load_dataset + +lj_speech = load_dataset("lj_speech", split="train") +lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"]) +lj_speech[0]["audio"] +{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ..., + 7.3242188e-04, 2.1362305e-04, 6.1035156e-05], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', + 'sampling_rate': 22050} + +lj_speech[0]["text"] +'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition' +``` + +Remember to resample the sampling rate to match the model's requirements. + +```py +lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16000)) +``` + +Load a processor and pass the audio `array` to the `audio` parameter and pass the `text` column to the `text` parameter. + +```py +from transformers import AutoProcessor + +processor = AutoProcessor.from_pretrained("openai/whisper-tiny") + +def prepare_dataset(example): + audio = example["audio"] + example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) + return example +``` + +Apply the `prepare_dataset` function to the dataset to preprocess it. The processor returns the `input_features` for the `audio` column and `labels` for the text column. + +```py +prepare_dataset(lj_speech[0]) +``` From 12ae995f58c1749901b2f7aec8dcd48c43d33574 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 20 Aug 2024 13:23:00 -0700 Subject: [PATCH 028/116] update not-doctested --- utils/not_doctested.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt index 43190d27bb42..42e897202352 100644 --- a/utils/not_doctested.txt +++ b/utils/not_doctested.txt @@ -5,7 +5,7 @@ docs/source/en/add_new_pipeline.md docs/source/en/agents.md docs/source/en/agents.md docs/source/en/attention.md -docs/source/en/bertology.md +docs/source/en/benchmarks.md docs/source/en/big_models.md docs/source/en/community.md docs/source/en/contributing.md @@ -290,7 +290,6 @@ docs/source/en/perplexity.md docs/source/en/philosophy.md docs/source/en/pipeline_webserver.md docs/source/en/pr_checks.md -docs/source/en/preprocessing.md docs/source/en/run_scripts.md docs/source/en/sagemaker.md docs/source/en/serialization.md From 591df63a4309100b800618fc79bbfc75678687b4 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 20 Aug 2024 17:15:06 -0700 Subject: [PATCH 029/116] update --- docs/source/en/_toctree.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 9aa58c459da5..396d7578dfe7 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -75,7 +75,7 @@ - local: conversations title: Chatting with Transformers - local: chat_templating - title: Templates for chat models + title: Chat templates - title: Framework-specific inference optimization sections: - local: tf_xla From 667f39467c8a1d6c96ad41fa5fcc893e7d21f85d Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 20 Aug 2024 17:28:52 -0700 Subject: [PATCH 030/116] make style --- src/transformers/models/falcon_mamba/modeling_falcon_mamba.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py index b015373ca3aa..13eca8aa91cf 100644 --- a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -378,6 +378,7 @@ def forward( return self.slow_forward(hidden_states, cache_params, cache_position) + # Copied from transformers.models.mamba.modeling_mamba.MambaRMSNorm with Mamba->FalconMamba class FalconMambaRMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): From 3e24e9843f73e0ef1d927552be6bf5f71cc5da97 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 20 Aug 2024 17:53:49 -0700 Subject: [PATCH 031/116] fix toctree --- docs/source/en/_toctree.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 396d7578dfe7..4543ae1e2ff3 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -86,6 +86,28 @@ title: Agents - local: multilingual title: Run inference with multilingual models + - local: custom_models + title: Share a custom model + - local: chat_templating + title: Chat templates + - local: trainer + title: Trainer + - local: sagemaker + title: Run training on Amazon SageMaker + - local: serialization + title: Export to ONNX + - local: tflite + title: Export to TFLite + - local: torchscript + title: Export to TorchScript + - local: benchmarks + title: Benchmarks + - local: notebooks + title: Notebooks with examples + - local: community + title: Community resources + - local: troubleshooting + title: Troubleshoot - local: gguf title: Interoperability with GGUF files - local: perf_infer_cpu From 3d7d13f7ab3d41d2a49ee8cfc448172cf98174c2 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 26 Aug 2024 11:41:13 -0700 Subject: [PATCH 032/116] revision --- docs/source/en/add_new_model.md | 64 ++++++++++++------------- docs/source/en/backbones.md | 52 +++++--------------- docs/source/en/custom_models.md | 46 +++++++++--------- docs/source/en/fast_tokenizers.md | 56 ++++++++++------------ docs/source/en/feature_extractors.md | 31 +++++++----- docs/source/en/image_processors.md | 34 +++++-------- docs/source/en/index.md | 12 ++--- docs/source/en/installation.md | 12 +++-- docs/source/en/model_doc/albert.md | 3 ++ docs/source/en/model_sharing.md | 22 +++++---- docs/source/en/models.md | 65 ++++++++++++++----------- docs/source/en/processors.md | 31 +++++++----- docs/source/en/quicktour.md | 72 ++++++++++++++-------------- 13 files changed, 247 insertions(+), 253 deletions(-) diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md index 0dcf65d04215..115a75681a71 100644 --- a/docs/source/en/add_new_model.md +++ b/docs/source/en/add_new_model.md @@ -15,17 +15,17 @@ rendered properly in your Markdown viewer. # Contribute -Transformers is fortunate to have a passionate community of developers and researchers contribute models to the library. As an open-source first project, we're invested in empowering the community to actively add models. +Transformers is fortunate to have a passionate community of developers and researchers contributing models to the library. As an open-source first project, we're invested in empowering the community to actively add models. When you add a model to Transformers, you'll learn: - more about open-source best practices - about a models architecture -- about the design principles behind Transformers +- about Transformers' design principles - how to efficiently test large models - how to use Python utilities like [Black](https://black.readthedocs.io/en/stable/) and [Ruff](https://docs.astral.sh/ruff/) to create clean and readable code -It is challenging but also rewarding. +It is a challenging, but also rewarding process! This guide will walk you through adding an example "BrandNewBert" PyTorch model to Transformers. @@ -34,9 +34,9 @@ This guide will walk you through adding an example "BrandNewBert" PyTorch model Open a [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) issue to add a specific model. > [!TIP] -> To add any model, filter by the [New model](https://github.com/huggingface/transformers/labels/New%20model) label for an open model request. +> Filter by the [New model](https://github.com/huggingface/transformers/labels/New%20model) label on GitHub if you're open to adding any model. -Now is a good time to get familiar with BrandNewBert. It can be helpful to read a models research paper to understand its technical details and implementation. You don't necessarily have to worry too much about the more theoretical aspects of the paper. Instead, focus on the practical details. Use the questions below to guide your reading. +Now is a good time to get familiar with BrandNewBert. It is helpful to read a models research paper to understand its technical design and implementation. You don't necessarily have to worry too much about the theoretical details. Instead, focus on the practical ones. Use the questions below to guide your reading. - What type of model is BrandNewBert? Is it a encoder, decoder, or encoder-decoder model? - What tasks can BrandNewBert be used for? @@ -49,7 +49,7 @@ Now is a good time to get familiar with BrandNewBert. It can be helpful to read Transformers is an opinionated library with its own unique philosophy and design choices. These choices help us scale Transformers while maintaining a sustainable level of maintenance. > [!TIP] -> Learn more about our design principles in the [Philosophy](./philosophy) page. +> Learn more about our design principles on the [Philosophy](./philosophy) page. Some of these design choices are: @@ -59,15 +59,15 @@ Some of these design choices are: These design choices are important *for everyone* interacting with the model. It is easier to read, understand, and modify. -This section describes how the model and configuration classes interact and the Transformers code style you should adopt. +This section describes how the model and configuration classes interact and the Transformers code style. ### Model and configuration -All Transformers' models inherit from a base [`PreTrainedModel`] and [`PretrainedConfig`] class. The configuration is the blueprint to the model. +All Transformers' models inherit from a base [`PreTrainedModel`] and [`PretrainedConfig`] class. The configuration is the models blueprint. -To keep the code readable, there is never more than two levels of abstraction for any model. The example model here, BrandNewBert, traces its inheritance from `BrandNewBertPreTrainedModel` and [`PreTrainedModel`]. It is important that a new model only depends on [`PreTrainedModel`] because it allows a model to be loaded and saved with [`~PreTrainedModel.from_pretrained`] and [`PreTrainedModel.save_pretrained`]. +To keep the code readable, there is never more than two levels of abstraction for any model. The example model here, BrandNewBert, traces its inheritance from `BrandNewBertPreTrainedModel` and [`PreTrainedModel`]. It is important that a new model only depends on [`PreTrainedModel`] because it allows a model to be loaded and saved with [`~PreTrainedModel.from_pretrained`] and [`~PreTrainedModel.save_pretrained`]. Other important functions like the forward method are defined in the `modeling.py` file. @@ -76,19 +76,19 @@ Specific model heads (for example, for sequence classification or language model New models require a configuration, for example `BrandNewBertConfig`, that is stored as an attribute of [`PreTrainedModel`]. ```py -model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert") +model = BrandNewBertModel.from_pretrained("username/brand_new_bert") model.config ``` -Like [`PreTrainedModel`], [`PretrainedConfig`] provides [`~PretrainedConfig.from_pretrained`] and [`PretrainedConfig.save_pretrained`] methods. +Like [`PreTrainedModel`], [`PretrainedConfig`] provides the [`~PretrainedConfig.from_pretrained`] and [`~PretrainedConfig.save_pretrained`] methods. -When you use [`~PreTrainedModel.save_pretrained`], it automatically calls the configurations [`~PretrainedConfig.save_pretrained`] method so that both the model and configuration are saved together. +When you use [`PreTrainedModel.save_pretrained`], it automatically calls [`~PretrainedConfig.save_pretrained`] so that both the model and configuration are saved together. A model is saved to a `model.safetensors` file and a configuration is saved to a `config.json` file. ### Code style -Transformers prefers a clean and readable code style over a more abstracted one. Some of the coed style choices include: +Transformers prefers a clean and readable code style over a more abstracted one. Some of the code style choices include: - The forward pass is written in the `modeling.py` file, completely independent of other models in the library. To reuse a block from another model, copy the code and paste it with a `# Copied from` comment above it. For example, the `RobertaSelfAttention` class is copied from the `BertSelfAttention` class. @@ -99,7 +99,7 @@ Transformers prefers a clean and readable code style over a more abstracted one. Refer to the [Check copies](./pr_checks#check-copies) section for more information about the `# Copied from` comment. -- The code should be accessible to users from a non-native English background. Pick descriptive variable names and avoid abbreviations. For example, "activation" is preferred over "act". One letter variables names are highly discouraged unless it's an infex in a for loop. +- The code should be accessible to users from a non-native English background. Pick descriptive variable names and avoid abbreviations. For example, "activation" is preferred over "act". One letter variables names are highly discouraged unless it's an index in a for loop. - Explicit code is preferred over shorter code even if it's longer. @@ -158,12 +158,12 @@ Return to your clone of Transformers to begin porting BrandNewBert. cd transformers ``` -There are two debugging environments for running the original model, a notebook ([Google Colab](https://colab.research.google.com/notebooks/intro.ipynb) or [Jupyter](https://jupyter.org/)) or a local Python script. +There are two possible debugging environments for running the original model, a notebook ([Google Colab](https://colab.research.google.com/notebooks/intro.ipynb) or [Jupyter](https://jupyter.org/)) or a local Python script. > [!WARNING] -> We don't recommend setting up a GPU environment to run the original model. This can be costly and only verified when the model is working in Transformers. Instead, work in a CPU environment at first. +> We don't recommend setting up a GPU environment to run the original model because it can be expensive. Instead, work in a CPU environment first to verify the model works in Transformers. Once it does, then you can verify it on a GPU. -Notebooks are great for executing code cell-by-cell which can better help split logical components from one another. It can also accelerate debugging cycles because intermediate results can be stored. Notebooks can also be shared which is useful for working with contributors. +Notebooks are great for executing code cell-by-cell which can help split logical components from one another. It can also accelerate debugging cycles because intermediate results can be stored. Notebooks can also be shared when working with other contributors. The downside of notebooks is that if you aren't used to them, it may take some time to get used to. @@ -195,7 +195,7 @@ git fetch upstream git rebase upstream/main ``` -Push any changes to your branch and click on **Compare & pull request** to open a pull request on GitHub. Open the pull request as a *draft* to signal it's a work in progress. +Push any changes to your branch and click on **Compare & pull request** to open a pull request on GitHub. Open the pull request as a *draft* to indicate it's a work in progress. ```bash git push -u origin a-descriptive-name-for-my-changes @@ -214,13 +214,13 @@ git merge upstream/main Before you start working on your model implementation, you should work on the original model implementation first to understand how it works. -This can be difficult if the original model repository is lacking documentation or if the codebase is complex. But you should use this as your motivation to implement the model in Transformers. Your contribution makes it more accessible and user-friendly to everyone. +This can be difficult if the original model repository is lacking documentation or if the codebase is complex. But you should use this as your motivation to implement the model in Transformers. Your contribution makes it more accessible and user-friendly to everyone! Orient yourself with the original repository by doing the following. - Locate the pretrained weights. - Figure out how to the load pretrained weights into the model. -- Figure out how to run the tokenizer indepdently of the model. +- Figure out how to run the tokenizer independently of the model. - Trace one forward pass to understand which classes and functions are required. These are probably the only classes and functions you'll have to implement. - Locate all the important components (model class, model subclasses, self-attention layer, etc.) of the model. - Figure out how to debug the model in the original repository. Add print statements, use interactive debuggers like [ipdb](https://github.com/gotcha/ipdb), or a efficient integrated development environment (IDE) like [PyCharm](https://www.jetbrains.com/pycharm/). @@ -242,9 +242,9 @@ If you run into issues, you'll need to choose one of the following debugging dec This strategy relies on breaking the original model into smaller sub-components, such as when the code can be easily run in eager mode. While more difficult, there are some advantages to this approach. -1. It is easier later to compare the original model to your implementation. You can automatically verify that each individual component matches its corresponding component in Transformers' implementation. This is better than relying on a visual comparison based on print statements. +1. It is easier later to compare the original model to your implementation. You can automatically verify that each individual component matches its corresponding component in the Transformers' implementation. This is better than relying on a visual comparison based on print statements. 2. It is easier to port individal components instead of the entire model. -3. It is easier for understanding how a model works by breaking it up into its components. +3. It is easier for understanding how a model works by breaking it up into smaller parts. 4. It is easier to prevent regressions at a later stage when you change your code thanks to component-by-component tests. > [!TIP] @@ -255,7 +255,7 @@ This strategy relies on breaking the original model into smaller sub-components, This strategy is viable when the original codebase is too complex, only allows intermediate components to be run in compiled mode, or if it's too time-consuming (maybe even impossible) to separate the model into smaller sub-components. -For example, the MeshTensorFlow implementation of [T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) is too complex and doesn't offer a simple way to decompose the model into its sub-components. In this situation, you'll have to relay on verifying print statements. +For example, the MeshTensorFlow implementation of [T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) is too complex and doesn't offer a simple way to decompose the model into its sub-components. In this situation, you'll have to rely on verifying print statements.
@@ -309,7 +309,7 @@ The `transformers-cli add-new-model-like` command should have generated a model The automatically generated code in the `modeling.py` file will have the same architecture as BERT if you answered it's an encoder-only model or it will have the same architecture as BART if you answered it's an encoder-decoder model. The generated code is just a starting point. Based on your research on the new model, you'll need to implement those specific changes by adapting the generated code. This may involve changes to the self-attention layer, the order of the normalization layer, and so on. -At this point, your code doesn't have to be clean or even fully correct! It is more efficiently to quickly create a first draft and then iteratively improve on it. The only thing that matters is that your model should be able to be instantiated from Transformers. The command below creates a model from the configuration with random weights, verifying that the the `__init__` method works. +At this point, your code doesn't have to be clean or even fully correct! It is more efficiently to quickly create a first draft and then iteratively improve on it. The only thing that matters is that your model can be instantiated from Transformers. The command below creates a model from the configuration with random weights, verifying that the the `__init__` method works. ```py from transformers import BrandNewBert, BrandNewBertConfig @@ -334,7 +334,7 @@ def _init_weights(self, module): module.weight.data.fill_(1.0) ``` -The initialization scheme can look different if you need to adapt it to your model. For example, [`Wave2Vec2ForPreTraining`] initializes [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) in its last two linear layers. +The initialization scheme can look different if you need to adapt it to your model. For example, [`Wav2Vec2ForPreTraining`] initializes [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) in its last two linear layers. The `_is_hf_initialized` flag makes sure the submodule is only initialized once. Setting `module.project_q` and `module.project_hid` to `True` ensures the custom initialization is not overriden later. The `_init_weights` function won't be applied to these modules. @@ -364,9 +364,9 @@ The original checkpoint must be converted to a Transformers compatible checkpoin Make sure **all** required weights are initialized and print out all the checkpoint weights that weren't used for initialization to make sure the model has been converted correctly. -You may encounter wrong shape statements of name assignments during the conversion. This is most likely because of incorrect parameters in BrandNewBertConfig, the wrong architecture, a bug in the `init` method of your implementation, or you need to transpose one of the checkpoint weights. +You may encounter wrong shape statements or name assignments during the conversion. This is most likely because of incorrect parameters in `BrandNewBertConfig`, the wrong architecture, a bug in the `init` method of your implementation, or you need to transpose one of the checkpoint weights. -Keep iterating with the [Adapt the model code](#adapt-the-model-code) section until all the checkpoint weights are correctly loaded. Once you can load a checkpoint in your model, save it to a folder. This should contain a `model.safetensors` file and a `config.json` file. +Keep iterating on the [Adapt the model code](#adapt-the-model-code) section until all the checkpoint weights are correctly loaded. Once you can load a checkpoint in your model, save it to a folder. This should contain a `model.safetensors` file and a `config.json` file. ```py model.save_pretrained("/path/to/converted/checkpoint/folder") @@ -389,7 +389,7 @@ class SimpleModel(nn.Module): self.layer_norm = nn.LayerNorm(10) ``` -PyTorch layer names are defined by the class attribute name of the layer (dense, intermediate, layer_norm). Create a instance of SimpleModel to fill all the layers with random weights. +PyTorch layer names are defined by the class attribute name of the layer (`dense`, `intermediate`, `layer_norm`). Create a instance of `SimpleModel` to fill all the layers with random weights. ```py model = SimpleModel() @@ -448,7 +448,7 @@ assert ( logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}") ``` -When the shape or name don't match, you may have assigned the incorrect checkpoint weight to a randomly initialized layer. An incorrect shape may be because of an BrandNewBert's parameters don't exactly match the original model's parameters. But it could be that the PyTorch layer implementation requires the weights to be transposed first. +When the shape or name don't match, you may have assigned the incorrect checkpoint weight to a randomly initialized layer. An incorrect shape may be because BrandNewBerts parameters don't exactly match the original models parameters. But it could also be that the PyTorch layer implementation requires the weights to be transposed first. ### Implement the forward pass @@ -478,7 +478,7 @@ Any difference between the two implementations should point to the bug in your i One of the best strategies is to add many print statements to the same positions in both implementations, and then successively remove them when they output identical values for the intermediate outputs. -When both implementations produce the same output, verify the outputs are within the precision of *1e-3*. +When both implementations produce the same output, verify the outputs are within a precision of *1e-3*. ```py torch.allclose(original_output, output, atol=1e-3) @@ -498,7 +498,7 @@ While the model works, you still need to add tests to ensure it is compatible wi pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py ``` -The integration tests should be added first because they serve the same purpose as the debugging scripts you used earlier to implement the new model in Transformers. A template of those model tests, BrandNewBertModelIntegrationTests, was added by Cookiecutter and only needs to be filled out by you. To ensure it passes, run the following command. +The integration tests should be added first because they serve the same purpose as the debugging scripts you used earlier to implement the new model in Transformers. A template of those model tests, `BrandNewBertModelIntegrationTests`, was added by Cookiecutter and just needs to be filled out. To ensure it passes, run the following command. @@ -550,7 +550,7 @@ When both implementations have the same `input_ids`, add a tokenizer test file. Now that you have a model and tokenizer, add end-to-end integration tests using both the model and tokenizer to `tests/models/brand_new_bert/test_modeling_brand-new_bert.py`. -The test should provide a meaningful text-to-text example that the model works as expected. For example, you can include a source-to-target translation pair, an article-to-summary pair, or a question-to-answer pair. +The test should provide a meaningful text-to-text example to show the model works as expected. For example, you can include a source-to-target translation pair, an article-to-summary pair, or a question-to-answer pair. If the checkpoint hasn't been finetuned on a downstream task, then the model tests will suffice. diff --git a/docs/source/en/backbones.md b/docs/source/en/backbones.md index 29b14a8f9692..00500a7404d5 100644 --- a/docs/source/en/backbones.md +++ b/docs/source/en/backbones.md @@ -22,7 +22,7 @@ For some higher-level computer visions tasks such as object detection or image s -Load a backbone with the [`~AutoBackbone.from_pretrained`] method. +Load a backbone with [`~AutoBackbone.from_pretrained`]. ```py from transformers import AutoBackbone @@ -32,7 +32,7 @@ model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", o This guide describes the backbone class, backbones from the [timm](https://hf.co/docs/timm/index) library, and how to extract features with them. -## Base backbone classes +## Backbone classes There are two backbone classes for Transformers' models. @@ -41,11 +41,14 @@ There are two backbone classes for Transformers' models. Refer to the [Backbone](./main_classes/backbones) API documentation to check which models support a backbone. -## AutoBackbone +There are two ways to load a Transformers backbone, [`AutoBackbone`] and a model-specific backbone class. + + + The [AutoClass](./model_doc/auto) API automatically loads a pretrained vision model with [`~AutoBackbone.from_pretrained`] as a backbone if it's supported. -Set the `out_indices` parameter to the layer you'd like to get the feature map from. If you known the name of the layer, you could also use `out_features`. These parameters can be used interchangeably, but if you use both, make sure they're referring to the same layer. +Set the `out_indices` parameter to the layer you'd like to get the feature map from. If you know the name of the layer, you could also use `out_features`. These parameters can be used interchangeably, but if you use both, make sure they refer to the same layer. When you don't use `out_indices` or `out_features`, the backbone returns the feature map from the last layer. Specify `out_indices=(1,)` to get the feature map from the first layer. @@ -59,17 +62,15 @@ from transformers import AutoImageProcessor, AutoBackbone model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,)) ``` -## Model-specific backbones + + -When you know a model supports a backbone, you can load the backbone and neck directly into the model's configuration. Then pass the configuration to the model to initialize it for a task. +When you know a model supports a backbone, you can load the backbone and neck directly into the models configuration. Then pass the configuration to the model to initialize it for a task. For example, load a [ResNet](./model_doc/resnet) backbone and neck for use in a [MaskFormer](./model_doc/maskformer) instance segmentation head. Set the `backbone` parameter to the pretrained model to load the model configuration class. Toggle the `use_pretrained_backbone` parameter to determine whether you want to use pretrained or randomly initialized weights. - - - ```py from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation @@ -77,19 +78,6 @@ config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbon model = MaskFormerForInstanceSegmentation(config) ``` - - - -```py -from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation - -config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=False) -model = MaskFormerForInstanceSegmentation(config) -``` - - - - Another option is to separately load the backbone configuration and then pass it to the `backbone_config` paramater in the model configuration. ```py @@ -107,31 +95,15 @@ model = MaskFormerForInstanceSegmentation(config) [timm](https://hf.co/docs/timm/index) is a collection of vision models for training and inference. Transformers supports timm models as backbones with the [`TimmBackbone`] and [`TimmBackboneConfig`] classes. -Set `use_timm_backnoe=True` to load pretrained timm weights. The `use_pretrained_backbone` parameter can be toggled to use pretrained or randomly initialized weights. - - - - -```py -from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation - -config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=True, use_timm_backbone=True) -model = MaskFormerForInstanceSegmentation(config) -``` - - - +Set `use_timm_backbone=True` to load pretrained timm weights. The `use_pretrained_backbone` parameter can be toggled to use pretrained or randomly initialized weights. ```py from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation -config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=False, use_timm_backbone=True) +config = MaskFormerConfig(backbone="resnet50", use_timm_backbone=True, use_pretrained_backbone=True) model = MaskFormerForInstanceSegmentation(config) ``` - - - You could also explicitly call the [`TimmBackboneConfig`] class to load and create a pretrained timm backbone. ```py diff --git a/docs/source/en/custom_models.md b/docs/source/en/custom_models.md index 9ecf9716f508..c81b99bdc06e 100644 --- a/docs/source/en/custom_models.md +++ b/docs/source/en/custom_models.md @@ -25,15 +25,15 @@ This guide will show you how to customize a ResNet model, enable [AutoClass](./m ## Configuration -A configuration, given by the base [`PretrainedConfig`] class, contains all the necessary information to build a model. This is where you'll configure the parameters of the custom ResNet model. Different configurations gives different ResNet model types. +A configuration, given by the base [`PretrainedConfig`] class, contains all the necessary information to build a model. This is where you'll configure the attributes of the custom ResNet model. Different attributes gives different ResNet model types. The three main rules for customizing a configuration are: -1. A custom configuration must inherit from [`PretrainedConfig`]. Inheritance ensures a custom model has all the functionality of a Transformers model such as [`PretrainedConfig.from_pretrained`], [`PretrainedConfig.save_pretrained`], and [`PretrainedConfig.push_to_hub`]. -2. The [`PretrainedConfig`] `__init__` must accept any `kwargs` and `kwargs` must be passed to the superclass `__init__`. [`PretrainedConfig`] has more more fields than the ones you're setting in your custom configuration. When you load a configuration with [`PretrainedConfig.from_pretrained`], those fields need to be accepted by your configuration and passed to the superclass. +1. A custom configuration must subclass [`PretrainedConfig`]. This ensures a custom model has all the functionality of a Transformers' model such as [`~PretrainedConfig.from_pretrained`], [`~PretrainedConfig.save_pretrained`], and [`~PretrainedConfig.push_to_hub`]. +2. The [`PretrainedConfig`] `__init__` must accept any `kwargs`, and they must be passed to the superclass `__init__`. [`PretrainedConfig`] has more fields than the ones you're setting in your custom configuration, so when you load a configuration with [`~PretrainedConfig.from_pretrained`], those fields need to be accepted by your configuration and passed to the superclass. > [!TIP] -> It is useful to check the validity of some of the parameters. In the example below, a check is implemented to ensure `block_type` and `stem_type` are one of the predefined values. +> It is useful to check the validity of some of the parameters. In the example below, a check is implemented to ensure `block_type` and `stem_type` belong to one of the predefined values. > > Add `model_type` to the configuration class to enable [AutoClass](./models#autoclass) support. @@ -74,7 +74,7 @@ class ResnetConfig(PretrainedConfig): super().__init__(**kwargs) ``` -Save the configuration to a JSON file with the [`PretrainedConfig.save_pretrained`] method. This file is stored in your custom model folder, `custom-resnet`. +Save the configuration to a JSON file with [`PretrainedConfig.save_pretrained`]. This file is stored in your custom model folder, `custom-resnet`. ```py resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) @@ -83,16 +83,18 @@ resnet50d_config.save_pretrained("custom-resnet") ## Model -With the custom ResNet configuration, you can now create and customize the model. The model inherits from the base [`PreTrainedModel`] class. Like [`PretrainedConfig`], inheriting from [`PreTrainedModel`] and initializing the superclass with the configuration extends Transformers functionalities such as saving and loading to the custom model. +With the custom ResNet configuration, you can now create and customize the model. The model subclasses the base [`PreTrainedModel`] class. Like [`PretrainedConfig`], inheriting from [`PreTrainedModel`] and initializing the superclass with the configuration extends Transformers' functionalities such as saving and loading to the custom model. -Transformers' models follow the convention of accepting a `config` object in the `__init__` method. This passes the entire `config` to the models sublayers, instead of breaking the `config` object into multiple arguments that are passed individually to the sublayers. Writing models this way produces simpler code with a clear *source of truth* for any hyperparameters. It is also easier to reuse code from other Transformers' models. +Transformers' models follow the convention of accepting a `config` object in the `__init__` method. This passes the entire `config` to the models sublayers, instead of breaking the `config` object into multiple arguments that are individually passed to the sublayers. -You'll create two ResNet models, a ResNet model that outputs the hidden states and a ResNet model with an image classification head. +Writing models this way produces simpler code with a clear *source of truth* for any hyperparameters. It is also easier to reuse code from other Transformers' models. + +You'll create two ResNet models, a barebones ResNet model that outputs the hidden states and a ResNet model with an image classification head. -Define a mapping between the block types and block classes. Everything else is created by passing the configuration class to the Resnet model class. +Define a mapping between the block types and classes. Everything else is created by passing the configuration class to the ResNet model class. > [!TIP] > Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support. @@ -129,7 +131,7 @@ class ResnetModel(PreTrainedModel): -The `forward` method needs to be rewrittten to calculate the loss for each logit if labels are available. Otherwise, the Resnet model class is the same. +The `forward` method needs to be rewrittten to calculate the loss for each logit if labels are available. Otherwise, the ResNet model class is the same. > [!TIP] > Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support. @@ -166,7 +168,7 @@ class ResnetModelForImageClassification(PreTrainedModel): -A model can return any output format. Returning a dictionary (like ResnetModelForImageClassification) with losses when labels are available, makes the custom model compatible with the [`Trainer`]. For other output formats, you'll need your own training loop or a different library for training. +A model can return any output format. Returning a dictionary (like `ResnetModelForImageClassification`) with losses when labels are available makes the custom model compatible with [`Trainer`]. For other output formats, you'll need your own training loop or a different library for training. Instantiate the custom model class with the configuration. @@ -176,7 +178,7 @@ resnet50d = ResnetModelForImageClassification(resnet50d_config) At this point, you can load pretrained weights into the model or train it from scratch. You'll load pretrained weights in this guide. -Load the pretrained weights from the [timm](https://hf.co/docs/timm/index) library, and then transfer those weights to the custom model with the [load_state_dict](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.load_state_dict) method. +Load the pretrained weights from the [timm](https://hf.co/docs/timm/index) library, and then transfer those weights to the custom model with [load_state_dict](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.load_state_dict). ```py import timm @@ -187,9 +189,9 @@ resnet50d.model.load_state_dict(pretrained_model.state_dict()) ## AutoClass support -The [AutoClass](./models#autoclass) API is a shortcut for automatically loading the correct architecture for a given model. It may be convenient for your users to add this API to your custom model. +The [AutoClass](./models#model-classes) API is a shortcut for automatically loading the correct architecture for a given model. It may be convenient to enable this for users when loading your custom model. -Make sure you have the `model_type` attribute (must be different from existing model types) in the configuration class and `config_class` attribute in the model class. With the [`~AutoConfig.register`] method, add the custom configuration and model to the [AutoClass](./models#autoclass) API. +Make sure you have the `model_type` attribute (must be different from existing model types) in the configuration class and `config_class` attribute in the model class. With the [`~AutoConfig.register`] method, add the custom configuration and model to the [AutoClass](./models#model-classes) API. > [!TIP] > The first argument to [`AutoConfig.register`] must match the `model_type` attribute in the custom configuration class, and the first argument to [`AutoModel.register`] must match the `config_class` of the custom model class. @@ -202,21 +204,21 @@ AutoModel.register(ResnetConfig, ResnetModel) AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) ``` -Your custom model code is now compatible with the [AutoClass](./models#autoclass) API. Users can load the model with the `AutoModel` or [`AutoModelForImageClassification`] classes. +Your custom model code is now compatible with the [AutoClass](./models#autoclass) API. Users can load the model with the [AutoModel](./model_doc/auto#automodel) or [`AutoModelForImageClassification`] classes. -## Upload model +## Upload Upload a custom model to the [Hub](https://hf.co/models) to allow other users to easily load and use it. Ensure the model directory is structured correctly as shown below. The directory should contain: -- `modeling.py`: Contains the code for ResnetModel and ResnetModelForImageClassification. This file can rely on relative imports to other files as long as they're in the same directory. +- `modeling.py`: Contains the code for `ResnetModel` and `ResnetModelForImageClassification`. This file can rely on relative imports to other files as long as they're in the same directory. > [!WARNING] -> Replace all relative imports at the top of the `modeling.py` file to import from Transformers instead if you're copying a model file from Transformers. +> When copying a Transformers' model file, replace all relative imports at the top of the `modeling.py` file to import from Transformers instead. -- `configuration.py`: Contains the code for ResnetConfig. -- `__init__.py`: Can be empty. This file allows Python `resnet_model` to be used as a module. +- `configuration.py`: Contains the code for `ResnetConfig`. +- `__init__.py`: Can be empty, this file allows Python `resnet_model` to be used as a module. ```bash . @@ -233,7 +235,7 @@ from resnet_model.configuration_resnet import ResnetConfig from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification ``` -Copy the code from the model and configuration files. To make sure the AutoClass objects are saved when calling [`~PreTrainedModel.save_pretrained`], call the [`~PretrainedConfig.register_for_auto_class`] method. This modifies the configuration JSON file to include the AutoClass objects and mapping. +Copy the code from the model and configuration files. To make sure the AutoClass objects are saved with [`~PreTrainedModel.save_pretrained`], call the [`~PretrainedConfig.register_for_auto_class`] method. This modifies the configuration JSON file to include the AutoClass objects and mapping. For a model, pick the appropriate `AutoModelFor` class based on the task. @@ -292,4 +294,4 @@ resnet50d.push_to_hub("custom-resnet50d") The pretrained weights, configuration in JSON format, `modeling.py` and `configuration.py` files should all be uploaded to the Hub now under a namespace and specified directory [here](https://hf.co/sgugger/custom-resnet50d). -Because a custom model doesn't use the same modeling code as Transformers' model, you need to add `trust_remode_code=True` in the [`~PreTrainedModel.from_pretrained`] method. Refer to the load [custom models](./models#custom-models) section for more information. +Because a custom model doesn't use the same modeling code as a Transformers' model, you need to add `trust_remode_code=True` in the [`~PreTrainedModel.from_pretrained`] method to load it. Refer to the load [custom models](./models#custom-models) section for more information. diff --git a/docs/source/en/fast_tokenizers.md b/docs/source/en/fast_tokenizers.md index 096f4f192b1b..fe3ff4f69e92 100644 --- a/docs/source/en/fast_tokenizers.md +++ b/docs/source/en/fast_tokenizers.md @@ -16,12 +16,14 @@ rendered properly in your Markdown viewer. # Tokenizers -Tokenizers convert text into an array of numbers known as tensors, which are the inputs to a model. There are several tokenizer types, but they all share the same purpose. Split text into smaller words or subwords (tokens) and convert them into numbers (input ids). A tokenizer also returns an attention mask to indicate which tokens should be attended to. +Tokenizers convert text into an array of numbers known as tensors, which are the inputs to a text model. There are several tokenizer types, but they all share the same purpose. Split text into smaller words or subwords (tokens) according to some rules, and convert them into numbers (input ids). A tokenizer also returns an attention mask to indicate which tokens should be attended to. > [!TIP] -> Learn more about the most popular tokenization algorithms in the [Summary of the tokenizers](./tokenizer_summary). +> Learn about the most popular tokenization algorithms on the [Summary of the tokenizers](./tokenizer_summary) page. -To load a tokenizer, call the [`~PreTrainedTokenizer.from_pretrained`] method to load the tokenizer and its configuration from the Hugging Face [Hub](https://hf.co) into the tokenizer class. Apply the tokenizer to a string of text to return the input ids and attention mask. Set the type of framework tensor to return with the `return_tensors` parameter. +To load a tokenizer, call [`~PreTrainedTokenizer.from_pretrained`] to load the tokenizer and its configuration from the Hugging Face [Hub](https://hf.co). The pretrained tokenizer is saved in a [tokenizer.model](https://huggingface.co/google/gemma-2-2b/blob/main/tokenizer.model) file with all its associated vocabulary files. This method accepts a Hub model repository name or a local directory. + +Apply the tokenizer to a string of text to return the input ids and attention mask. Set the framework tensor type to return with the `return_tensors` parameter. ```py from transformers import AutoTokenizer @@ -34,26 +36,23 @@ tokenizer("We are very happy to show you the 🤗 Transformers library", return_ } ``` -Whatever tokenizer you use, make sure the tokenizer vocabulary is the same as a pretrained models tokenizers vocabulary. This is especially important if you're using a custom tokenizer which has a different vocabulary than the one generated by a pretrained models tokenizer. +Whatever tokenizer you use, make sure the tokenizer vocabulary is the same as the pretrained models tokenizer vocabulary. This is especially important if you're using a custom tokenizer with a different vocabulary than the one generated by a pretrained models tokenizer. This guide provides a brief overview of the tokenizer classes and how to preprocess text with it. -## Base tokenizer classes +## Tokenizer classes All tokenizers inherit from a [`PreTrainedTokenizerBase`] class that provides common methods for all tokenizers like [`~PreTrainedTokenizerBase.from_pretrained`] and [`~PreTrainedTokenizerBase.batch_decode`]. From this base class, there are two main tokenizer classes. -- [`PreTrainedTokenizer`] is a Python implementation. -- [`PreTrainedTokenizerFast`] is a fast Rust-based implementation from the [Tokenizers](https://hf.co/docs/tokenizers/index) library. - -Each model tokenizer inherits from one of these two base tokenizer classes, for example [`LlamaTokenizer`] and [`LlamaTokenizerFast`]. - -The pretrained tokenizer is saved in a [tokenizer.model](https://huggingface.co/google/gemma-2-2b/blob/main/tokenizer.model) file with all its associated vocabulary files. +- [`PreTrainedTokenizer`] is a Python implementation, for example [`LlamaTokenizer`]. +- [`PreTrainedTokenizerFast`] is a fast Rust-based implementation from the [Tokenizers](https://hf.co/docs/tokenizers/index) library, for example [`LlamaTokenizerFast`]. -To use a pretrained tokenizer, you need to load all the vocabulary files and the tokenizer model with the [`~PreTrainedTokenizerBase.from_pretrained`] method. This method accepts a Hub model repository name or a local directory. For a custom tokenizer, you need to load it's vocabulary file. Both methods are shown in [AutoTokenizer](#autotokenizer) and [Model-specific tokenizer](#model-specific-tokenizer) sections. +There are two ways you can load a tokenizer, with [`AutoTokenizer`] or a model-specific tokenizer. -## AutoTokenizer + + -The [AutoClass](./model_doc/auto) API is a fast and easy way to load a tokenizer without needing to know whether a Python or Rust-based implementation is available. By default, an AutoTokenizer tries to load a fast tokenizer if it's available for a given model, otherwise, it loads the Python implementation. +The [AutoClass](./model_doc/auto) API is a fast and easy way to load a tokenizer without needing to know whether a Python or Rust-based implementation is available. By default, an [`AutoTokenizer`] tries to load a fast tokenizer if it's available for a given model, otherwise, it loads the Python implementation. Use the [`~PreTrainedTokenizer.from_pretrained`] method to load a tokenizer. @@ -76,12 +75,10 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("./model_directory/my_vocab_file.txt") ``` -## Model-specific tokenizer - -Each pretrained model is associated with a tokenizer and its specific vocabulary it was trained on. A tokenizer can be loaded directly from the model-specific class. Check a model's API documentation to check whether a fast tokenizer is supported for a model. + + - - +Each pretrained model is associated with a tokenizer and the specific vocabulary it was trained on. A tokenizer can be loaded directly from the model-specific class. Check a model's API documentation to check whether a fast tokenizer is supported for a model. ```py from transformers import GemmaTokenizer @@ -90,8 +87,7 @@ tokenizer = GemmaTokenizer.from_pretrained("google/gemma-2-2b") tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt") ``` - - +To load a fast tokenizer, use the fast implementation class. ```py from transformers import GemmaTokenizerFast @@ -100,9 +96,6 @@ tokenizer = GemmaTokenizerFast.from_pretrained("google/gemma-2-2b") tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt") ``` - - - Load your own tokenizer by passing its vocabulary file to the `vocab_file` parameter. ```py @@ -111,13 +104,16 @@ from transformers import GemmaTokenizerFast tokenizer = GemmaTokenizerFast(vocab_file="my_vocab_file.txt") ``` + + + ## Fast tokenizers [`PreTrainedTokenizerFast`] or *fast tokenizers* are Rust-based tokenizers from the [Tokenizers](https://hf.co/docs/tokenizers) library. It is significantly faster at batched tokenization and provides additional alignment methods compared to the Python-based tokenizers. -If you're using the [AutoTokenizer](#autotokenizer) API, it automatically loads a fast tokenizer if it's supported for a given model. Otherwise, you need to explicitly load the fast tokenizer. +If you're using [`AutoTokenizer`], it automatically loads a fast tokenizer if it's supported for a given model. Otherwise, you need to explicitly load the fast tokenizer. This section will show you how to train a fast tokenizer and reuse it in Transformers. @@ -177,7 +173,7 @@ fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") -A Transformers model expects the input as a PyTorch, TensorFlow, or NumPy tensor. A tokenizers job is to preprocess text into those tensors. Specify the type of framework tensor to return with the `return_tensors` parameter. +A Transformers model expects the input as a PyTorch, TensorFlow, or NumPy tensor. A tokenizers job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter. ```py from transformers import AutoTokenizer @@ -197,7 +193,7 @@ When passing a string of text to a tokenizer, there are actually two steps the t -In the first step, a string of text is split into tokens. How the text is split depends on the tokenization algorithm. Call the [`~PreTrainedTokenizer.tokenize`] method to tokenize the text. +In the first step, a string of text is split into tokens. How the text is split depends on the tokenization algorithm. Call [`~PreTrainedTokenizer.tokenize`] to tokenize the text. ```py tokens = tokenizer.tokenize("We are very happy to show you the 🤗 Transformers library") @@ -210,7 +206,7 @@ Gemma uses a [SentencePiece](./tokenizer_summary#sentencepiece) tokenizer which -In the second step, the tokens are converted into ids with the [`~PreTrainedTokenizer.convert_tokens_to_ids`] method. +In the second step, the tokens are converted into ids with [`~PreTrainedTokenizer.convert_tokens_to_ids`]. ```py ids = tokenizer.convert_tokens_to_ids(tokens) @@ -221,7 +217,7 @@ print(ids) -Lastly, the model prediction typically generates numerical outputs which are converted back to text with the [`~PreTrainedTokenizer.decode`] method. +Lastly, the model prediction typically generates numerical outputs which are converted back to text with [`~PreTrainedTokenizer.decode`]. ```py decoded_string = tokenizer.decode(ids) @@ -236,7 +232,7 @@ print(decoded_string) Special tokens are used by the tokenizer to provide the model with some additional information about the text. -For example, if you compare the tokens obtained from passing text directly to the tokenizer and from the [`~PreTrainedTokenizer.convert_tokens_to_ids`] method, you'll notice some additional tokens are added. +For example, if you compare the tokens obtained from passing text directly to the tokenizer and from [`~PreTrainedTokenizer.convert_tokens_to_ids`], you'll notice some additional tokens are added. ```py model_inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.") diff --git a/docs/source/en/feature_extractors.md b/docs/source/en/feature_extractors.md index fd0886346a04..09f817569b88 100644 --- a/docs/source/en/feature_extractors.md +++ b/docs/source/en/feature_extractors.md @@ -18,7 +18,7 @@ rendered properly in your Markdown viewer. Feature extractors preprocess audio data into the correct format for a given model. It takes the raw audio signal and converts it into a tensor that can be fed to a model. The tensor shape depends on the model, but the feature extractor will correctly preprocess the audio data for you given the model you're using. Feature extractors also include methods for padding, truncation, and resampling. -To load a feature extractor, call the [`~AutoFeatureExtractor.from_pretrained`] method to load the feature extractor and its preprocessor configuration from the Hugging Face [Hub](https://hf.co/models) into the feature extractor class. +To load a feature extractor, call [`~AutoFeatureExtractor.from_pretrained`] to load the feature extractor and its preprocessor configuration from the Hugging Face [Hub](https://hf.co/models). The feature extractor and preprocessor configuration is saved in a [preprocessor_config.json](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json) file. This method loads a feature extractor from a Hub model repository name or local directory Pass the audio signal, typically stored in `array`, to the feature extractor and set the `sampling_rate` parameter to the pretrained audio models sampling rate. It is important the sampling rate of the audio data matches the sampling rate of the data a pretrained audio model was trained on. @@ -36,20 +36,23 @@ The feature extractor returns an input, `input_values`, that is ready for the mo This guide walks you through the feature extractor classes and how to preprocess audio data. -## Base feature extractor classes +## Feature extractor classes Transformers feature extractors inherit from the [`SequenceFeatureExtractor`] class, which subclasses [`FeatureExtractionMixin`]. - [`SequenceFeatureExtractor`] provides a method to [`~SequenceFeatureExtractor.pad`] sequences to a certain length to avoid uneven sequence lengths. -- [`FeatureExtractionMixin`] provides [`~FeatureExtractionMixin.from_pretrained`] and [`~FeatureExtractionMixin.save_pretrained`] to load and save a feature extractor. It loads a feature extractor from a Hub model repository name or local directory, and saves a feature extractors configuration to a [preprocessor_config.json](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json). +- [`FeatureExtractionMixin`] provides [`~FeatureExtractionMixin.from_pretrained`] and [`~FeatureExtractionMixin.save_pretrained`] to load and save a feature extractor. -## AutoFeatureExtractor +There are two ways you can load a feature extractor, [`AutoFeatureExtractor`] and a model-specific feature extractor class. + + + The [AutoClass](./model_doc/auto) API automatically loads the correct feature extractor for a given model. -Use the [`~AutoFeatureExtractor.from_pretrained`] method to load a feature extractor. +Use [`~AutoFeatureExtractor.from_pretrained`] to load a feature extractor. ```py from transformers import AutoFeatureExtractor @@ -57,7 +60,8 @@ from transformers import AutoFeatureExtractor feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny") ``` -## Model-specific feature extractor + + Every pretrained audio model has a specific associated feature extractor for correctly processing audio data. When you load a feature extractor, it retrieves the feature extractors configuration (feature size, chunk length, etc.) from [`preprocessor_config.json`](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json). @@ -69,11 +73,16 @@ from transformers import WhisperFeatureExtractor feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny") ``` + + + ## Preprocess -A feature extractor expects the input as a PyTorch tensor of a certain shape. The exact input shape can vary depending on the specific audio model you're using. For example, [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper) expects `input_features` which is a tensor of shape (batch_size, feature_size, sequence_length) but [Wav2Vec2](https://hf.co/docs/transformers/model_doc/wav2vec2) expects `input_values` which is a tensor of shape (batch_size, sequence_length). +A feature extractor expects the input as a PyTorch tensor of a certain shape. The exact input shape can vary depending on the specific audio model you're using. + +For example, [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper) expects `input_features` which is a tensor of shape (batch_size, feature_size, sequence_length) but [Wav2Vec2](https://hf.co/docs/transformers/model_doc/wav2vec2) expects `input_values` which is a tensor of shape (batch_size, sequence_length). -The feature extractor takes care of this for whichever audio model you're using. +The feature extractor generates the correct input shape for whichever audio model you're using. A feature extractor also sets the sampling rate (the number of audio signal values taken per second) of the audio files. The sampling rate of your audio data must match the sampling rate of the dataset a pretrained model was trained on. This value is typically given in the model card. @@ -106,7 +115,7 @@ processed_dataset ### Padding -Audio sequence lengths are different which is an issue because Transformers expects all sequences to have the same lengths so they can be batched. Uneven sequence lengths can't be batched. +Audio sequence lengths that are different is an issue because Transformers expects all sequences to have the same lengths so they can be batched. Uneven sequence lengths can't be batched. ```py dataset[0]["audio"]["array"].shape @@ -185,13 +194,13 @@ dataset[0]["audio"] 'sampling_rate': 8000} ``` -Call the [`~datasets.Dataset.cast_column`] method on the `audio` column to upsample the sampling rate to 16kHz. +Call [`~datasets.Dataset.cast_column`] on the `audio` column to upsample the sampling rate to 16kHz. ```py dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) ``` -When you load the dataset sample, it is resampled to 16kHz. +When you load the dataset sample, it is now resampled to 16kHz. ```py dataset[0]["audio"] diff --git a/docs/source/en/image_processors.md b/docs/source/en/image_processors.md index 008f857888d4..faa3919e660a 100644 --- a/docs/source/en/image_processors.md +++ b/docs/source/en/image_processors.md @@ -16,12 +16,12 @@ rendered properly in your Markdown viewer. # Image processors -An image processor converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision or video model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images it was pretrained on. +An image processor converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision or video model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on. - [`~BaseImageProcessor.center_crop`] to resize an image - [`~BaseImageProcessor.normalize`] or [`~BaseImageProcessor.rescale`] pixel values -Load an image processor with the [`~ImageProcessingMixin.from_pretrained`] method. This loads the image processors configuration (image size, whether to normalize and rescale, etc.) from a vision model on the Hugging Face [Hub](https://hf.co) into the image processor class. +Load an image processor with [`~ImageProcessingMixin.from_pretrained`]. This loads the image processors configuration (image size, whether to normalize and rescale, etc.) from a vision model on the Hugging Face [Hub](https://hf.co). The specific image processor configuration for each pretrained model is saved in a [preprocessor_config.json](https://huggingface.co/google/vit-base-patch16-224/blob/main/preprocessor_config.json) file. This method accepts a Hub model repository name or a local directory. ```py from transformers import AutoImageProcessor @@ -31,10 +31,6 @@ image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-22 Pass an image to the image processor to transform it into pixel values. Set `return_tensors="pt"` to return PyTorch tensors, and feel free to print out the inputs to see what the image looks like as a tensor. -
- -
- ```py from PIL import Image import requests @@ -46,7 +42,7 @@ inputs = image_processor(image, return_tensors="pt") This guide covers the image processor class and how to preprocess images for vision models. -## Base image processor classes +## Image processor classes @@ -57,15 +53,14 @@ Transformers image processors inherit from the [`BaseImageProcessor`] class whic Each image processor subclasses the [`ImageProcessingMixin`] class which provides the [`~ImageProcessingMixin.from_pretrained`] and [`~ImageProcessingMixin.save_pretrained`] methods for loading and saving image processors. -The specific image processor configuration for each pretrained model is saved in a [preprocessor_config.json](https://huggingface.co/google/vit-base-patch16-224/blob/main/preprocessor_config.json) file. - -To use an image processor, you need to load the specific image processor configuration associated with the vision model with [`~ImageProcessingMixin.from_pretrained`]. This method accepts a Hub model repository name or a local directory. +There are two ways you can load an image processor, [`AutoImageProcessor`] and a model-specific image processor. -## AutoImageProcessor + + The [AutoClass](./model_doc/auto) API provides a convenient method to load an image processor without directly specifying the model the image processor is associated with. -Use the [`~AutoImageProcessor.from_pretrained`] method to load an image processor. Set `use_fast=True` to load a fast image processor if it's supported for a model. +Use [`~AutoImageProcessor.from_pretrained`] to load an image processor. Set `use_fast=True` to load a fast image processor if it's supported for a model. ```py from transformers import AutoImageProcessor @@ -73,23 +68,20 @@ from transformers import AutoImageProcessor image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224", use_fast=True) ``` -## Model-specific image processor + + Each image processor is associated with a specific pretrained vision model, and the image processor's configuration contains the model's expected size and whether to normalize and resize. The image processor can be loaded directly from the model-specific class. Check a model's API documentation to see whether it supports a fast image processor. - - - ```py from transformers import ViTImageProcessor image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") ``` - - +To load a fast image processor, use the fast implementation class. ```py from transformers import ViTImageProcessorFast @@ -144,7 +136,7 @@ def transforms(examples): return examples ``` -Apply the combined augmentation and preprocessing function to the entire dataset on the fly with the [`~datasets.Dataset.set_transform`] method. +Apply the combined augmentation and preprocessing function to the entire dataset on the fly with [`~datasets.Dataset.set_transform`]. ```py dataset.set_transform(transforms) @@ -171,13 +163,13 @@ plt.imshow(img.permute(1, 2, 0)) -For other vision tasks like object detection or segmentation, the image processor includes post-processing methods to convert a model's raw output into meaningful predictions like bounding boxes or segmentation maps. +For other vision tasks like object detection or segmentation, the image processor includes post-processing methods to convert a models raw output into meaningful predictions like bounding boxes or segmentation maps. ### Padding Some models, like [DETR](./model_doc/detr), applies [scale augmentation](https://paperswithcode.com/method/image-scale-augmentation) during training which can cause images in a batch to have different sizes. Images with different sizes can't be batched together. -To fix this, pad the images with the special padding token `0`. Use the [`~DetrImageProcessor.pad`] method to pad the images, and define a custom collate function to batch them together. +To fix this, pad the images with the special padding token `0`. Use the [pad](https://github.com/huggingface/transformers/blob/9578c2597e2d88b6f0b304b5a05864fd613ddcc1/src/transformers/models/detr/image_processing_detr.py#L1151) method to pad the images, and define a custom collate function to batch them together. ```py def collate_fn(batch): diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 22696effd403..66f59feaa3bb 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -17,26 +17,26 @@ rendered properly in your Markdown viewer. Transformers is a library of pretrained natural language processing, computer vision, audio, and multimodal models. -It supports the main machine learning frameworks ([PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), and [Flax](https://flax.readthedocs.io/en/latest/)), and provides APIs for inference and training to help you use pretrained models out-of-the-box or train new ones from scratch. +It supports [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), [Flax](https://flax.readthedocs.io/en/latest/) and provides inference and training APIs to get started with pretrained models right away. Join us on the [Hugging Face Hub](https://huggingface.co/), [Discord](https://discord.com/invite/JfAtkvEtRb), or [forum](https://discuss.huggingface.co/) today! ## Features -Transformers provides everything you need for training or inference with state-of-the-art pretrained models. Some of its main features include: +Transformers provides everything you need for inference or training with state-of-the-art pretrained models. Some of its main features include: - [`Pipeline`]: A high-level API that supports optimized inference for many machine learning tasks like text generation, image segmentation, automatic speech recognition, document question answering, and more. -- [`Trainer`]: A feature-rich API that supports training and distributed training for PyTorch models. It includes many performant and efficient training features such as mixed precision, torch.compile, and FlashAttention. -- [`~GenerationMixin.generate`]: A generation API for large language models (LLMs) and vision language models (VLMs) that supports streaming and multiple decoding strategies for different use cases. +- [`Trainer`]: An extensive API that supports many features such as mixed precision, torch.compile, and FlashAttention for training and distributed training for PyTorch models. +- [`~GenerationMixin.generate`]: A generation API for large language models (LLMs) and vision language models (VLMs) that supports streaming and many decoding strategies. ## Design > [!TIP] -> Refer to our [Philosophy](./philosophy) for a more detailed explanation of Transformers' design principles. +> For a more detailed explanation of Transformers' design principles, learn more in our [Philosophy](./philosophy). Transformers is designed for developers and machine learning engineers and researchers alike. Its main design principles are: -1. Easy and fast to use: Every model is implemented from only three main classes (model, preprocessor, and configuration) and can be quickly used for inference or training with two APIs ([`Pipeline`] or [`Trainer`]). +1. Easy and fast to use: Every model is implemented from only three main classes (configuration, model, and preprocessor) and can be quickly used for inference or training with just [`Pipeline`] or [`Trainer`]. 2. Pretrained models: Reduce your carbon footprint, compute cost and time by using a pretrained model instead of training an entirely new one. Each pretrained model is reproduced as closely as possible to the original model and offers state-of-the-art performance.
diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md index 049d2edd3cea..a5df5a2c08c3 100644 --- a/docs/source/en/installation.md +++ b/docs/source/en/installation.md @@ -114,7 +114,7 @@ python -c "from transformers import pipeline; print(pipeline('sentiment-analysis ### Source install -Installing from source installs the *latest* version rather than the *stable* version of the library. This ensures you get the most up-to-date changes in the library, which is useful for experimenting with the latest features or fixing a bug that hasn't been officially released yet. +Installing from source installs the *latest* version rather than a *stable* version of the library. This ensures you get the most up-to-date changes in the library, which is useful for experimenting with the latest features or fixing a bug that hasn't been officially released in the stable version yet. The downside is that the latest version may not always be stable. If you encounter any problems, please open a [GitHub Issue](https://github.com/huggingface/transformers/issues) so we can fix it as soon as possible. @@ -142,7 +142,7 @@ pip install -e . ``` > [!WARNING] -> You must keep the `transformers` folder to keep using it locally. +> You must keep the local Transformers folder to keep using it. Update your local version of Transformers with the latest changes in the main repository with the following command. @@ -165,7 +165,9 @@ After installation, you can configure the Transformers cache location or setup t ### Cache directory -When you load a pretrained model with [`~PreTrainedModel.from_pretrained`], the model is downloaded from the Hub and locally cached. Every time you load a model, it checks whether the cached model is up-to-date. If it's the same, then the local model is loaded. If it's not the same, the updated model is downloaded and cached. This ensures you always have the latest version of a model. +When you load a pretrained model with [`~PreTrainedModel.from_pretrained`], the model is downloaded from the Hub and locally cached. + +Every time you load a model, it checks whether the cached model is up-to-date. If it's the same, then the local model is loaded. If it's not the same, the newer model is downloaded and cached. This ensures you always have the latest model version. The default directory given by the shell environment variable `TRANSFORMERS_CACHE` is `~/.cache/huggingface/hub`. @@ -192,14 +194,14 @@ from huggingface_hub import snapshot_download snapshot_download(repo_id="meta-llama/Llama-2-7b-hf", repo_type="model") ``` -Set the environment variable `HF_HUB_OFFLINE=1` to prevent HTTP calls to the Hub to download the model. +Set the environment variable `HF_HUB_OFFLINE=1` to prevent HTTP calls to the Hub when loading a model. ```bash HF_HUB_OFFLINE=1 \ python examples/pytorch/language-modeling/run_clm.py --model_name_or_path meta-llama/Llama-2-7b-hf --dataset_name wikitext ... ``` -Another option for only loading cached files is to set the `local_files_only` parameter to `True` in the [`~PreTrainedModel.from_pretrained`] method. +Another option for only loading cached files is to set the `local_files_only` parameter to `True` in [`~PreTrainedModel.from_pretrained`]. ```py from transformers import LlamaForCausalLM diff --git a/docs/source/en/model_doc/albert.md b/docs/source/en/model_doc/albert.md index d195203615de..86c378df436e 100644 --- a/docs/source/en/model_doc/albert.md +++ b/docs/source/en/model_doc/albert.md @@ -23,6 +23,9 @@ rendered properly in your Markdown viewer. Spaces +PyTorch +TensorFlow +Flax
## Overview diff --git a/docs/source/en/model_sharing.md b/docs/source/en/model_sharing.md index 30fd5a2e30c7..9a1fddf9a4f5 100644 --- a/docs/source/en/model_sharing.md +++ b/docs/source/en/model_sharing.md @@ -63,19 +63,19 @@ model = AutoModel.from_pretrained( ) ``` -Model repositories also support [gating](https://hf.co/docs/hub/models-gated) for more control over how and who can access a model. Gating is common for allowing a select group of users to preview a research model before it's made public. +Model repositories also support [gating](https://hf.co/docs/hub/models-gated) for more control over who can access a model. Gating is common for allowing a select group of users to preview a research model before it's made public.
-The model repository also includes an inference [widget](https://hf.co/docs/hub/models-widgets) for users to directly interact with a model. +The model repository also includes an inference [widget](https://hf.co/docs/hub/models-widgets) for users to directly interact with a model on the Hub. Check out the Hub [Models](https://hf.co/docs/hub/models) documentation to learn more about. ## Model framework conversion -Reach a wider audience by converting a model to be compatible with all machine learning frameworks (PyTorch, TensorFlow, Flax). While users can still load a model if they're using a different framework, it is slower because Transformers converts the checkpoint on the fly. It is faster to convert the checkpoint beforehand. +Reach a wider audience by making a model available in PyTorch, TensorFlow, and Flax. While users can still load a model if they're using a different framework, it is slower because Transformers needs to convert the checkpoint on the fly. It is faster to convert the checkpoint beforehand. @@ -116,7 +116,7 @@ flax_model.save_pretrained("path/to/awesome-name-you-picked") -## Upload model +## Upload a model There are several ways to upload a model to the Hub depending on your workflow preference. You can push a model with the [`Trainer`], call the [`~PreTrainedModel.push_to_hub`] method directly on a model, or use the Hub's web interface. @@ -124,9 +124,9 @@ There are several ways to upload a model to the Hub depending on your workflow p ### Trainer -The [`Trainer`], Transformers' training API, allows pushing a model directly to the Hub after training. Set `push_to_hub=True` in the [`TrainingArguments`] class and pass it to the [`Trainer`]. Once training is complete, call [`~transformers.Trainer.push_to_hub`] to upload the model. +The [`Trainer`], Transformers' training API, can push a model directly to the Hub after training. Set `push_to_hub=True` in the [`TrainingArguments`] class and pass it to the [`Trainer`]. Once training is complete, call [`~transformers.Trainer.push_to_hub`] to upload the model. -The [`~transformers.Trainer.push_to_hub`] method automatically adds useful information like the training hyperparameters and results to the model card. +The [`~transformers.Trainer.push_to_hub`] method automatically adds useful information like training hyperparameters and results to the model card. ```py from transformers import TrainingArguments, Trainer @@ -155,9 +155,11 @@ push_to_hub_callback = PushToHubCallback( model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback) ``` -### PreTrainedModel.push_to_hub +### PushToHubMixin -Call [`~PreTrainedModel.push_to_hub`] directly on a model to upload it to the Hub. It creates a repository under your namespace with the model name specified in [`~PreTrainedModel.push_to_hub`]. +The [`PushToHubMixin`] provides the functionality for pushing a model or tokenizer to the Hub. + +Call [`~PushToHubMixin.push_to_hub`] directly on a model to upload it to the Hub. It creates a repository under your namespace with the model name specified in [`~PushToHubMixin.push_to_hub`]. ```py model.push_to_hub("my-awesome-model") @@ -183,7 +185,7 @@ Create a new repository by selecting [**New Model**](https://huggingface.co/new) -Add some details about your model: +Add some information about your model: - Select the **owner** of the repository. This can be yourself or any of the organizations you belong to. - Pick a name for your model, which will also be the repository name. @@ -209,4 +211,4 @@ A model card is a `README.md` file in your repository. Add this file by: Take a look at the Llama 3.1 [model card](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) for an example of the type of information to include on a model card. -Learn more about other model card metadata (carbon emissions, license, link to paper, etc.) to include in the [Model Cards](https://hf.co/docs/hub/model-cards#model-cards) guide. +Learn more about other model card metadata (carbon emissions, license, link to paper, etc.) available in the [Model Cards](https://hf.co/docs/hub/model-cards#model-cards) guide. diff --git a/docs/source/en/models.md b/docs/source/en/models.md index 7513679550e0..24fecc318836 100644 --- a/docs/source/en/models.md +++ b/docs/source/en/models.md @@ -18,7 +18,7 @@ rendered properly in your Markdown viewer. Transformers provides many pretrained models that are ready to use with just a single line of code. It requires a model class and the [`~PreTrainedModel.from_pretrained`] method. -To load a model, call the [`~PreTrainedModel.from_pretrained`] method to download and load the model weights and configuration stored on the Hugging Face [Hub](https://hf.co/models) into the model class. +To load a model, call [`~PreTrainedModel.from_pretrained`] to download and load the model weights and configuration stored on the Hugging Face [Hub](https://hf.co/models). > [!TIP] > The [`~PreTrainedModel.from_pretrained`] method loads weights stored in the [safetensors](https://hf.co/docs/safetensors/index) file format if they're available. Traditionally, PyTorch model weights are serialized with the [pickle](https://docs.python.org/3/library/pickle.html) utility which is known to be unsecure. Safetensor files are more secure and faster to load. @@ -29,7 +29,7 @@ from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") ``` -This guide will briefly explain how models are loaded, the different ways you can load a model, and how to overcome memory issues for really big models. +This guide will briefly explain how models are loaded, the different ways you can load a model, how to overcome memory issues for really big models, and how to load custom models. ## Models and configurations @@ -40,12 +40,10 @@ All models have a `configuration.py` file with specific attributes like the numb > [!TIP] > An *architecture* refers to the model's skeleton and a *checkpoint* refers to the model's weights for a given architecture. For example, [BERT](./model_doc/bert) is an architecture while [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) is a checkpoint. You'll see the term *model* used interchangeably for architecture and checkpoint. -To get a pretrained model, you need to load the weights into the model. This is done by calling the [`~PreTrainedModel.from_pretrained`] method which accepts weights from the Hugging Face Hub or a local directory. - There are two general types of models you can load: -1. A generic model class like [`LlamaModel`] or [`AutoModel`] that outputs hidden states. -2. A model class with a specific *head* attached to the generic model, like [`LlamaForCausalLM`] or [`AutoModelForCausalLM`], for performing specific tasks. +1. A barebones model like [`AutoModel`] or [`LlamaModel`] that outputs hidden states. +2. A model with a specific *head* attached to the model, like [`AutoModelForCausalLM`] or [`LlamaForCausalLM`], for performing specific tasks. For each model type, there is a separate class for each machine learning framework (PyTorch, TensorFlow, Flax). Pick the corresponding prefix for the framework you're using. @@ -85,11 +83,18 @@ model = FlaxMistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
-## AutoClass +## Model classes + +To get a pretrained model, you need to load the weights into the model. This is done by calling [`~PreTrainedModel.from_pretrained`] which accepts weights from the Hugging Face Hub or a local directory. + +There are two model classes, the [AutoModel](./model_doc/auto) class and a model-specific class. + + + -The [AutoClass](./model_doc/auto) API is a convenient way to load an architecture without needing to know the exact model class name, because there are many architectures. It automatically selects the correct model class based on the configuration file. You only need to know the task and checkpoint you want to use. +The [AutoModel](./model_doc/auto) class is a convenient way to load an architecture without needing to know the exact model class name because there are many architectures. It automatically selects the correct model class based on the configuration file. You only need to know the task and checkpoint you want to use. The AutoClass makes it easy to switch between models or tasks, as long as the architecture is supported for a given task. @@ -104,7 +109,7 @@ model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-2-7 model = AutoModelForQuestionAnswering.from_pretrained("meta-llama/Llama-2-7b-hf") ``` -In other cases, you want to quickly try out several models for a task. +In other cases, you may want to quickly try out several models for a task. ```py from transformers import AutoModelForCausalLM @@ -115,11 +120,12 @@ model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") model = AutoModelForCausalLM.from_pretrained("google/gemma-7b") ``` -## Model-specific class + + -The [AutoClass](#autoclass) builds on top of model-specific classes. All model classes that support a specific task are mapped to their respective `AutoModelFor` task class. +The AutoModel class builds on top of model-specific classes. All model classes that support a specific task are mapped to their respective `AutoModelFor` task class. -But if you already know which model class you want to use, then you could use its model-specific class directly. +If you already know which model class you want to use, then you could use its model-specific class directly. ```py from transformers import LlamaModel, LlamaForCausalLM @@ -127,6 +133,9 @@ from transformers import LlamaModel, LlamaForCausalLM model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") ``` + + + ## Big models Large pretrained models require a lot of memory to load. The loading process involves: @@ -135,7 +144,7 @@ Large pretrained models require a lot of memory to load. The loading process inv 2. loading the pretrained weights 3. placing the pretrained weights on the model -You need enough memory to hold two copies of the model weights (random and pretrained) which may not be possible depending on your hardware. In distributed training environments, this is an even bigger challenge because each process loads a pretrained model. +You need enough memory to hold two copies of the model weights (random and pretrained) which may not be possible depending on your hardware. In distributed training environments, this is even more challenging because each process loads a pretrained model. Transformers reduces some of these memory-related challenges with fast initialization, sharded checkpoints, leveraging Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature, and supporting lower bit data types. @@ -147,13 +156,13 @@ Transformers boosts loading speed and avoids random weight initialization with t ### Sharded checkpoints -For big models with sharded checkpoints, each shard is loaded sequentially after the previous shard is loaded. This limits memory-usage to only the model size and the largest shard size. - Transformers' [`~PreTrainedModel.save_pretrained`] method automatically shards checkpoints larger than 10GB. +Each shard is loaded sequentially after the previous shard is loaded, limiting memory usage to only the model size and the largest shard size. + The `max_shard_size` parameter defaults to 5GB for each shard because it is easier to run on free-tier GPU instances without running out of memory. -For example, let's shard [BioMistral/BioMistral-7B](https://hf.co/BioMistral/BioMistral-7B). +For example, create some shards checkpoints for [BioMistral/BioMistral-7B](https://hf.co/BioMistral/BioMistral-7B) in [`~PreTrainedModel.save_pretrained`]. ```py from transformers import AutoModel @@ -174,7 +183,7 @@ with tempfile.TemporaryDirectory() as tmp_dir: new_model = AutoModel.from_pretrained(tmp_dir) ``` -Sharded checkpoints can also be directly loaded with the [`~transformers.modeling_utils.load_sharded_checkpoint`] method. +Sharded checkpoints can also be directly loaded with [`~transformers.modeling_utils.load_sharded_checkpoint`]. ```py from transformers.modeling_utils import load_sharded_checkpoint @@ -225,18 +234,18 @@ index["weight_map"] -The [`~PreTrainedModel.from_pretrained`] method is supercharged with Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling). +The [`~PreTrainedModel.from_pretrained`] method is supercharged by Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature. Big Model Inference creates a *model skeleton* on PyTorch's [meta](https://pytorch.org/docs/main/meta.html) device. The meta device doesn't store any real data, only the metadata. -Randomly initialized weights are only created when the pretrained weights are loaded to avoid maintaining two copies of the model in memory at the same time. The maximum memory-usage is only the size of the model. +Randomly initialized weights are only created when the pretrained weights are loaded to avoid maintaining two copies of the model in memory at the same time. The maximum memory usage is only the size of the model. > [!TIP] -> Learn more about device placement in [Designing a device map](https://hf.co/docs/accelerate/v0.33.0/en/concept_guides/big_model_inference#designing-a-device-map) section. +> Learn more about device placement in [Designing a device map](https://hf.co/docs/accelerate/v0.33.0/en/concept_guides/big_model_inference#designing-a-device-map). Big Model Inference's second feature relates to how weights are loaded and dispatched in the model skeleton. Model weights are dispatched across all available devices, starting with the fastest device (usually the GPU) and then offloading any remaining weights to slower devices (CPU and hard drive). -Both features combine reduced memory-usage and faster loading times for big pretrained models. +Both features combined reduces memory usage and loading times for big pretrained models. Set the [device_map](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3061) parameter to `"auto"` to enable Big Model Inference. This also sets the [low_cpu_mem_usage](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3028) parameter to `True`. @@ -246,7 +255,9 @@ from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto") ``` -To manually assign layers to devices, create a `device_map`. It should map all model parameters to a device, but you don't have to detail where all the submodules of a layer go if the entire layer is on the same device. Access the `hf_device_map` attribute to see how the model is distributed across devices. +To manually assign layers to devices, create a `device_map`. It should map all model parameters to a device, but you don't have to detail where all the submodules of a layer go if the entire layer is on the same device. + +Access the `hf_device_map` attribute to see how the model is distributed across devices. ```py device_map = {"model.layers.1": 0, "model.layers.14": 1, "model.layers.31": "cpu", "lm_head": "disk"} @@ -257,7 +268,7 @@ model.hf_device_map PyTorch model weights are initialized as torch.float32. To load a model in a different data type, like torch.float16, it requires additional memory to load the model again in the desired data type. -Explicitly set the [torch_dtype]() parameter to directly initialize the model in the desired data type instead of essentially loading a model twice (torch.float32, torch.float16). You could also set `torch_dtype="auto"` to automatically load the weights with the most optimal memory pattern (the data type is derived from the model weights). +Explicitly set the [torch_dtype](https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype) parameter to directly initialize the model in the desired data type instead of loading the weights twice (torch.float32, torch.float16). You could also set `torch_dtype="auto"` to automatically load the weights with the most optimal memory pattern (the data type is derived from the model weights). @@ -294,9 +305,9 @@ model = AutoModel.from_config(my_config) Custom models use Transformers' configuration and modeling classes, supports the [AutoClass](#autoclass) API, and are loaded with [`~PreTrainedModel.from_pretrained`]. What makes custom models different is the modeling code is not from Transformers. -The Hub includes [malware scanning](https://hf.co/docs/hub/security-malware#malware-scanning) for every repository, but extra care should still be taken when loading a custom model to avoid inadvertently executing malicious code. +Extra precaution should be taken when loading a custom model. While the Hub includes [malware scanning](https://hf.co/docs/hub/security-malware#malware-scanning) for every repository, extra care should still be taken when loading a custom model to avoid inadvertently executing malicious code. -Set the `trust_remote_code` parameter to `True` in [`~PreTrainedModel.from_pretrained`] to load a custom model. +Set `trust_remote_code=True` in [`~PreTrainedModel.from_pretrained`] to load a custom model. ```py from transformers import AutoModelForImageClassification @@ -304,7 +315,7 @@ from transformers import AutoModelForImageClassification model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True) ``` -As an extra layer of security, load a custom model from a specific revision to make sure the model code hasn't changed. The commit hash can be copied from the model's [commit history](https://hf.co/sgugger/custom-resnet50d/commits/main). +For an extra layer of security, load a custom model from a specific revision to make sure the model code hasn't changed. The commit hash can be copied from the model's [commit history](https://hf.co/sgugger/custom-resnet50d/commits/main). ```py commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292" @@ -313,4 +324,4 @@ model = AutoModelForImageClassification.from_pretrained( ) ``` -Learn more about how to create a custom model in [Customize](./custom_models). \ No newline at end of file +Learn more about how to create a custom model in [Customize](./custom_models). diff --git a/docs/source/en/processors.md b/docs/source/en/processors.md index 4bb12b140aa4..48ce5ec5faf0 100644 --- a/docs/source/en/processors.md +++ b/docs/source/en/processors.md @@ -20,7 +20,7 @@ Multimodal models require a preprocessor capable of handling inputs that combine For example, [PaliGemma](./model_doc/paligemma) is a vision-language model that uses the [SigLIP](./model_doc/siglip) image processor and the [Llama](./model_doc/llama) tokenizer. A [`ProcessorMixin`] class wraps both of these preprocessor types, providing a single and unified processor class for a multimodal model. -To load a processor, call the [`~ProcessorMixin.from_pretrained`] method. Pass the input type to the processor to generate the expected model inputs, the input ids and pixel values. +To load a processor, call [`~ProcessorMixin.from_pretrained`]. Pass the input type to the processor to generate the expected model inputs, the input ids and pixel values. ```py from transformers import AutoProcessor, PaliGemmaForConditionalGeneration @@ -39,15 +39,18 @@ inputs This guide describes the processor class and how to preprocess multimodal inputs. -## Base processor class +## Processor classes All processors inherit from the [`ProcessorMixin`] class which provides methods like [`~ProcessorMixin.from_pretrained`], [`~ProcessorMixin.save_pretrained`], and [`~ProcessorMixin.push_to_hub`] for loading, saving, and sharing processors to the Hub repsectively. -## AutoProcessor +There are two ways to load a processor, with an [`AutoProcessor`] and with a model-specific processor class. + + + The [AutoClass](./model_doc/auto) API provides a simple interface to load processors without directly specifying the specific model class it belongs to. -Use the [`~AutoProcessor.from_pretrained`] method to load a processor. +Use [`~AutoProcessor.from_pretrained`] to load a processor. ```py from transformers import AutoProcessor @@ -55,9 +58,10 @@ from transformers import AutoProcessor processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224") ``` -## Model-specific processor + + -Processors are also associated with a specific pretrained multimodal model class. You can load a processor directly from the model class with the [`~ProcessorMixin.from_pretrained`] method. +Processors are also associated with a specific pretrained multimodal model class. You can load a processor directly from the model class with [`~ProcessorMixin.from_pretrained`]. ```py from transformers import WhisperProcessor @@ -75,6 +79,9 @@ feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer) ``` + + + ## Preprocess Processors preprocess multimodal inputs into the expected Transformers format. There are a couple combinations of input modalities that a processor can handle such as text and audio or text and image. @@ -84,22 +91,22 @@ Automatic speech recognition (ASR) tasks require a processor that can handle tex ```py from datasets import load_dataset -lj_speech = load_dataset("lj_speech", split="train") -lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"]) -lj_speech[0]["audio"] +dataset = load_dataset("lj_speech", split="train") +dataset = dataset.map(remove_columns=["file", "id", "normalized_text"]) +dataset[0]["audio"] {'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ..., 7.3242188e-04, 2.1362305e-04, 6.1035156e-05], dtype=float32), 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', 'sampling_rate': 22050} -lj_speech[0]["text"] +dataset[0]["text"] 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition' ``` Remember to resample the sampling rate to match the model's requirements. ```py -lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16000)) +dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) ``` Load a processor and pass the audio `array` to the `audio` parameter and pass the `text` column to the `text` parameter. @@ -118,5 +125,5 @@ def prepare_dataset(example): Apply the `prepare_dataset` function to the dataset to preprocess it. The processor returns the `input_features` for the `audio` column and `labels` for the text column. ```py -prepare_dataset(lj_speech[0]) +prepare_dataset(dataset[0]) ``` diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md index c567f0647007..8c603433e0b8 100755 --- a/docs/source/en/quicktour.md +++ b/docs/source/en/quicktour.md @@ -18,21 +18,19 @@ rendered properly in your Markdown viewer. [[open-in-colab]] -Get up and running with Transformers! Whether you're a developer or a machine learning engineer, this quickstart will show you Transformers' key features. +Get up and running with Transformers, a library of pretrained models! -Transformers is a library of pretrained models, providing three classes to instantiate any model and two APIs for inference or training. By limiting the number of user-facing abstractions, Transformers is easier to learn and faster to use. +There are only three classes to instantiate any model and two APIs for inference or training. By limiting the number of user-facing abstractions, Transformers is easier to learn and faster to use. -Transformers supports popular machine learning frameworks like [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), and [Flax](https://flax.readthedocs.io/en/latest/). Switching between frameworks is easy, granting the flexibility to use the best tool for the job (training, evaluation, or production). +Whether you're a developer or a machine learning engineer, this quickstart introduces you to Transformers' key features and shows you how easy it is to: -In this quickstart, you'll learn how to: - -- load a pretrained model for inference or training +- load a pretrained model - run inference with the [`Pipeline`] API - train a model with the [`Trainer`] API ## Setup -To start, we recommend creating a Hugging Face [account](https://hf.co/join). This allows you to host and access version controlled models, datasets, and apps on the [Hugging Face Hub](https://hf.co/docs/hub/index), a collaborative platform for discovery and building. +To start, we recommend creating a Hugging Face [account](https://hf.co/join). This allows you to host and access version controlled models, datasets, and [Spaces](https://hf.co/spaces) on the [Hugging Face Hub](https://hf.co/docs/hub/index), a collaborative platform for discovery and building. Create a [User Access Token](https://hf.co/docs/hub/security-tokens#user-access-tokens) and login to your account. @@ -67,24 +65,24 @@ Install an up-to-date version of Transformers and some additional libraries from !pip install -U transformers datasets evaluate accelerate timm ``` -## Base classes +## Pretrained models Each pretrained model inherits from three base classes. | **Class** | **Description** | |---|---| | [`PretrainedConfig`] | A file that specifies a models attributes such as the number of attention heads or vocabulary size. | -| [`PreTrainedModel`] | A model (or architecture) defined by the attributes from the configuration file. For training and inference with a task, you need a model with a specific head attached to convert the raw hidden states into task-specific outputs. For example, [`PreTrainedModel`] outputs the raw hidden states but [`AutoModelForCausalLM`] adds a causal language model head on top to output the generated text. | +| [`PreTrainedModel`] | A model (or architecture) defined by the model attributes from the configuration file. A pretrained model only returns the raw hidden states. For a specific task, use the appropriate model head to convert the raw hidden states into a meaningful result (e.g., [`LlamaModel`] versus [`LlamaForCausalLM`]). | | Preprocessor | A class for converting raw inputs (text, images, audio, multimodal) into numerical inputs to the model. For example, [`PreTrainedTokenizer`] converts text into tensors and [`ImageProcessingMixin`] converts pixels into tensors. | -Unless you're building a custom model, you'll primarily interact with the [AutoClass](./model_doc/auto) API like [`AutoConfig`], [`AutoModelForCausalLM`], and [`AutoTokenizer`]. An `AutoClass` automatically infers the appropriate architecture for each task and machine learning framework based on the name or path to the pretrained weights and configuration file. +We recommend using the [AutoClass](./model_doc/auto) API to load models and preprocessors because it automatically infers the appropriate architecture for each task and machine learning framework based on the name or path to the pretrained weights and configuration file. -Use the [`~PreTrainedModel.from_pretrained`] method to load a pretrained models weights and configuration file from the Hub into the model and preprocessor class. +Use the [`~PreTrainedModel.from_pretrained`] method to load the weights and configuration file from the Hub into the model and preprocessor class. -When you load a model, especially a large language model (LLM), setting `device_map="auto"` automatically allocates the model weights on your device(s) beginning with the GPU. +When you load a model, especially a large language model (LLM), setting `device_map="auto"` automatically allocates the model weights to your fastest device(s) first which is typically the GPU. ```py from transformers import AutoModelForCausalLM, AutoTokenizer @@ -93,7 +91,7 @@ model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") ``` -Tokenize the text and convert it into tensors with the tokenizer. To accelerate inference, move the model to a GPU if it's available. +Tokenize the text and return PyTorch tensors with the tokenizer. To accelerate inference, move the model to a GPU if it's available. ```py model_inputs = tokenizer(["Hugging Face is a"], return_tensors="pt").to("cuda") @@ -119,7 +117,7 @@ model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2-xl") tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-xl") ``` -Tokenize the text and convert it into tensors with the tokenizer. To accelerate inference, move the model to a GPU if it's available. +Tokenize the text and return TensorFlow tensors with the tokenizer. ```py model_inputs = tokenizer(["Hugging Face is a"], return_tensors="tf") @@ -138,9 +136,9 @@ tokenizer.batch_decode(generated_ids)[0] -For training, skip ahead to the [Trainer API](#trainer-api) section to learn how. +For training, skip ahead to the [Trainer API](#trainer-api) section. -## Pipeline API +## Pipeline The [`Pipeline`] is the most convenient way to inference with a pretrained model. It supports many tasks such as text generation, image segmentation, automatic speech recognition, document question answering, and more. @@ -152,7 +150,7 @@ Create a [`Pipeline`] object and select a task. By default, the [`Pipeline`] dow -Set `device="cuda"` to accelerate inference with a GPU. +Set `device="cuda"`, if it's available, to accelerate inference with a GPU. ```py from transformers import pipeline @@ -213,9 +211,9 @@ pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac") -## Trainer API +## Trainer -The [`Trainer`] is an optimized training and evaluation loop for PyTorch models, a [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html). It abstracts away a lot of the standard boilerplate involved in manually writing a training loop, allowing you to start training faster and focus on training design choices. You only need a model, dataset, a preprocessor, and a data collator to build batches of data from the dataset. +The [`Trainer`] is an optimized training and evaluation loop for PyTorch models. It abstracts away a lot of the standard boilerplate usually involved in manually writing a training loop. You can start training faster and focus on training design choices. You only need a model, dataset, a preprocessor, and a data collator to build batches of data from the dataset. Customize the training process with the [`TrainingArguments`] class. It provides many options for training, evaluation, and more. The training process can be as complex or simple as you want or need. Experiment with training hyperparameters and features like batch size, learning rate, mixed precision, torch.compile, and more. Or if you prefer, just use the default settings to quickly produce a baseline. @@ -227,10 +225,10 @@ from datasets import load_dataset model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") -dataset = load_dataset("rotten_tomatoes") # doctest: +IGNORE_RESULT +dataset = load_dataset("rotten_tomatoes") ``` -Create a function to tokenize the text and convert it into tensors. Apply this function to the whole dataset with the [`datasets.Dataset.map`] method. +Create a function to tokenize the text and convert it into PyTorch tensors. Apply this function to the whole dataset with the [`~datasets.Dataset.map`] method. ```py def tokenize_dataset(dataset): @@ -238,7 +236,7 @@ def tokenize_dataset(dataset): dataset = dataset.map(tokenize_dataset, batched=True) ``` -Load a data collator to create batches of data. +Load a data collator to create batches of data, and pass the tokenizer to it. ```py from transformers import DataCollatorWithPadding @@ -246,7 +244,7 @@ from transformers import DataCollatorWithPadding data_collator = DataCollatorWithPadding(tokenizer=tokenizer) ``` -Next, create an instance of [`TrainingArguments`] to customize the training process. +Next, create an instance of [`TrainingArguments`] with the training features and hyperparameters you want. ```py from transformers import TrainingArguments @@ -273,12 +271,12 @@ trainer = Trainer( eval_dataset=dataset["test"], tokenizer=tokenizer, data_collator=data_collator, -) # doctest: +SKIP +) trainer.train() ``` -Push your model to the Hub to share it with the community. +Use the [`~Trainer.push_to_hub`] method to share your model and tokenizer to the Hub. ```py trainer.push_to_hub() @@ -291,7 +289,7 @@ Congratulations, you just trained your first model with Transformers! > [!WARNING] > Not all pretrained models are available in TensorFlow. Check which ones are implemented in [Supported models and frameworks](./index#supported-models-and-frameworks). -[`Trainer`] doesn't work with TensorFlow models, but you can still train one with [Keras](https://keras.io/). Transformers implements TensorFlow models as a standard [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model), which is compatible with Keras' [compile](https://keras.io/api/models/model_training_apis/#compile-method) and [fit](https://keras.io/api/models/model_training_apis/#fit-method) methods. +[`Trainer`] doesn't work with TensorFlow models, but you can still train a Transformers model implemented in TensorFlow with [Keras](https://keras.io/). Transformers TensorFlow models are a standard [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model), which is compatible with Keras' [compile](https://keras.io/api/models/model_training_apis/#compile-method) and [fit](https://keras.io/api/models/model_training_apis/#fit-method) methods. Load a model, tokenizer, and dataset for training. @@ -302,12 +300,12 @@ model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilb tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") ``` -Create a function to tokenize the text and convert it into tensors. Apply this function to the whole dataset with the [`~datasets.Dataset.map`] method. +Create a function to tokenize the text and convert it into TensorFlow tensors. Apply this function to the whole dataset with the [`~datasets.Dataset.map`] method. ```py def tokenize_dataset(dataset): - return tokenizer(dataset["text"]) # doctest: +SKIP -dataset = dataset.map(tokenize_dataset) # doctest: +SKIP + return tokenizer(dataset["text"]) +dataset = dataset.map(tokenize_dataset) ``` Transformers provides the [`~TFPreTrainedModel.prepare_tf_dataset`] method to collate and batch a dataset. @@ -315,7 +313,7 @@ Transformers provides the [`~TFPreTrainedModel.prepare_tf_dataset`] method to co ```py tf_dataset = model.prepare_tf_dataset( dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer -) # doctest: +SKIP +) ``` Finally, call [compile](https://keras.io/api/models/model_training_apis/#compile-method) to configure the model for training and [fit](https://keras.io/api/models/model_training_apis/#fit-method) to start. @@ -324,17 +322,17 @@ Finally, call [compile](https://keras.io/api/models/model_training_apis/#compile from tensorflow.keras.optimizers import Adam model.compile(optimizer="adam") -model.fit(tf_dataset) # doctest: +SKIP +model.fit(tf_dataset) ``` ## Next steps -Great work on completing the quickstart! You have only scratched the surface of what you can achieve with Transformers. +Great work on completing the quickstart! -Now that you have a better understanding of the library, it is time to keep exploring and learning what interests you the most. +Now that you have a better understanding of the library and what it offers, it's time to keep exploring and learning what interests you the most. -- Base classes: Learn more about the base classes and the model and processor classes that inherit from it. This will help you understand how to write your own custom models, preprocess different types of inputs (audio, images, multimodal), and how to share your model. -- Inference: Dive deeper into inference with the [`Pipeline`] API, inference with LLMs, chatting with LLMs, agents, and how to optimize inference with your machine learning framework and hardware. -- Training: Explore the [`Trainer`] API in more detail, distributed training, and optimizing training on your hardware. -- Quantization: Reduce memory and storage requirements with quantization and speed up inference by representing weights in fewer bits. +- Base classes: Learn more about the base classes, and the configuration, model and processor classes that inherit from it. This will help you understand how to create your own custom models, preprocess different types of inputs (audio, images, multimodal), and how to share your model. +- Inference: Explore the [`Pipeline`] API further, inference with LLMs, chatting with LLMs, agents, and how to optimize inference with your machine learning framework and hardware. +- Training: Study the [`Trainer`] API in more detail, as well as distributed training and optimizing training on specific hardware. +- Quantization: Reduce memory and storage requirements with quantization and speed up inference by representing weights with fewer bits. - Resources: Looking for end-to-end recipes for how to train and inference with a model for a specific task? Check out the task recipes! From 696465c44039d82c1ff72963c86bb63613048092 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 26 Aug 2024 11:46:36 -0700 Subject: [PATCH 033/116] make fixup --- .../falcon_mamba/modeling_falcon_mamba.py | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py index 13eca8aa91cf..61753f5ce243 100644 --- a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -372,11 +372,11 @@ def forward( hidden_states, cache_params: Optional[MambaCache] = None, cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, ): if is_fast_path_available and "cuda" in self.x_proj.weight.device.type and not torch._dynamo.is_compiling(): - return self.cuda_kernels_forward(hidden_states, cache_params, cache_position) - return self.slow_forward(hidden_states, cache_params, cache_position) - + return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask) + return self.slow_forward(hidden_states, cache_params, cache_position, attention_mask) # Copied from transformers.models.mamba.modeling_mamba.MambaRMSNorm with Mamba->FalconMamba @@ -414,13 +414,16 @@ def forward( hidden_states, cache_params: Optional[MambaCache] = None, cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, ): residual = hidden_states hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype)) if self.residual_in_fp32: residual = residual.to(torch.float32) - hidden_states = self.mixer(hidden_states, cache_params=cache_params, cache_position=cache_position) + hidden_states = self.mixer( + hidden_states, cache_params=cache_params, cache_position=cache_position, attention_mask=attention_mask + ) hidden_states = residual + hidden_states return hidden_states @@ -728,6 +731,13 @@ def _update_model_kwargs_for_generation( and model_kwargs["cache_position"] is not None ): model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens + + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + return model_kwargs def prepare_inputs_for_generation( @@ -737,6 +747,7 @@ def prepare_inputs_for_generation( use_cache=None, cache_params: Optional[MambaCache] = None, cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, **kwargs, ): # Overwitten -- uses `cache_params` as opposed to `past_key_values` @@ -751,6 +762,10 @@ def prepare_inputs_for_generation( ) if cache_position[0] > 0: input_ids = input_ids[:, -1].unsqueeze(-1) + + if attention_mask is not None: + attention_mask = None + else: # we initialize the `cache_position` to full size of `conv_states` at prefill stage # considering padding will be applied when input length is shorter, and truncation @@ -768,6 +783,7 @@ def prepare_inputs_for_generation( "cache_params": cache_params, "use_cache": use_cache, "cache_position": cache_position, + "attention_mask": attention_mask, } ) return model_inputs From e4db69eb2912edd1dc1af250fb38ec48f30407a7 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 26 Aug 2024 11:58:49 -0700 Subject: [PATCH 034/116] fix toctree --- docs/source/en/_toctree.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 4543ae1e2ff3..6a2d9fd3695f 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -185,6 +185,8 @@ title: FBGEMM FP8 - local: quantization/optimum title: Optimum + - local: quantization/torchao + title: TorchAO - local: quantization/contribute title: Contribute new quantization method - title: Deploy to production From c8baf2b78407838ba41b8a9218477c87eace7ec9 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 26 Aug 2024 12:12:42 -0700 Subject: [PATCH 035/116] fix --- .../falcon_mamba/modeling_falcon_mamba.py | 32 +++++++++++++++---- .../test_modeling_falcon_mamba.py | 11 ++++--- tests/utils/test_modeling_utils.py | 7 ++-- 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py index 61753f5ce243..d7a40ed5c5ff 100644 --- a/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -62,7 +62,7 @@ (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn) ) -_CHECKPOINT_FOR_DOC = "tiiuae/falcon_mamba-7b" +_CHECKPOINT_FOR_DOC = "tiiuae/falcon-mamba-7b" _CONFIG_FOR_DOC = "FalconMambaConfig" @@ -167,6 +167,7 @@ def cuda_kernels_forward( hidden_states: torch.Tensor, cache_params: Optional[MambaCache] = None, cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, ): # 1. Gated MLP's linear projection projected_states = self.in_proj(hidden_states).transpose(1, 2) @@ -195,6 +196,9 @@ def cuda_kernels_forward( else: hidden_states, gate = projected_states.chunk(2, dim=1) + if attention_mask is not None: + hidden_states = hidden_states * attention_mask.unsqueeze(1) + # 2. Convolution sequence transformation conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2)) if cache_params is not None and cache_position[0] > 0: @@ -216,6 +220,9 @@ def cuda_kernels_forward( hidden_states, conv_weights, self.conv1d.bias, activation=self.activation ) + if attention_mask is not None: + hidden_states = hidden_states * attention_mask.unsqueeze(1) + # 3. State Space Model sequence transformation # 3.a. input varying initialization of time_step, B and C ssm_parameters = self.x_proj(hidden_states.transpose(1, 2)) @@ -275,6 +282,7 @@ def slow_forward( input_states, cache_params: Optional[MambaCache] = None, cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, ): batch_size, seq_len, _ = input_states.shape dtype = input_states.dtype @@ -282,6 +290,9 @@ def slow_forward( projected_states = self.in_proj(input_states).transpose(1, 2) # [batch, 2 * intermediate_size, seq_len] hidden_states, gate = projected_states.chunk(2, dim=1) + if attention_mask is not None: + hidden_states = hidden_states * attention_mask.unsqueeze(1) + # 2. Convolution sequence transformation if cache_params is not None: ssm_state = cache_params.ssm_states[self.layer_idx].clone() @@ -311,6 +322,9 @@ def slow_forward( ) hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len]) # [batch, intermediate_size, seq_len] + if attention_mask is not None: + hidden_states = hidden_states * attention_mask.unsqueeze(1) + # 3. State Space Model sequence transformation # 3.a. Selection: [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2] ssm_parameters = self.x_proj(hidden_states.transpose(1, 2)) @@ -622,14 +636,13 @@ def set_input_embeddings(self, new_embeddings): def forward( self, input_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.LongTensor] = None, # Ignored arg inputs_embeds: Optional[torch.LongTensor] = None, cache_params: Optional[MambaCache] = None, use_cache: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, - **kwargs, # `attention_mask` is passed by the tokenizer and we don't want it + attention_mask: Optional[torch.LongTensor] = None, ) -> Union[Tuple, FalconMambaOutput]: output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -668,10 +681,15 @@ def forward( for mixer_block in self.layers: if self.gradient_checkpointing and self.training: hidden_states = self._gradient_checkpointing_func( - mixer_block.__call__, hidden_states, cache_params, cache_position + mixer_block.__call__, hidden_states, cache_params, cache_position, attention_mask ) else: - hidden_states = mixer_block(hidden_states, cache_params=cache_params, cache_position=cache_position) + hidden_states = mixer_block( + hidden_states, + cache_params=cache_params, + cache_position=cache_position, + attention_mask=attention_mask, + ) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) @@ -794,11 +812,10 @@ def prepare_inputs_for_generation( output_type=FalconMambaCausalLMOutput, config_class=_CONFIG_FOR_DOC, ) - # Ignore copy def forward( self, input_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.LongTensor] = None, # Ignored copy + attention_mask: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, cache_params: Optional[MambaCache] = None, labels: Optional[torch.LongTensor] = None, @@ -824,6 +841,7 @@ def forward( return_dict=return_dict, use_cache=use_cache, cache_position=cache_position, + attention_mask=attention_mask, ) hidden_states = falcon_mamba_outputs[0] diff --git a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py index d4f084ab2941..97e435e1a689 100644 --- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py +++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py @@ -98,6 +98,7 @@ def prepare_config_and_inputs( self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False ): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + attention_mask = ids_tensor([self.batch_size, self.seq_length], 1) sequence_labels = None token_labels = None @@ -116,7 +117,7 @@ def prepare_config_and_inputs( return ( config, input_ids, - None, + attention_mask, sequence_labels, token_labels, choice_labels, @@ -150,6 +151,7 @@ def prepare_config_and_inputs_for_decoder(self): ( config, input_ids, + attention_mask, sequence_labels, token_labels, choice_labels, @@ -158,6 +160,7 @@ def prepare_config_and_inputs_for_decoder(self): return ( config, input_ids, + attention_mask, sequence_labels, token_labels, choice_labels, @@ -250,12 +253,12 @@ def prepare_config_and_inputs_for_common(self): ( config, input_ids, - _, + attention_mask, sequence_labels, token_labels, choice_labels, ) = self.prepare_config_and_inputs() - inputs_dict = {"input_ids": input_ids} + inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask} return config, inputs_dict @@ -543,4 +546,4 @@ def test_training_kernel(self): loss = (1 - lm_logits).mean() loss.backward() - self.assertEqual(out_training, out_no_training) + self.assertEqual(out_training, out_no_training) \ No newline at end of file diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index 568efea55ad4..b9358be16296 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -2608,8 +2608,7 @@ def test_not_available_flash(self): _ = AutoModel.from_pretrained( "hf-internal-testing/tiny-random-GPTBigCodeModel", attn_implementation="flash_attention_2" ) - - self.assertTrue("the package flash_attn seems not to be installed" in str(cm.exception)) + self.assertTrue("the package flash_attn seems to be not installed" in str(cm.exception)) def test_not_available_flash_with_config(self): if is_flash_attn_2_available(): @@ -2624,7 +2623,7 @@ def test_not_available_flash_with_config(self): attn_implementation="flash_attention_2", ) - self.assertTrue("the package flash_attn seems not to be installed" in str(cm.exception)) + self.assertTrue("the package flash_attn seems to be not installed" in str(cm.exception)) def test_not_available_sdpa(self): if is_torch_sdpa_available(): @@ -2672,4 +2671,4 @@ def test_identical(self): shared_names, identical_names = _find_identical([{"a", "b"}], state_dict) self.assertEqual(shared_names, [{"a", "b"}]) - self.assertEqual(identical_names, []) + self.assertEqual(identical_names, []) \ No newline at end of file From 866fee92db18ce618830e919fefbcd4b0978e4aa Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 26 Aug 2024 12:14:55 -0700 Subject: [PATCH 036/116] make style --- tests/utils/test_modeling_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index b9358be16296..66210cae8043 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -2671,4 +2671,4 @@ def test_identical(self): shared_names, identical_names = _find_identical([{"a", "b"}], state_dict) self.assertEqual(shared_names, [{"a", "b"}]) - self.assertEqual(identical_names, []) \ No newline at end of file + self.assertEqual(identical_names, []) From 4af365e82109cb342c6f8e794ae9713967b49287 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 26 Aug 2024 12:46:19 -0700 Subject: [PATCH 037/116] fix hfoption tag --- docs/source/en/backbones.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/en/backbones.md b/docs/source/en/backbones.md index 00500a7404d5..c43acd907135 100644 --- a/docs/source/en/backbones.md +++ b/docs/source/en/backbones.md @@ -91,6 +91,9 @@ config = MaskFormerConfig(backbone_config=backbone_config) model = MaskFormerForInstanceSegmentation(config) ``` + + + ## timm backbones [timm](https://hf.co/docs/timm/index) is a collection of vision models for training and inference. Transformers supports timm models as backbones with the [`TimmBackbone`] and [`TimmBackboneConfig`] classes. From f9f1c3eafa574bfbafefde79a8a6388651ccd446 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 28 Aug 2024 17:13:13 -0700 Subject: [PATCH 038/116] pipeline --- docs/source/en/_toctree.yml | 4 +- docs/source/en/add_new_model.md | 2 +- docs/source/en/model_doc/albert.md | 3 +- docs/source/en/pipeline_tutorial.md | 438 ++++++++++++++-------------- 4 files changed, 227 insertions(+), 220 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 6a2d9fd3695f..35b81aa4a241 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -18,7 +18,7 @@ - local: model_sharing title: Share - local: add_new_model - title: Contribute + title: Add a new model - local: task_summary title: What 🤗 Transformers can do - local: tasks_explained @@ -49,7 +49,7 @@ - title: Pipeline API sections: - local: pipeline_tutorial - title: Run inference with pipelines + title: Pipeline - local: pipeline_webserver title: Pipelines for webserver inference - local: add_new_pipeline diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md index 115a75681a71..1ef99d91bc0c 100644 --- a/docs/source/en/add_new_model.md +++ b/docs/source/en/add_new_model.md @@ -13,7 +13,7 @@ rendered properly in your Markdown viewer. --> -# Contribute +# Add a new model Transformers is fortunate to have a passionate community of developers and researchers contributing models to the library. As an open-source first project, we're invested in empowering the community to actively add models. diff --git a/docs/source/en/model_doc/albert.md b/docs/source/en/model_doc/albert.md index 86c378df436e..52826572aeda 100644 --- a/docs/source/en/model_doc/albert.md +++ b/docs/source/en/model_doc/albert.md @@ -25,7 +25,8 @@ rendered properly in your Markdown viewer. PyTorch TensorFlow -Flax +Flax ## Overview diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md index 357bc7f636ec..7627bef403e1 100644 --- a/docs/source/en/pipeline_tutorial.md +++ b/docs/source/en/pipeline_tutorial.md @@ -1,4 +1,4 @@ - -# Pipelines for inference +# Pipeline -The [`pipeline`] makes it simple to use any model from the [Hub](https://huggingface.co/models) for inference on any language, computer vision, speech, and multimodal tasks. Even if you don't have experience with a specific modality or aren't familiar with the underlying code behind the models, you can still use them for inference with the [`pipeline`]! This tutorial will teach you to: +The [`Pipeline`] is a simple but powerful inference API that is readily available for a variety of machine learning tasks with any model from the Hugging Face [Hub](https://hf.co/models). Tailor the [`Pipeline`] to your task with certain task specific parameters, such as adding timestamps to an automatic speech recognition (ASR) pipeline for transcribing meeting notes. [`Pipeline`] supports GPUs, Apple silicon, and half-precision weights to accelerate inference and save memory. -* Use a [`pipeline`] for inference. -* Use a specific tokenizer or model. -* Use a [`pipeline`] for audio, vision, and multimodal tasks. + - +Transformers has two pipeline classes, a generic [`Pipeline`] and many individual task-specific pipelines like [`TextGenerationPipeline`] or [`VisualQuestionAnsweringPipeline`]. Load these individual pipelines by setting the task identifier in the `task` parameter in [`Pipeline`]. You can find the task identifier for each pipeline in their API documentation. -Take a look at the [`pipeline`] documentation for a complete list of supported tasks and available parameters. +Each task is configured to use a default pretrained model and preprocessor, but this can be overriden with the `model` parameter if you want to use a different model. - - -## Pipeline usage - -While each task has an associated [`pipeline`], it is simpler to use the general [`pipeline`] abstraction which contains -all the task-specific pipelines. The [`pipeline`] automatically loads a default model and a preprocessing class capable -of inference for your task. Let's take the example of using the [`pipeline`] for automatic speech recognition (ASR), or -speech-to-text. - - -1. Start by creating a [`pipeline`] and specify the inference task: +For example, to use the [`TextGenerationPipeline`] with [Gemma 2](./model_doc/gemma2), set `task="text-generation"` and `model="google/gemma-2-2b"`. ```py ->>> from transformers import pipeline +from transformers import pipeline ->>> transcriber = pipeline(task="automatic-speech-recognition") +pipeline = pipeline(task="text-generation", model="google/gemma-2-2b") +pipeline("the secret to baking a really good cake is ") +[{'generated_text': 'the secret to baking a really good cake is 1. the right ingredients 2. the'}] ``` -2. Pass your input to the [`pipeline`]. In the case of speech recognition, this is an audio input file: +When you have more than one input, pass them as a list. ```py ->>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") -{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'} -``` +from transformers import pipeline -Not the result you had in mind? Check out some of the [most downloaded automatic speech recognition models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending) -on the Hub to see if you can get a better transcription. +pipeline = pipeline(task="text-generation", model="google/gemma-2-2b", device="cuda") +pipeline(["the secret to baking a really good cake is ", "a baguette is "]) +[[{'generated_text': 'the secret to baking a really good cake is 1. the right ingredients 2. the'}], + [{'generated_text': 'a baguette is 100% bread.\n\na baguette is 100%'}]] +``` -Let's try the [Whisper large-v2](https://huggingface.co/openai/whisper-large-v2) model from OpenAI. Whisper was released -2 years later than Wav2Vec2, and was trained on close to 10x more data. As such, it beats Wav2Vec2 on most downstream -benchmarks. It also has the added benefit of predicting punctuation and casing, neither of which are possible with -Wav2Vec2. +This guide will introduce you to the [`Pipeline`], demonstrate its features, and show you how to configure its various parameters. -Let's give it a try here to see how it performs. Set `torch_dtype="auto"` to automatically load the most memory-efficient data type the weights are stored in. +## Tasks -```py ->>> transcriber = pipeline(model="openai/whisper-large-v2", torch_dtype="auto") ->>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") -{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'} -``` +[`Pipeline`] is compatible with many machine learning tasks across different modalities. You just need to pass an appropriate input to the pipeline and it will handle the rest. -Now this result looks more accurate! For a deep-dive comparison on Wav2Vec2 vs Whisper, refer to the [Audio Transformers Course](https://huggingface.co/learn/audio-course/chapter5/asr_models). -We really encourage you to check out the Hub for models in different languages, models specialized in your field, and more. -You can check out and compare model results directly from your browser on the Hub to see if it fits or -handles corner cases better than other ones. -And if you don't find a model for your use case, you can always start [training](training) your own! +Here are some examples of how to use [`Pipeline`] for different tasks and modalities. -If you have several inputs, you can pass your input as a list: + + ```py -transcriber( - [ - "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac", - "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac", - ] -) -``` - -Pipelines are great for experimentation as switching from one model to another is trivial; however, there are some ways to optimize them for larger workloads than experimentation. See the following guides that dive into iterating over whole datasets or using pipelines in a webserver: -of the docs: -* [Using pipelines on a dataset](#using-pipelines-on-a-dataset) -* [Using pipelines for a webserver](./pipeline_webserver) +from transformers import pipeline -## Parameters +pipeline = pipeline(task="summarization", model="google/pegasus-billsum") +pipeline("Section was formerly set out as section 44 of this title. As originally enacted, this section contained two further provisions that 'nothing in this act shall be construed as in any wise affecting the grant of lands made to the State of California by virtue of the act entitled 'An act authorizing a grant to the State of California of the Yosemite Valley, and of the land' embracing the Mariposa Big-Tree Grove, approved June thirtieth, eighteen hundred and sixty-four; or as affecting any bona-fide entry of land made within the limits above described under any law of the United States prior to the approval of this act.' The first quoted provision was omitted from the Code because the land, granted to the state of California pursuant to the Act cite, was receded to the United States. Resolution June 11, 1906, No. 27, accepted the recession.") +[{'summary_text': 'Instructs the Secretary of the Interior to convey to the State of California all right, title, and interest of the United States in and to specified lands which are located within the Yosemite and Mariposa National Forests, California.'}] +``` -[`pipeline`] supports many parameters; some are task specific, and some are general to all pipelines. -In general, you can specify parameters anywhere you want: + + ```py -transcriber = pipeline(model="openai/whisper-large-v2", my_parameter=1) +from transformers import pipeline -out = transcriber(...) # This will use `my_parameter=1`. -out = transcriber(..., my_parameter=2) # This will override and use `my_parameter=2`. -out = transcriber(...) # This will go back to using `my_parameter=1`. +pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3") +pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") +{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'} ``` -Let's check out 3 important ones: - -### Device - -If you use `device=n`, the pipeline automatically puts the model on the specified device. -This will work regardless of whether you are using PyTorch or Tensorflow. + + ```py -transcriber = pipeline(model="openai/whisper-large-v2", device=0) +from transformers import pipeline + +pipeline = pipeline(task="image-classification", model="google/vit-base-patch16-224") +pipeline(images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg") +[{'label': 'lynx, catamount', 'score': 0.43350091576576233}, + {'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor', + 'score': 0.034796204417943954}, + {'label': 'snow leopard, ounce, Panthera uncia', + 'score': 0.03240183740854263}, + {'label': 'Egyptian cat', 'score': 0.02394474856555462}, + {'label': 'tiger cat', 'score': 0.02288915030658245}] ``` -If the model is too large for a single GPU and you are using PyTorch, you can set `torch_dtype='float16'` to enable FP16 precision inference. Usually this would not cause significant performance drops but make sure you evaluate it on your models! + + -Alternatively, you can set `device_map="auto"` to automatically -determine how to load and store the model weights. Using the `device_map` argument requires the 🤗 [Accelerate](https://huggingface.co/docs/accelerate) -package: +```py +from transformers import pipeline -```bash -pip install --upgrade accelerate +pipeline = pipeline(task="visual-question-answering", model="Salesforce/blip-vqa-base") +pipeline( + image="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg", + question="What is in the image?", +) +[{'answer': 'statue of liberty'}] ``` -The following code automatically loads and stores model weights across devices: + +
-```py -transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto") -``` +## Parameters -Note that if `device_map="auto"` is passed, there is no need to add the argument `device=device` when instantiating your `pipeline` as you may encounter some unexpected behavior! +At a minimum, a [`Pipeline`] only requires a task identifier, model, and the appropriate input. But there are many parameters available to configure the pipeline with, from task-specific parameters to optimizing performance. -### Batch size +This section will walk you through some of the more important parameters. -By default, pipelines will not batch inference for reasons explained in detail [here](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching). The reason is that batching is not necessarily faster, and can actually be quite slower in some cases. +### Device -But if it works in your use case, you can use: +[`Pipeline`] is compatible with many hardware types, including GPUs, CPUs, Apple silicon, and more. This is configured with the `device` parameter. By default, [`Pipeline`] runs on a CPU which is given by `device=-1`. -```py -transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2) -audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)] -texts = transcriber(audio_filenames) -``` + + -This runs the pipeline on the 4 provided audio files, but it will pass them in batches of 2 -to the model (which is on a GPU, where batching is more likely to help) without requiring any further code from you. -The output should always match what you would have received without batching. It is only meant as a way to help you get more speed out of a pipeline. +To run [`Pipeline`] on a GPU, set `device` to the associated CUDA device id. For example, `device=0` runs on the first GPU. -Pipelines can also alleviate some of the complexities of batching because, for some pipelines, a single item (like a long audio file) needs to be chunked into multiple parts to be processed by a model. The pipeline performs this [*chunk batching*](./main_classes/pipelines#pipeline-chunk-batching) for you. +```py +from transformers import pipeline -### Task specific parameters +pipeline = pipeline(task="text-generation", model="google/gemma-2-2b", device=0) +pipeline("the secret to baking a really good cake is ") +``` -All tasks provide task specific parameters which allow for additional flexibility and options to help you get your job done. -For instance, the [`transformers.AutomaticSpeechRecognitionPipeline.__call__`] method has a `return_timestamps` parameter which sounds promising for subtitling videos: +You could also let [Accelerate](https://hf.co/docs/accelerate/index), a library for distributed training, automatically choose how to load and store the model weights on the appropriate device. This is especially useful if you have multiple devices. Accelerate loads and stores the model weights on the fastest device first, and then moves the weights to other devices (CPU, hard drive) as needed. Set `device_map="auto"` to let Accelerate choose the device. +> [!TIP] +> Make sure you have [Accelerate](https://hf.co/docs/accelerate/basic_tutorials/install) installed. +> +> ```py +> !pip install -U accelerate +> ``` ```py ->>> transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=True) ->>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") -{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.', 'chunks': [{'timestamp': (0.0, 11.88), 'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its'}, {'timestamp': (11.88, 12.38), 'text': ' creed.'}]} +from transformers import pipeline + +pipeline = pipeline(task="text-generation", model="google/gemma-2-2b", device_map="auto") +pipeline("the secret to baking a really good cake is ") ``` -As you can see, the model inferred the text and also outputted **when** the various sentences were pronounced. + + -There are many parameters available for each task, so check out each task's API reference to see what you can tinker with! -For instance, the [`~transformers.AutomaticSpeechRecognitionPipeline`] has a `chunk_length_s` parameter which is helpful -for working on really long audio files (for example, subtitling entire movies or hour-long videos) that a model typically -cannot handle on its own: +To run [`Pipeline`] on Apple silicon, set `device="mps"`. -```python ->>> transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30) ->>> transcriber("https://huggingface.co/datasets/reach-vb/random-audios/resolve/main/ted_60.wav") -{'text': " So in college, I was a government major, which means I had to write a lot of papers. Now, when a normal student writes a paper, they might spread the work out a little like this. So, you know. You get started maybe a little slowly, but you get enough done in the first week that with some heavier days later on, everything gets done and things stay civil. And I would want to do that like that. That would be the plan. I would have it all ready to go, but then actually the paper would come along, and then I would kind of do this. And that would happen every single paper. But then came my 90-page senior thesis, a paper you're supposed to spend a year on. I knew for a paper like that, my normal workflow was not an option, it was way too big a project. So I planned things out and I decided I kind of had to go something like this. This is how the year would go. So I'd start off light and I'd bump it up"} +```py +from transformers import pipeline + +pipeline = pipeline(task="text-generation", model="google/gemma-2-2b", device="mps") +pipeline("the secret to baking a really good cake is ") ``` -If you can't find a parameter that would really help you out, feel free to [request it](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)! + + +### Batch inference -## Using pipelines on a dataset +[`Pipeline`] can also process batches of inputs with the `batch_size` parameter. Batch inference may improve speed, especially on a GPU, but it isn't guaranteed to. Other variables such as hardware, data, and the model itself can affect whether batch inference improves speed. For this reason, batch inference is disabled by default. -The pipeline can also run inference on a large dataset. The easiest way we recommend doing this is by using an iterator: +In this example, when there are 4 inputs and `batch_size` is set to 2, [`Pipeline`] passes a batch of 2 inputs to the model at a time. ```py -def data(): - for i in range(1000): - yield f"My example {i}" - +from transformers import pipeline -pipe = pipeline(model="openai-community/gpt2", device=0) -generated_characters = 0 -for out in pipe(data()): - generated_characters += len(out[0]["generated_text"]) +pipeline = pipeline(task="text-generation", model="google/gemma-2-2b", device="cuda", batch_size=2) +pipeline(["the secret to baking a really good cake is", "a baguette is", "paris is the", "hotdogs are"]) +[[{'generated_text': 'the secret to baking a really good cake is to use a good cake mix.\n\ni’'}], + [{'generated_text': 'a baguette is'}], + [{'generated_text': 'paris is the most beautiful city in the world.\n\ni’ve been to paris 3'}], + [{'generated_text': 'hotdogs are a staple of the american diet. they are a great source of protein and can'}]] ``` -The iterator `data()` yields each result, and the pipeline automatically -recognizes the input is iterable and will start fetching the data while -it continues to process it on the GPU (this uses [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) under the hood). -This is important because you don't have to allocate memory for the whole dataset -and you can feed the GPU as fast as possible. - -Since batching could speed things up, it may be useful to try tuning the `batch_size` parameter here. - -The simplest way to iterate over a dataset is to just load one from 🤗 [Datasets](https://github.com/huggingface/datasets/): +Another good use case for batch inference is when you stream data in [`Pipeline`]. ```py -# KeyDataset is a util that will just output the item we're interested in. +from transformers import pipeline from transformers.pipelines.pt_utils import KeyDataset -from datasets import load_dataset - -pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0) -dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]") +import datasets -for out in pipe(KeyDataset(dataset, "audio")): +# KeyDataset is a utility that returns the item in the dict returned by the dataset +dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised") +pipeline = pipeline(task="text-classification", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", device="cuda") +for out in pipeline(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"): print(out) ``` +Here are some general rules of thumb for determining whether batch inference can help improve performance. -## Using pipelines for a webserver +1. The only way to know for sure is to measure performance on your model, data, and hardware. +2. Don't batch inference if you're constrained by latency (a live inference product for example). +3. Don't batch inference if you're using a CPU. +4. Don't batch inference if you don't know the `sequence_length` of your data. Measure performance, iteratively add to `sequence_length`, and include out-of-memory (OOM) checks to recover from failures. +5. Do batch inference if your `sequence_length` is regular, and keep pushing it until you reach an OOM error. The larger the GPU, the more likely batch inference is to be beneficial. +6. Do make sure you can handle OOM errors if you decide to do batch inference. - -Creating an inference engine is a complex topic which deserves it's own -page. - +### Task-specific parameters -[Link](./pipeline_webserver) +The [`Pipeline`] accepts any parameters that are supported by each individual task pipeline. Make sure to check out each individual task pipeline to see what type of parameters are available. If you can't find a parameter that would be useful for your use case, please feel free to open a GitHub [issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml) to request it! -## Vision pipeline +Here are some examples of enabling these task-specific parameters in [`Pipeline`]. -Using a [`pipeline`] for vision tasks is practically identical. + + -Specify your task and pass your image to the classifier. The image can be a link, a local path or a base64-encoded image. For example, what species of cat is shown below? - -![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg) +The [`AutomaticSpeechRecognitionPipeline.__call__`] method has a `return_timestamps` parameter that returns when each word was spoken by setting it to `"word"`. This parameter can be passed along to [`Pipeline`]. ```py ->>> from transformers import pipeline - ->>> vision_classifier = pipeline(model="google/vit-base-patch16-224") ->>> preds = vision_classifier( -... images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" -... ) ->>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] ->>> preds -[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}] +from transformers import pipeline + +pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3") +pipeline(audio="https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac", return_timestamp="word") +{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.', + 'chunks': [{'text': ' I', 'timestamp': (0.0, 1.1)}, + {'text': ' have', 'timestamp': (1.1, 1.44)}, + {'text': ' a', 'timestamp': (1.44, 1.62)}, + {'text': ' dream', 'timestamp': (1.62, 1.92)}, + {'text': ' that', 'timestamp': (1.92, 3.7)}, + {'text': ' one', 'timestamp': (3.7, 3.88)}, + {'text': ' day', 'timestamp': (3.88, 4.24)}, + {'text': ' this', 'timestamp': (4.24, 5.82)}, + {'text': ' nation', 'timestamp': (5.82, 6.78)}, + {'text': ' will', 'timestamp': (6.78, 7.36)}, + {'text': ' rise', 'timestamp': (7.36, 7.88)}, + {'text': ' up', 'timestamp': (7.88, 8.46)}, + {'text': ' and', 'timestamp': (8.46, 9.2)}, + {'text': ' live', 'timestamp': (9.2, 10.34)}, + {'text': ' out', 'timestamp': (10.34, 10.58)}, + {'text': ' the', 'timestamp': (10.58, 10.8)}, + {'text': ' true', 'timestamp': (10.8, 11.04)}, + {'text': ' meaning', 'timestamp': (11.04, 11.4)}, + {'text': ' of', 'timestamp': (11.4, 11.64)}, + {'text': ' its', 'timestamp': (11.64, 11.8)}, + {'text': ' creed.', 'timestamp': (11.8, 12.3)}]} ``` -## Text pipeline + + + +The [`~TextGenerationPipeline.__call__`] method has a `return_full_text` parameter that determines whether to return the full text or only the generated text. Set it to `False` to only return the generated text. -Using a [`pipeline`] for NLP tasks is practically identical. +[`~TextGenerationPipeline.__call__`] also additional keyword arguments from the [`~GenerationMixin.generate`] method, which itself takes generation configuration parameters from [`GenerationConfig`]. To return more than one generated sequence, set `num_return_sequences` to a value greater than 1. Pass this parameter to [`Pipeline`]. ```py ->>> from transformers import pipeline - ->>> # This model is a `zero-shot-classification` model. ->>> # It will classify text, except you are free to choose any label you might imagine ->>> classifier = pipeline(model="facebook/bart-large-mnli") ->>> classifier( -... "I have a problem with my iphone that needs to be resolved asap!!", -... candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"], -... ) -{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]} +from transformers import pipeline + +pipeline = pipeline(task="text-generation", model="openai-community/gpt2") +pipeline("the secret to baking a good cake is", num_return_sequences=4, return_full_text=False) +[{'generated_text': ' how easy it is for me to do it with my hands. You must not go nuts, or the cake is going to fall out.'}, + {'generated_text': ' to prepare the cake before baking. The key is to find the right type of icing to use and that icing makes an amazing frosting cake.\n\nFor a good icing cake, we give you the basics'}, + {'generated_text': " to remember to soak it in enough water and don't worry about it sticking to the wall. In the meantime, you could remove the top of the cake and let it dry out with a paper towel.\n"}, + {'generated_text': ' the best time to turn off the oven and let it stand 30 minutes. After 30 minutes, stir and bake a cake in a pan until fully moist.\n\nRemove the cake from the heat for about 12'}] ``` -## Multimodal pipeline + + -The [`pipeline`] supports more than one modality. For example, a visual question answering (VQA) task combines text and image. Feel free to use any image link you like and a question you want to ask about the image. The image can be a URL or a local path to the image. +## Chunk batching -For example, if you use this [invoice image](https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png): +There are some instances where you need to process data in chunks. -```py ->>> from transformers import pipeline - ->>> vqa = pipeline(model="impira/layoutlm-document-qa") ->>> output = vqa( -... image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", -... question="What is the invoice number?", -... ) ->>> output[0]["score"] = round(output[0]["score"], 3) ->>> output -[{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}] -``` +- for some data types, a single input (for example, a really long audio file) may need to be chunked into multiple parts before it can be processed +- for some tasks, like zero-shot classification or question answering, a single input may need multiple forward passes which can cause issues with the `batch_size` parameter - +The [`ChunkPipeline`] class is designed to handle these use cases. Both pipeline classes are used in the same way, but since [`ChunkPipeline`] can automatically handle batching you don't need to worry about the number of forward passes your inputs trigger. Instead, you can optimize `batch_size` independently of the inputs. -To run the example above you need to have [`pytesseract`](https://pypi.org/project/pytesseract/) installed in addition to 🤗 Transformers: +Here is how it differs from a regular [`Pipeline`]. -```bash -sudo apt install -y tesseract-ocr -pip install pytesseract + + + +```py +all_model_outputs = [] +for preprocessed in pipeline.preprocess(inputs): + model_outputs = pipeline.model_forward(preprocessed) + all_model_outputs.append(model_outputs) +outputs =pipeline.postprocess(all_model_outputs) ``` - +
+ + +```py +preprocessed = pipeline.preprocess(inputs) +model_outputs = pipeline.forward(preprocessed) +outputs = pipeline.postprocess(model_outputs) +``` -## Using `pipeline` on large models with 🤗 `accelerate`: + +
-You can easily run `pipeline` on large models using 🤗 `accelerate`! First make sure you have installed `accelerate` with `pip install accelerate`. +## Large datasets -First load your model using `device_map="auto"`! We will use `facebook/opt-1.3b` for our example. +For inference on large datasets, you can iterate directly over the dataset. This avoids immediately allocating memory for the entire dataset, and you don't need to worry about creating batches yourself. As mentioned in the [Batch inference](#batch-inference) section, you can try using the `batch_size` parameter to see if it improves performance. ```py -# pip install accelerate -import torch +from transformers.pipelines.pt_utils import KeyDataset from transformers import pipeline +from datasets import load_dataset -pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto") -output = pipe("This is a cool example!", do_sample=True, top_p=0.95) +dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised") +pipeline = pipeline(task="text-classification", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", device="cuda") +for out in pipeline(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"): + print(out) ``` -You can also pass 8-bit loaded models if you install `bitsandbytes` and add the argument `load_in_8bit=True` +Other ways to run inference on large datasets with [`Pipeline`] include using an iterator or generator. ```py -# pip install accelerate bitsandbytes -import torch -from transformers import pipeline +def data(): + for i in range(1000): + yield f"My example {i}" -pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"load_in_8bit": True}) -output = pipe("This is a cool example!", do_sample=True, top_p=0.95) +pipeline = pipeline(model="openai-community/gpt2", device=0) +generated_characters = 0 +for out in pipeline(data()): + generated_characters += len(out[0]["generated_text"]) ``` -Note that you can replace the checkpoint with any Hugging Face model that supports large model loading, such as BLOOM. +## Large model optimizations -## Creating web demos from pipelines with `gradio` +[Accelerate](https://hf.co/docs/accelerate/index) enables a couple of optimizations for running large models with [`Pipeline`]. Make sure Accelerate is installed first. -Pipelines are automatically supported in [Gradio](https://github.com/gradio-app/gradio/), a library that makes creating beautiful and user-friendly machine learning apps on the web a breeze. First, make sure you have Gradio installed: - -``` -pip install gradio +```py +!pip install -U accelerate ``` -Then, you can create a web demo around an image classification pipeline (or any other pipeline) in a single line of code by calling Gradio's [`Interface.from_pipeline`](https://www.gradio.app/docs/interface#interface-from-pipeline) function to launch the pipeline. This creates an intuitive drag-and-drop interface in your browser: - -```py -from transformers import pipeline -import gradio as gr +As mentioned in the [Device](#device) section, the `device_map="auto"` setting is useful for automatically distributing the model across the fastest devices (GPUs) first before dispatching to other slower devices if available (CPU, hard drive). -pipe = pipeline("image-classification", model="google/vit-base-patch16-224") +[`Pipeline`] supports half-precision weights, torch.float16, which can be significantly faster and save memory. Performance loss is negligible for most models, especially for larger models. If your hardware supports it, you can enable torch.bfloat16 instead for more range. -gr.Interface.from_pipeline(pipe).launch() -``` +> [!TIP] +> Inputs are internally converted to torch.float16, and it only works for models with a PyTorch backend. +Lastly, [`Pipeline`] also accepts quantized models to really reduce memory usage even further. Make sure you have the [bitsandbytes](https://hf.co/docs/bitsandbytes/installation) library installed first, and then add `load_in_8bit=True` to `model_kwargs` in the pipeline. -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/panda-classification.png) +```py +import torch +from transformers import pipeline, BitsAndBytesConfig -By default, the web demo runs on a local server. If you'd like to share it with others, you can generate a temporary public -link by setting `share=True` in `launch()`. You can also host your demo on [Hugging Face Spaces](https://huggingface.co/spaces) for a permanent link. +pipeline = pipeline(model="google/gemma-7b", torch_dtype=torch.bfloat16, device_map="auto", model_kwargs={"quantization_config": BitsAndBytesConfig(load_in_8bit=True)}) +pipeline("the secret to baking a good cake is ") +[{'generated_text': 'the secret to baking a good cake is 1. the right ingredients 2. the right'}] +``` From 8a9fcfe534eee96dd05de31b8e5d4d82a284f642 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 28 Aug 2024 18:23:11 -0700 Subject: [PATCH 039/116] pipeline gradio --- docs/source/en/_toctree.yml | 2 ++ docs/source/en/pipeline_gradio.md | 52 +++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 docs/source/en/pipeline_gradio.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 35b81aa4a241..a785edf92b8e 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -50,6 +50,8 @@ sections: - local: pipeline_tutorial title: Pipeline + - local: pipeline_gradio + title: Machine learning apps - local: pipeline_webserver title: Pipelines for webserver inference - local: add_new_pipeline diff --git a/docs/source/en/pipeline_gradio.md b/docs/source/en/pipeline_gradio.md new file mode 100644 index 000000000000..7a917d1fa865 --- /dev/null +++ b/docs/source/en/pipeline_gradio.md @@ -0,0 +1,52 @@ + + +# Machine learning apps + +[Gradio](https://www.gradio.app/), a fast and easy library for building and sharing machine learning apps, is integrated with [`Pipeline`] to quickly create a simple interface for inference. + +Before you begin, make sure Gradio is installed. + +```py +!pip install gradio +``` + +Create a pipeline for your task, and then pass it to Gradio's [Interface.from_pipeline](https://www.gradio.app/docs/gradio/interface#interface-from_pipeline) function to create the interface. Gradio automatically determines the appropriate input and output components for a [`Pipeline`]. + +Add [launch](https://www.gradio.app/main/docs/gradio/blocks#blocks-launch) to create a web server and start up the app. + +```py +from transformers import pipeline +import gradio as gr + +pipeline = pipeline("image-classification", model="google/vit-base-patch16-224") +gr.Interface.from_pipeline(pipeline).launch() +``` + +The web app runs on a local server by default. To share the app with other users, set `share=True` in [launch](https://www.gradio.app/main/docs/gradio/blocks#blocks-launch) to generate a temporary public link. For a more permanent solution, host the app on Hugging Face [Spaces](https://hf.co/spaces). + +```py +gr.Interface.from_pipeline(pipeline).launch(share=True) +``` + +The Space below is created with the code above and hosted on Spaces. + + From 144287f9d7476b9fff38c088d6ddc715e02c33c7 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Thu, 29 Aug 2024 13:29:28 -0700 Subject: [PATCH 040/116] pipeline web server --- docs/source/en/_toctree.yml | 2 +- docs/source/en/pipeline_webserver.md | 162 ++++++++++++--------------- 2 files changed, 75 insertions(+), 89 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index a785edf92b8e..1e0afe8cc49d 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -53,7 +53,7 @@ - local: pipeline_gradio title: Machine learning apps - local: pipeline_webserver - title: Pipelines for webserver inference + title: Web server inference - local: add_new_pipeline title: How to add a pipeline to 🤗 Transformers? - title: LLMs diff --git a/docs/source/en/pipeline_webserver.md b/docs/source/en/pipeline_webserver.md index 17b5fbd958dd..b081d3af615d 100644 --- a/docs/source/en/pipeline_webserver.md +++ b/docs/source/en/pipeline_webserver.md @@ -1,34 +1,40 @@ - -# Using pipelines for a webserver +# Web server inference - -Creating an inference engine is a complex topic, and the "best" solution -will most likely depend on your problem space. Are you on CPU or GPU? Do -you want the lowest latency, the highest throughput, support for -many models, or just highly optimize 1 specific model? -There are many ways to tackle this topic, so what we are going to present is a good default -to get started which may not necessarily be the most optimal solution for you. - +A web server is basically a system that waits for requests and serves them as they come in. This means you can use [`Pipeline`] as an inference engine on the web server, since you can use an iterator (similar to how you would [iterate over a dataset](./pipeline_tutorial#large-datasets)) to handle each incoming request. +Designing a web server with [`Pipeline`] is unique though because they're fundamentally different. Web servers are multiplexed (multithreaded, async, etc.) to handle multiple requests concurrently. [`Pipeline`] and its underlying model on the other hand are not designed for parallelism because they take a lot of memory. It's best to give a [`Pipeline`] all the available resources when they're running or for a compute intensive job. -The key thing to understand is that we can use an iterator, just like you would [on a -dataset](pipeline_tutorial#using-pipelines-on-a-dataset), since a webserver is basically a system that waits for requests and -treats them as they come in. +This guide shows how to work around this difference by using the web server to handle the light load of receiving and sending requests, and having a single thread to handle the heavier load of running [`Pipeline`]. -Usually webservers are multiplexed (multithreaded, async, etc..) to handle various -requests concurrently. Pipelines on the other hand (and mostly the underlying models) -are not really great for parallelism; they take up a lot of RAM, so it's best to give them all the available resources when they are running or it's a compute-intensive job. +## Create a server -We are going to solve that by having the webserver handle the light load of receiving -and sending requests, and having a single thread handling the actual work. -This example is going to use `starlette`. The actual framework is not really -important, but you might have to tune or change the code if you are using another -one to achieve the same effect. +[Starlette](https://www.starlette.io/) is a lightweight framework for building web servers. You can use any other framework you'd like, but you may have to make some changes to the code below. -Create `server.py`: +Before you begin, make sure Starlette and [uvicorn](http://www.uvicorn.org/) are installed. + +```py +!pip install starlette uvicorn +``` + +Now you can create a simple web server in a `server.py` file. The key is to only load the model **once** to prevent unnecessary copies of it from consuming memory. + +Create a pipeline to fill in the masked token, `[MASK]`. ```py from starlette.applications import Starlette @@ -37,7 +43,6 @@ from starlette.routing import Route from transformers import pipeline import asyncio - async def homepage(request): payload = await request.body() string = payload.decode("utf-8") @@ -46,22 +51,19 @@ async def homepage(request): output = await response_q.get() return JSONResponse(output) - async def server_loop(q): - pipe = pipeline(model="google-bert/bert-base-uncased") + pipeline = pipeline(task="fill-mask",model="google-bert/bert-base-uncased") while True: (string, response_q) = await q.get() - out = pipe(string) + out = pipeline(string) await response_q.put(out) - app = Starlette( routes=[ Route("/", homepage, methods=["POST"]), ], ) - @app.on_event("startup") async def startup_event(): q = asyncio.Queue() @@ -69,30 +71,48 @@ async def startup_event(): asyncio.create_task(server_loop(q)) ``` -Now you can start it with: +Start the server with the following command. + ```bash uvicorn server:app ``` -And you can query it: +The server can be queried now with a POST request. + ```bash -curl -X POST -d "test [MASK]" http://localhost:8000/ -#[{"score":0.7742936015129089,"token":1012,"token_str":".","sequence":"test."},...] +curl -X POST -d "Paris is the [MASK] of France." http://localhost:8000/ +[{'score': 0.9969332218170166, + 'token': 3007, + 'token_str': 'capital', + 'sequence': 'paris is the capital of france.'}, + {'score': 0.0005914849461987615, + 'token': 2540, + 'token_str': 'heart', + 'sequence': 'paris is the heart of france.'}, + {'score': 0.00043787318281829357, + 'token': 2415, + 'token_str': 'center', + 'sequence': 'paris is the center of france.'}, + {'score': 0.0003378340043127537, + 'token': 2803, + 'token_str': 'centre', + 'sequence': 'paris is the centre of france.'}, + {'score': 0.00026995912776328623, + 'token': 2103, + 'token_str': 'city', + 'sequence': 'paris is the city of france.'}] ``` -And there you go, now you have a good idea of how to create a webserver! +## Queuing requests -What is really important is that we load the model only **once**, so there are no copies -of the model on the webserver. This way, no unnecessary RAM is being used. -Then the queuing mechanism allows you to do fancy stuff like maybe accumulating a few -items before inferring to use dynamic batching: +The server's queuing mechanism can be used for some interesting applications such as dynamic batching. With dynamic batching, you can accumulate several requests first before processing them with the [`Pipeline`]. - +The example below is written in pseudocode for readability rather than performance, in particular, you'll notice that: -The code sample below is intentionally written like pseudo-code for readability. -Do not run this without checking if it makes sense for your system resources! +1. There is no batch size limit. +2. The timeout is reset on every queue fetch, so you could end up waiting much longer than the `timeout` value before processing a request. This would also delay the first inference request by that amount of time. The web server always waits 1ms even if the queue is empty, which is inefficient, because you could be using that time to start inference. It could make sense though if batching is essential to your use case. - + It would be better to have a single 1ms deadline, instead of resetting it on every fetch. ```py (string, rq) = await q.get() @@ -100,69 +120,35 @@ strings = [] queues = [] while True: try: - (string, rq) = await asyncio.wait_for(q.get(), timeout=0.001) # 1ms + (string, rq) = await asyncio.wait_for(q.get(), timeout=0.001) except asyncio.exceptions.TimeoutError: break strings.append(string) queues.append(rq) strings -outs = pipe(strings, batch_size=len(strings)) +outs = pipeline(strings, batch_size=len(strings)) for rq, out in zip(queues, outs): await rq.put(out) ``` -Again, the proposed code is optimized for readability, not for being the best code. -First of all, there's no batch size limit which is usually not a -great idea. Next, the timeout is reset on every queue fetch, meaning you could -wait much more than 1ms before running the inference (delaying the first request -by that much). - -It would be better to have a single 1ms deadline. - -This will always wait for 1ms even if the queue is empty, which might not be the -best since you probably want to start doing inference if there's nothing in the queue. -But maybe it does make sense if batching is really crucial for your use case. -Again, there's really no one best solution. - - -## Few things you might want to consider - -### Error checking +## Error checking -There's a lot that can go wrong in production: out of memory, out of space, -loading the model might fail, the query might be wrong, the query might be -correct but still fail to run because of a model misconfiguration, and so on. +There are many things that can go wrong in production. You could run out-of-memory, out of space, fail to load a model, have an incorrect model configuration, have an incorrect query, and so much more! -Generally, it's good if the server outputs the errors to the user, so -adding a lot of `try..except` statements to show those errors is a good -idea. But keep in mind it may also be a security risk to reveal all those errors depending -on your security context. +Adding `try...except` statements could be helpful to return these errors to the user for debugging. Keep in mind that this could pose a security risk though if you shouldn't be revealing certain information. -### Circuit breaking +## Circuit breaking -Webservers usually look better when they do circuit breaking. It means they -return proper errors when they're overloaded instead of just waiting for the query indefinitely. Return a 503 error instead of waiting for a super long time or a 504 after a long time. +It is better to return errors when the server is overloaded instead of forcing a user to wait indefinitely. Try to return a 503 or 504 error instead of making a user wait for a really long time. -This is relatively easy to implement in the proposed code since there is a single queue. -Looking at the queue size is a basic way to start returning errors before your -webserver fails under load. +It is relatively simple to implement these error types since it's only a single queue. You should look at the queue size to determine when to start returning errors before your server fails under load. -### Blocking the main thread +## Block the main thread -Currently PyTorch is not async aware, and computation will block the main -thread while running. That means it would be better if PyTorch was forced to run -on its own thread/process. This wasn't done here because the code is a lot more -complex (mostly because threads and async and queues don't play nice together). -But ultimately it does the same thing. +PyTorch is not async aware, so computation will block the main thread from running. -This would be important if the inference of single items were long (> 1s) because -in this case, it means every query during inference would have to wait for 1s before -even receiving an error. +For this reason, it's better to run PyTorch on its own separate thread or process. When inference of a single request is especially long (> 1s), it's even more important because it means every query during inference must wait 1s before even receiving an error. -### Dynamic batching +## Dynamic batching -In general, batching is not necessarily an improvement over passing 1 item at -a time (see [batching details](./main_classes/pipelines#pipeline-batching) for more information). But it can be very effective -when used in the correct setting. In the API, there is no dynamic -batching by default (too much opportunity for a slowdown). But for BLOOM inference - -which is a very large model - dynamic batching is **essential** to provide a decent experience for everyone. +Dynamic batching can be very effective when used in the correct setting, but it's not necessary when you're only passing 1 request at a time (see [batch inference](./pipeline_tutorial#batch-inference) for more details). From 23286eea79e7b1c63b831cf379b1d1ed26893d05 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Fri, 30 Aug 2024 10:53:55 -0700 Subject: [PATCH 041/116] add pipeline --- docs/source/en/_toctree.yml | 26 +--- docs/source/en/add_new_pipeline.md | 221 ++++++++++------------------- 2 files changed, 77 insertions(+), 170 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 1e0afe8cc49d..bc6f5d52085e 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -55,7 +55,7 @@ - local: pipeline_webserver title: Web server inference - local: add_new_pipeline - title: How to add a pipeline to 🤗 Transformers? + title: Add a new pipeline - title: LLMs sections: - local: tasks/prompting @@ -88,36 +88,12 @@ title: Agents - local: multilingual title: Run inference with multilingual models - - local: custom_models - title: Share a custom model - - local: chat_templating - title: Chat templates - - local: trainer - title: Trainer - - local: sagemaker - title: Run training on Amazon SageMaker - - local: serialization - title: Export to ONNX - - local: tflite - title: Export to TFLite - - local: torchscript - title: Export to TorchScript - - local: benchmarks - title: Benchmarks - - local: notebooks - title: Notebooks with examples - - local: community - title: Community resources - - local: troubleshooting - title: Troubleshoot - local: gguf title: Interoperability with GGUF files - local: perf_infer_cpu title: CPU inference - local: perf_infer_gpu_one title: GPU inference - - local: big_models - title: Instantiate a big model - title: Training sections: - title: Trainer API diff --git a/docs/source/en/add_new_pipeline.md b/docs/source/en/add_new_pipeline.md index e8234c565b26..d7ec9b21261a 100644 --- a/docs/source/en/add_new_pipeline.md +++ b/docs/source/en/add_new_pipeline.md @@ -1,4 +1,4 @@ - -# How to create a custom pipeline? +# Add a new pipeline -In this guide, we will see how to create a custom pipeline and share it on the [Hub](https://hf.co/models) or add it to the -🤗 Transformers library. +You can make [`Pipeline`] your own by subclassing it, and then implementing a few methods. Share the code with the community on the [Hub](https://hf.co) and register the pipeline with Transformers so that everyone can quickly and easily use it. -First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes, -dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible -as it makes compatibility easier (even through other languages via JSON). Those will be the `inputs` of the -pipeline (`preprocess`). +This guide will walk you through the process of adding a new pipeline to Transformers. -Then define the `outputs`. Same policy as the `inputs`. The simpler, the better. Those will be the outputs of -`postprocess` method. +## Design choices -Start by inheriting the base class `Pipeline` with the 4 methods needed to implement `preprocess`, -`_forward`, `postprocess`, and `_sanitize_parameters`. +At a bare minimum, you only need to provide [`Pipeline`] with an appropriate input for a task. This is also where you should begin when designing your pipeline. +Decide what input types [`Pipeline`] can accept. It can be strings, raw bytes, dictionaries, and so on. Try to keep the inputs in pure Python where possible because it's more compatible. Next, decide on the output [`Pipeline`] should return. Again, keeping the output in Python is the simplest and best option because it's easier to work with. -```python -from transformers import Pipeline +Keeping the inputs and outputs simple, and ideally JSON-serializable, makes it easier for users to run your [`Pipeline`] without needing to learn new object types. It's also common to support many different input types, for even greater ease of use. For example, making an audio file acceptable from a filename, URL, or raw bytes gives the user more flexibility in how they provide the audio data. + +## Implement a pipeline +With an input and output decided, you can start implementing [`Pipeline`]. Your pipeline should inherit from the base [`Pipeline`] class and include 4 methods. + +```py +from transformers import Pipeline class MyPipeline(Pipeline): def _sanitize_parameters(self, **kwargs): - preprocess_kwargs = {} - if "maybe_arg" in kwargs: - preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] - return preprocess_kwargs, {}, {} - def preprocess(self, inputs, maybe_arg=2): - model_input = Tensor(inputs["input_ids"]) - return {"model_input": model_input} + def preprocess(self, inputs, args=2): def _forward(self, model_inputs): - # model_inputs == {"model_input": model_input} - outputs = self.model(**model_inputs) - # Maybe {"logits": Tensor(...)} - return outputs def postprocess(self, model_outputs): - best_class = model_outputs["logits"].softmax(-1) - return best_class ``` -The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing -pre/postprocessing on the CPU on different threads +1. `preprocess` takes the inputs and transforms them into the appropriate input format for the model. -`preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might -contain more information and is usually a `Dict`. - -`_forward` is the implementation detail and is not meant to be called directly. `forward` is the preferred -called method as it contains safeguards to make sure everything is working on the expected device. If anything is -linked to a real model it belongs in the `_forward` method, anything else is in the preprocess/postprocess. - -`postprocess` methods will take the output of `_forward` and turn it into the final output that was decided -earlier. - -`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization -time `pipeline(...., maybe_arg=4)` or at call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`. - -The returns of `_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to `preprocess`, -`_forward`, and `postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That -allows to keep the default arguments in the function definition which is always more "natural". - -A classic example would be a `top_k` argument in the post processing in classification tasks. +```py +def preprocess(self, inputs, maybe_arg=2): + model_input = Tensor(inputs["input_ids"]) + return {"model_input": model_input} +``` -```python ->>> pipe = pipeline("my-new-task") ->>> pipe("This is a test") -[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05} -{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}] +1. `_forward` shouldn't be called directly. `forward` is the preferred method because it includes safeguards to make sure everything works correctly on the expected device. Anything linked to the model belongs in `_forward`, and everything else belongs in either `preprocess` or `postprocess`. ->>> pipe("This is a test", top_k=2) -[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}] +```py +def _forward(self, model_inputs): + outputs = self.model(**model_inputs) + return outputs ``` -In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit -`_sanitize_parameters` to allow this new parameter. - +1. `postprocess` generates the final output from the models output in `_forward`. -```python +```py def postprocess(self, model_outputs, top_k=5): best_class = model_outputs["logits"].softmax(-1) - # Add logic to handle top_k return best_class +``` +1. `_sanitize_parameters` lets users pass additional parameters to [`Pipeline`]. This could be during the initialization or when [`Pipeline`] is called. `_sanitize_parameters` returns 3 dicts of additional keyword arguments that are passed directly to `preprocess`, `_forward`, and `postprocess`. Don't add anything if a user didn't call the pipeline with an extra parameters! This keeps the default arguments in the function definition which is always more *natural*. +For example, add a `top_k` parameter in `postprocess` to return the top 5 most likely classes. Then in `_sanitize_parameters`, check if the user passed in `top_k` and add it to `postprocess_kwargs`. + +```py def _sanitize_parameters(self, **kwargs): preprocess_kwargs = {} if "maybe_arg" in kwargs: @@ -110,42 +84,51 @@ def _sanitize_parameters(self, **kwargs): return preprocess_kwargs, {}, postprocess_kwargs ``` -Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy -without requiring users to understand new kinds of objects. It's also relatively common to support many different types -of arguments for ease of use (audio files, which can be filenames, URLs or pure bytes) +Now the pipeline can return the top most likely labels if they choose to. +```py +from transformers import pipeline + +pipeline = pipeline("my-task") +# returns 3 most likely labels +pipeline("This is the best meal I've ever had", top_k=3) +# returns 5 most likely labels by default +pipeline("This is the best meal I've ever had") +``` +## Register a pipeline -## Adding it to the list of supported tasks +Register the new task your pipeline supports in the `PIPELINE_REGISTRY`. The registry defines: -To register your `new-task` to the list of supported tasks, you have to add it to the `PIPELINE_REGISTRY`: +- the machine learning framework the pipeline supports with either `pt_model` or `tf_model` (add both to ensure it works with either frameworks) +- a default model which should come from a specific revision (branch, or commit hash) where the model works as expected with `default` +- the expected input with `type` -```python +```py from transformers.pipelines import PIPELINE_REGISTRY +from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification PIPELINE_REGISTRY.register_pipeline( "new-task", pipeline_class=MyPipeline, pt_model=AutoModelForSequenceClassification, + tf_model=TFAutoModelForSequenceClassification, + default={"pt": ("user/awesome-model", "branch-name")}, + type="text", ) ``` -You can specify a default model if you want, in which case it should come with a specific revision (which can be the name of a branch or a commit hash, here we took `"abcdef"`) as well as the type: +## Share your pipeline -```python -PIPELINE_REGISTRY.register_pipeline( - "new-task", - pipeline_class=MyPipeline, - pt_model=AutoModelForSequenceClassification, - default={"pt": ("user/awesome_model", "abcdef")}, - type="text", # current support type: text, audio, image, multimodal -) -``` +Share your pipeline with the community on the [Hub](https://hf.co) or you can add it directly to Transformers. + +It's faster to upload your pipeline code to the Hub because it doesn't require any review from the Transformers team. Adding the pipeline to Transformers may be slower because it requires a review and you need to add tests to ensure the [`Pipeline`] works. -## Share your pipeline on the Hub +### Upload to the Hub -To share your custom pipeline on the Hub, you just have to save the custom code of your `Pipeline` subclass in a -python file. For instance, let's say we want to use a custom pipeline for sentence pair classification like this: +Add your pipeline code to the Hub in a Python file. + +For example, a custom pipeline for sentence pair classification might look the following code below. ```py import numpy as np @@ -183,88 +166,36 @@ class PairClassificationPipeline(Pipeline): return {"label": label, "score": score, "logits": logits} ``` -The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. If we have saved this in -a file named `pair_classification.py`, we can then import it and register it like this. - -```py -from pair_classification import PairClassificationPipeline -from transformers.pipelines import PIPELINE_REGISTRY -from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification - -PIPELINE_REGISTRY.register_pipeline( - "pair-classification", - pipeline_class=PairClassificationPipeline, - pt_model=AutoModelForSequenceClassification, - tf_model=TFAutoModelForSequenceClassification, -) -``` - -The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5fb0164e89d4998e5776897c16f7330d3df/src/transformers/pipelines/base.py#L1387) function registers the pipeline details (task type, pipeline class, supported backends) to a models `config.json` file. - -```json - "custom_pipelines": { - "pair-classification": { - "impl": "pair_classification.PairClassificationPipeline", - "pt": [ - "AutoModelForSequenceClassification" - ], - "tf": [ - "TFAutoModelForSequenceClassification" - ], - } - }, -``` - -Once this is done, we can use it with a pretrained model. For instance `sgugger/finetuned-bert-mrpc` has been -fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not. +Call [`~Pipeline.push_to_hub`] to push the pipeline to the Hub. The Python file containing the code is copied to the Hub, and the pipelines model and tokenizer are also saved and pushed to the Hub. Your pipeline should now be available on the Hub under your namespace. ```py from transformers import pipeline -classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc") -``` - -Then we can share it on the Hub by using the `push_to_hub` method: - -```py -classifier.push_to_hub("test-dynamic-pipeline") +pipeline = pipeline(task="pair-classification", model="sgugger/finetuned-bert-mrpc") +pipeline.push_to_hub("pair-classification-pipeline") ``` -This will copy the file where you defined `PairClassificationPipeline` inside the folder `"test-dynamic-pipeline"`, -along with saving the model and tokenizer of the pipeline, before pushing everything into the repository -`{your_username}/test-dynamic-pipeline`. After that, anyone can use it as long as they provide the option -`trust_remote_code=True`: +To use the pipeline, add `trust_remote_code=True` when loading the pipeline. ```py from transformers import pipeline -classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True) +pipeline = pipeline(task="pair-classification", trust_remote_code=True) ``` -## Add the pipeline to 🤗 Transformers +### Add to Transformers + +Adding a custom pipeline to Transformers requires adding tests to make sure everything works as expected, and requesting a review from the Transformers team. -If you want to contribute your pipeline to 🤗 Transformers, you will need to add a new module in the `pipelines` submodule -with the code of your pipeline, then add it to the list of tasks defined in `pipelines/__init__.py`. +Add your pipeline code as a new module to the [pipelines](https://github.com/huggingface/transformers/tree/main/src/transformers/pipelines) submodule, and add it to the list of tasks defined in [pipelines/__init__.py](https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/__init__.py). -Then you will need to add tests. Create a new file `tests/test_pipelines_MY_PIPELINE.py` with examples of the other tests. +Next, add a new test for the pipeline in [transformers/tests/pipelines](https://github.com/huggingface/transformers/tree/main/tests/pipelines). You can look at the other tests for examples of how to test your pipeline. For example, take a look at the text classification pipeline test. -The `run_pipeline_test` function will be very generic and run on small random models on every possible -architecture as defined by `model_mapping` and `tf_model_mapping`. +The [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function should be very generic and run on the models defined in [model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L48) and [tf_model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L49). This is important for testing future compatibility with new models. -This is very important to test future compatibility, meaning if someone adds a new model for -`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's -impossible to check for actual values, that's why there is a helper `ANY` that will simply attempt to match the -output of the pipeline TYPE. +You'll also notice `ANY` is used throughout the `run_pipeline_test` function. The models are random, so you can't check the actual values. Using `ANY` allows the test to just match the output of the pipeline type instead. -You also *need* to implement 2 (ideally 4) tests. +Finally, you should also implement the following 4 tests. -- `test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense) - and test the pipeline outputs. The results should be the same as `test_small_model_tf`. -- `test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense) - and test the pipeline outputs. The results should be the same as `test_small_model_pt`. -- `test_large_model_pt` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to - make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make - sure there is no drift in future releases. -- `test_large_model_tf` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to - make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make - sure there is no drift in future releases. +1. [test_small_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L59) and [test_small_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L150), use a small model for these pipelines to make sure they return the correct outputs. The results don't have to make sense. Each pipeline should return the same result. +1. [test_large_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L187) nad [test_large_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L220), use a realistic model for these pipelines to make sure they return meaningful results. These tests are slow and should be marked as slow. From 0f0160e4eb46efa5f31376b81b09d5c8948ca378 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Fri, 30 Aug 2024 11:15:35 -0700 Subject: [PATCH 042/116] fix toctree --- docs/source/en/big_models.md | 215 ----------------------------------- 1 file changed, 215 deletions(-) delete mode 100644 docs/source/en/big_models.md diff --git a/docs/source/en/big_models.md b/docs/source/en/big_models.md deleted file mode 100644 index 0c1737af1abd..000000000000 --- a/docs/source/en/big_models.md +++ /dev/null @@ -1,215 +0,0 @@ - - -# Instantiate a big model - -A barrier to accessing very large pretrained models is the amount of memory required. When loading a pretrained PyTorch model, you usually: - -1. Create a model with random weights. -2. Load your pretrained weights. -3. Put those pretrained weights in the model. - -The first two steps both require a full version of the model in memory and if the model weighs several GBs, you may not have enough memory for two copies of it. This problem is amplified in distributed training environments because each process loads a pretrained model and stores two copies in memory. - -> [!TIP] -> The randomly created model is initialized with "empty" tensors, which take space in memory without filling it. The random values are whatever was in this chunk of memory at the time. To improve loading speed, the [`_fast_init`](https://github.com/huggingface/transformers/blob/c9f6e5e35156e068b227dd9b15521767f6afd4d2/src/transformers/modeling_utils.py#L2710) parameter is set to `True` by default to skip the random initialization for all weights that are correctly loaded. - -This guide will show you how Transformers can help you load large pretrained models despite their memory requirements. - -## Sharded checkpoints - -From Transformers v4.18.0, a checkpoint larger than 10GB is automatically sharded by the [`~PreTrainedModel.save_pretrained`] method. It is split into several smaller partial checkpoints and creates an index file that maps parameter names to the files they're stored in. - -The maximum shard size is controlled with the `max_shard_size` parameter, but by default it is 5GB, because it is easier to run on free-tier GPU instances without running out of memory. - -For example, let's shard [BioMistral/BioMistral-7B](https://hf.co/BioMistral/BioMistral-7B). - -```py ->>> with tempfile.TemporaryDirectory() as tmp_dir: -... model.save_pretrained(tmp_dir, max_shard_size="5GB") -... print(sorted(os.listdir(tmp_dir))) -['config.json', 'generation_config.json', 'model-00001-of-00006.safetensors', 'model-00002-of-00006.safetensors', 'model-00003-of-00006.safetensors', 'model-00004-of-00006.safetensors', 'model-00005-of-00006.safetensors', 'model-00006-of-00006.safetensors', 'model.safetensors.index.json'] -``` - -The sharded checkpoint is reloaded with the [`~PreTrainedModel.from_pretrained`] method. - -```py ->>> with tempfile.TemporaryDirectory() as tmp_dir: -... model.save_pretrained(tmp_dir, max_shard_size="5GB") -... new_model = AutoModel.from_pretrained(tmp_dir) -``` - -The main advantage of sharded checkpoints for big models is that each shard is loaded after the previous one, which caps the memory usage to only the model size and the largest shard size. - -You could also directly load a sharded checkpoint inside a model without the [`~PreTrainedModel.from_pretrained`] method (similar to PyTorch's `load_state_dict()` method for a full checkpoint). In this case, use the [`~modeling_utils.load_sharded_checkpoint`] method. - -```py ->>> from transformers.modeling_utils import load_sharded_checkpoint - ->>> with tempfile.TemporaryDirectory() as tmp_dir: -... model.save_pretrained(tmp_dir, max_shard_size="5GB") -... load_sharded_checkpoint(model, tmp_dir) -``` - -### Shard metadata - -The index file determines which keys are in the checkpoint and where the corresponding weights are stored. This file is loaded like any other JSON file and you can get a dictionary from it. - -```py ->>> import json - ->>> with tempfile.TemporaryDirectory() as tmp_dir: -... model.save_pretrained(tmp_dir, max_shard_size="5GB") -... with open(os.path.join(tmp_dir, "model.safetensors.index.json"), "r") as f: -... index = json.load(f) - ->>> print(index.keys()) -dict_keys(['metadata', 'weight_map']) -``` - -The `metadata` key provides the total model size. - -```py ->>> index["metadata"] -{'total_size': 28966928384} -``` - -The `weight_map` key maps each parameter name (typically `state_dict` in a PyTorch model) to the shard it's stored in. - -```py ->>> index["weight_map"] -{'lm_head.weight': 'model-00006-of-00006.safetensors', - 'model.embed_tokens.weight': 'model-00001-of-00006.safetensors', - 'model.layers.0.input_layernorm.weight': 'model-00001-of-00006.safetensors', - 'model.layers.0.mlp.down_proj.weight': 'model-00001-of-00006.safetensors', - ... -} -``` - -## Accelerate's Big Model Inference - -> [!TIP] -> Make sure you have Accelerate v0.9.0 or later and PyTorch v1.9.0 or later installed. - -From Transformers v4.20.0, the [`~PreTrainedModel.from_pretrained`] method is supercharged with Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature to efficiently handle really big models! Big Model Inference creates a *model skeleton* on PyTorch's [**meta**](https://pytorch.org/docs/main/meta.html) device. The randomly initialized parameters are only created when the pretrained weights are loaded. This way, you aren't keeping two copies of the model in memory at the same time (one for the randomly initialized model and one for the pretrained weights), and the maximum memory consumed is only the full model size. - -To enable Big Model Inference in Transformers, set `low_cpu_mem_usage=True` in the [`~PreTrainedModel.from_pretrained`] method. - -```py -from transformers import AutoModelForCausalLM - -gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", low_cpu_mem_usage=True) -``` - -Accelerate automatically dispatches the model weights across all available devices, starting with the fastest device (GPU) first and then offloading to the slower devices (CPU and even hard drive). This is enabled by setting `device_map="auto"` in the [`~PreTrainedModel.from_pretrained`] method. When you pass the `device_map` parameter, `low_cpu_mem_usage` is automatically set to `True` so you don't need to specify it. - -```py -from transformers import AutoModelForCausalLM - -# these loading methods are equivalent -gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto") -gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto", low_cpu_mem_usage=True) -``` - -You can also write your own `device_map` by mapping each layer to a device. It should map all model parameters to a device, but you don't have to detail where all the submodules of a layer go if the entire layer is on the same device. - -```python -device_map = {"model.layers.1": 0, "model.layers.14": 1, "model.layers.31": "cpu", "lm_head": "disk"} -``` - -Access `hf_device_map` attribute to see how Accelerate split the model across devices. - -```py -gemma.hf_device_map -``` - -```python out -{'model.embed_tokens': 0, - 'model.layers.0': 0, - 'model.layers.1': 0, - 'model.layers.2': 0, - 'model.layers.3': 0, - 'model.layers.4': 0, - 'model.layers.5': 0, - 'model.layers.6': 0, - 'model.layers.7': 0, - 'model.layers.8': 0, - 'model.layers.9': 0, - 'model.layers.10': 0, - 'model.layers.11': 0, - 'model.layers.12': 0, - 'model.layers.13': 0, - 'model.layers.14': 'cpu', - 'model.layers.15': 'cpu', - 'model.layers.16': 'cpu', - 'model.layers.17': 'cpu', - 'model.layers.18': 'cpu', - 'model.layers.19': 'cpu', - 'model.layers.20': 'cpu', - 'model.layers.21': 'cpu', - 'model.layers.22': 'cpu', - 'model.layers.23': 'cpu', - 'model.layers.24': 'cpu', - 'model.layers.25': 'cpu', - 'model.layers.26': 'cpu', - 'model.layers.27': 'cpu', - 'model.layers.28': 'cpu', - 'model.layers.29': 'cpu', - 'model.layers.30': 'cpu', - 'model.layers.31': 'cpu', - 'model.norm': 'cpu', - 'lm_head': 'cpu'} -``` - -## Model data type - -PyTorch model weights are normally instantiated as torch.float32 and it can be an issue if you try to load a model as a different data type. For example, you'd need twice as much memory to load the weights in torch.float32 and then again to load them in your desired data type, like torch.float16. - -> [!WARNING] -> Due to how PyTorch is designed, the `torch_dtype` parameter only supports floating data types. - -To avoid wasting memory like this, explicitly set the `torch_dtype` parameter to the desired data type or set `torch_dtype="auto"` to load the weights with the most optimal memory pattern (the data type is automatically derived from the model weights). - - - - -```py -from transformers import AutoModelForCausalLM - -gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype=torch.float16) -``` - - - - -```py -from transformers import AutoModelForCausalLM - -gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype="auto") -``` - - - - -You can also set the data type to use for models instantiated from scratch. - -```python -import torch -from transformers import AutoConfig, AutoModel - -my_config = AutoConfig.from_pretrained("google/gemma-2b", torch_dtype=torch.float16) -model = AutoModel.from_config(my_config) -``` From 5c1628edf391b2b6378bf4514c3c3c2ca7098a95 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Fri, 30 Aug 2024 11:19:07 -0700 Subject: [PATCH 043/116] not-doctested --- utils/not_doctested.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt index 42e897202352..656e9a6aaf8f 100644 --- a/utils/not_doctested.txt +++ b/utils/not_doctested.txt @@ -6,7 +6,6 @@ docs/source/en/agents.md docs/source/en/agents.md docs/source/en/attention.md docs/source/en/benchmarks.md -docs/source/en/big_models.md docs/source/en/community.md docs/source/en/contributing.md docs/source/en/custom_models.md From f0e9ec0364c8f9a27415d775470753821906f9a2 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 3 Sep 2024 15:24:31 -0700 Subject: [PATCH 044/116] prompting --- docs/source/en/_toctree.yml | 4 +- docs/source/en/tasks/prompting.md | 508 +++++++++--------------------- 2 files changed, 156 insertions(+), 356 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index bc6f5d52085e..2e36a29042c0 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -59,7 +59,7 @@ - title: LLMs sections: - local: tasks/prompting - title: LLM prompting guide + title: Prompt engineering - local: llm_optims title: LLM inference optimization - local: kv_cache @@ -242,6 +242,8 @@ title: Image tasks with IDEFICS - local: tasks/image_text_to_text title: Image-text-to-text + - local: tasks/video_text_to_text + title: Video-text-to-text - local: benchmarks title: Benchmarks - local: notebooks diff --git a/docs/source/en/tasks/prompting.md b/docs/source/en/tasks/prompting.md index 146ec328df0c..0b7f9a917ff7 100644 --- a/docs/source/en/tasks/prompting.md +++ b/docs/source/en/tasks/prompting.md @@ -1,4 +1,4 @@ - - -# LLM prompting guide +# Prompt engineering [[open-in-colab]] -Large Language Models such as Falcon, LLaMA, etc. are pretrained transformer models initially trained to predict the -next token given some input text. They typically have billions of parameters and have been trained on trillions of -tokens for an extended period of time. As a result, these models become quite powerful and versatile, and you can use -them to solve multiple NLP tasks out of the box by instructing the models with natural language prompts. - -Designing such prompts to ensure the optimal output is often called "prompt engineering". Prompt engineering is an -iterative process that requires a fair amount of experimentation. Natural languages are much more flexible and expressive -than programming languages, however, they can also introduce some ambiguity. At the same time, prompts in natural language -are quite sensitive to changes. Even minor modifications in prompts can lead to wildly different outputs. - -While there is no exact recipe for creating prompts to match all cases, researchers have worked out a number of best -practices that help to achieve optimal results more consistently. +Prompt engineering or prompting, refers to using natural language to improve large language model (LLM) performance on a variety of tasks. LLMs have tremendous capacity as a result of their training and size, such that a prompt can steer the model towards generating a desired output. In many cases ([but not all](#finetuning)), you don't need a finetuned model for a task, you just need a good prompt. -This guide covers the prompt engineering best practices to help you craft better LLM prompts and solve various NLP tasks. -You'll learn: +Try prompting a LLM to classify some text. When you create a prompt, it's very important to provide specific instructions about the task you want to perform and what the result should look like. -- [Basics of prompting](#basics-of-prompting) -- [Best practices of LLM prompting](#best-practices-of-llm-prompting) -- [Advanced prompting techniques: few-shot prompting and chain-of-thought](#advanced-prompting-techniques) -- [When to fine-tune instead of prompting](#prompting-vs-fine-tuning) +```py +from transformers import pipeline +import torch - - -Prompt engineering is only a part of the LLM output optimization process. Another essential component is choosing the -optimal text generation strategy. You can customize how your LLM selects each of the subsequent tokens when generating -the text without modifying any of the trainable parameters. By tweaking the text generation parameters, you can reduce -repetition in the generated text and make it more coherent and human-sounding. -Text generation strategies and parameters are out of scope for this guide, but you can learn more about these topics in -the following guides: - -* [Generation with LLMs](../llm_tutorial) -* [Text generation strategies](../generation_strategies) +pipeline = pipeline(task="text-generation", model="mistralai/Mistal-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto") +prompt = """Classify the text into neutral, negative or positive. +Text: This movie is definitely one of my favorite movies of its kind. The interaction between respectable and morally strong characters is an ode to chivalry and the honor code amongst thieves and policemen. +Sentiment: +""" - +outputs = pipeline(prompt, max_new_tokens=10) +for output in outputs: + print(f"Result: {output['generated_text']}") +Result: Classify the text into neutral, negative or positive. +Text: This movie is definitely one of my favorite movies of its kind. The interaction between respectable and morally strong characters is an ode to chivalry and the honor code amongst thieves and policemen. +Sentiment: +Positive +``` -## Basics of prompting +The challenge lies in designing prompts that produces the results you're expecting, which can be tricky, because language is so incredibly nuanced and expressive. -### Types of models +This guide covers prompt engineering best practices, techniques, and examples for how to solve language and reasoning tasks. -The majority of modern LLMs are decoder-only transformers. Some examples include: [LLaMA](../model_doc/llama), -[Llama2](../model_doc/llama2), [Falcon](../model_doc/falcon), [GPT2](../model_doc/gpt2). However, you may encounter -encoder-decoder transformer LLMs as well, for instance, [Flan-T5](../model_doc/flan-t5) and [BART](../model_doc/bart). +## Best practices -Encoder-decoder-style models are typically used in generative tasks where the output **heavily** relies on the input, for -example, in translation and summarization. The decoder-only models are used for all other types of generative tasks. +1. Try to pick the latest models for the best performance. Keep in mind that LLMs can come in two flavors, [base](https://hf.co/mistralai/Mistral-7B-v0.1) and [instruction-tuned](https://hf.co/mistralai/Mistral-7B-Instruct-v0.1) (or chat). -When using a pipeline to generate text with an LLM, it's important to know what type of LLM you are using, because -they use different pipelines. + Base models are excellent at completing text given an initial prompt, but they're not as good at following instructions. Instruction-tuned models are specifically trained versions of the base models on instructional or conversational data. This makes instruction-tuned models a better fit for prompting. -Run inference with decoder-only models with the `text-generation` pipeline: + > [!WARNING] + > Modern LLMs are typically decoder-only models, but there are some encoder-decoder LLMs like [Flan-T5](../model_doc/flan-t5) or [BART](../model_doc/bart) that may be used for prompting. For encoder-decoder models, make sure you set the pipeline task identifier to `text2text-generation` instead of `text-generation`. -```python ->>> from transformers import pipeline ->>> import torch +2. Start with a short and simple prompt, and iterate on it to get better results. ->>> torch.manual_seed(0) # doctest: +IGNORE_RESULT +3. Put instructions at the beginning or end of a prompt. For longer prompts, models may apply optimizations to prevent attention from scaling quadratically, which places more emphasis at the beginning and end of a prompt. ->>> generator = pipeline('text-generation', model = 'openai-community/gpt2') ->>> prompt = "Hello, I'm a language model" +4. Clearly separate instructions from the text of interest. ->>> generator(prompt, max_length = 30) -[{'generated_text': "Hello, I'm a language model. Not a programming language at all: it's pretty simple.\n\nWhen I write a function, I mean"}] -``` +5. Be specific and descriptive about the task and the desired output, including for example, its format, length, style, and language. Avoid ambiguous and vague descriptions and instructions. -To run inference with an encoder-decoder, use the `text2text-generation` pipeline: +6. Instructions should focus on "what to do" rather than "what not to do". -```python ->>> text2text_generator = pipeline("text2text-generation", model = 'google/flan-t5-base') ->>> prompt = "Translate from English to French: I'm very happy to see you" +7. Help lead the model generate the correct output by writing the first word or even the first sentence. ->>> text2text_generator(prompt) -[{'generated_text': 'Je suis très heureuse de vous rencontrer.'}] -``` +8. Try other techniques like [few-shot](#few-shot) and [chain-of-thought](#chain-of-thought) to improve results. -### Base vs instruct/chat models +9. Test your prompts with different models to assess their robustness. -Most of the recent LLM checkpoints available on 🤗 Hub come in two versions: base and instruct (or chat). For example, -[`tiiuae/falcon-7b`](https://huggingface.co/tiiuae/falcon-7b) and [`tiiuae/falcon-7b-instruct`](https://huggingface.co/tiiuae/falcon-7b-instruct). +10. Version and track your prompt performance. -Base models are excellent at completing the text when given an initial prompt, however, they are not ideal for NLP tasks -where they need to follow instructions, or for conversational use. This is where the instruct (chat) versions come in. -These checkpoints are the result of further fine-tuning of the pre-trained base versions on instructions and conversational data. -This additional fine-tuning makes them a better choice for many NLP tasks. +## Techniques -Let's illustrate some simple prompts that you can use with [`tiiuae/falcon-7b-instruct`](https://huggingface.co/tiiuae/falcon-7b-instruct) -to solve some common NLP tasks. +Crafting a good prompt alone, also known as zero-shot prompting, may not be enough to get the results you want. You may need to try a few prompting techniques to get the best performance. -### NLP tasks +This section covers a few of these techniques. -First, let's set up the environment: +### Few-shot -```bash -pip install -q transformers accelerate -``` +Few-shot prompting improves accuracy and performance by including specific examples of what a model should generate given an input. The explicit examples give the model a better understanding of the task and the output format you're looking for. Try experimenting with different numbers of examples (2, 4, 8, etc.) to the model to see how it affects performance. -Next, let's load the model with the appropriate pipeline (`"text-generation"`): +The example below provides the model with 1 example (1-shot) of the output format, a date in MM/DD/YYYY format, it should return. -```python ->>> from transformers import pipeline, AutoTokenizer ->>> import torch +```py +from transformers import pipeline +import torch ->>> torch.manual_seed(0) # doctest: +IGNORE_RESULT ->>> model = "tiiuae/falcon-7b-instruct" +pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto") +prompt = """Text: The first human went into space and orbited the Earth on April 12, 1961. +Date: 04/12/1961 +Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. +Date:""" ->>> tokenizer = AutoTokenizer.from_pretrained(model) ->>> pipe = pipeline( -... "text-generation", -... model=model, -... tokenizer=tokenizer, -... torch_dtype=torch.bfloat16, -... device_map="auto", -... ) +outputs = pipeline(prompt, max_new_tokens=12, do_sample=True, top_k=10) +for output in outputs: + print(f"Result: {output['generated_text']}") +Result: Text: The first human went into space and orbited the Earth on April 12, 1961. +Date: 04/12/1961 +Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. +Date: 09/28/1960 ``` - - -Note that Falcon models were trained using the `bfloat16` datatype, so we recommend you use the same. This requires a recent -version of CUDA and works best on modern cards. - - +The downside of few-shot prompting is that you need to create lengthier prompts which increases computation and latency. There is also a limit to prompt lengths. Finally, a model can learn unintended patterns from your examples and it doesn't work well on complex reasoning tasks. -Now that we have the model loaded via the pipeline, let's explore how you can use prompts to solve NLP tasks. +### Chain-of-thought -#### Text classification +Chain-of-thought (CoT) is effective at generating more coherent and well-reasoned outputs by providing a series of prompts that help a model "think" more thoroughly about a topic. -One of the most common forms of text classification is sentiment analysis, which assigns a label like "positive", "negative", -or "neutral" to a sequence of text. Let's write a prompt that instructs the model to classify a given text (a movie review). -We'll start by giving the instruction, and then specifying the text to classify. Note that instead of leaving it at that, we're -also adding the beginning of the response - `"Sentiment: "`: +The example below provides the model with several prompts that forces it to work through several intermediate reasoning steps. -```python ->>> torch.manual_seed(0) # doctest: +IGNORE_RESULT ->>> prompt = """Classify the text into neutral, negative or positive. -... Text: This movie is definitely one of my favorite movies of its kind. The interaction between respectable and morally strong characters is an ode to chivalry and the honor code amongst thieves and policemen. -... Sentiment: -... """ +```py +from transformers import pipeline +import torch ->>> sequences = pipe( -... prompt, -... max_new_tokens=10, -... ) +pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto") +prompt = """Let's go through this step-by-step: +1. You start with 15 muffins. +2. You eat 2 muffins, leaving you with 13 muffins. +3. You give 5 muffins to your neighbor, leaving you with 8 muffins. +4. Your partner buys 6 more muffins, bringing the total number of muffins to 14. +5. Your partner eats 2 muffins, leaving you with 12 muffins. +If you eat 6 muffins, how many are left?""" ->>> for seq in sequences: -... print(f"Result: {seq['generated_text']}") -Result: Classify the text into neutral, negative or positive. -Text: This movie is definitely one of my favorite movies of its kind. The interaction between respectable and morally strong characters is an ode to chivalry and the honor code amongst thieves and policemen. -Sentiment: -Positive +outputs = pipeline(prompt, max_new_tokens=20, do_sample=True, top_k=10) +for output in outputs: + print(f"Result: {output['generated_text']}") +Result: Let's go through this step-by-step: +1. You start with 15 muffins. +2. You eat 2 muffins, leaving you with 13 muffins. +3. You give 5 muffins to your neighbor, leaving you with 8 muffins. +4. Your partner buys 6 more muffins, bringing the total number of muffins to 14. +5. Your partner eats 2 muffins, leaving you with 12 muffins. +If you eat 6 muffins, how many are left? +Answer: 6 ``` -As a result, the output contains a classification label from the list we have provided in the instructions, and it is a correct one! +Like [few-shot](#few-shot) prompting, the downside of CoT is that it requires more effort to design a series of prompts that help the model reason through a complex task and the prompt length increases latency. - +## Finetuning -You may notice that in addition to the prompt, we pass a `max_new_tokens` parameter. It controls the number of tokens the -model shall generate, and it is one of the many text generation parameters that you can learn about -in [Text generation strategies](../generation_strategies) guide. +While prompting is a powerful way to work with LLMs, there are scenarios where a finetuned model or even finetuning a model works better. - +Here are some examples scenarios where a finetuned model makes sense. -#### Named Entity Recognition +- Your domain is extremely different from what a LLM was pretrained on, and extensive prompting didn't produce the results you want. +- Your model needs to work well in a low-resource language. +- Your model needs to be trained on sensitive data that have strict regulatory requirements. +- You're using a small model due to cost, privacy, infrastructure, or other constraints. -Named Entity Recognition (NER) is a task of finding named entities in a piece of text, such as a person, location, or organization. -Let's modify the instructions in the prompt to make the LLM perform this task. Here, let's also set `return_full_text = False` -so that output doesn't contain the prompt: +In all of these scenarios, ensure that you have a large enough domain-specific dataset to train your model with, have enough time and resources, and the cost of finetuning is worth it. Otherwise, you may be better off trying to optimize your prompt! -```python ->>> torch.manual_seed(1) # doctest: +IGNORE_RESULT ->>> prompt = """Return a list of named entities in the text. -... Text: The Golden State Warriors are an American professional basketball team based in San Francisco. -... Named entities: -... """ +## Examples ->>> sequences = pipe( -... prompt, -... max_new_tokens=15, -... return_full_text = False, -... ) - ->>> for seq in sequences: -... print(f"{seq['generated_text']}") -- Golden State Warriors -- San Francisco -``` +Here are some examples of prompting a LLM for different tasks. -As you can see, the model correctly identified two named entities from the given text. + + -#### Translation +```py +from transformers import pipeline +import torch -Another task LLMs can perform is translation. You can choose to use encoder-decoder models for this task, however, here, -for the simplicity of the examples, we'll keep using Falcon-7b-instruct, which does a decent job. Once again, here's how -you can write a basic prompt to instruct a model to translate a piece of text from English to Italian: +pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto") +prompt = """Return a list of named entities in the text. +Text: The company was founded in 2016 by French entrepreneurs Clément Delangue, Julien Chaumond, and Thomas Wolf in New York City, originally as a company that developed a chatbot app targeted at teenagers. +Named entities: +""" -```python ->>> torch.manual_seed(2) # doctest: +IGNORE_RESULT ->>> prompt = """Translate the English text to Italian. -... Text: Sometimes, I've believed as many as six impossible things before breakfast. -... Translation: -... """ - ->>> sequences = pipe( -... prompt, -... max_new_tokens=20, -... do_sample=True, -... top_k=10, -... return_full_text = False, -... ) - ->>> for seq in sequences: -... print(f"{seq['generated_text']}") -A volte, ho creduto a sei impossibili cose prima di colazione. +outputs = pipeline(prompt, max_new_tokens=50, return_full_text=False) +for output in outputs: + print(f"Result: {output['generated_text']}") +Result: [Clément Delangue, Julien Chaumond, Thomas Wolf, company, New York City, chatbot app, teenagers] ``` -Here we've added a `do_sample=True` and `top_k=10` to allow the model to be a bit more flexible when generating output. - -#### Text summarization - -Similar to the translation, text summarization is another generative task where the output **heavily** relies on the input, -and encoder-decoder models can be a better choice. However, decoder-style models can be used for this task as well. -Previously, we have placed the instructions at the very beginning of the prompt. However, the very end of the prompt can -also be a suitable location for instructions. Typically, it's better to place the instruction on one of the extreme ends. - -```python ->>> torch.manual_seed(3) # doctest: +IGNORE_RESULT ->>> prompt = """Permaculture is a design process mimicking the diversity, functionality and resilience of natural ecosystems. The principles and practices are drawn from traditional ecological knowledge of indigenous cultures combined with modern scientific understanding and technological innovations. Permaculture design provides a framework helping individuals and communities develop innovative, creative and effective strategies for meeting basic needs while preparing for and mitigating the projected impacts of climate change. -... Write a summary of the above text. -... Summary: -... """ - ->>> sequences = pipe( -... prompt, -... max_new_tokens=30, -... do_sample=True, -... top_k=10, -... return_full_text = False, -... ) - ->>> for seq in sequences: -... print(f"{seq['generated_text']}") -"Permaculture is an ecological design method that mimics natural ecosystems' diversity, functionality, and resilience using modern technology and indigenous knowledge. It aims to help" -``` + + -#### Question answering - -For question answering task we can structure the prompt into the following logical components: instructions, context, question, and -the leading word or phrase (`"Answer:"`) to nudge the model to start generating the answer: - -```python ->>> torch.manual_seed(4) # doctest: +IGNORE_RESULT ->>> prompt = """Answer the question using the context below. -... Context: Gazpacho is a cold soup and drink made of raw, blended vegetables. Most gazpacho includes stale bread, tomato, cucumbers, onion, bell peppers, garlic, olive oil, wine vinegar, water, and salt. Northern recipes often include cumin and/or pimentón (smoked sweet paprika). Traditionally, gazpacho was made by pounding the vegetables in a mortar with a pestle; this more laborious method is still sometimes used as it helps keep the gazpacho cool and avoids the foam and silky consistency of smoothie versions made in blenders or food processors. -... Question: What modern tool is used to make gazpacho? -... Answer: -... """ - ->>> sequences = pipe( -... prompt, -... max_new_tokens=10, -... do_sample=True, -... top_k=10, -... return_full_text = False, -... ) - ->>> for seq in sequences: -... print(f"Result: {seq['generated_text']}") -"Result: Modern tools are used, such as immersion blenders" -``` +```py +from transformers import pipeline +import torch -#### Reasoning +pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto") +prompt = """Translate the English text to French. +Text: Sometimes, I've believed as many as six impossible things before breakfast. +Translation: +""" -Reasoning is one of the most difficult tasks for LLMs, and achieving good results often requires applying advanced prompting techniques, like -[Chain-of-thought](#chain-of-thought). - -Let's try if we can make a model reason about a simple arithmetics task with a basic prompt: - -```python ->>> torch.manual_seed(5) # doctest: +IGNORE_RESULT ->>> prompt = """There are 5 groups of students in the class. Each group has 4 students. How many students are there in the class?""" - ->>> sequences = pipe( -... prompt, -... max_new_tokens=30, -... do_sample=True, -... top_k=10, -... return_full_text = False, -... ) - ->>> for seq in sequences: -... print(f"Result: {seq['generated_text']}") -Result: -There are a total of 50 students in the class (5 groups x 4 students per group = 20 groups, and +outputs = pipeline(prompt, max_new_tokens=20, do_sample=True, top_k=10, return_full_text=False) +for output in outputs: + print(f"Result: {output['generated_text']}") +Result: À l'occasion, j'ai croyu plus de six choses impossibles ``` -Correct! Let's increase the complexity a little and see if we can still get away with a basic prompt: + + -```python ->>> torch.manual_seed(6) # doctest: +IGNORE_RESULT ->>> prompt = """I baked 15 muffins. I ate 2 muffins and gave 5 muffins to a neighbor. My partner then bought 6 more muffins and ate 2. How many muffins do we now have?""" +```py +from transformers import pipeline +import torch ->>> sequences = pipe( -... prompt, -... max_new_tokens=10, -... do_sample=True, -... top_k=10, -... return_full_text = False, -... ) +pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto") +prompt = """Permaculture is a design process mimicking the diversity, functionality and resilience of natural ecosystems. The principles and practices are drawn from traditional ecological knowledge of indigenous cultures combined with modern scientific understanding and technological innovations. Permaculture design provides a framework helping individuals and communities develop innovative, creative and effective strategies for meeting basic needs while preparing for and mitigating the projected impacts of climate change. +Write a summary of the above text. +Summary: +""" ->>> for seq in sequences: -... print(f"Result: {seq['generated_text']}") -Result: -The total number of muffins now is 21 -``` - -This is a wrong answer, it should be 12. In this case, this can be due to the prompt being too basic, or due to the choice -of model, after all we've picked the smallest version of Falcon. Reasoning is difficult for models of all sizes, but larger -models are likely to perform better. - -## Best practices of LLM prompting - -In this section of the guide we have compiled a list of best practices that tend to improve the prompt results: - -* When choosing the model to work with, the latest and most capable models are likely to perform better. -* Start with a simple and short prompt, and iterate from there. -* Put the instructions at the beginning of the prompt, or at the very end. When working with large context, models apply various optimizations to prevent Attention complexity from scaling quadratically. This may make a model more attentive to the beginning or end of a prompt than the middle. -* Clearly separate instructions from the text they apply to - more on this in the next section. -* Be specific and descriptive about the task and the desired outcome - its format, length, style, language, etc. -* Avoid ambiguous descriptions and instructions. -* Favor instructions that say "what to do" instead of those that say "what not to do". -* "Lead" the output in the right direction by writing the first word (or even begin the first sentence for the model). -* Use advanced techniques like [Few-shot prompting](#few-shot-prompting) and [Chain-of-thought](#chain-of-thought) -* Test your prompts with different models to assess their robustness. -* Version and track the performance of your prompts. - -## Advanced prompting techniques - -### Few-shot prompting - -The basic prompts in the sections above are the examples of "zero-shot" prompts, meaning, the model has been given -instructions and context, but no examples with solutions. LLMs that have been fine-tuned on instruction datasets, generally -perform well on such "zero-shot" tasks. However, you may find that your task has more complexity or nuance, and, perhaps, -you have some requirements for the output that the model doesn't catch on just from the instructions. In this case, you can -try the technique called few-shot prompting. - -In few-shot prompting, we provide examples in the prompt giving the model more context to improve the performance. -The examples condition the model to generate the output following the patterns in the examples. - -Here's an example: - -```python ->>> torch.manual_seed(0) # doctest: +IGNORE_RESULT ->>> prompt = """Text: The first human went into space and orbited the Earth on April 12, 1961. -... Date: 04/12/1961 -... Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. -... Date:""" - ->>> sequences = pipe( -... prompt, -... max_new_tokens=8, -... do_sample=True, -... top_k=10, -... ) - ->>> for seq in sequences: -... print(f"Result: {seq['generated_text']}") -Result: Text: The first human went into space and orbited the Earth on April 12, 1961. -Date: 04/12/1961 -Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. -Date: 09/28/1960 +outputs = pipeline(prompt, max_new_tokens=30, do_sample=True, top_k=10, return_full_text=False) +for output in outputs: + print(f"Result: {output['generated_text']}") +Result: Permaculture is the design process that involves mimicking natural ecosystems to provide sustainable solutions to basic needs. It is a holistic approach that comb ``` -In the above code snippet we used a single example to demonstrate the desired output to the model, so this can be called a -"one-shot" prompting. However, depending on the task complexity you may need to use more than one example. - -Limitations of the few-shot prompting technique: -- While LLMs can pick up on the patterns in the examples, these technique doesn't work well on complex reasoning tasks -- Few-shot prompting requires creating lengthy prompts. Prompts with large number of tokens can increase computation and latency. There's also a limit to the length of the prompts. -- Sometimes when given a number of examples, models can learn patterns that you didn't intend them to learn, e.g. that the third movie review is always negative. - -### Chain-of-thought + + -Chain-of-thought (CoT) prompting is a technique that nudges a model to produce intermediate reasoning steps thus improving -the results on complex reasoning tasks. +```py +from transformers import pipeline +import torch -There are two ways of steering a model to producing the reasoning steps: -- few-shot prompting by illustrating examples with detailed answers to questions, showing the model how to work through a problem. -- by instructing the model to reason by adding phrases like "Let's think step by step" or "Take a deep breath and work through the problem step by step." +pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto") +prompt = """Answer the question using the context below. +Context: Gazpacho is a cold soup and drink made of raw, blended vegetables. Most gazpacho includes stale bread, tomato, cucumbers, onion, bell peppers, garlic, olive oil, wine vinegar, water, and salt. Northern recipes often include cumin and/or pimentón (smoked sweet paprika). Traditionally, gazpacho was made by pounding the vegetables in a mortar with a pestle; this more laborious method is still sometimes used as it helps keep the gazpacho cool and avoids the foam and silky consistency of smoothie versions made in blenders or food processors. +Question: What modern tool is used to make gazpacho? +Answer: +""" -If we apply the CoT technique to the muffins example from the [reasoning section](#reasoning) and use a larger model, -such as (`tiiuae/falcon-180B-chat`) which you can play with in the [HuggingChat](https://huggingface.co/chat/), -we'll get a significant improvement on the reasoning result: - -```text -Let's go through this step-by-step: -1. You start with 15 muffins. -2. You eat 2 muffins, leaving you with 13 muffins. -3. You give 5 muffins to your neighbor, leaving you with 8 muffins. -4. Your partner buys 6 more muffins, bringing the total number of muffins to 14. -5. Your partner eats 2 muffins, leaving you with 12 muffins. -Therefore, you now have 12 muffins. +outputs = pipeline(prompt, max_new_tokens=10, do_sample=True, top_k=10, return_full_text=False) +for output in outputs: + print(f"Result: {output['generated_text']}") +Result: A blender or food processor is the modern tool ``` -## Prompting vs fine-tuning - -You can achieve great results by optimizing your prompts, however, you may still ponder whether fine-tuning a model -would work better for your case. Here are some scenarios when fine-tuning a smaller model may be a preferred option: - -- Your domain is wildly different from what LLMs were pre-trained on and extensive prompt optimization did not yield sufficient results. -- You need your model to work well in a low-resource language. -- You need the model to be trained on sensitive data that is under strict regulations. -- You have to use a small model due to cost, privacy, infrastructure or other limitations. - -In all of the above examples, you will need to make sure that you either already have or can easily obtain a large enough -domain-specific dataset at a reasonable cost to fine-tune a model. You will also need to have enough time and resources -to fine-tune a model. - -If the above examples are not the case for you, optimizing prompts can prove to be more beneficial. - - + + From 8526857bbd307770166a6e9bca1b92e03e44ed3f Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 4 Sep 2024 14:22:05 -0700 Subject: [PATCH 045/116] llm optims --- docs/source/en/_toctree.yml | 2 +- docs/source/en/llm_optims.md | 114 +++++++++++++++++------------------ 2 files changed, 58 insertions(+), 58 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 2e36a29042c0..19f72286c30c 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -61,7 +61,7 @@ - local: tasks/prompting title: Prompt engineering - local: llm_optims - title: LLM inference optimization + title: Optimize inference - local: kv_cache title: Best Practices for Generation with Cache - local: llm_tutorial diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md index 37406ea0bef2..e2e7747343b7 100644 --- a/docs/source/en/llm_optims.md +++ b/docs/source/en/llm_optims.md @@ -9,46 +9,42 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> -# LLM inference optimization +# Optimize inference -Large language models (LLMs) have pushed text generation applications, such as chat and code completion models, to the next level by producing text that displays a high level of understanding and fluency. But what makes LLMs so powerful - namely their size - also presents challenges for inference. +Inference with large language models (LLMs) can be challenging because they have to store and handle billions of parameters. To load a 70B parameter [Llama 2](https://hf.co/meta-llama/Llama-2-70b-hf) model, it requires 256GB of memory for full precision weights and 128GB of memory for half-precision weights. For comparison, the most powerful GPUs today - the A100 and H100 - only have 80GB of memory. -Basic inference is slow because LLMs have to be called repeatedly to generate the next token. The input sequence increases as generation progresses, which takes longer and longer for the LLM to process. LLMs also have billions of parameters, making it a challenge to store and handle all those weights in memory. +On top of the memory requirements, inference is slow because LLMs are called repeatedly to generate the next token. The input sequence increases as generation progresses, which takes longer and longer to process. -This guide will show you how to use the optimization techniques available in Transformers to accelerate LLM inference. +This guide will show you how to optimize LLM inference to accelerate generation and reduce memory usage. > [!TIP] -> Hugging Face also provides [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a library dedicated to deploying and serving highly optimized LLMs for inference. It includes deployment-oriented optimization features not included in Transformers, such as continuous batching for increasing throughput and tensor parallelism for multi-GPU inference. +> Try out [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a Hugging Face library dedicated to deploying and serving highly optimized LLMs for inference. It includes deployment-oriented optimization features not included in Transformers, such as continuous batching for increasing throughput and tensor parallelism for multi-GPU inference. -## Static kv-cache and `torch.compile` +## Static kv-cache and torch.compile -During decoding, a LLM computes the key-value (kv) values for each input token and since it is autoregressive, it computes the same kv values each time because the generated output becomes part of the input now. This is not very efficient because you're recomputing the same kv values each time. +LLMs compute key-value (kv) values for each input token, and it performs the same kv computation each time because the generated output becomes part of the input. However, performing the same kv computation every time is not very efficient. -To optimize this, you can use a kv-cache to store the past keys and values instead of recomputing them each time. However, since the kv-cache grows with each generation step and is dynamic, it prevents you from taking advantage of [`torch.compile`](./perf_torch_compile), a powerful optimization tool that fuses PyTorch code into fast and optimized kernels. We have an entire guide dedicated to kv-caches [here](./kv_cache). +A *kv-cache* stores the past keys and values instead of recomputing them each time. But the kv-cache is dynamic and it grows with each generation step which prevents you from taking advantage of [torch.compile](./perf_torch_compile), a powerful optimization method that fuses PyTorch code into optimized kernels. -The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with `torch.compile` for up to a 4x speed up. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware. +The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value, which allows you to combine it with [torch.compile](./perf_torch_compile) for up to a 4x speed up. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware. > [!WARNING] -> Currently, only [Llama](./model_doc/llama2) and a few other models support static kv-cache and `torch.compile`. Check [this issue](https://github.com/huggingface/transformers/issues/28981) for a live model compatibility list. +> Follow this [issue](https://github.com/huggingface/transformers/issues/28981) to track which models (Llama, Gemma, Mistral, etc.) support a static kv-cache and torch.compile. -There are three flavors of static kv-cache usage, depending on the complexity of your task: -1. Basic usage: simply set a flag in `generation_config` (recommended); -2. Advanced usage: handle a cache object for multi-turn generation or a custom generation loop; -3. Advanced usage: compile the entire `generate` function into a single graph, if having a single graph is relevant for you. +Depending on your task, there are several ways you can use the static kv-cache. -Select the correct tab below for further instructions on each of these flavors. +1. For basic use cases, set [cache_implementation](https://hf.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.cache_implementation) to `"static"` (recommended). +2. For multi-turn generation or a custom generation loop, initialize and handle [`StaticCache`] directly. +3. For more unique hardware or use cases, it may be better to compile the entire [`~GenerationMixin.generate`] function into a single graph. > [!TIP] -> Regardless of the strategy used with `torch.compile`, you can avoid shape-related recompilations if you left-pad your LLM inputs to a limited set of values. The [`pad_to_multiple_of` tokenizer flag](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__.pad_to_multiple_of) is your friend! +> Regardless of how you use the static kv-cache and torch.compile, left-pad your inputs with [pad_to_multiple_of](https://hf.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__.pad_to_multiple_of) to a limited set of values to avoid shape-related recompilations. - + -For this example, let's use the [Gemma](https://hf.co/google/gemma-2b) model. All we need to do is to: -1. Access the model's `generation_config` attribute and set the `cache_implementation` to "static"; -2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache. - -And that's it! +1. Set the [cache_implementation](https://hf.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.cache_implementation) to `"static"` in a models [`GenerationConfig`]. +2. Call [torch.compile](./perf_torch_compile) to compile the forward pass with the static kv-cache. ```py from transformers import AutoTokenizer, AutoModelForCausalLM @@ -70,17 +66,15 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) ['The theory of special relativity states 1. The speed of light is constant in all inertial reference'] ``` -Under the hood, `generate` will attempt to reuse the same cache object, removing the need for re-compilation at each call. Avoiding re-compilation is critical to get the most out of `torch.compile`, and you should be aware of the following: -1. If the batch size changes or the maximum output length increases between calls, the cache will have to be reinitialized, triggering a new compilation; -2. The first couple of calls of the compiled function are slower, as the function is being compiled. +Under the hood, [`~GenerationMixin.generate`] attempts to reuse the same cache object to avoid recompilation at each call, which is critical to get the most out of [torch.compile](./perf_torch_compile). Be aware of the following to avoid triggering recompilation or if generation is slower than expected. -> [!WARNING] -> For a more advanced usage of the static cache, such as multi-turn conversations, we recommend instantiating and manipulating the cache object outside [`~GenerationMixin.generate`]. See the advanced usage tab. +1. If the batch size changes or the maximum output length increases between calls, the cache is reinitialized and recompiled. +2. The first several calls of the compiled function are slower because it is being compiled. - + -A [`StaticCache`] object can be passed to the model's [`~GenerationMixin.generate`] under the `past_key_values` argument. The object will retain the cache contents, so you can pass it to a new [`~GenerationMixin.generate`] call to continue generation, like you would do with a dynamic cache. +Directly initialize a [`StaticCache`] object and pass it to the `past_key_values` parameter in [`~GenerationMixin.generate`]. The [`StaticCache`] keeps the cache contents, so you can pass it to a new [`~GenerationMixin.generate`] call to continue generation, similar to a dynamic cache. ```py from transformers import AutoTokenizer, AutoModelForCausalLM, StaticCache @@ -118,9 +112,9 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) ``` > [!TIP] -> If you want to reuse the same [`StaticCache`] object on a new prompt, be sure to reset its contents with the `.reset()` method between calls +> To reuse [`StaticCache`] on a new prompt, use [`~StaticCache.reset`] to reset the cache contents between calls. -If you want to go further down a level, the [`StaticCache`] object can also be passed to the model's forward pass under the same `past_key_values` argument. Using this strategy, you can write your own function to decode the next token given the current token and position and cache position of previously generated tokens. +Another option for using [`StaticCache`] is to pass it to a models forward pass using the same `past_key_values` argument. This allows you to write your own custom decoding function to decode the next token given the current token, position, and cache position of previously generated tokens. ```py from transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache, logging @@ -153,10 +147,11 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_valu return new_token ``` -There are a few important things you must do to enable static kv-cache and `torch.compile` with the `StaticCache` method: -1. Initialize the [`StaticCache`] instance before using the model for inference. There you can configure parameters like the maximum batch size and sequence length. -2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache. -3. Use `SDPBackend.MATH` in the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more. +To enable static kv-cache and [torch.compile](./perf_torch_compile) with [`StaticCache`], follow the steps below. + +1. Initialize [`StaticCache`] before using the model for inference to configure parameters like the maximum batch size and sequence length. +2. Call [torch.compile](./perf_torch_compile) on the model to compile the forward pass with the static kv-cache. +3. Set `enable_math=True` in the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more. ```py from torch.nn.attention import SDPBackend, sdpa_kernel @@ -193,9 +188,9 @@ text ``` - + -Compiling the entire `generate` function, in terms of code, is even simpler than in the basic usage: call `torch.compile` on `generate` to compile the entire function. No need to specify the use of the static cache: although it is compatible, dynamic cache (default) was faster in our benchmarks. +Compiling the entire [`~GenerationMixin.generate`] function also compiles the input preparation logit processor operations, and more in addition to the forward pass. With this approach, you don't need to initialize [`StaticCache`] or set the [cache_implementation](https://hf.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.cache_implementation) parameter. ```py from transformers import AutoTokenizer, AutoModelForCausalLM @@ -215,28 +210,33 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) ['The theory of special relativity states 1. The speed of light is constant in all inertial reference'] ``` -As a result, we compile not only the model forward pass, but also all input preparation, logit processor operations, and so on. The result should be a slightly `generate` call, compared to the basic usage example, and the compiled graph may be better suited to more exotic hardware devices or use cases. However, there are severe drawbacks in using this approach: -1. Compilation is much slower; -2. All parameterization of `generate` must be done through `generation_config`; -3. Many warnings and exceptions are suppressed -- we suggest testing with its uncompiled form first; -4. Although we are working on it, it is heavily feature restricted (for instance, at the time of writing, generation does not stop if an EOS token is selected). +This usage pattern is more appropriate for unique hardware or use cases, but there are several drawbacks to consider. + +1. Compilation is much slower. +2. Parameters must be configured through [`GenerationConfig`]. +3. Many warnings and exceptions are suppressed. We recommend testing the uncompiled model first. +4. Many features are unavailable at the moment. For example, generation does not stop if an `EOS` token is selected. -## Speculative decoding +## Decoding + +Decoding can also be optimized to accelerate generation. You can use a lightweight assistant model to generate candidate tokens faster than the LLM itself or you can use a variant of this decoding strategy that works especially well for input-grounded tasks. + +### Speculative decoding > [!TIP] > For a more in-depth explanation, take a look at the [Assisted Generation: a new direction toward low-latency text generation](https://hf.co/blog/assisted-generation) blog post! -Another issue with autoregression is that for each input token you need to load the model weights each time during the forward pass. This is slow and cumbersome for LLMs which have billions of parameters. Speculative decoding alleviates this slowdown by using a second smaller and faster assistant model to generate candidate tokens that are verified by the larger LLM in a single forward pass. If the verified tokens are correct, the LLM essentially gets them for "free" without having to generate them itself. There is no degradation in accuracy because the verification forward pass ensures the same outputs are generated as if the LLM had generated them on its own. +For each input token, the model weights are loaded each time during the forward pass, which is slow and cumbersome when a model has billions of parameters. Speculative decoding alleviates this slowdown by using a second smaller and faster assistant model to generate candidate tokens that are verified by the larger model in a single forward pass. If the verified tokens are correct, the LLM essentially gets them for "free" without having to generate them itself. There is no degradation in accuracy because the verification forward pass ensures the same outputs are generated as if the LLM had generated them on its own. To get the largest speed up, the assistant model should be a lot smaller than the LLM so that it can generate tokens quickly. The assistant and LLM model must also share the same tokenizer to avoid re-encoding and decoding tokens. > [!WARNING] > Speculative decoding is only supported for the greedy search and sampling decoding strategies, and it also doesn't support batched inputs. -Enable speculative decoding by loading an assistant model and passing it to the [`~GenerationMixin.generate`] method. +Enable speculative decoding by loading an assistant model and passing it to [`~GenerationMixin.generate`]. @@ -261,7 +261,7 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True) -For speculative sampling decoding, add the `do_sample` and `temperature` parameters to the [`~GenerationMixin.generate`] method in addition to the assistant model. +For speculative sampling decoding, add the [do_sample](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.do_sample) and [temperature](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.temperature) parameters to [`~GenerationMixin.generate`]. ```py from transformers import AutoModelForCausalLM, AutoTokenizer @@ -287,7 +287,7 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) Prompt lookup decoding is a variant of speculative decoding that is also compatible with greedy search and sampling. Prompt lookup works especially well for input-grounded tasks - such as summarization - where there is often overlapping words between the prompt and output. These overlapping n-grams are used as the LLM candidate tokens. -To enable prompt lookup decoding, specify the number of tokens that should be overlapping in the `prompt_lookup_num_tokens` parameter. Then you can pass this parameter to the [`~GenerationMixin.generate`] method. +To enable prompt lookup decoding, specify the number of tokens that should be overlapping in the [prompt_lookup_num_tokens](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.prompt_lookup_num_tokens) parameter. Then pass this parameter to [`~GenerationMixin.generate`]. @@ -312,7 +312,7 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) -For prompt lookup decoding with sampling, add the `do_sample` and `temperature` parameters to the [`~GenerationMixin.generate`] method. +For prompt lookup decoding with sampling, add the [do_sample](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.do_sample) and [temperature](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig.temperature) parameters to [`~GenerationMixin.generate`]. ```py from transformers import AutoModelForCausalLM, AutoTokenizer @@ -333,15 +333,15 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) -## Attention optimizations +## Attention -A known issue with transformer models is that the self-attention mechanism grows quadratically in compute and memory with the number of input tokens. This limitation is only magnified in LLMs which handles much longer sequences. To address this, try FlashAttention2 or PyTorch's scaled dot product attention (SDPA), which are more memory efficient attention implementations and can accelerate inference. +A known issue with transformer models is that the self-attention mechanism grows quadratically in compute and memory with the number of input tokens. This limitation is only magnified in LLMs which handles much longer sequences. To address this, try FlashAttention2 or PyTorch's scaled dot product attention (SDPA), which are more memory efficient attention implementations. ### FlashAttention-2 -FlashAttention and [FlashAttention-2](./perf_infer_gpu_one#flashattention-2) break up the attention computation into smaller chunks and reduces the number of intermediate read/write operations to GPU memory to speed up inference. FlashAttention-2 improves on the original FlashAttention algorithm by also parallelizing over sequence length dimension and better partitioning work on the hardware to reduce synchronization and communication overhead. +FlashAttention and [FlashAttention-2](./perf_infer_gpu_one#flashattention-2) break up the attention computation into smaller chunks and reduces the number of intermediate read/write operations to the GPU memory to speed up inference. FlashAttention-2 improves on the original FlashAttention algorithm by also parallelizing over sequence length dimension and better partitioning work on the hardware to reduce synchronization and communication overhead. -To use FlashAttention-2, set `attn_implementation="flash_attention_2"` in the [`~PreTrainedModel.from_pretrained`] method. +To use FlashAttention-2, set [attn_implementation](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.attn_implementation) to `"flash_attention_2"` in [`~PreTrainedModel.from_pretrained`]. ```py from transformers import AutoModelForCausalLM, BitsAndBytesConfig @@ -453,7 +453,7 @@ trainer.train() Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and it supports FlashAttention, xFormers, and PyTorch's C++ implementation. SDPA chooses the most performant attention algorithm if you're using a CUDA backend. For other backends, SDPA defaults to the PyTorch C++ implementation. > [!TIP] -> SDPA supports FlashAttention-2 as long as you have the latest PyTorch version installed. +> SDPA automaticallysupports FlashAttention-2 as long as you have the latest PyTorch version installed. Use the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to explicitly enable or disable any of the four attention algorithms. For example, use `SDPBackend.FLASH_ATTENTION` to enable FlashAttention. @@ -473,12 +473,12 @@ with sdpa_kernel(SDPBackend.FLASH_ATTENTION): ## Quantization -Quantization reduces the size of the LLM weights by storing them in a lower precision. This translates to lower memory usage and makes loading LLMs for inference more accessible if you're constrained by your GPUs memory. If you aren't limited by your GPU, you don't necessarily need to quantize your model because it can incur a small latency cost (except for AWQ and fused AWQ modules) due to the extra step required to quantize and dequantize the weights. +Quantization reduces the size of model weights by storing them in a lower precision. This translates to lower memory usage and makes loading LLMs for inference more accessible if you're constrained by GPU memory. If you aren't limited by your GPU, you don't necessarily need to quantize your model because it can increase latency slightly (except for AWQ and fused AWQ modules) due to the extra step required to quantize and dequantize the weights. > [!TIP] > There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, VPTQ, AWQ, and AutoGPTQ. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post which compares AutoGPTQ and bitsandbytes. -Use the Model Memory Calculator below to estimate and compare how much memory is required to load a model. For example, try estimating how much memory it costs to load [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1). +Use the Model Memory Calculator below to estimate and compare how much memory is required to load a model. For example, try estimating the memory required to load [Mistral-7B-v0.1](https://hf.co/mistralai/Mistral-7B-v0.1). -To load Mistral-7B-v0.1 in half-precision, set the `torch_dtype` parameter in the [`~transformers.AutoModelForCausalLM.from_pretrained`] method to `torch.bfloat16`. This requires 13.74GB of memory. +To load a model in half-precision, set the [torch_dtype](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.torch_dtype) parameter in [`~transformers.AutoModelForCausalLM.from_pretrained`] to `torch.bfloat16`. This requires 13.74GB of memory. ```py from transformers import AutoTokenizer, AutoModelForCausalLM @@ -498,7 +498,7 @@ model = AutoModelForCausalLM.from_pretrained( ) ``` -To load a quantized model (8-bit or 4-bit) for inference, try [bitsandbytes](https://hf.co/docs/bitsandbytes) and set the `load_in_4bit` or `load_in_8bit` parameters to `True`. Loading the model in 8-bits only requires 6.87 GB of memory. +To load a quantized model (8-bit or 4-bit), try [bitsandbytes](https://hf.co/docs/bitsandbytes) and set the [load_in_4bit](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.BitsAndBytesConfig.load_in_4bit) or [load_in_8bit](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.BitsAndBytesConfig.load_in_8bit) parameters to `True`. Loading the model in 8-bits only requires 6.87 GB of memory. ```py from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig From 7fbf2212f85f07a13482661abdb3bf5d1f1ba381 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 4 Sep 2024 14:30:10 -0700 Subject: [PATCH 046/116] fix toctree --- docs/source/en/_toctree.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 19f72286c30c..90bfe4092a3f 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -86,6 +86,8 @@ title: Optimize inference using `torch.compile()` - local: agents title: Agents + - local: agents_advanced + title: Agents, supercharged - Multi-agents, External tools, and more - local: multilingual title: Run inference with multilingual models - local: gguf From 3c1216884d669b75bef3630faaf320e3694d9e04 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 9 Sep 2024 16:35:49 -0700 Subject: [PATCH 047/116] fixes --- docs/source/en/_toctree.yml | 2 + docs/source/en/autoclass_tutorial.md | 189 ------- docs/source/en/bertology.md | 41 -- docs/source/en/performance.md | 73 --- docs/source/en/preprocessing.md | 534 ------------------ .../test_modeling_falcon_mamba.py | 2 +- 6 files changed, 3 insertions(+), 838 deletions(-) delete mode 100644 docs/source/en/autoclass_tutorial.md delete mode 100644 docs/source/en/bertology.md delete mode 100644 docs/source/en/performance.md delete mode 100644 docs/source/en/preprocessing.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 90bfe4092a3f..dbe01f4eb53f 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -92,6 +92,8 @@ title: Run inference with multilingual models - local: gguf title: Interoperability with GGUF files + - local: tiktoken + title: Interoperability with TikToken files - local: perf_infer_cpu title: CPU inference - local: perf_infer_gpu_one diff --git a/docs/source/en/autoclass_tutorial.md b/docs/source/en/autoclass_tutorial.md deleted file mode 100644 index f4601fba1e6f..000000000000 --- a/docs/source/en/autoclass_tutorial.md +++ /dev/null @@ -1,189 +0,0 @@ - - -# AutoClass API - -With so many different Transformer architectures, it can be challenging to create one for your checkpoint. As a part of 🤗 Transformers core philosophy to make the library easy, simple and flexible to use, an `AutoClass` automatically infers and loads the correct architecture from a given checkpoint. The `from_pretrained()` method lets you quickly load a pretrained model for any architecture so you don't have to devote time and resources to train a model from scratch. Producing this type of checkpoint-agnostic code means if your code works for one checkpoint, it will work with another checkpoint - as long as it was trained for a similar task - even if the architecture is different. - - - -Remember, architecture refers to the skeleton of the model and checkpoints are the weights for a given architecture. For example, [BERT](https://huggingface.co/google-bert/bert-base-uncased) is an architecture, while `google-bert/bert-base-uncased` is a checkpoint. Model is a general term that can mean either architecture or checkpoint. - - - -In this tutorial, learn to: - -* Load a pretrained tokenizer. -* Load a pretrained image processor -* Load a pretrained feature extractor. -* Load a pretrained processor. -* Load a pretrained model. -* Load a model as a backbone. - -## AutoTokenizer - -Nearly every NLP task begins with a tokenizer. A tokenizer converts your input into a format that can be processed by the model. - -Load a tokenizer with [`AutoTokenizer.from_pretrained`]: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") -``` - -Then tokenize your input as shown below: - -```py ->>> sequence = "In a hole in the ground there lived a hobbit." ->>> print(tokenizer(sequence)) -{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} -``` - -## AutoImageProcessor - -For vision tasks, an image processor processes the image into the correct input format. - -```py ->>> from transformers import AutoImageProcessor - ->>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") -``` - -## AutoBackbone - -
- -
A Swin backbone with multiple stages for outputting a feature map.
-
- -The [`AutoBackbone`] lets you use pretrained models as backbones to get feature maps from different stages of the backbone. You should specify one of the following parameters in [`~PretrainedConfig.from_pretrained`]: - -* `out_indices` is the index of the layer you'd like to get the feature map from -* `out_features` is the name of the layer you'd like to get the feature map from - -These parameters can be used interchangeably, but if you use both, make sure they're aligned with each other! If you don't pass any of these parameters, the backbone returns the feature map from the last layer. - -
- -
A feature map from the first stage of the backbone. The patch partition refers to the model stem.
-
- -For example, in the above diagram, to return the feature map from the first stage of the Swin backbone, you can set `out_indices=(1,)`: - -```py ->>> from transformers import AutoImageProcessor, AutoBackbone ->>> import torch ->>> from PIL import Image ->>> import requests ->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" ->>> image = Image.open(requests.get(url, stream=True).raw) ->>> processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224") ->>> model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,)) - ->>> inputs = processor(image, return_tensors="pt") ->>> outputs = model(**inputs) ->>> feature_maps = outputs.feature_maps -``` - -Now you can access the `feature_maps` object from the first stage of the backbone: - -```py ->>> list(feature_maps[0].shape) -[1, 96, 56, 56] -``` - -## AutoFeatureExtractor - -For audio tasks, a feature extractor processes the audio signal into the correct input format. - -Load a feature extractor with [`AutoFeatureExtractor.from_pretrained`]: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained( -... "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" -... ) -``` - -## AutoProcessor - -Multimodal tasks require a processor that combines two types of preprocessing tools. For example, the [LayoutLMV2](model_doc/layoutlmv2) model requires an image processor to handle images and a tokenizer to handle text; a processor combines both of them. - -Load a processor with [`AutoProcessor.from_pretrained`]: - -```py ->>> from transformers import AutoProcessor - ->>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") -``` - -## AutoModel - - - -The `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]. - -> [!WARNING] -> By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto") -``` - -Easily reuse the same checkpoint to load an architecture for a different task: - -```py ->>> from transformers import AutoModelForTokenClassification - ->>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto") -``` - - - -For PyTorch models, the `from_pretrained()` method uses `torch.load()` which internally uses `pickle` and is known to be insecure. In general, never load a model that could have come from an untrusted source, or that could have been tampered with. This security risk is partially mitigated for public models hosted on the Hugging Face Hub, which are [scanned for malware](https://huggingface.co/docs/hub/security-malware) at each commit. See the [Hub documentation](https://huggingface.co/docs/hub/security) for best practices like [signed commit verification](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) with GPG. - -TensorFlow and Flax checkpoints are not affected, and can be loaded within PyTorch architectures using the `from_tf` and `from_flax` kwargs for the `from_pretrained` method to circumvent this issue. - - - -Generally, we recommend using the `AutoTokenizer` class and the `AutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning. - - -Finally, the `TFAutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`TFAutoModelForSequenceClassification.from_pretrained`]: - -```py ->>> from transformers import TFAutoModelForSequenceClassification - ->>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") -``` - -Easily reuse the same checkpoint to load an architecture for a different task: - -```py ->>> from transformers import TFAutoModelForTokenClassification - ->>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased") -``` - -Generally, we recommend using the `AutoTokenizer` class and the `TFAutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning. - - diff --git a/docs/source/en/bertology.md b/docs/source/en/bertology.md deleted file mode 100644 index a1b92a362cd0..000000000000 --- a/docs/source/en/bertology.md +++ /dev/null @@ -1,41 +0,0 @@ - - -# BERTology - -There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT -(that some call "BERTology"). Some good examples of this field are: - - -- BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: - https://arxiv.org/abs/1905.05950 -- Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650 -- What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. - Manning: https://arxiv.org/abs/1906.04341 -- CAT-probing: A Metric-based Approach to Interpret How Pre-trained Models for Programming Language Attend Code Structure: https://arxiv.org/abs/2210.04633 - -In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to -help people access the inner representations, mainly adapted from the great work of Paul Michel -(https://arxiv.org/abs/1905.10650): - - -- accessing all the hidden-states of BERT/GPT/GPT-2, -- accessing all the attention weights for each head of BERT/GPT/GPT-2, -- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained - in https://arxiv.org/abs/1905.10650. - -To help you understand and use these features, we have added a specific example script: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) which extracts information and prune a model pre-trained on -GLUE. diff --git a/docs/source/en/performance.md b/docs/source/en/performance.md deleted file mode 100644 index b9176be04ec2..000000000000 --- a/docs/source/en/performance.md +++ /dev/null @@ -1,73 +0,0 @@ - - -# Performance and Scalability - -Training large transformer models and deploying them to production present various challenges. -During training, the model may require more GPU memory than available or exhibit slow training speed. In the deployment -phase, the model can struggle to handle the required throughput in a production environment. - -This documentation aims to assist you in overcoming these challenges and finding the optimal settings for your use-case. -The guides are divided into training and inference sections, as each comes with different challenges and solutions. -Within each section you'll find separate guides for different hardware configurations, such as single GPU vs. multi-GPU -for training or CPU vs. GPU for inference. - -Use this document as your starting point to navigate further to the methods that match your scenario. - -## Training - -Training large transformer models efficiently requires an accelerator such as a GPU or TPU. The most common case is where -you have a single GPU. The methods that you can apply to improve training efficiency on a single GPU extend to other setups -such as multiple GPU. However, there are also techniques that are specific to multi-GPU or CPU training. We cover them in -separate sections. - -* [Methods and tools for efficient training on a single GPU](perf_train_gpu_one): start here to learn common approaches that can help optimize GPU memory utilization, speed up the training, or both. -* [Multi-GPU training section](perf_train_gpu_many): explore this section to learn about further optimization methods that apply to a multi-GPU settings, such as data, tensor, and pipeline parallelism. -* [CPU training section](perf_train_cpu): learn about mixed precision training on CPU. -* [Efficient Training on Multiple CPUs](perf_train_cpu_many): learn about distributed CPU training. -* [Training on TPU with TensorFlow](perf_train_tpu_tf): if you are new to TPUs, refer to this section for an opinionated introduction to training on TPUs and using XLA. -* [Custom hardware for training](perf_hardware): find tips and tricks when building your own deep learning rig. -* [Hyperparameter Search using Trainer API](hpo_train) - -## Inference - -Efficient inference with large models in a production environment can be as challenging as training them. In the following -sections we go through the steps to run inference on CPU and single/multi-GPU setups. - -* [Inference on a single CPU](perf_infer_cpu) -* [Inference on a single GPU](perf_infer_gpu_one) -* [Multi-GPU inference](perf_infer_gpu_multi) -* [XLA Integration for TensorFlow Models](tf_xla) - - -## Training and inference - -Here you'll find techniques, tips and tricks that apply whether you are training a model, or running inference with it. - -* [Instantiating a big model](big_models) -* [Troubleshooting performance issues](debugging) - -## Contribute - -This document is far from being complete and a lot more needs to be added, so if you have additions or corrections to -make please don't hesitate to open a PR or if you aren't sure start an Issue and we can discuss the details there. - -When making contributions that A is better than B, please try to include a reproducible benchmark and/or a link to the -source of that information (unless it comes directly from you). diff --git a/docs/source/en/preprocessing.md b/docs/source/en/preprocessing.md deleted file mode 100644 index 1a6f071a3353..000000000000 --- a/docs/source/en/preprocessing.md +++ /dev/null @@ -1,534 +0,0 @@ - - -# Preprocess - -[[open-in-colab]] - -Before you can train a model on a dataset, it needs to be preprocessed into the expected model input format. Whether your data is text, images, or audio, it needs to be converted and assembled into batches of tensors. 🤗 Transformers provides a set of preprocessing classes to help prepare your data for the model. In this tutorial, you'll learn that for: - -* Text, use a [Tokenizer](./main_classes/tokenizer) to convert text into a sequence of tokens, create a numerical representation of the tokens, and assemble them into tensors. -* Speech and audio, use a [Feature extractor](./main_classes/feature_extractor) to extract sequential features from audio waveforms and convert them into tensors. -* Image inputs use a [ImageProcessor](./main_classes/image_processor) to convert images into tensors. -* Multimodal inputs, use a [Processor](./main_classes/processors) to combine a tokenizer and a feature extractor or image processor. - - - -`AutoProcessor` **always** works and automatically chooses the correct class for the model you're using, whether you're using a tokenizer, image processor, feature extractor or processor. - - - -Before you begin, install 🤗 Datasets so you can load some datasets to experiment with: - -```bash -pip install datasets -``` - -## Natural Language Processing - - - -The main tool for preprocessing textual data is a [tokenizer](main_classes/tokenizer). A tokenizer splits text into *tokens* according to a set of rules. The tokens are converted into numbers and then tensors, which become the model inputs. Any additional inputs required by the model are added by the tokenizer. - - - -If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer. This ensures the text is split the same way as the pretraining corpus, and uses the same corresponding tokens-to-index (usually referred to as the *vocab*) during pretraining. - - - -Get started by loading a pretrained tokenizer with the [`AutoTokenizer.from_pretrained`] method. This downloads the *vocab* a model was pretrained with: - -```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") -``` - -Then pass your text to the tokenizer: - -```py ->>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.") ->>> print(encoded_input) -{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], - 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} -``` - -The tokenizer returns a dictionary with three important items: - -* [input_ids](glossary#input-ids) are the indices corresponding to each token in the sentence. -* [attention_mask](glossary#attention-mask) indicates whether a token should be attended to or not. -* [token_type_ids](glossary#token-type-ids) identifies which sequence a token belongs to when there is more than one sequence. - -Return your input by decoding the `input_ids`: - -```py ->>> tokenizer.decode(encoded_input["input_ids"]) -'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]' -``` - -As you can see, the tokenizer added two special tokens - `CLS` and `SEP` (classifier and separator) - to the sentence. Not all models need -special tokens, but if they do, the tokenizer automatically adds them for you. - -If there are several sentences you want to preprocess, pass them as a list to the tokenizer: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_inputs = tokenizer(batch_sentences) ->>> print(encoded_inputs) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1]]} -``` - -### Pad - -Sentences aren't always the same length which can be an issue because tensors, the model inputs, need to have a uniform shape. Padding is a strategy for ensuring tensors are rectangular by adding a special *padding token* to shorter sentences. - -Set the `padding` parameter to `True` to pad the shorter sequences in the batch to match the longest sequence: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True) ->>> print(encoded_input) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} -``` - -The first and third sentences are now padded with `0`'s because they are shorter. - -### Truncation - -On the other end of the spectrum, sometimes a sequence may be too long for a model to handle. In this case, you'll need to truncate the sequence to a shorter length. - -Set the `truncation` parameter to `True` to truncate a sequence to the maximum length accepted by the model: - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True) ->>> print(encoded_input) -{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], - 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} -``` - - - -Check out the [Padding and truncation](./pad_truncation) concept guide to learn more different padding and truncation arguments. - - - -### Build tensors - -Finally, you want the tokenizer to return the actual tensors that get fed to the model. - -Set the `return_tensors` parameter to either `pt` for PyTorch, or `tf` for TensorFlow: - - - - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt") ->>> print(encoded_input) -{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], - [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], - [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), - 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), - 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])} -``` - - -```py ->>> batch_sentences = [ -... "But what about second breakfast?", -... "Don't think he knows about second breakfast, Pip.", -... "What about elevensies?", -... ] ->>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf") ->>> print(encoded_input) -{'input_ids': , - 'token_type_ids': , - 'attention_mask': } -``` - - - - -Different pipelines support tokenizer arguments in their `__call__()` differently. `text-2-text-generation` pipelines support (i.e. pass on) -only `truncation`. `text-generation` pipelines support `max_length`, `truncation`, `padding` and `add_special_tokens`. -In `fill-mask` pipelines, tokenizer arguments can be passed in the `tokenizer_kwargs` argument (dictionary). - - -## Audio - -For audio tasks, you'll need a [feature extractor](main_classes/feature_extractor) to prepare your dataset for the model. The feature extractor is designed to extract features from raw audio data, and convert them into tensors. - -Load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) for more details on how to load a dataset) to see how you can use a feature extractor with audio datasets: - -```py ->>> from datasets import load_dataset, Audio - ->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") -``` - -Access the first element of the `audio` column to take a look at the input. Calling the `audio` column automatically loads and resamples the audio file: - -```py ->>> dataset[0]["audio"] -{'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, - 0. , 0. ], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 8000} -``` - -This returns three items: - -* `array` is the speech signal loaded - and potentially resampled - as a 1D array. -* `path` points to the location of the audio file. -* `sampling_rate` refers to how many data points in the speech signal are measured per second. - -For this tutorial, you'll use the [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) model. Take a look at the model card, and you'll learn Wav2Vec2 is pretrained on 16kHz sampled speech audio. It is important your audio data's sampling rate matches the sampling rate of the dataset used to pretrain the model. If your data's sampling rate isn't the same, then you need to resample your data. - -1. Use 🤗 Datasets' [`~datasets.Dataset.cast_column`] method to upsample the sampling rate to 16kHz: - -```py ->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000)) -``` - -2. Call the `audio` column again to resample the audio file: - -```py ->>> dataset[0]["audio"] -{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, ..., - 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', - 'sampling_rate': 16000} -``` - -Next, load a feature extractor to normalize and pad the input. When padding textual data, a `0` is added for shorter sequences. The same idea applies to audio data. The feature extractor adds a `0` - interpreted as silence - to `array`. - -Load the feature extractor with [`AutoFeatureExtractor.from_pretrained`]: - -```py ->>> from transformers import AutoFeatureExtractor - ->>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") -``` - -Pass the audio `array` to the feature extractor. We also recommend adding the `sampling_rate` argument in the feature extractor in order to better debug any silent errors that may occur. - -```py ->>> audio_input = [dataset[0]["audio"]["array"]] ->>> feature_extractor(audio_input, sampling_rate=16000) -{'input_values': [array([ 3.8106556e-04, 2.7506407e-03, 2.8015103e-03, ..., - 5.6335266e-04, 4.6588284e-06, -1.7142107e-04], dtype=float32)]} -``` - -Just like the tokenizer, you can apply padding or truncation to handle variable sequences in a batch. Take a look at the sequence length of these two audio samples: - -```py ->>> dataset[0]["audio"]["array"].shape -(173398,) - ->>> dataset[1]["audio"]["array"].shape -(106496,) -``` - -Create a function to preprocess the dataset so the audio samples are the same lengths. Specify a maximum sample length, and the feature extractor will either pad or truncate the sequences to match it: - -```py ->>> def preprocess_function(examples): -... audio_arrays = [x["array"] for x in examples["audio"]] -... inputs = feature_extractor( -... audio_arrays, -... sampling_rate=16000, -... padding=True, -... max_length=100000, -... truncation=True, -... ) -... return inputs -``` - -Apply the `preprocess_function` to the first few examples in the dataset: - -```py ->>> processed_dataset = preprocess_function(dataset[:5]) -``` - -The sample lengths are now the same and match the specified maximum length. You can pass your processed dataset to the model now! - -```py ->>> processed_dataset["input_values"][0].shape -(100000,) - ->>> processed_dataset["input_values"][1].shape -(100000,) -``` - -## Computer vision - -For computer vision tasks, you'll need an [image processor](main_classes/image_processor) to prepare your dataset for the model. -Image preprocessing consists of several steps that convert images into the input expected by the model. These steps -include but are not limited to resizing, normalizing, color channel correction, and converting images to tensors. - - - -Image preprocessing often follows some form of image augmentation. Both image preprocessing and image augmentation -transform image data, but they serve different purposes: - -* Image augmentation alters images in a way that can help prevent overfitting and increase the robustness of the model. You can get creative in how you augment your data - adjust brightness and colors, crop, rotate, resize, zoom, etc. However, be mindful not to change the meaning of the images with your augmentations. -* Image preprocessing guarantees that the images match the model’s expected input format. When fine-tuning a computer vision model, images must be preprocessed exactly as when the model was initially trained. - -You can use any library you like for image augmentation. For image preprocessing, use the `ImageProcessor` associated with the model. - - - -Load the [food101](https://huggingface.co/datasets/food101) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) for more details on how to load a dataset) to see how you can use an image processor with computer vision datasets: - - - -Use 🤗 Datasets `split` parameter to only load a small sample from the training split since the dataset is quite large! - - - -```py ->>> from datasets import load_dataset - ->>> dataset = load_dataset("food101", split="train[:100]") -``` - -Next, take a look at the image with 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=image#datasets.Image) feature: - -```py ->>> dataset[0]["image"] -``` - -
- -
- -Load the image processor with [`AutoImageProcessor.from_pretrained`]: - -```py ->>> from transformers import AutoImageProcessor - ->>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") -``` - -First, let's add some image augmentation. You can use any library you prefer, but in this tutorial, we'll use torchvision's [`transforms`](https://pytorch.org/vision/stable/transforms.html) module. If you're interested in using another data augmentation library, learn how in the [Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) or [Kornia notebooks](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb). - -1. Here we use [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) to chain together a couple of -transforms - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) and [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html). -Note that for resizing, we can get the image size requirements from the `image_processor`. For some models, an exact height and -width are expected, for others only the `shortest_edge` is defined. - -```py ->>> from torchvision.transforms import RandomResizedCrop, ColorJitter, Compose - ->>> size = ( -... image_processor.size["shortest_edge"] -... if "shortest_edge" in image_processor.size -... else (image_processor.size["height"], image_processor.size["width"]) -... ) - ->>> _transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)]) -``` - -2. The model accepts [`pixel_values`](model_doc/vision-encoder-decoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) -as its input. `ImageProcessor` can take care of normalizing the images, and generating appropriate tensors. -Create a function that combines image augmentation and image preprocessing for a batch of images and generates `pixel_values`: - -```py ->>> def transforms(examples): -... images = [_transforms(img.convert("RGB")) for img in examples["image"]] -... examples["pixel_values"] = image_processor(images, do_resize=False, return_tensors="pt")["pixel_values"] -... return examples -``` - - - -In the example above we set `do_resize=False` because we have already resized the images in the image augmentation transformation, -and leveraged the `size` attribute from the appropriate `image_processor`. If you do not resize images during image augmentation, -leave this parameter out. By default, `ImageProcessor` will handle the resizing. - -If you wish to normalize images as a part of the augmentation transformation, use the `image_processor.image_mean`, -and `image_processor.image_std` values. - - -3. Then use 🤗 Datasets[`~datasets.Dataset.set_transform`] to apply the transforms on the fly: -```py ->>> dataset.set_transform(transforms) -``` - -4. Now when you access the image, you'll notice the image processor has added `pixel_values`. You can pass your processed dataset to the model now! - -```py ->>> dataset[0].keys() -``` - -Here is what the image looks like after the transforms are applied. The image has been randomly cropped and it's color properties are different. - -```py ->>> import numpy as np ->>> import matplotlib.pyplot as plt - ->>> img = dataset[0]["pixel_values"] ->>> plt.imshow(img.permute(1, 2, 0)) -``` - -
- -
- - - -For tasks like object detection, semantic segmentation, instance segmentation, and panoptic segmentation, `ImageProcessor` -offers post processing methods. These methods convert model's raw outputs into meaningful predictions such as bounding boxes, -or segmentation maps. - - - -### Pad - -In some cases, for instance, when fine-tuning [DETR](./model_doc/detr), the model applies scale augmentation at training -time. This may cause images to be different sizes in a batch. You can use [`DetrImageProcessor.pad`] -from [`DetrImageProcessor`] and define a custom `collate_fn` to batch images together. - -```py ->>> def collate_fn(batch): -... pixel_values = [item["pixel_values"] for item in batch] -... encoding = image_processor.pad(pixel_values, return_tensors="pt") -... labels = [item["labels"] for item in batch] -... batch = {} -... batch["pixel_values"] = encoding["pixel_values"] -... batch["pixel_mask"] = encoding["pixel_mask"] -... batch["labels"] = labels -... return batch -``` - -## Multimodal - -For tasks involving multimodal inputs, you'll need a [processor](main_classes/processors) to prepare your dataset for the model. A processor couples together two processing objects such as tokenizer and feature extractor. - -Load the [LJ Speech](https://huggingface.co/datasets/lj_speech) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) for more details on how to load a dataset) to see how you can use a processor for automatic speech recognition (ASR): - -```py ->>> from datasets import load_dataset - ->>> lj_speech = load_dataset("lj_speech", split="train") -``` - -For ASR, you're mainly focused on `audio` and `text` so you can remove the other columns: - -```py ->>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"]) -``` - -Now take a look at the `audio` and `text` columns: - -```py ->>> lj_speech[0]["audio"] -{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ..., - 7.3242188e-04, 2.1362305e-04, 6.1035156e-05], dtype=float32), - 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', - 'sampling_rate': 22050} - ->>> lj_speech[0]["text"] -'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition' -``` - -Remember you should always [resample](preprocessing#audio) your audio dataset's sampling rate to match the sampling rate of the dataset used to pretrain a model! - -```py ->>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000)) -``` - -Load a processor with [`AutoProcessor.from_pretrained`]: - -```py ->>> from transformers import AutoProcessor - ->>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") -``` - -1. Create a function to process the audio data contained in `array` to `input_values`, and tokenize `text` to `labels`. These are the inputs to the model: - -```py ->>> def prepare_dataset(example): -... audio = example["audio"] - -... example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) - -... return example -``` - -2. Apply the `prepare_dataset` function to a sample: - -```py ->>> prepare_dataset(lj_speech[0]) -``` - -The processor has now added `input_values` and `labels`, and the sampling rate has also been correctly downsampled to 16kHz. You can pass your processed dataset to the model now! diff --git a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py index 97e435e1a689..971468026d71 100644 --- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py +++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py @@ -546,4 +546,4 @@ def test_training_kernel(self): loss = (1 - lm_logits).mean() loss.backward() - self.assertEqual(out_training, out_no_training) \ No newline at end of file + self.assertEqual(out_training, out_no_training) From 9d85ef4e4c3f8f5a1c0e814817dc93e9a9a18419 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 11 Sep 2024 17:55:33 -0700 Subject: [PATCH 048/116] cache --- docs/source/en/_toctree.yml | 4 +- docs/source/en/cache_explanation.md | 74 ++++ docs/source/en/index.md | 2 +- docs/source/en/kv_cache.md | 544 ++++++++++++---------------- docs/source/en/pipeline_tutorial.md | 2 +- 5 files changed, 314 insertions(+), 312 deletions(-) create mode 100644 docs/source/en/cache_explanation.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index dbe01f4eb53f..c987250e2688 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -63,7 +63,9 @@ - local: llm_optims title: Optimize inference - local: kv_cache - title: Best Practices for Generation with Cache + title: KV cache strategies + - local: cache_explanation + title: Caching - local: llm_tutorial title: Generation with LLMs - local: generation_strategies diff --git a/docs/source/en/cache_explanation.md b/docs/source/en/cache_explanation.md new file mode 100644 index 000000000000..613e89275759 --- /dev/null +++ b/docs/source/en/cache_explanation.md @@ -0,0 +1,74 @@ + + +# Caching + +Imagine you’re having a conversation with someone, and instead of remembering what they previously said, they have to start from scratch every time you respond. This would be slow and inefficient, right? + +You can extend this analogy to transformer models. Autoregressive model generation can be slow because it makes a prediction one token at a time. Each new prediction is dependent on all the previous context. + +To predict the 1000th token, the model requires information from the previous 999 tokens. The information is represented as matrix multiplications across the token representations. + +To predict the 1001th token, you need the same information from the previous 999 tokens in addition to any information from the 1000th token. This is a lot of matrix multiplications a model has to compute over and over for each token! + +A key-value (KV) cache eliminates this inefficiency by storing kv pairs derived from the self-attention layers of previously processed tokens. The stored kv pairs are retrieved from the cache and reused for subsequent tokens, avoiding the need to recompute. + +> [!WARNING] +> Caching should only be used for **inference**. It may cause unexpected errors if it's enabled during training. + +## Cache class + +When you use Transformers' [`Cache`] class, the attention module performs several critical steps to integrate past and present information. + +1. The attention module concatenates current kv pairs with past kv pairs stored in the cache. This creates attentions weights with the shape `(new_tokens_length, past_kv_length + new_tokens_length)`. The current and past kv pairs are essentially combined to compute the attention scores, ensuring a model is aware of previous context and the current input. + +2. When the `forward` method is called iteratively, it's crucial that the attention mask shape matches the combined length of the past and current kv pairs. The attention mask should have the shape `(batch_size, past_kv_length + new_tokens_length)`. This is typically handled internally in [`~GenerationMixin.generate`], but if you want to implement your own generation loop with [`Cache`], keep this in mind! The attention mask should hold the past and current token values. + +3. It is also important to be aware of the `cache_position`. This is important if you want to reuse a prefilled [`Cache`] with the `forward` method because you have to pass a valid `cache_position` value. This indicates the input positions in a sequence. `cache_position` is unaffected by padding, and it always adds one more position for each token. For example, if a kv cache contains 10 tokens - regardless of pad tokens - the cache position for the next token should be `torch.tensor([10]). + +The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token. + +```py +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache + +model_id = "meta-llama/Llama-2-7b-chat-hf" +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0") +tokenizer = AutoTokenizer.from_pretrained(model_id) + +past_key_values = DynamicCache() +messages = [{"role": "user", "content": "Hello, what's your name."}] +inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0") + +generated_ids = inputs.input_ids +cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device="cuda:0") +max_new_tokens = 10 + +for _ in range(max_new_tokens): + outputs = model(**inputs, cache_position=cache_position, past_key_values=past_key_values, use_cache=True) + # Greedily sample one next token + next_token_ids = outputs.logits[:, -1:].argmax(-1) + generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1) + # Prepare inputs for the next generation step by leaaving unprocessed tokens, in our case we have only one new token + # and expanding attn mask for the new token, as explained above + attention_mask = inputs["attention_mask"] + attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1) + inputs = {"input_ids": next_token_ids, "attention_mask": attention_mask} + cache_position = cache_position[-1:] + 1 # add one more position for the next token + +print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]) +"[INST] Hello, what's your name. [/INST] Hello! My name is LLaMA," +``` diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 66f59feaa3bb..3eb9fc9d398b 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -41,7 +41,7 @@ Transformers is designed for developers and machine learning engineers and resea diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md index d02c007b115f..44505f8f8c2f 100644 --- a/docs/source/en/kv_cache.md +++ b/docs/source/en/kv_cache.md @@ -14,420 +14,346 @@ rendered properly in your Markdown viewer. --> -# Best Practices for Generation with Cache +# KV cache strategies -Efficient caching is crucial for optimizing the performance of models in various generative tasks, -including text generation, translation, summarization and other transformer-based applications. -Effective caching helps reduce computation time and improve response rates, especially in real-time or resource-intensive applications. +The key-value (KV) vectors are used to calculate attention scores, and for autoregressive models, the KV scores are calculated *every* time because the model predicts one token at a time. Each prediction depends on the previous tokens, which means the model performs the same computations each time. -Transformers support various caching methods, leveraging "Cache" classes to abstract and manage the caching logic. -This document outlines best practices for using these classes to maximize performance and efficiency. -Check out all the available `Cache` classes in the [API documentation](./internal/generation_utils). +A KV *cache* stores these calculations so they can be reused without recomputing them. Efficient caching is crucial for optimizing model performance because it reduces computation time and improves response rates. For a more in-depth explanation about how a cache works, refer to [Caching](./cache_explanation.md). -## What is Cache and why we should care? +Transformers offers several [`Cache`] classes that implement different caching mechanisms. Some of these [`Cache`] classes are optimized to save memory while others are designed to maximize generation speed. Refer to the table below to compare cache types and use it to help you select the best cache for your use case. -Imagine you’re having a conversation with someone, and instead of remembering what was said previously, you have to start from scratch every time you respond. This would be slow and inefficient, right? In the world of Transformer models, a similar concept applies, and that's where Caching keys and values come into play. From now on, I'll refer to the concept as KV Cache. +| Cache Type | Memory Efficient  | Supports torch.compile() | Initialization Recommended | Latency | Long Context Generation | +|------------------------|------------------|--------------------------|----------------------------|---------|-------------------------| +| Dynamic Cache | No | No | No | Mid | No | +| Static Cache | No | Yes | Yes | High | No | +| Offloaded Cache | Yes | No | No | Low | Yes | +| Offloaded Static Cache | No | Yes | Yes | High | Yes | +| Quantized Cache | Yes | No | No | Low | Yes | +| Sliding Window Cache | No | Yes | Yes | High | No | +| Sink Cache | Yes | No | Yes | Mid | Yes | -KV cache is needed to optimize the generation in autoregressive models, where the model predicts text token by token. This process can be slow since the model can generate only one token at a time, and each new prediction is dependent on the previous context. That means, to predict token number 1000 in the generation, you need information from the previous 999 tokens, which comes in the form of some matrix multiplications across the representations of those tokens. But to predict token number 1001, you also need the same information from the first 999 tokens, plus additional information from token number 1000. That is where key-value cache is used to optimize the sequential generation process by storing previous calculations to reuse in subsequent tokens, so they don't need to be computed again. +This guide introduces you to the different [`Cache`] classes and shows you how to use them for generation. -More concretely, key-value cache acts as a memory bank for these generative models, where the model stores key-value pairs derived from self-attention layers for previously processed tokens. By storing this information, the model can avoid redundant computations and instead retrieve keys and values of previous tokens from the cache. Note that caching can be used only in inference and should be disabled when training, otherwise it might cause unexpected errors. +## Default cache -
- For the Curious Minds Who Like to Dive Deep +The [`DynamicCache`] is the default cache class for most models. It allows the cache size to grow dynamically in order to store an increasing number of keys and values as generation progresses. - ### Under the Hood: How Cache Object Works in Attention Mechanism +Disable the cache by configuring `use_cache=False` in [`~GenerationMixin.generate`]. - When utilizing a cache object in the input, the Attention module performs several critical steps to integrate past and present information seamlessly. +```py +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM - The Attention module concatenates the current key-values with the past key-values stored in the cache. This results in attention weights of shape `(new_tokens_length, past_kv_length + new_tokens_length)`. Essentially, the past and current key-values are combined to compute attention scores, ensuring that the model considers both previous context and new input. The concatenated key-values are used to compute the attention scores resulting in attention weights of shape `(new_tokens_length, past_kv_length + new_tokens_length)`. +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0") +inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device) - Therefore, when iteratively calling `forward()` instead of the `generate()` method, it’s crucial to ensure that the attention mask shape matches the combined length of past and current key-values. The attention mask should have the shape `(batch_size, past_kv_length + new_tokens_length)`. This is usually handled internally when you call `generate()` method. If you want to implement your own generation loop with Cache classes, take this into consideration and prepare the attention mask to hold values to current and past tokens. +model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=False) +``` - +Cache classes can also be initialized first before calling and passing it to the models [past_key_values](https://hf.co/docs/transformers/internal/generation_utils#transformers.generation.GenerateDecoderOnlyOutput.past_key_values) parameter. This cache initialization strategy is only recommended for some cache types. - One important concept you need to know when writing your own generation loop, is `cache_position`. In case you want to reuse an already filled Cache object by calling `forward()`, you have to pass in a valid `cache_position` which will indicate the positions of inputs in the sequence. Note that `cache_position` is not affected by padding, and always adds one more position for each token. For example, if key/value cache contains 10 tokens (no matter how many of it is a pad token), the cache position for the next token should be `torch.tensor([10])`. +In most other cases, it's easier to define the cache strategy in the [cache_implementation](https://hf.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.cache_implementation) parameter. - +```py +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0") +inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device) - See an example below for how to implement your own generation loop. +past_key_values = DynamicCache() +out = model.generate(**inputs, do_sample=False, max_new_tokens=20, past_key_values=past_key_values) +``` - ```python - >>> import torch - >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache +## Memory efficient caches - >>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto") - >>> tokenizer = AutoTokenizer.from_pretrained(model_id) +The KV cache can occupy a significant portion of memory and become a [bottleneck](https://hf.co/blog/llama31#inference-memory-requirements) for long-context generation. Memory efficient caches focus on trading off speed for reduced memory usage. This is especially important for large language models (LLMs) and if your hardware is memory constrained. - >>> past_key_values = DynamicCache() - >>> messages = [{"role": "user", "content": "Hello, what's your name."}] - >>> inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device) +### Offloaded cache - >>> generated_ids = inputs.input_ids - >>> cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device=model.device) - >>> max_new_tokens = 10 +The [`OffloadedCache`] saves GPU memory by moving the KV cache for most model layers to the CPU. Only the current layer cache is maintained on the GPU during a models `forward` iteration over the layers. [`OffloadedCache`] asynchronously prefetches the next layer cache and sends the previous layer cache back to the CPU. - >>> for _ in range(max_new_tokens): - ... outputs = model(**inputs, cache_position=cache_position, past_key_values=past_key_values, use_cache=True) - ... # Greedily sample one next token - ... next_token_ids = outputs.logits[:, -1:].argmax(-1) - ... generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1) - ... - ... # Prepare inputs for the next generation step by leaaving unprocessed tokens, in our case we have only one new token - ... # and expanding attn mask for the new token, as explained above - ... attention_mask = inputs["attention_mask"] - ... attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1) - ... inputs = {"input_ids": next_token_ids, "attention_mask": attention_mask} - ... cache_position = cache_position[-1:] + 1 # add one more position for the next token +This cache strategy always generates the same result as [`DynamicCache`] and works as a drop-in replacement or fallback. You may want to use [`OffloadedCache`] if you have a GPU and you're getting out-of-memory (OOM) errors. - >>> print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]) - ``` - ```txt - <|user|> - Hello, what's your name. - <|assistant|> - My name is Sarah. - <| - ``` +> [!WARNING] +> You may notice a small degradation in generation throughput compared to [`DynamicCache`] depending on your model and generation choices (context size, number of generated tokens, number of beams, etc.). -
+Enable [`OffloadedCache`] by configuring `cache_implementation="offloaded"` in either [`GenerationConfig`] or [`~GenerationMixin.generate`]. +```py +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM +ckpt = "microsoft/Phi-3-mini-4k-instruct" +tokenizer = AutoTokenizer.from_pretrained(ckpt) +model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0") +inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device) -## Generate with Cache +out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded") +print(tokenizer.batch_decode(out, skip_special_tokens=True)[0]) +Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896. +``` -In 🤗 Transformers, we support various Cache types to optimize the performance across different models and tasks. By default, all models generate with caching, -with the [`~DynamicCache`] class being the default cache for most models. It allows us to dynamically grow cache size, by saving more and more keys and values as we generate. If for some reason you don't want to use caches, you can pass `use_cache=False` into the `generate()` method. +The example below shows how you can fallback on [`OffloadedCache`] if you run out of memory. + +```py +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM + +def resilient_generate(model, *args, **kwargs): + oom = False + try: + return model.generate(*args, **kwargs) + except torch.cuda.OutOfMemoryError as e: + print(e) + print("retrying with cache_implementation='offloaded'") + oom = True + if oom: + torch.cuda.empty_cache() + kwargs["cache_implementation"] = "offloaded" + return model.generate(*args, **kwargs) + +ckpt = "microsoft/Phi-3-mini-4k-instruct" +tokenizer = AutoTokenizer.from_pretrained(ckpt) +model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0") +prompt = ["okay "*1000 + "Fun fact: The most"] +inputs = tokenizer(prompt, return_tensors="pt").to(model.device) +beams = { "num_beams": 40, "num_beam_groups": 40, "num_return_sequences": 40, "diversity_penalty": 1.0, "max_new_tokens": 23, "early_stopping": True, } +out = resilient_generate(model, **inputs, **beams) +responses = tokenizer.batch_decode(out[:,-28:], skip_special_tokens=True) +``` -Refer to the table below to see the difference between cache types and choose the one that suits best for your use-case. Models for which initialization is recommended should be initialized before calling the model and passed to model as a kwarg. In all other cases you can simply define desired `cache_implementation` and we take care of the rest for you. +### Quantized cache -| Cache Type | Memory Efficient | Supports torch.compile() | Initialization Recommended | Latency | Long Context Generation | -|------------------------|------------------|--------------------------|----------------------------|---------|-------------------------| -| Dynamic Cache | No | No | No | Mid | No | -| Static Cache | No | Yes | Yes | High | No | -| Offloaded Cache | Yes | No | No | Low | Yes | -| Offloaded Static Cache | No | Yes | Yes | High | Yes | -| Quantized Cache | Yes | No | No | Low | Yes | -| Sliding Window Cache | No | Yes | Yes | High | No | -| Sink Cache | Yes | No | Yes | Mid | Yes | +The [`QuantizedCache`] reduces memory requirements by quantizing the KV values to a lower precision. [`QuantizedCache`] currently supports two quantization backends. +- [`HQQQuantizedCache`] supports int2, int4, and int8 datatypes. +- [`QuantoQuantizedCache`] supports int2 and int4 datatypes. This is the default quantization backend. -These cache classes can be set with a `cache_implementation` argument when generating. To learn about the available options for the cache_implementation flag, please refer to the [API Documentation](./main_classes/text_generation#transformers.GenerationConfig). Now, let's explore each cache type in detail and see how to use them. Note that the below examples are for decoder-only Tranformer-based models. We also support ["Model-Specific Cache"] classes for models such as Mamba or Jamba, keep reading for more details. +> [!WARNING] +> Quantizing the cache can harm latency if the context length is short and there is enough GPU memory available for generation without enabling cache quantization. Try to find a balance between memory efficiency and latency. -### Quantized Cache +Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and indicate the quantization backend in [`QuantizedCacheConfig`]. Any additional quantization related parameters should also be passed either as a dict or an instance of [`QuantizedCacheConfig`]. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length. -The key and value cache can occupy a large portion of memory, becoming a [bottleneck for long-context generation](https://huggingface.co/blog/llama31#inference-memory-requirements), especially for Large Language Models. -Quantizing the cache when using `generate()` can significantly reduce memory requirements at the cost of speed. + + -KV Cache quantization in `transformers` is largely inspired by the paper ["KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache"](https://arxiv.org/abs/2402.02750) and currently supports [`~QuantoQuantizedCache`] and [`~HQQQuantizedCache`] classes. For more information on the inner workings see the paper. +For [`HQQQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `1`. -To enable quantization of the key-value cache, one needs to indicate `cache_implementation="quantized"` in the `generation_config`. -Quantization related arguments should be passed to the `generation_config` either as a `dict` or an instance of a [`~QuantizedCacheConfig`] class. -One has to indicate which quantization backend to use in the [`~QuantizedCacheConfig`], the default is `quanto`. +```py +from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache, QuantizedCacheConfig -It is recommended to set `axis-key/axis-value` parameters in the cache config to `0` if you're using the `quanto` backend and to `1` if you're using the `HQQ` backend. For other config values, please use the defaults unless you're running out of memory. In that case, you may consider decreasing the residual length. +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0") +inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device) - +out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"axis-key": 1, "axis-value": 1, "backend": "hqq"}) +print(tokenizer.batch_decode(out, skip_special_tokens=True)[0]) +I like rock music because it's loud and energetic. It's a great way to express myself and rel +``` -Cache quantization can be detrimental in terms of latency if the context length is short and there is enough GPU VRAM available to run without cache quantization. It is recommended to seek balance between memory efficiency and latency. - + + +For [`QuantoQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `0`. -```python ->>> import torch ->>> from transformers import AutoTokenizer, AutoModelForCausalLM +```py +from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache, QuantizedCacheConfig ->>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") ->>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto") ->>> inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device) +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0") +inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device) ->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"}) ->>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0]) -I like rock music because it's a great way to express myself. I like the way it makes me feel, the +out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "axis-key": 0, "axis-value": 0, "backend": "quanto"}) +print(tokenizer.batch_decode(out, skip_special_tokens=True)[0]) +I like rock music because it's loud and energetic. It's a great way to express myself and rel ``` -### Offloaded Cache + + -Similarly to KV cache quantization, [`~OffloadedCache`] strategy aims to reduce GPU VRAM usage. -It does so by moving the KV cache for most layers to the CPU. -As the model's `forward()` method iterates over the layers, this strategy maintains the current layer cache on the GPU. -At the same time it asynchronously prefetches the next layer cache as well as sending the previous layer cache back to the CPU. -Unlike KV cache quantization, this strategy always produces the same result as the default KV cache implementation. -Thus, it can serve as a drop-in replacement or a fallback for it. +### Sink cache -Depending on your model and the characteristics of your generation task (size of context, number of generated tokens, number of beams, etc.) -you may notice a small degradation in generation throughput compared to the default KV cache implementation. +[`SinkCache`] is capable of generating very long sequences ("infinite length" according to the paper) by only retaining a few initial tokens from the sequence. These are called the *sink tokens* because they account for a significant portion of the attention scores during generation. Subsequent tokens are discarded on a sliding windowed basis, and only the latest `window_size` tokens are kept. This means most of the previous knowledge is discarded. -To enable KV cache offloading, pass `cache_implementation="offloaded"` in the `generation_config` or directly to the `generate()` call. -Use `cache_implementation="offloaded_static"` for an offloaded static cache (see also [Offloaded Static Cache](#offloaded-static-cache) below). +The sink tokens allow a model to maintain stable performance even when it's dealing with very long text sequences. -```python ->>> import torch ->>> from transformers import AutoTokenizer, AutoModelForCausalLM ->>> ckpt = "microsoft/Phi-3-mini-4k-instruct" +Enable [`SinkCache`] by initializing it first with the [window_length](https://hf.co/docs/transformers/main/en/internal/generation_utils#transformers.SinkCache.window_length) and [num_sink_tokens](https://hf.co/docs/transformers/main/en/internal/generation_utils#transformers.SinkCache.num_sink_tokens) parameters before passing it to [past_key_values](https://hf.co/docs/transformers/internal/generation_utils#transformers.generation.GenerateDecoderOnlyOutput.past_key_values) in [`~GenerationMixin.generate`]. ->>> tokenizer = AutoTokenizer.from_pretrained(ckpt) ->>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, device_map="auto") ->>> inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device) - ->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded") ->>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0]) -Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896. +```py +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache ->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23) ->>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0]) -Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896. -``` +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0") +inputs = tokenizer("This is a long story about unicorns, fairies and magic.", return_tensors="pt").to(model.device) - - -Cache offloading requires a CUDA GPU and can be slower than dynamic KV cache. Use it if you are getting CUDA out of memory errors. - - - -The example below shows how KV cache offloading can be used as a fallback strategy. -```python ->>> import torch ->>> from transformers import AutoTokenizer, AutoModelForCausalLM ->>> def resilient_generate(model, *args, **kwargs): -... oom = False -... try: -... return model.generate(*args, **kwargs) -... except torch.cuda.OutOfMemoryError as e: -... print(e) -... print("retrying with cache_implementation='offloaded'") -... oom = True -... if oom: -... torch.cuda.empty_cache() -... kwargs["cache_implementation"] = "offloaded" -... return model.generate(*args, **kwargs) -... -... ->>> ckpt = "microsoft/Phi-3-mini-4k-instruct" ->>> tokenizer = AutoTokenizer.from_pretrained(ckpt) ->>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0") ->>> prompt = ["okay "*1000 + "Fun fact: The most"] ->>> inputs = tokenizer(prompt, return_tensors="pt").to(model.device) ->>> beams = { "num_beams": 40, "num_beam_groups": 40, "num_return_sequences": 40, "diversity_penalty": 1.0, "max_new_tokens": 23, "early_stopping": True, } ->>> out = resilient_generate(model, **inputs, **beams) ->>> responses = tokenizer.batch_decode(out[:,-28:], skip_special_tokens=True) +past_key_values = SinkCache(window_length=256, num_sink_tokens=4) +out = model.generate(**inputs, do_sample=False, max_new_tokens=30, past_key_values=past_key_values) +tokenizer.batch_decode(out, skip_special_tokens=True)[0] +"This is a long story about unicorns, fairies and magic. It is a fantasy world where unicorns and fairies live together in harmony. The story follows a young girl named Lily" ``` -On a GPU with 50 GB of RAM, running this code will print -``` -CUDA out of memory. Tried to allocate 4.83 GiB. GPU -retrying with cache_implementation='offloaded' -``` -before successfully generating 40 beams. +## Speed optimized caches +The default [`DynamicCache`] prevents you from taking advantage of just-in-time (JIT) optimizations because the cache size isn't fixed. JIT optimizations enable you to maximize latency at the expense of memory usage. All of the following cache types are compatible with JIT optimizations like [torch.compile](./llm_optims#static-kv-cache-and-torchcompile) to accelerate generation. -### Static Cache +### Static cache -Since the "DynamicCache" dynamically grows with each generation step, it prevents you from taking advantage of JIT optimizations. The [`~StaticCache`] pre-allocates -a specific maximum size for the keys and values, allowing you to generate up to the maximum length without having to modify cache size. Check the below usage example. +A [`StaticCache`] pre-allocates a specific maximum cache size for the kv pairs. You can generate up to the maximum cache size without needing to modify it. -For more examples with Static Cache and JIT compilation, take a look at [StaticCache & torchcompile](./llm_optims#static-kv-cache-and-torchcompile) +Enable [`StaticCache`] by configuring `cache_implementation="static"` in [`~GenerationMixin.generate`]. -```python ->>> import torch ->>> from transformers import AutoTokenizer, AutoModelForCausalLM +```py +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM ->>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") ->>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto") ->>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device) +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto") +inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device) ->>> # simply pass the cache implementation="static" ->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="static") ->>> tokenizer.batch_decode(out, skip_special_tokens=True)[0] -"Hello, my name is [Your Name] and I am a [Your Position] at [Your Company]. I am writing" +out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="static") +tokenizer.batch_decode(out, skip_special_tokens=True)[0] +"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of" ``` +### Offloaded static cache -## Offloaded Static Cache +The [`OffloadedStaticCache`] is very similar to the [OffloadedCache](#offloaded-cache) except the cache size is set to a maximum cache size. Otherwise, [`OffladedStaticCache`] only keeps the current layer cache on the GPU and the rest are moved to the CPU. -Like [`~OffloadedCache`] exists for offloading a "DynamicCache", there is also an offloaded static cache. It fully supports -JIT optimizations. Just pass `cache_implementation="offloaded_static"` in the `generation_config` or directly to the `generate()` call. -This will use the [`~OffloadedStaticCache`] implementation instead. +Enable [`OffloadedStaticCache`] by configuring `cache_implementation="offloaded_static"` in [`~GenerationMixin.generate`]. -```python ->>> import torch ->>> from transformers import AutoTokenizer, AutoModelForCausalLM +```py +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM ->>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") ->>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto") ->>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device) +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto") +inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device) ->>> # simply pass the cache implementation="offloaded_static" ->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="offloaded_static") ->>> tokenizer.batch_decode(out, skip_special_tokens=True)[0] +out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="offloaded_static") +tokenizer.batch_decode(out, skip_special_tokens=True)[0] "Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of" ``` Cache offloading requires a CUDA GPU. +### Sliding window cache -### Sliding Window Cache +[`SlidingWindowCache`] implements a sliding window over the previos kv pairs, and only keeps the last `sliding_window` tokens. This cache type is designed to only work with models that support *sliding window attention*, such as [Mistral](./model_doc/mistral). Older kv states are discarded and replaced by new kv states. -As the name suggests, this cache type implements a sliding window over previous keys and values, retaining only the last `sliding_window` tokens. It should be used with models like Mistral that support sliding window attention. Additionally, similar to Static Cache, this one is JIT-friendly and can be used with the same compile tecniques as Static Cache. +Enable [`SlidingWindowCache`] by configuring `cache_implementation="sliding_window"` in [`~GenerationMixin.generate`]. -Note that you can use this cache only for models that support sliding window, e.g. Mistral models. +```py +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") +model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16).to("cuda:0") +inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device) -```python ->>> import torch ->>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache - ->>> tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B") ->>> model = AutoModelForCausalLM.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B", torch_dtype=torch.float16, device_map="auto") ->>> inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device) - ->>> # can be used by passing in cache implementation ->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window") ->>> tokenizer.batch_decode(out, skip_special_tokens=True)[0] -"Yesterday I was on a rock concert and. I was so excited to see my favorite band perform live. I was so happy that I could hardly contain myself. I was jumping up and down and" +out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window") +tokenizer.batch_decode(out, skip_special_tokens=True)[0] ``` -### Sink Cache - -Sink Cache was introduced in ["Efficient Streaming Language Models with Attention Sinks"](https://arxiv.org/abs/2309.17453). It allows you to generate long sequences of text ("infinite length" according to the paper) without any fine-tuning. That is achieved by smart handling of previous keys and values, specifically it retains a few initial tokens from the sequence, called "sink tokens". This is based on the observation that these initial tokens attract a significant portion of attention scores during the generation process. Tokens that come after "sink tokens" are discarded on a sliding windowed basis, keeping only the latest `window_size` tokens. By keeping these initial tokens as "attention sinks," the model maintains stable performance even when dealing with very long texts, thus discarding most of the previous knowledge. - -Unlike other cache classes, this one can't be used directly by indicating a `cache_implementation`. You have to initialize the Cache before calling on `generate()` as follows. - -```python ->>> import torch ->>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache +## Model caches ->>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") ->>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto") ->>> inputs = tokenizer("This is a long story about unicorns, fairies and magic.", return_tensors="pt").to(model.device) +Some model types, like encoder-decoder models or [Gemma2](./model_doc/gemma2) and [Mamba](./model_doc/mamba), have dedicated cache classes. ->>> # get our cache, specify number of sink tokens and window size ->>> # Note that window size already includes sink tokens, so has to be larger ->>> past_key_values = SinkCache(window_length=256, num_sink_tokens=4) ->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, past_key_values=past_key_values) ->>> tokenizer.batch_decode(out, skip_special_tokens=True)[0] -"This is a long story about unicorns, fairies and magic. It is a story about a young girl named Lily who discovers that she has the power to control the elements. She learns that she can" -``` - -### Encoder-Decoder Cache +### Encoder-decoder cache -The [`~EncoderDecoderCache`] is a wrapper designed to handle the caching needs of encoder-decoder models. This cache type is specifically built to manage both self-attention and cross-attention caches, ensuring storage and retrieval of past key/values required for these complex models. Cool thing about Encoder-Decoder Cache is that you can set different cache types for the encoder and for the decoder, depending on your use case. Currently this cache is only supported in [Whisper](./model_doc/whisper) models but we will be adding more models soon. +[`EncoderDecoderCache`] is designed for encoder-decoder models. It manages both the self-attention and cross-attention caches to ensure storage and retrieval of previous kv pairs. It is possible to individually set a different cache type for the encoder and decoder. -In terms of usage, there is nothing special to be done and calling `generate()` or `forward()` will handle everything for you. +This cache type doesn't require any setup. It can be used when calling [`~GenerationMixin.generate`] or a models `forward` method. +> [!TIP] +> The [`EncoderDecoderCache`] currently only supports [Whisper](./model_doc/whisper). -### Model-specific Cache Classes +### Model-specific caches -Some models require storing previous keys, values, or states in a specific way, and the above cache classes cannot be used. For such cases, we have several specialized cache classes that are designed for specific models. These models only accept their own dedicated cache classes and do not support using any other cache types. Some examples include [`~HybridCache`] for [Gemma2](./model_doc/gemma2) series models or [`~MambaCache`] for [Mamba](./model_doc/mamba) architecture models. +Some models have a unique way of storing past kv pairs or states that is not compatible with any other cache classes. +[Gemma2](./model_doc/gemma2) requires [`HybridCache`], which uses a combination of [`SlidingWindowCache`] for sliding window attention and [`StaticCache`] for global attention under the hood. -## Iterative Generation with Cache +[Mamba](./model_doc/mamba) requires [`MambaCache`] because the model doesn't have an attention mechanism or kv states. -We have seen how to use each of the cache types when generating. What if you want to use cache in iterative generation setting, for example in applications like chatbots, where interactions involve multiple turns and continuous back-and-forth exchanges. Iterative generation with cache allows these systems to handle ongoing conversations effectively without reprocessing the entire context at each step. But there are some tips that you should know before you start implementing: +## Iterative generation -The general format when doing iterative generation is as below. First you have to initialize an empty cache of the type you want, and you can start feeding in new prompts iteratively. Keeping track of dialogues history and formatting can be done with chat templates, read more on that in [chat_templating](./chat_templating) +A cache can also work in iterative generation settings where there is back-and-forth interaction with a model (chatbots). Like regular generation, iterative generation with a cache allows a model to efficiently handle ongoing conversations without recomputing the entire context at each step. -In case you are using Sink Cache, you have to crop your inputs to that maximum length because Sink Cache can generate text longer than its maximum window size, but it expects the first input to not exceed the maximum cache length. +For iterative generation with a cache, start by initializing an empty cache class and then you can feed in your new prompts. Keep track of dialogue history with a [chat template](./chat_templating). +If you're using [`SinkCache`], the inputs need to be truncated to the maximum length because [`SinkCache`] can generate text that exceeds its maximum window size. However, the first input shouldn't exceed the maximum cache length. -```python ->>> import torch ->>> from transformers import AutoTokenizer,AutoModelForCausalLM ->>> from transformers.cache_utils import ( -... DynamicCache, -... SinkCache, -... StaticCache, -... SlidingWindowCache, -... QuantoQuantizedCache, -... QuantizedCacheConfig, -... ) +The example below demonstrates how to use a cache for iterative generation. ->>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ->>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto') ->>> tokenizer = AutoTokenizer.from_pretrained(model_id) +```py +import torch +from transformers import AutoTokenizer,AutoModelForCausalLM +from transformers.cache_utils import ( + DynamicCache, + SinkCache, + StaticCache, + SlidingWindowCache, + QuantoQuantizedCache, + QuantizedCacheConfig, +) ->>> user_prompts = ["Hello, what's your name?", "Btw, yesterday I was on a rock concert."] +model_id = "meta-llama/Llama-2-7b-chat-hf" +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto') +tokenizer = AutoTokenizer.from_pretrained(model_id) ->>> past_key_values = DynamicCache() ->>> max_cache_length = past_key_values.get_max_cache_shape() +user_prompts = ["Hello, what's your name?", "Btw, yesterday I was on a rock concert."] ->>> messages = [] ->>> for prompt in user_prompts: -... messages.append({"role": "user", "content": prompt}) -... inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device) -... if isinstance(past_key_values, SinkCache): -... inputs = {k: v[:, -max_cache_length:] for k, v in inputs.items()} -... -... input_length = inputs["input_ids"].shape[1] -... -... outputs = model.generate(**inputs, do_sample=False, max_new_tokens=256, past_key_values=past_key_values) -... completion = tokenizer.decode(outputs[0, input_length: ], skip_special_tokens=True) -... messages.append({"role": "assistant", "content": completion}) +past_key_values = DynamicCache() +max_cache_length = past_key_values.get_max_length() -print(messages) -[{'role': 'user', 'content': "Hello, what's your name?"}, {'role': 'assistant', 'content': "Hello, I'm AI."}, {'role': 'user', 'content': 'Btw, yesterday I was on a rock concert.'}, {'role': 'assistant', 'content': "I'm sorry to hear that you were on a rock concert yesterday. It sounds like a fun experience, but I'm not capable of experiencing music or concerts. However, I can provide you with some information about rock music and its history. Rock music emerged in the 1950s and 1960s in the United States and Britain, and it quickly gained popularity around the world. Some of the most famous rock bands of all time include The Beatles, The Rolling Stones, Led Zeppelin, and Pink Floyd. Rock music has a distinct sound and style, with elements of blues, country, and folk music. It often features guitar solos, heavy bass lines, and drums. Rock music has had a significant impact on popular culture, influencing genres such as punk rock, heavy metal, and alternative rock."}] +messages = [] +for prompt in user_prompts: + messages.append({"role": "user", "content": prompt}) + inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device) + if isinstance(past_key_values, SinkCache): + inputs = {k: v[:, -max_cache_length:] for k, v in inputs.items()} + input_length = inputs["input_ids"].shape[1] + outputs = model.generate(**inputs, do_sample=False, max_new_tokens=256, past_key_values=past_key_values) + completion = tokenizer.decode(outputs[0, input_length: ], skip_special_tokens=True) + messages.append({"role": "assistant", "content": completion}) ``` +## Prefill a cache -## Re-use Cache to continue generation - -Sometimes you would want to first fill-in cache object with key/values for certain prefix prompt and re-use it several times to generate different sequences from it. In that case you can construct a `Cache` object that will hold the instruction prompt, and re-use it several times with different text sequences. - -```python ->>> import copy ->>> import torch ->>> from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache ->>> from accelerate.test_utils.testing import get_backend - ->>> DEVICE, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) ->>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ->>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=DEVICE) ->>> tokenizer = AutoTokenizer.from_pretrained(model_id) - ->>> # Init StaticCache with big enough max-length (1024 tokens for the below example) ->>> # You can also init a DynamicCache, if that suits you better ->>> prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device=DEVICE, dtype=torch.bfloat16) - ->>> INITIAL_PROMPT = "You are a helpful assistant. " ->>> inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to(DEVICE) ->>> # This is the common prompt cached, we need to run forward without grad to be abel to copy ->>> with torch.no_grad(): -... prompt_cache = model(**inputs_initial_prompt, past_key_values = prompt_cache).past_key_values - ->>> prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"] ->>> responses = [] ->>> for prompt in prompts: -... new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to(DEVICE) -... past_key_values = copy.deepcopy(prompt_cache) -... outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20) -... response = tokenizer.batch_decode(outputs)[0] -... responses.append(response) - ->>> print(responses) -[' You are a helpful assistant. Help me to write a blogpost about travelling. I am excited to share my experiences with you. I have been traveling for the past', ' You are a helpful assistant. What is the capital of France? \n\nAnswer: Paris is the capital of France.'] -``` +In some situations, you may want to fill a [`Cache`] with kv pairs for a certain prefix prompt and reuse it to generate different sequences. +The example below initializes a [`StaticCache`], and then caches an initial prompt. Now you can generate several sequences from the prefilled prompt. -## Legacy cache format +```py +import copy +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache -Prior to the introduction of the `Cache` object, the cache of LLMs used to be a tuple of tuples of tensors. The legacy -format has a dynamic size, growing as we generate text -- very similar to `DynamicCache`. If your project depend on -this legacy format, you can seamlessly convert it to a `DynamicCache` and back. +model_id = "meta-llama/Llama-2-7b-chat-hf" +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda") +tokenizer = AutoTokenizer.from_pretrained(model_id) -```python ->>> import torch ->>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache +# Init StaticCache with big enough max-length (1024 tokens for the below example) +# You can also init a DynamicCache, if that suits you better +prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16) ->>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") ->>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto") ->>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device) +INITIAL_PROMPT = "You are a helpful assistant. " +inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda") +# This is the common prompt cached, we need to run forward without grad to be able to copy +with torch.no_grad(): + prompt_cache = model(**inputs_initial_prompt, past_key_values = prompt_cache).past_key_values ->>> # `return_dict_in_generate=True` is required to return the cache. `return_legacy_cache` forces the returned cache ->>> # to be of the legacy type ->>> generation_outputs = model.generate(**inputs, return_dict_in_generate=True, return_legacy_cache=True, max_new_tokens=5) +prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"] +responses = [] +for prompt in prompts: + new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda") + past_key_values = copy.deepcopy(prompt_cache) + outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20) + response = tokenizer.batch_decode(outputs)[0] + responses.append(response) ->>> # We can convert a legacy cache to a DynamicCache -- and the other way around. This is helpful if you have custom ->>> # logic to manipulate a cache in a specific format. ->>> cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values) ->>> legacy_format_cache = cache.to_legacy_cache() +print(responses) ``` diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md index 7627bef403e1..b1ece3fd8d74 100644 --- a/docs/source/en/pipeline_tutorial.md +++ b/docs/source/en/pipeline_tutorial.md @@ -325,7 +325,7 @@ for out in pipeline(data()): generated_characters += len(out[0]["generated_text"]) ``` -## Large model optimizations +## Large models [Accelerate](https://hf.co/docs/accelerate/index) enables a couple of optimizations for running large models with [`Pipeline`]. Make sure Accelerate is installed first. From 2b2c2b8378d15a8862f259520fe8ad7d796a9b25 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 16 Sep 2024 13:48:54 -0700 Subject: [PATCH 049/116] text generation --- docs/source/en/_toctree.yml | 4 +- docs/source/en/generation_strategies.md | 650 ++++++------------------ docs/source/en/llm_tutorial.md | 318 +++++------- 3 files changed, 290 insertions(+), 682 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index c987250e2688..353bca2befe4 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -67,9 +67,9 @@ - local: cache_explanation title: Caching - local: llm_tutorial - title: Generation with LLMs + title: Text generation - local: generation_strategies - title: Customize the generation strategy + title: Generation strategies - local: llm_tutorial_optimization title: Getting the most out of LLMs - local: perplexity diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md index 99049cceef34..b35e59769649 100644 --- a/docs/source/en/generation_strategies.md +++ b/docs/source/en/generation_strategies.md @@ -1,4 +1,4 @@ - -# Text generation strategies +# Generation strategies -Text generation is essential to many NLP tasks, such as open-ended text generation, summarization, translation, and -more. It also plays a role in a variety of mixed-modality applications that have text as an output like speech-to-text -and vision-to-text. Some of the models that can generate text include -GPT2, XLNet, OpenAI GPT, CTRL, TransformerXL, XLM, Bart, T5, GIT, Whisper. +A decoding strategy informs how a model should select the next generated token. There are many types of decoding strategies, and choosing the appropriate one has a significant impact on the quality of the generated text. -Check out a few examples that use [`~generation.GenerationMixin.generate`] method to produce -text outputs for different tasks: -* [Text summarization](./tasks/summarization#inference) -* [Image captioning](./model_doc/git#transformers.GitForCausalLM.forward.example) -* [Audio transcription](./model_doc/whisper#transformers.WhisperForConditionalGeneration.forward.example) +This guide will help you understand the different decoding strategies available in Transformers and how and when to use them. -Note that the inputs to the generate method depend on the model's modality. They are returned by the model's preprocessor -class, such as AutoTokenizer or AutoProcessor. If a model's preprocessor creates more than one kind of input, pass all -the inputs to generate(). You can learn more about the individual model's preprocessor in the corresponding model's documentation. +## Default decoding strategy -The process of selecting output tokens to generate text is known as decoding, and you can customize the decoding strategy -that the `generate()` method will use. Modifying a decoding strategy does not change the values of any trainable parameters. -However, it can have a noticeable impact on the quality of the generated output. It can help reduce repetition in the text -and make it more coherent. +Greedy search is the default decoding strategy. It selects the next most likely token at each step. Unless specified in the [`GenerationConfig`], this strategy generates a maximum of 20 tokens. -This guide describes: -* default generation configuration -* common decoding strategies and their main parameters -* saving and sharing custom generation configurations with your fine-tuned model on 🤗 Hub +Greedy search works well for tasks with relatively short outputs. However, it breaks down when generating longer sequences because it begins to repeat itself. - - -`generate()` is a critical component of our [chat CLI](quicktour#chat-with-text-generation-models). -You can apply the learnings of this guide there as well. - - - -## Default text generation configuration - -A decoding strategy for a model is defined in its generation configuration. When using pre-trained models for inference -within a [`pipeline`], the models call the `PreTrainedModel.generate()` method that applies a default generation -configuration under the hood. The default configuration is also used when no custom configuration has been saved with -the model. - -When you load a model explicitly, you can inspect the generation configuration that comes with it through - `model.generation_config`: - -```python ->>> from transformers import AutoModelForCausalLM - ->>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2") ->>> model.generation_config -GenerationConfig { - "bos_token_id": 50256, - "eos_token_id": 50256 -} - -``` - -Printing out the `model.generation_config` reveals only the values that are different from the default generation -configuration, and does not list any of the default values. - -The default generation configuration limits the size of the output combined with the input prompt to a maximum of 20 -tokens to avoid running into resource limitations. The default decoding strategy is greedy search, which is the simplest decoding strategy that picks a token with the highest probability as the next token. For many tasks -and small output sizes this works well. However, when used to generate longer outputs, greedy search can start -producing highly repetitive results. - -## Customize text generation - -You can override any `generation_config` by passing the parameters and their values directly to the [`generate`] method: - -```python ->>> my_model.generate(**inputs, num_beams=4, do_sample=True) # doctest: +SKIP -``` - -Even if the default decoding strategy mostly works for your task, you can still tweak a few things. Some of the -commonly adjusted parameters include: - -- `max_new_tokens`: the maximum number of tokens to generate. In other words, the size of the output sequence, not -including the tokens in the prompt. As an alternative to using the output's length as a stopping criteria, you can choose -to stop generation whenever the full generation exceeds some amount of time. To learn more, check [`StoppingCriteria`]. -- `num_beams`: by specifying a number of beams higher than 1, you are effectively switching from greedy search to -beam search. This strategy evaluates several hypotheses at each time step and eventually chooses the hypothesis that -has the overall highest probability for the entire sequence. This has the advantage of identifying high-probability -sequences that start with a lower probability initial tokens and would've been ignored by the greedy search. Visualize how it works [here](https://huggingface.co/spaces/m-ric/beam_search_visualizer). -- `do_sample`: if set to `True`, this parameter enables decoding strategies such as multinomial sampling, beam-search -multinomial sampling, Top-K sampling and Top-p sampling. All these strategies select the next token from the probability -distribution over the entire vocabulary with various strategy-specific adjustments. -- `num_return_sequences`: the number of sequence candidates to return for each input. This option is only available for -the decoding strategies that support multiple sequence candidates, e.g. variations of beam search and sampling. Decoding -strategies like greedy search and contrastive search return a single output sequence. - -It is also possible to extend `generate()` with external libraries or handcrafted code. The `logits_processor` argument -allows you to pass custom [`LogitsProcessor`] instances, allowing you to manipulate the next token probability -distributions. Likewise, the `stopping_criteria` argument lets you set custom [`StoppingCriteria`] to stop text generation. -The [`logits-processor-zoo`](https://github.com/NVIDIA/logits-processor-zoo) library contains examples of external -`generate()`-compatible extensions. - -## Save a custom decoding strategy with your model - -If you would like to share your fine-tuned model with a specific generation configuration, you can: -* Create a [`GenerationConfig`] class instance -* Specify the decoding strategy parameters -* Save your generation configuration with [`GenerationConfig.save_pretrained`], making sure to leave its `config_file_name` argument empty -* Set `push_to_hub` to `True` to upload your config to the model's repo - -```python ->>> from transformers import AutoModelForCausalLM, GenerationConfig - ->>> model = AutoModelForCausalLM.from_pretrained("my_account/my_model") # doctest: +SKIP ->>> generation_config = GenerationConfig( -... max_new_tokens=50, do_sample=True, top_k=50, eos_token_id=model.config.eos_token_id -... ) ->>> generation_config.save_pretrained("my_account/my_model", push_to_hub=True) # doctest: +SKIP -``` - -You can also store several generation configurations in a single directory, making use of the `config_file_name` -argument in [`GenerationConfig.save_pretrained`]. You can later instantiate them with [`GenerationConfig.from_pretrained`]. This is useful if you want to -store several generation configurations for a single model (e.g. one for creative text generation with sampling, and -one for summarization with beam search). You must have the right Hub permissions to add configuration files to a model. - -```python ->>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig - ->>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small") ->>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small") - ->>> translation_generation_config = GenerationConfig( -... num_beams=4, -... early_stopping=True, -... decoder_start_token_id=0, -... eos_token_id=model.config.eos_token_id, -... pad_token=model.config.pad_token_id, -... ) - ->>> # Tip: add `push_to_hub=True` to push to the Hub ->>> translation_generation_config.save_pretrained("/tmp", "translation_generation_config.json") - ->>> # You could then use the named generation config file to parameterize generation ->>> generation_config = GenerationConfig.from_pretrained("/tmp", "translation_generation_config.json") ->>> inputs = tokenizer("translate English to French: Configuration files are easy to use!", return_tensors="pt") ->>> outputs = model.generate(**inputs, generation_config=generation_config) ->>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) -['Les fichiers de configuration sont faciles à utiliser!'] -``` - -## Streaming - -The `generate()` supports streaming, through its `streamer` input. The `streamer` input is compatible with any instance -from a class that has the following methods: `put()` and `end()`. Internally, `put()` is used to push new tokens and -`end()` is used to flag the end of text generation. - - - -The API for the streamer classes is still under development and may change in the future. - - - -In practice, you can craft your own streaming class for all sorts of purposes! We also have basic streaming classes -ready for you to use. For example, you can use the [`TextStreamer`] class to stream the output of `generate()` into -your screen, one word at a time: - -```python ->>> from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer - ->>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2") ->>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") ->>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt") ->>> streamer = TextStreamer(tok) - ->>> # Despite returning the usual output, the streamer will also print the generated text to stdout. ->>> _ = model.generate(**inputs, streamer=streamer, max_new_tokens=20) -An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven, -``` - - -## Watermarking - -The `generate()` supports watermarking the generated text by randomly marking a portion of tokens as "green". -When generating the "green" will have a small 'bias' value added to their logits, thus having a higher chance to be generated. -The watermarked text can be detected by calculating the proportion of "green" tokens in the text and estimating how likely it is -statistically to obtain that amount of "green" tokens for human-generated text. This watermarking strategy was proposed in the paper -["On the Reliability of Watermarks for Large Language Models"](https://arxiv.org/abs/2306.04634). For more information on -the inner functioning of watermarking, it is recommended to refer to the paper. - -The watermarking can be used with any generative model in `tranformers` and does not require an extra classification model -to detect watermarked text. To trigger watermarking, pass in a [`WatermarkingConfig`] with needed arguments directly to the -`.generate()` method or add it to the [`GenerationConfig`]. Watermarked text can be later detected with a [`WatermarkDetector`]. - - - +```py +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer -The WatermarkDetector internally relies on the proportion of "green" tokens, and whether generated text follows the coloring pattern. -That is why it is recommended to strip off the prompt text, if it is much longer than the generated text. -This also can have an effect when one sequence in the batch is a lot longer causing other rows to be padded. -Additionally, the detector **must** be initiated with identical watermark configuration arguments used when generating. +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") +inputs = tokenizer("I look forward to", return_tensors="pt").to("cuda") - - -Let's generate some text with watermarking. In the below code snippet, we set the bias to 2.5 which is a value that -will be added to "green" tokens' logits. After generating watermarked text, we can pass it directly to the `WatermarkDetector` -to check if the text is machine-generated (outputs `True` for machine-generated and `False` otherwise). - -```python ->>> from transformers import AutoTokenizer, AutoModelForCausalLM, WatermarkDetector, WatermarkingConfig - ->>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") ->>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2") ->>> tok.pad_token_id = tok.eos_token_id ->>> tok.padding_side = "left" - ->>> inputs = tok(["This is the beginning of a long story", "Alice and Bob are"], padding=True, return_tensors="pt") ->>> input_len = inputs["input_ids"].shape[-1] - ->>> watermarking_config = WatermarkingConfig(bias=2.5, seeding_scheme="selfhash") ->>> out = model.generate(**inputs, watermarking_config=watermarking_config, do_sample=False, max_length=20) - ->>> detector = WatermarkDetector(model_config=model.config, device="cpu", watermarking_config=watermarking_config) ->>> detection_out = detector(out, return_dict=True) ->>> detection_out.prediction -array([ True, True]) -``` - - -## Decoding strategies - -Certain combinations of the `generate()` parameters, and ultimately `generation_config`, can be used to enable specific -decoding strategies. If you are new to this concept, we recommend reading -[this blog post that illustrates how common decoding strategies work](https://huggingface.co/blog/how-to-generate). - -Here, we'll show some of the parameters that control the decoding strategies and illustrate how you can use them. - - - -Selecting a given decoding strategy is not the only way you can influence the outcome of `generate()` with your model. -The decoding strategies act based (mostly) on the logits, the distribution of probabilities for the next token, and -thus selecting a good logits manipulation strategy can go a long way! In other words, manipulating the logits is another -dimension you can act upon, in addition to selecting a decoding strategy. Popular logits manipulation strategies include -`top_p`, `min_p`, and `repetition_penalty` -- you can check the full list in the [`GenerationConfig`] class. - - - -### Greedy Search - -[`generate`] uses greedy search decoding by default so you don't have to pass any parameters to enable it. This means the parameters `num_beams` is set to 1 and `do_sample=False`. - -```python ->>> from transformers import AutoModelForCausalLM, AutoTokenizer - ->>> prompt = "I look forward to" ->>> checkpoint = "distilbert/distilgpt2" - ->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) ->>> inputs = tokenizer(prompt, return_tensors="pt") - ->>> model = AutoModelForCausalLM.from_pretrained(checkpoint) ->>> outputs = model.generate(**inputs) ->>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['I look forward to seeing you all again!\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'] +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda") +# explicitly set to default length because Llama2 generation length is 4096 +outputs = model.generate(**inputs, max_new_tokens=20) +tokenizer.batch_decode(outputs, skip_special_tokens=True) +'Hugging Face is an open-source company that provides a suite of tools and services for building, deploying, and maintaining natural language processing' ``` -### Contrastive search +## Contrastive search -The contrastive search decoding strategy was proposed in the 2022 paper [A Contrastive Framework for Neural Text Generation](https://arxiv.org/abs/2202.06417). -It demonstrates superior results for generating non-repetitive yet coherent long outputs. To learn how contrastive search -works, check out [this blog post](https://huggingface.co/blog/introducing-csearch). -The two main parameters that enable and control the behavior of contrastive search are `penalty_alpha` and `top_k`: +[Contrastive search](https://huggingface.co/papers/2202.06417) is a decoding strategy that aims to reduce repetition even while generating longer sequences. This strategy compares how similar a generated token is against previous tokens, and if they're more similar, a penalty is applied. -```python ->>> from transformers import AutoTokenizer, AutoModelForCausalLM +Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `penalty_alpha` manages the penalty applied and `top_k` is the number of most likely tokens to return. ->>> checkpoint = "openai-community/gpt2-large" ->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) ->>> model = AutoModelForCausalLM.from_pretrained(checkpoint) +```py +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer ->>> prompt = "Hugging Face Company is" ->>> inputs = tokenizer(prompt, return_tensors="pt") +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") +inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda") ->>> outputs = model.generate(**inputs, penalty_alpha=0.6, top_k=4, max_new_tokens=100) ->>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['Hugging Face Company is a family owned and operated business. We pride ourselves on being the best -in the business and our customer service is second to none.\n\nIf you have any questions about our -products or services, feel free to contact us at any time. We look forward to hearing from you!'] +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda") +# explicitly set to 100 because Llama2 generation length is 4096 +outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4) +tokenizer.batch_decode(outputs, skip_special_tokens=True) +'Hugging Face is an open-source company that provides a platform for building and deploying AI models.\nHugging Face is an open-source company that provides a platform for building and deploying AI models. The platform allows developers to build and deploy AI models, as well as collaborate with other developers.\nHugging Face was founded in 2019 by Thibault Wittemberg and Clément Delangue. The company is based in Paris, France.\nHugging Face has' ``` -### Multinomial sampling +## Beam search -As opposed to greedy search that always chooses a token with the highest probability as the -next token, multinomial sampling (also called ancestral sampling) randomly selects the next token based on the probability distribution over the entire -vocabulary given by the model. Every token with a non-zero probability has a chance of being selected, thus reducing the -risk of repetition. +Beam search keeps track of several generated sequences (beams) at each time step. After a certain number of steps, it selects the sequence with the highest *overall* probability. Unlike greedy search, this strategy can "look ahead" and pick a sequence with a higher probability overall even if the initial tokens have a lower probability. -To enable multinomial sampling set `do_sample=True` and `num_beams=1`. +> [TIP] +> Check out the [beam search visualizer](https://huggingface.co/spaces/m-ric/beam_search_visualizer) to see how beam search works. -```python ->>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed ->>> set_seed(0) # For reproducibility +Enable beam search with the `num_beams` parameter (should be greater than 1 otherwise it's equivalent to greedy search). ->>> checkpoint = "openai-community/gpt2-large" ->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) ->>> model = AutoModelForCausalLM.from_pretrained(checkpoint) +```py +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer ->>> prompt = "Today was an amazing day because" ->>> inputs = tokenizer(prompt, return_tensors="pt") +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") +inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda") ->>> outputs = model.generate(**inputs, do_sample=True, num_beams=1, max_new_tokens=100) ->>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -["Today was an amazing day because we received these wonderful items by the way of a gift shop. The box arrived on a Thursday and I opened it on Monday afternoon to receive the gifts. Both bags featured pieces from all the previous years!\n\nThe box had lots of surprises in it, including some sweet little mini chocolate chips! I don't think I'd eat all of these. This was definitely one of the most expensive presents I have ever got, I actually got most of them for free!\n\nThe first package came"] +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda") +# explicitly set to 100 because Llama2 generation length is 4096 +outputs = model.generate(**inputs, max_new_tokens=50, num_beams=2) +tokenizer.batch_decode(outputs, skip_special_tokens=True) +"['Hugging Face is an open-source company that develops and maintains the Hugging Face platform, which is a collection of tools and libraries for building and deploying natural language processing (NLP) models. Hugging Face was founded in 2018 by Thomas Wolf']" ``` -### Beam-search decoding - -Unlike greedy search, beam-search decoding keeps several hypotheses at each time step and eventually chooses -the hypothesis that has the overall highest probability for the entire sequence. This has the advantage of identifying high-probability -sequences that start with lower probability initial tokens and would've been ignored by the greedy search. - - - - +## Diverse beam search -You can visualize how beam-search decoding works in [this interactive demo](https://huggingface.co/spaces/m-ric/beam_search_visualizer): type your input sentence, and play with the parameters to see how the decoding beams change. - -To enable this decoding strategy, specify the `num_beams` (aka number of hypotheses to keep track of) that is greater than 1. - -```python ->>> from transformers import AutoModelForCausalLM, AutoTokenizer +[Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups. ->>> prompt = "It is astonishing how one can" ->>> checkpoint = "openai-community/gpt2-medium" +Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversity_penalty` parameters (the `num_beams` parameter should be divisible by `num_beam_groups`). ->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) ->>> inputs = tokenizer(prompt, return_tensors="pt") +```py +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer ->>> model = AutoModelForCausalLM.from_pretrained(checkpoint) +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") +inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda") ->>> outputs = model.generate(**inputs, num_beams=5, max_new_tokens=50) ->>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['It is astonishing how one can have such a profound impact on the lives of so many people in such a short period of -time."\n\nHe added: "I am very proud of the work I have been able to do in the last few years.\n\n"I have'] +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda") +# explicitly set to 100 because Llama2 generation length is 4096 +outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False) +tokenizer.batch_decode(outputs, skip_special_tokens=True) +'Hugging Face is an open-source company 🤗\nWe are an open-source company. Our mission is to democratize AI and make it accessible to everyone. We believe that AI should be used for the benefit of humanity, not for the benefit of a' ``` -### Beam-search multinomial sampling +## Multinomial sampling -As the name implies, this decoding strategy combines beam search with multinomial sampling. You need to specify -the `num_beams` greater than 1, and set `do_sample=True` to use this decoding strategy. +Search methods selects the most likely tokens. Sampling, or multinomial sampling, randomly selects a token based on the probability distribution over the entire models vocabulary. This means every token with a non-zero probability has a chance to be selected. Sampling strategies reduce repetition and can generate more creative and diverse outputs. -```python ->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, set_seed ->>> set_seed(0) # For reproducibility - ->>> prompt = "translate English to German: The house is wonderful." ->>> checkpoint = "google-t5/t5-small" +Enable multinomial sampling with `do_sample=True` and `num_beams=1`. ->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) ->>> inputs = tokenizer(prompt, return_tensors="pt") +```py +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer ->>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") +inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda") ->>> outputs = model.generate(**inputs, num_beams=5, do_sample=True) ->>> tokenizer.decode(outputs[0], skip_special_tokens=True) -'Das Haus ist wunderbar.' +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda") +# explicitly set to 100 because Llama2 generation length is 4096 +outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1) +tokenizer.batch_decode(outputs, skip_special_tokens=True) +'Hugging Face is an open-source company 🤗\nWe are open-source and believe that open-source is the best way to build technology. Our mission is to make AI accessible to everyone, and we believe that open-source is the best way to achieve that.' ``` -### Diverse beam search decoding - -The diverse beam search decoding strategy is an extension of the beam search strategy that allows for generating a more diverse -set of beam sequences to choose from. To learn how it works, refer to [Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models](https://arxiv.org/pdf/1610.02424.pdf). -This approach has three main parameters: `num_beams`, `num_beam_groups`, and `diversity_penalty`. -The diversity penalty ensures the outputs are distinct across groups, and beam search is used within each group. +## Beam search multinomial sampling +This decoding strategy is a combination of beam search and multinomial sampling. It generates multiple beams and uses a sampling strategy for each beam. -```python ->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM - ->>> checkpoint = "google/pegasus-xsum" ->>> prompt = ( -... "The Permaculture Design Principles are a set of universal design principles " -... "that can be applied to any location, climate and culture, and they allow us to design " -... "the most efficient and sustainable human habitation and food production systems. " -... "Permaculture is a design system that encompasses a wide variety of disciplines, such " -... "as ecology, landscape design, environmental science and energy conservation, and the " -... "Permaculture design principles are drawn from these various disciplines. Each individual " -... "design principle itself embodies a complete conceptual framework based on sound " -... "scientific principles. When we bring all these separate principles together, we can " -... "create a design system that both looks at whole systems, the parts that these systems " -... "consist of, and how those parts interact with each other to create a complex, dynamic, " -... "living system. Each design principle serves as a tool that allows us to integrate all " -... "the separate parts of a design, referred to as elements, into a functional, synergistic, " -... "whole system, where the elements harmoniously interact and work together in the most " -... "efficient way possible." -... ) +Enable beam search multinomial sampling by setting `num_beams` to a value greater than 1 and `do_sample=True`. ->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) ->>> inputs = tokenizer(prompt, return_tensors="pt") +```py +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer ->>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") +inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda") ->>> outputs = model.generate(**inputs, num_beams=5, num_beam_groups=5, max_new_tokens=30, diversity_penalty=1.0) ->>> tokenizer.decode(outputs[0], skip_special_tokens=True) -'The Design Principles are a set of universal design principles that can be applied to any location, climate and -culture, and they allow us to design the' +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda") +# explicitly set to 100 because Llama2 generation length is 4096 +outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=4) +'Hugging Face is an open-source company 100% dedicated to making AI more accessible. We believe that AI should be available to everyone, and we’re working hard to make that a reality.\nWe’re a team of passionate engineers, designers,' ``` -This guide illustrates the main parameters that enable various decoding strategies. More advanced parameters exist for the -[`generate`] method, which gives you even further control over the [`generate`] method's behavior. -For the complete list of the available parameters, refer to the [API documentation](./main_classes/text_generation). +## Speculative decoding -### Speculative Decoding +[Speculative](https://hf.co/papers/2211.17192) or assistive decoding isn't a search or sampling strategy. This method is especially useful for LLMs where it can be more costly and slower to generate tokens. Instead, speculative decoding adds a second smaller model to generate candidate tokens. The main model verifies the candidate tokens in a single `forward` pass, which speeds up the decoding process overall. Refer to the [speculative decoding](./llm_optims#speculative-decoding) guide to learn more. -Speculative decoding (also known as assisted decoding) is a modification of the decoding strategies above, that uses an -assistant model (ideally a much smaller one), to generate a few candidate tokens. The main model then validates the candidate -tokens in a single forward pass, which speeds up the decoding process. If `do_sample=True`, then the token validation with -resampling introduced in the [speculative decoding paper](https://arxiv.org/pdf/2211.17192.pdf) is used. -Assisted decoding assumes the main and assistant models have the same tokenizer, otherwise, see Universal Assisted Decoding below. +Currently, only greed search and multinomial sampling are supported with speculative decoding. Batched inputs aren't supported either. -Currently, only greedy search and sampling are supported with assisted decoding, and assisted decoding doesn't support batched inputs. -To learn more about assisted decoding, check [this blog post](https://huggingface.co/blog/assisted-generation). +Enable speculative decoding with the `assistant_model` parameter. You'll notice the fastest speed up with an assistant model that is much smaller than the main model. Add `do_sample=True` to enable token validation with resampling. -To enable assisted decoding, set the `assistant_model` argument with a model. + + -```python ->>> from transformers import AutoModelForCausalLM, AutoTokenizer +```py +from transformers import AutoModelForCausalLM, AutoTokenizer ->>> prompt = "Alice and Bob" ->>> checkpoint = "EleutherAI/pythia-1.4b-deduped" ->>> assistant_checkpoint = "EleutherAI/pythia-160m-deduped" +tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B") +model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B") +assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M") +inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt") ->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) ->>> inputs = tokenizer(prompt, return_tensors="pt") - ->>> model = AutoModelForCausalLM.from_pretrained(checkpoint) ->>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint) ->>> outputs = model.generate(**inputs, assistant_model=assistant_model) ->>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a glass of wine.'] +outputs = model.generate(**inputs, assistant_model=assistant_model) +tokenizer.batch_decode(outputs, skip_special_tokens=True) +'Hugging Face is an open-source company that provides a platform for developers to build and deploy machine' ``` @@ -470,139 +188,101 @@ If you're using a `pipeline` object, all you need to do is to pass the assistant -When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness, -just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency. + + -```python ->>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed ->>> set_seed(42) # For reproducibility +Add the `temperature` parameter to control sampling randomness. For speculative decoding, a lower temperature may improve latency. ->>> prompt = "Alice and Bob" ->>> checkpoint = "EleutherAI/pythia-1.4b-deduped" ->>> assistant_checkpoint = "EleutherAI/pythia-160m-deduped" +```py +from transformers import AutoModelForCausalLM, AutoTokenizer ->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) ->>> inputs = tokenizer(prompt, return_tensors="pt") +tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B") +model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B") +assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M") +inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt") ->>> model = AutoModelForCausalLM.from_pretrained(checkpoint) ->>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint) ->>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5) ->>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['Alice and Bob are two people who are very different, but they are both very good at what they do. Alice'] +outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5) +tokenizer.batch_decode(outputs, skip_special_tokens=True) +'Hugging Face is an open-source company that is dedicated to creating a better world through technology.' ``` -We recommend to install `scikit-learn` library to enhance the candidate generation strategy and achieve additional speedup. + + -#### Universal Assisted Decoding +### Prompt lookup decoding -Universal Assisted Decoding (UAD) adds support for main and assistant models with different tokenizers. -To use it, simply pass the tokenizers using the `tokenizer` and `assistant_tokenizer` arguments (see below). -Internally, the main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are -in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above. -The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer. -Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings, -to ensure the new tokens include the correct prompt suffix. +[Prompt lookup decoding](./llm_optims#prompt-lookup-decoding) is a variant of speculative decoding that uses overlapping n-grams as the candidate tokens. It works well for input-grounded tasks such as summarization. Refer to the [prompt lookup decoding](./llm_optims#prompt-lookup-decoding) guide to learn more. -```python ->>> from transformers import AutoModelForCausalLM, AutoTokenizer +Enable prompt lookup decoding with the `prompt_lookup_num_tokens` parameter. ->>> prompt = "Alice and Bob" ->>> checkpoint = "google/gemma-2-9b" ->>> assistant_checkpoint = "double7/vicuna-68m" +```py +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer ->>> assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_checkpoint) ->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) ->>> inputs = tokenizer(prompt, return_tensors="pt") +tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B") +model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda") +assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M", torch_dtype=torch.float16).to("cuda") +inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda") ->>> model = AutoModelForCausalLM.from_pretrained(checkpoint) ->>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint) ->>> outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=tokenizer, assistant_tokenizer=assistant_tokenizer) ->>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['Alice and Bob are playing a game. Alice has a set of $n$ integers $a_1, a'] +outputs = model.generate(**inputs, assistant_model=assistant_model, max_new_tokens=20, prompt_lookup_num_tokens=5) +tokenizer.batch_decode(outputs, skip_special_tokens=True) +'Hugging Face is an open-source company that provides a platform for developers to build and deploy machine learning models. It offers a variety of tools' ``` -#### Prompt Lookup +## DoLa -Alternatively, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed -to model based assisted decoding. You can read more about it [here](https://twitter.com/joao_gante/status/1747322413006643259). +[Decoding by Contrasting Layers (DoLa)](https://hf.co/papers/2309.03883) is a contrastive decoding strategy for improving factuality and reducing hallucination. This strategy works by contrasting the logit diffferences between the final and early layers. As a result, factual knowledge localized to a particular layers are amplified. DoLa is not recommended for smaller models like GPT-2. -#### Self-Speculative Decoding +Enable DoLa with the following parameters. -An LLM can be trained to also use its language modeling head with earlier hidden states as input, effectively -skipping layers to yield a lower-quality output -- a technique called early exiting. -We use the lower-quality early exit output as an assistant output, and apply self-speculation to fix the output using the remaining layers. The final generation of that self-speculative solution is the same (or has the same distribution) as the original model's generation. -If the model you're using was trained to do early exit, you can pass -`assistant_early_exit` (integer). In this case, the assistant model will be the same model but exiting early, hence the -"self-speculative" name. Because the assistant model is a portion of the target model, caches and weights can be shared, which results in lower memory requirements. As in other assisted generation methods, the final generated result has the same quality as if no assistant had been used. +- `dola_layers` are the candidate layers to be contrasted with the final layer. It can be a string with `low` or `high` to contrast the lower or higher parts of a layer. `high` is recommended for short-answer tasks like TruthfulQA. `low` is recommended for long-answer reasoning tasks like GSM8K, StrategyQA, FACTOR, and VicunaQA. -```python ->>> from transformers import AutoModelForCausalLM, AutoTokenizer + When a model has tied word embeddings, layer 0 is skipped and it begins from layer 2. ->>> prompt = "Alice and Bob" ->>> checkpoint = "facebook/layerskip-llama3.2-1B" + It can also be a list of integers that represent the layer indices between 0 and the total number of layers. Layer 0 is the word embedding, 1 is the first transformer layer, and so on. Refer to the table below for the range of layer indices depending on the number of model layers. ->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) ->>> inputs = tokenizer(prompt, return_tensors="pt") +| layers | low | high | +|---|---|---| +| > 40 | (0, 20, 2) | (N - 20, N, 2) | +| <= 40 | range(0, N // 2, 2) | range(N // 2, N, 2) | ->>> model = AutoModelForCausalLM.from_pretrained(checkpoint) ->>> outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20) ->>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['Alice and Bob are playing a game. Alice has a set of $n$ integers $a_1, a'] -``` +- `repetition_penalty` reduces repetition and it is recommended to set it to 1.2. + + + -### DoLa Decoding +```py +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer -**D**ecoding by C**o**ntrasting **La**yers (DoLa) is a contrastive decoding strategy to improve the factuality and reduce the -hallucinations of LLMs, as described in this paper of ICLR 2024 [DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language Models](https://arxiv.org/abs/2309.03883). +tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B") +model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda") +inputs = tokenizer("What is the highest peak in the world??", return_tensors="pt").to("cuda") -DoLa is achieved by contrasting the differences in logits obtained from final -layers versus earlier layers, thus amplify the factual knowledge localized to particular part of transformer layers. +outputs = model.generate(**inputs, max_new_tokens=50, dola_layers="high", do_sample=False) +tokenizer.batch_decode(outputs, skip_special_tokens=True) +" Mount EverestMount Everest, called Himalaya in Nepali, is the world's highest peak, lying almost 9.5 kilometers above the sea level and the tallest mountain from 19,036.91 ft. The mountain was" +``` -Do the following two steps to activate DoLa decoding when calling the `model.generate` function: -1. Set the `dola_layers` argument, which can be either a string or a list of integers. - - If set to a string, it can be one of `low`, `high`. - - If set to a list of integers, it should be a list of layer indices between 0 and the total number of layers in the model. The 0-th layer is word embedding, and the 1st layer is the first transformer layer, and so on. -2. Set `repetition_penalty = 1.2` is suggested to reduce repetition in DoLa decoding. + + -See the following examples for DoLa decoding with the 32-layer LLaMA-7B model. +Contrast layers 18 and 20 with the final layer. -```python ->>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed ->>> import torch ->>> from accelerate.test_utils.testing import get_backend - ->>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) ->>> tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") ->>> model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", torch_dtype=torch.float16).to(device) ->>> set_seed(42) - ->>> text = "On what date was the Declaration of Independence officially signed?" ->>> inputs = tokenizer(text, return_tensors="pt").to(device) - -# Vanilla greddy decoding ->>> vanilla_output = model.generate(**inputs, do_sample=False, max_new_tokens=50) ->>> tokenizer.batch_decode(vanilla_output[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True) -['\nThe Declaration of Independence was signed on July 4, 1776.\nWhat was the date of the signing of the Declaration of Independence?\nThe Declaration of Independence was signed on July 4,'] - -# DoLa decoding with contrasting higher part of layers (layers 16,18,...,30) ->>> dola_high_output = model.generate(**inputs, do_sample=False, max_new_tokens=50, dola_layers='high') ->>> tokenizer.batch_decode(dola_high_output[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True) -['\nJuly 4, 1776, when the Continental Congress voted to separate from Great Britain. The 56 delegates to the Continental Congress signed the Declaration on August 2, 1776.'] - -# DoLa decoding with contrasting specific layers (layers 28 and 30) ->>> dola_custom_output = model.generate(**inputs, do_sample=False, max_new_tokens=50, dola_layers=[28,30], repetition_penalty=1.2) ->>> tokenizer.batch_decode(dola_custom_output[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True) -['\nIn 1891, when he was 54 years old, John Jacob Astor founded his empire. He opened a one-man business and spent the next 27 years working 10-hour days. When'] -``` +```py +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B") +model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda") +inputs = tokenizer("What is the highest peak in the world?", return_tensors="pt").to("cuda") -#### Understanding the `dola_layers` argument +outputs = model.generate(**inputs, max_new_tokens=50, dola_layers=[18,20], do_sample=False, repetition_penalty=1.2) +tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True) +" Mount EverestMount Everest, called Himalaya in Nepali, is the world's highest peak above sea level and it rises to an incredible height of 29,028 feet above the ocean. Its summit is over a mile taller than Mt" -`dola_layers` stands for the candidate layers in premature layer selection, as described in the DoLa paper. The selected premature layer will be contrasted with the final layer. -Setting `dola_layers` to `'low'` or `'high'` will select the lower or higher part of the layers to contrast, respectively. -- For `N`-layer models with `N <= 40` layers, the layers of `range(0, N // 2, 2)` and `range(N // 2, N, 2)` are used for `'low'` and `'high'` layers, respectively. -- For models with `N > 40` layers, the layers of `range(0, 20, 2)` and `range(N - 20, N, 2)` are used for `'low'` and `'high'` layers, respectively. -- If the model has tied word embeddings, we skip the word embeddings (0-th) layer and start from the 2nd layer, as the early exit from word embeddings will become identity function. -- Set the `dola_layers` to a list of integers for layer indices to contrast manually specified layers. For example, setting `dola_layers=[28,30]` will contrast the final layer (32-th layer) with the 28-th and 30-th layers. +``` -The paper suggested that contrasting `'high'` layers to improve short-answer tasks like TruthfulQA, and contrasting `'low'` layers to improve all the other long-answer reasoning tasks, such as GSM8K, StrategyQA, FACTOR, and VicunaQA. Applying DoLa to smaller models like GPT-2 is not recommended, as the results shown in the Appendix N of the paper. + + \ No newline at end of file diff --git a/docs/source/en/llm_tutorial.md b/docs/source/en/llm_tutorial.md index b0cb96293b68..3ef1222b4cb2 100644 --- a/docs/source/en/llm_tutorial.md +++ b/docs/source/en/llm_tutorial.md @@ -1,4 +1,4 @@ - - -# Generation with LLMs +# Text generation [[open-in-colab]] -LLMs, or Large Language Models, are the key component behind text generation. In a nutshell, they consist of large pretrained transformer models trained to predict the next word (or, more precisely, token) given some input text. Since they predict one token at a time, you need to do something more elaborate to generate new sentences other than just calling the model -- you need to do autoregressive generation. - -Autoregressive generation is the inference-time procedure of iteratively calling a model with its own generated outputs, given a few initial inputs. In 🤗 Transformers, this is handled by the [`~generation.GenerationMixin.generate`] method, which is available to all models with generative capabilities. - - +Text generation is one of the most popular applications of large language models (LLMs). A LLM is trained to generate the next word (token) given some initial text (prompt) along with its own generated outputs up to a predefined length or when it reaches an end-of-sequence (`EOS`) token. -If you want to jump straight to chatting with a model, [try our chat CLI](quicktour#chat-with-text-generation-models). +In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities. - +This guide will show you the basics of text generation with the [`~GenerationMixin.generate`] API and some common pitfalls to avoid. -This tutorial will show you how to: +## Generate -* Generate text with an LLM -* Avoid common pitfalls -* Next steps to help you get the most out of your LLM - -Before you begin, make sure you have all the necessary libraries installed: +Before you begin, it's helpful to install [bitsandbytes](https://hf.co/docs/bitsandbytes/index) to quantize really large models to reduce their memory usage. ```bash -pip install transformers bitsandbytes>=0.39.0 -q +!pip install transformers bitsandbytes>0.39.0 -q ``` Bitsandbytes supports multiple backends in addition to CUDA-based GPUs. Refer to the multi-backend installation [guide](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend) to learn more. +Load a LLM with [`~PreTrainedModel.from_pretrained`] and add the following two parameters to lessen the memory requirements. -## Generate text - -A language model trained for [causal language modeling](tasks/language_modeling) takes a sequence of text tokens as input and returns the probability distribution for the next token. +- `device_map="auto` enables Accelerate's [Big Model Inference](./models#big-model-inference) feature for automatically initiating the model skeleton and loading and dispatching the model weights across all available devices, starting with the fastest device (GPU). +- `quantization_config` is a configuration object that defines the quantization settings. This examples uses bitsandbytes as the quantization backend (see the [Quantization](./quantization/overview) section for more available backends) and it loads the model in [4-bits](./quantization/bitsandbytes). - -
- -
"Forward pass of an LLM"
-
- -A critical aspect of autoregressive generation with LLMs is how to select the next token from this probability distribution. Anything goes in this step as long as you end up with a token for the next iteration. This means it can be as simple as selecting the most likely token from the probability distribution or as complex as applying a dozen transformations before sampling from the resulting distribution. - - -
- -
"Autoregressive generation iteratively selects the next token from a probability distribution to generate text"
-
- -The process depicted above is repeated iteratively until some stopping condition is reached. Ideally, the stopping condition is dictated by the model, which should learn when to output an end-of-sequence (`EOS`) token. If this is not the case, generation stops when some predefined maximum length is reached. - -Properly setting up the token selection step and the stopping condition is essential to make your model behave as you'd expect on your task. That is why we have a [`~generation.GenerationConfig`] file associated with each model, which contains a good default generative parameterization and is loaded alongside your model. +```py +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig -Let's talk code! +# load model and set up quantization configuration +quantization_config = BitsAndBytesConfig(load_in_4bit=True) +model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto", quantization_config=quantization_config) +``` - +Tokenize your input, and set the [`~PreTrainedTokenizer.padding_side`] parameter to `"left"` because a LLM is not trained to continue generation from padding tokens. The tokenizer returns the input ids and attention mask. -If you're interested in basic LLM usage, our high-level [`Pipeline`](pipeline_tutorial) interface is a great starting point. However, LLMs often require advanced features like quantization and fine control of the token selection step, which is best done through [`~generation.GenerationMixin.generate`]. Autoregressive generation with LLMs is also resource-intensive and should be executed on a GPU for adequate throughput. +> [!TIP] +> Process more than one prompt at a time by passing a list of strings to the tokenizer. Batch the inputs to improve throughput at a small cost to latency and memory. - +```py +# tokenize input +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left") +model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda") +``` -First, you need to load the model. +Pass the inputs to [`~GenerationMixin.generate`] to generate tokens, and then [`~PreTrainedTokenizer.batch_decode`] the generated tokens back to text. ```py ->>> from transformers import AutoModelForCausalLM - ->>> model = AutoModelForCausalLM.from_pretrained( -... "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True -... ) +# generate and decode back to text +generated_ids = model.generate(**model_inputs) +tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +"A list of colors: red, blue, green, yellow, orange, purple, pink," ``` -You'll notice two flags in the `from_pretrained` call: +## Pitfalls - - `device_map` ensures the model is moved to your GPU(s) - - `load_in_4bit` applies [4-bit dynamic quantization](main_classes/quantization) to massively reduce the resource requirements +The section below covers some common issues that you may encounter during text generation and how to solve them. -There are other ways to initialize a model, but this is a good baseline to begin with an LLM. +## Wrong output length -Next, you need to preprocess your text input with a [tokenizer](tokenizer_summary). +[`~GenerationMixin.generate`] returns up to 20 tokens by default unless otherwise specified in a models [`GenerationConfig`]. It is highly recommended to manually set the number of generated tokens with the [`max_new_tokens`] parameter to control the output length. [Decoder-only](https://hf.co/learn/nlp-course/chapter1/6?fw=pt) models returns the initial prompt along with the generated tokens. ```py ->>> from transformers import AutoTokenizer ->>> from accelerate.test_utils.testing import get_backend - ->>> DEVICE, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) ->>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left") ->>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to(DEVICE) +model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda") ``` -The `model_inputs` variable holds the tokenized text input, as well as the attention mask. While [`~generation.GenerationMixin.generate`] does its best effort to infer the attention mask when it is not passed, we recommend passing it whenever possible for optimal results. - -After tokenizing the inputs, you can call the [`~generation.GenerationMixin.generate`] method to returns the generated tokens. The generated tokens then should be converted to text before printing. + + ```py ->>> generated_ids = model.generate(**model_inputs) ->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] -'A list of colors: red, blue, green, yellow, orange, purple, pink,' +generated_ids = model.generate(**model_inputs) +tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +'A sequence of numbers: 1, 2, 3, 4, 5' ``` -Finally, you don't need to do it one sequence at a time! You can batch your inputs, which will greatly improve the throughput at a small latency and memory cost. All you need to do is to make sure you pad your inputs properly (more on that below). + + ```py ->>> tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default ->>> model_inputs = tokenizer( -... ["A list of colors: red, blue", "Portugal is"], return_tensors="pt", padding=True -... ).to(DEVICE) ->>> generated_ids = model.generate(**model_inputs) ->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True) -['A list of colors: red, blue, green, yellow, orange, purple, pink,', -'Portugal is a country in southwestern Europe, on the Iber'] +generated_ids = model.generate(**model_inputs, max_new_tokens=50) +tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,' ``` -And that's it! In a few lines of code, you can harness the power of an LLM. + + +## Wrong decoding strategy -## Common pitfalls +The default decoding strategy in [`~GenerationMixin.generate`] is *greedy search*, which selects the next most likely token, unless otherwise specified in a models [`GenerationConfig`]. While this decoding strategy works well for input-grounded tasks (transcription, translation), it is not optimal for more creative use cases (story writing, chat applications). -There are many [generation strategies](generation_strategies), and sometimes the default values may not be appropriate for your use case. If your outputs aren't aligned with what you're expecting, we've created a list of the most common pitfalls and how to avoid them. +For example, enable a [multinomial sampling](./generation_strategies#multinomial-sampling) strategy to generate more diverse outputs. Refer to the [Generation strategy](./generation_strategies) guide for more decoding strategies. ```py ->>> from transformers import AutoModelForCausalLM, AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") ->>> tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default ->>> model = AutoModelForCausalLM.from_pretrained( -... "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True -... ) +model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda") ``` -### Generated output is too short/long - -If not specified in the [`~generation.GenerationConfig`] file, `generate` returns up to 20 tokens by default. We highly recommend manually setting `max_new_tokens` in your `generate` call to control the maximum number of new tokens it can return. Keep in mind LLMs (more precisely, [decoder-only models](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt)) also return the input prompt as part of the output. - + + ```py ->>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to(DEVICE) - ->>> # By default, the output will contain up to 20 tokens ->>> generated_ids = model.generate(**model_inputs) ->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] -'A sequence of numbers: 1, 2, 3, 4, 5' - ->>> # Setting `max_new_tokens` allows you to control the maximum length ->>> generated_ids = model.generate(**model_inputs, max_new_tokens=50) ->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] -'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,' +generated_ids = model.generate(**model_inputs) +tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] ``` -### Incorrect generation mode - -By default, and unless specified in the [`~generation.GenerationConfig`] file, `generate` selects the most likely token at each iteration (greedy decoding). Depending on your task, this may be undesirable; creative tasks like chatbots or writing an essay benefit from sampling. On the other hand, input-grounded tasks like audio transcription or translation benefit from greedy decoding. Enable sampling with `do_sample=True`, and you can learn more about this topic in this [blog post](https://huggingface.co/blog/how-to-generate). + + ```py ->>> # Set seed for reproducibility -- you don't need this unless you want full reproducibility ->>> from transformers import set_seed ->>> set_seed(42) +generated_ids = model.generate(**model_inputs, do_sample=True) +tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +``` ->>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to(DEVICE) + + ->>> # LLM + greedy decoding = repetitive, boring output ->>> generated_ids = model.generate(**model_inputs) ->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] -'I am a cat. I am a cat. I am a cat. I am a cat' +## Wrong padding side ->>> # With sampling, the output becomes more creative! ->>> generated_ids = model.generate(**model_inputs, do_sample=True) ->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] -'I am a cat. Specifically, I am an indoor-only cat. I' -``` +Inputs need to be padded if they don't have the same length. But LLMs aren't trained to continue generation from padding tokens, which means the [`~PreTrainedTokenizer.padding_side`] parameter needs to be set to the left of the input. -### Wrong padding side - -LLMs are [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt) architectures, meaning they continue to iterate on your input prompt. If your inputs do not have the same length, they need to be padded. Since LLMs are not trained to continue from pad tokens, your input needs to be left-padded. Make sure you also don't forget to pass the attention mask to generate! + + ```py ->>> # The tokenizer initialized above has right-padding active by default: the 1st sequence, ->>> # which is shorter, has padding on the right side. Generation fails to capture the logic. ->>> model_inputs = tokenizer( -... ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt" -... ).to(DEVICE) ->>> generated_ids = model.generate(**model_inputs) ->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +model_inputs = tokenizer( + ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt" +).to("cuda") +generated_ids = model.generate(**model_inputs) +tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] '1, 2, 33333333333' +``` ->>> # With left-padding, it works as expected! ->>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left") ->>> tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default ->>> model_inputs = tokenizer( -... ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt" -... ).to(DEVICE) ->>> generated_ids = model.generate(**model_inputs) ->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + + + +```py +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left") +tokenizer.pad_token = tokenizer.eos_token +model_inputs = tokenizer( + ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt" +).to("cuda") +generated_ids = model.generate(**model_inputs) +tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] '1, 2, 3, 4, 5, 6,' ``` -### Wrong prompt - -Some models and tasks expect a certain input prompt format to work properly. When this format is not applied, you will get a silent performance degradation: the model kinda works, but not as well as if you were following the expected prompt. More information about prompting, including which models and tasks need to be careful, is available in this [guide](tasks/prompting). Let's see an example with a chat LLM, which makes use of [chat templating](chat_templating): - -```python ->>> tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha") ->>> model = AutoModelForCausalLM.from_pretrained( -... "HuggingFaceH4/zephyr-7b-alpha", device_map="auto", load_in_4bit=True -... ) ->>> set_seed(0) ->>> prompt = """How many helicopters can a human eat in one sitting? Reply as a thug.""" ->>> model_inputs = tokenizer([prompt], return_tensors="pt").to(DEVICE) ->>> input_length = model_inputs.input_ids.shape[1] ->>> generated_ids = model.generate(**model_inputs, max_new_tokens=20) ->>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0]) -"I'm not a thug, but i can tell you that a human cannot eat" ->>> # Oh no, it did not follow our instruction to reply as a thug! Let's see what happens when we write ->>> # a better prompt and use the right template for this model (through `tokenizer.apply_chat_template`) - ->>> set_seed(0) ->>> messages = [ -... { -... "role": "system", -... "content": "You are a friendly chatbot who always responds in the style of a thug", -... }, -... {"role": "user", "content": "How many helicopters can a human eat in one sitting?"}, -... ] ->>> model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(DEVICE) ->>> input_length = model_inputs.shape[1] ->>> generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=20) ->>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0]) -'None, you thug. How bout you try to focus on more useful questions?' ->>> # As we can see, it followed a proper thug style 😎 -``` + + -## Further resources +## Wrong prompt format -While the autoregressive generation process is relatively straightforward, making the most out of your LLM can be a challenging endeavor because there are many moving parts. For your next steps to help you dive deeper into LLM usage and understanding: +Some models and tasks expect a certain input prompt format, and if the format is incorrect, the model returns a suboptimal output. You can learn more about prompting in the [prompt engineering](./tasks/prompting) guide. -### Advanced generate usage +For example, a chat model expects the input as a [chat template](./chat_templating). Your prompt should include a `role` and `content` to indicate who is participating in the conversation. If you try to pass your prompt as a single string, the model doesn't always return the expected output. -1. Guide on how to [control different generation methods](generation_strategies), how to set up the generation configuration file, and how to stream the output; -2. [Accelerating text generation](llm_optims); -3. [Prompt templates for chat LLMs](chat_templating); -4. [Prompt design guide](tasks/prompting); -5. API reference on [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`], and [generate-related classes](internal/generation_utils). Most of the classes, including the logits processors, have usage examples! +```py +from transformers import AutoTokenizer, AutoModelForCausalLM -### LLM leaderboards +tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha") +model = AutoModelForCausalLM.from_pretrained( + "HuggingFaceH4/zephyr-7b-alpha", device_map="auto", load_in_4bit=True +) +``` -1. [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), which focuses on the quality of the open-source models; -2. [Open LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard), which focuses on LLM throughput. + + -### Latency, throughput and memory utilization +```py +prompt = """How many cats does it take to change a light bulb? Reply as a pirate.""" +model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda") +input_length = model_inputs.input_ids.shape[1] +generated_ids = model.generate(**model_inputs, max_new_tokens=50) +print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0]) +"Aye, matey! 'Tis a simple task for a cat with a keen eye and nimble paws. First, the cat will climb up the ladder, carefully avoiding the rickety rungs. Then, with" +``` -1. Guide on how to [optimize LLMs for speed and memory](llm_tutorial_optimization); -2. Guide on [quantization](main_classes/quantization) such as bitsandbytes and autogptq, which shows you how to drastically reduce your memory requirements. + + -### Related libraries +```py +messages = [ + { + "role": "system", + "content": "You are a friendly chatbot who always responds in the style of a pirate", + }, + {"role": "user", "content": "How many cats does it take to change a light bulb?"}, +] +model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda") +input_length = model_inputs.shape[1] +generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=50) +print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0]) +"Arr, matey! According to me beliefs, 'twas always one cat to hold the ladder and another to climb up it an’ change the light bulb, but if yer looking to save some catnip, maybe yer can +``` -1. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices; -2. [`outlines`](https://github.com/outlines-dev/outlines), a library where you can constrain text generation (e.g. to generate JSON files); -3. [`SynCode`](https://github.com/uiuc-focal-lab/syncode), a library for context-free grammar guided generation (e.g. JSON, SQL, Python); -4. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference), a production-ready server for LLMs; -5. [`text-generation-webui`](https://github.com/oobabooga/text-generation-webui), a UI for text generation; -6. [`logits-processor-zoo`](https://github.com/NVIDIA/logits-processor-zoo), containing additional options to control text generation with 🤗 Transformers. See our related [blog post](https://huggingface.co/blog/logits-processor-zoo). + + From 6691fe5143b85da536396f55d806ff7bfaa4ba83 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 16 Sep 2024 14:02:53 -0700 Subject: [PATCH 050/116] fix --- docs/source/en/_toctree.yml | 2 ++ docs/source/en/generation_strategies.md | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 353bca2befe4..bace4f431505 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -232,6 +232,8 @@ title: Image Feature Extraction - local: tasks/mask_generation title: Mask Generation + - local: tasks/keypoint_detection + title: Keypoint detection - local: tasks/knowledge_distillation_for_image_classification title: Knowledge Distillation for Computer Vision - title: Multimodal diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md index b35e59769649..55dbc6f7fa9e 100644 --- a/docs/source/en/generation_strategies.md +++ b/docs/source/en/generation_strategies.md @@ -285,4 +285,4 @@ tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[-1]:], skip_special_tok ```
-
\ No newline at end of file +
From 9eefd8bae3e494ff97ec859f37aadff35344c39d Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 17 Sep 2024 12:21:03 -0700 Subject: [PATCH 051/116] chat pipeline --- docs/source/en/_toctree.yml | 6 +- docs/source/en/conversations.md | 250 +++++------------------- docs/source/en/generation_strategies.md | 2 - 3 files changed, 49 insertions(+), 209 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index bace4f431505..d42a27ddc116 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -74,12 +74,12 @@ title: Getting the most out of LLMs - local: perplexity title: Perplexity of fixed-length models - - title: Chat models + - title: Chat sections: - local: conversations - title: Chatting with Transformers + title: Chat pipeline - local: chat_templating - title: Chat templates + title: Templates - title: Framework-specific inference optimization sections: - local: tf_xla diff --git a/docs/source/en/conversations.md b/docs/source/en/conversations.md index a48c046b4949..5548063fa606 100644 --- a/docs/source/en/conversations.md +++ b/docs/source/en/conversations.md @@ -14,62 +14,46 @@ rendered properly in your Markdown viewer. --> -# Chatting with Transformers +# Chat pipeline -If you're reading this article, you're almost certainly aware of **chat models**. Chat models are conversational -AIs that you can send and receive messages with. The most famous of these is the proprietary ChatGPT, but there are -now many open-source chat models which match or even substantially exceed its performance. These models are free to -download and run on a local machine. Although the largest and most capable models require high-powered hardware -and lots of memory to run, there are smaller models that will run perfectly well on a single consumer GPU, or even -an ordinary desktop or notebook CPU. +Chat models are conversational models that you can send and receive messages with. There are many chat models available to choose from, but in general, larger models tend to be more capable though that's not always the case. The model size is often included in the name, like "8B" or "70B", and it describes the number of parameters. Some mixture-of-expert (MoE) models have names like "8x7B" or "141B-A35B" which basically means it's a 57B and 141B parameter model. Without quantization, you'll need ~2 bytes of memory per parameter. To reduce memory requirements, try quantizing the model. -This guide will help you get started with chat models. We'll start with a brief quickstart guide that uses a convenient, -high-level "pipeline". This is all you need if you just want to start running a chat model -immediately. After the quickstart, we'll move on to more detailed information about -what exactly chat models are, how to choose an appropriate one, and a low-level breakdown of each of the -steps involved in talking to a chat model. We'll also give some tips on optimizing the performance and memory usage -of your chat models. +Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [LMSys Chatbot Arena](https://chat.lmsys.org/?leaderboard) to further help you identify the best chat models for your use case. Models that are specialized in certain domains (medical, legal text, non-English languages, etc.) may sometimes outperform larger general purpose models. +> [!TIP] +> Chat with a number of open-source models for free on [HuggingChat](https://hf.co/chat/)! -## Quickstart +This guide shows you how to build and format a conversation, and how to quickly start chatting with a model with the [`TextGenerationPipeline`]. -If you have no time for details, here's the brief summary: Chat models continue chats. This means that you pass them -a conversation history, which can be as short as a single user message, and the model will continue the conversation -by adding its response. Let's see this in action. First, let's build a chat: +## TextGenerationPipeline -```python +The [`TextGenerationPipeline`] is a high-level text generation API with a "chat mode". Chat mode is turned on when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format). + +To start, build a chat history with the following two roles. + +- `system` describes how the model should behave and respond when you're chatting with it. This role isn't supported by all chat models. +- `user` is where you enter your first message to the model. + +```py chat = [ {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."}, {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"} ] ``` -Notice that in addition to the user's message, we added a **system** message at the start of the conversation. Not all -chat models support system messages, but when they do, they represent high-level directives about how the model -should behave in the conversation. You can use this to guide the model - whether you want short or long responses, -lighthearted or serious ones, and so on. If you want the model to do useful work instead of -practicing its improv routine, you can either omit the system message or try a terse one such as "You are a helpful and intelligent -AI assistant who responds to user queries." - -Once you have a chat, the quickest way to continue it is using the [`TextGenerationPipeline`]. -Let's see this in action with `LLaMA-3`. Note that `LLaMA-3` is a gated model, which means you will need to -[apply for access](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and log in with your Hugging Face -account to use it. We'll also use `device_map="auto"`, which will load the model on GPU if there's enough memory -for it, and set the dtype to `torch.bfloat16` to save memory: +Create a [`TextGenerationPipeline`] and pass the `chat` to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory. -```python +```py import torch from transformers import pipeline -pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto") -response = pipe(chat, max_new_tokens=512) -print(response[0]['generated_text'][-1]['content']) +pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto") +response = pipeline(chat, max_new_tokens=512) +print(response[0]["generated_text"][-1]["content"]) ``` -And you'll get: - -```text -(sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright, +```txt +(sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright, alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide! So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million @@ -91,22 +75,18 @@ So, there you have it, pal! That's my expert advice on what to do in New York. N excuse me, I've got some oil changes to attend to. (winks) ``` -You can continue the chat by appending your own response to it. The -`response` object returned by the pipeline actually contains the entire chat so far, so we can simply append -a message and pass it back: +Use the `append` method on `chat` to respond to the model's message. -```python -chat = response[0]['generated_text'] +```py +chat = response[0]["generated_text"] chat.append( {"role": "user", "content": "Wait, what's so wild about soup cans?"} ) -response = pipe(chat, max_new_tokens=512) -print(response[0]['generated_text'][-1]['content']) +response = pipeline(chat, max_new_tokens=512) +print(response[0]["generated_text"][-1]["content"]) ``` -And you'll get: - -```text +```txt (laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man! It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!" @@ -120,171 +100,33 @@ But, hey, you're not alone, pal. I mean, I'm a robot, and even I don't get it. ( But, hey, that's what makes art, art, right? (laughs) ``` -The remainder of this tutorial will cover specific topics such -as performance and memory, or how to select a chat model for your needs. - -## Choosing a chat model - -There are an enormous number of different chat models available on the [Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending), -and new users often feel very overwhelmed by the selection offered. Don't be, though! You really need to just focus on -two important considerations: -- The model's size, which will determine if you can fit it in memory and how quickly it will -run. -- The quality of the model's chat output. - -In general, these are correlated - bigger models tend to be -more capable, but even so there's a lot of variation at a given size point! - -### Size and model naming -The size of a model is easy to spot - it's the number in the model name, like "8B" or "70B". This is the number of -**parameters** in the model. Without quantization, you should expect to need about 2 bytes of memory per parameter. -This means that an "8B" model with 8 billion parameters will need about 16GB of memory just to fit the parameters, -plus a little extra for other overhead. It's a good fit for a high-end consumer GPU with 24GB of memory, such as a 3090 -or 4090. - -Some chat models are "Mixture of Experts" models. These may list their sizes in different ways, such as "8x7B" or -"141B-A35B". The numbers are a little fuzzier here, but in general you can read this as saying that the model -has approximately 56 (8x7) billion parameters in the first case, or 141 billion parameters in the second case. - -Note that it is very common to use quantization techniques to reduce the memory usage per parameter to 8 bits, 4 bits, -or even less. This topic is discussed in more detail in the [Memory considerations](#memory-considerations) section below. - -### But which chat model is best? -Even once you know the size of chat model you can run, there's still a lot of choice out there. One way to sift through -it all is to consult **leaderboards**. Two of the most popular leaderboards are the [OpenLLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) -and the [LMSys Chatbot Arena Leaderboard](https://chat.lmsys.org/?leaderboard). Note that the LMSys leaderboard -also includes proprietary models - look at the `licence` column to identify open-source ones that you can download, then -search for them on the [Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending). - -### Specialist domains -Some models may be specialized for certain domains, such as medical or legal text, or non-English languages. -If you're working in these domains, you may find that a specialized model will give you big performance benefits. -Don't automatically assume that, though! Particularly when specialized models are smaller or older than the current -cutting-edge, a top-end general-purpose model may still outclass them. Thankfully, we are beginning to see -[domain-specific leaderboards](https://huggingface.co/blog/leaderboard-medicalllm) that should make it easier to locate -the best models for specialized domains. - -## What happens inside the pipeline? - -The quickstart above used a high-level pipeline to chat with a chat model, which is convenient, but not the -most flexible. Let's take a more low-level approach, to see each of the steps involved in chat. Let's start with -a code sample, and then break it down: - -```python -from transformers import AutoModelForCausalLM, AutoTokenizer -import torch - -# Prepare the input as before -chat = [ - {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."}, - {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"} -] - -# 1: Load the model and tokenizer -model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16) -tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") - -# 2: Apply the chat template -formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) -print("Formatted chat:\n", formatted_chat) - -# 3: Tokenize the chat (This can be combined with the previous step using tokenize=True) -inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False) -# Move the tokenized inputs to the same device the model is on (GPU/CPU) -inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()} -print("Tokenized inputs:\n", inputs) - -# 4: Generate text from the model -outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1) -print("Generated tokens:\n", outputs) - -# 5: Decode the output back to a string -decoded_output = tokenizer.decode(outputs[0][inputs['input_ids'].size(1):], skip_special_tokens=True) -print("Decoded output:\n", decoded_output) -``` - -There's a lot in here, each piece of which could be its own document! Rather than going into too much detail, I'll cover -the broad ideas, and leave the details for the linked documents. The key steps are: - -1. [Models](https://huggingface.co/learn/nlp-course/en/chapter2/3) and [Tokenizers](https://huggingface.co/learn/nlp-course/en/chapter2/4?fw=pt) are loaded from the Hugging Face Hub. -2. The chat is formatted using the tokenizer's [chat template](https://huggingface.co/docs/transformers/main/en/chat_templating) -3. The formatted chat is [tokenized](https://huggingface.co/learn/nlp-course/en/chapter2/4) using the tokenizer. -4. We [generate](https://huggingface.co/docs/transformers/en/llm_tutorial) a response from the model. -5. The tokens output by the model are decoded back to a string - -## Performance, memory and hardware - -You probably know by now that most machine learning tasks are run on GPUs. However, it is entirely possible -to generate text from a chat model or language model on a CPU, albeit somewhat more slowly. If you can fit -the model in GPU memory, though, this will usually be the preferable option. - -### Memory considerations - -By default, Hugging Face classes like [`TextGenerationPipeline`] or [`AutoModelForCausalLM`] will load the model in -`float32` precision. This means that it will need 4 bytes (32 bits) per parameter, so an "8B" model with 8 billion -parameters will need ~32GB of memory. However, this can be wasteful! Most modern language models are trained in -"bfloat16" precision, which uses only 2 bytes per parameter. If your hardware supports it (Nvidia 30xx/Axxx -or newer), you can load the model in `bfloat16` precision, using the `torch_dtype` argument as we did above. +## Performance -It is possible to go even lower than 16-bits using "quantization", a method to lossily compress model weights. This -allows each parameter to be squeezed down to 8 bits, 4 bits or even less. Note that, especially at 4 bits, -the model's outputs may be negatively affected, but often this is a tradeoff worth making to fit a larger and more -capable chat model in memory. Let's see this in action with `bitsandbytes`: +Transformers load models in full precision by default, and for a 8B model, this requires ~32GB of memory! Reduce a model's memory usage by loading a model in half-precision or bfloat16 (only uses ~2 bytes per parameter). You can even quantize the model to a lower precision like 8-bit or 4-bit with [bitsandbytes](https://hf.co/docs/bitsandbytes/index). -```python -from transformers import AutoModelForCausalLM, BitsAndBytesConfig +> [!TIP] +> Refer to the [Quantization](./quantization/overview) section for more information about different quantization backends. -quantization_config = BitsAndBytesConfig(load_in_8bit=True) # You can also try load_in_4bit -model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", quantization_config=quantization_config) -``` - -Or we can do the same thing using the `pipeline` API: +Create a [`BitsAndBytesConfig`] with your desired quantization settings and pass it to the pipelines `model_kwargs` parameter. -```python +```py from transformers import pipeline, BitsAndBytesConfig -quantization_config = BitsAndBytesConfig(load_in_8bit=True) # You can also try load_in_4bit -pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config}) +quantization_config = BitsAndBytesConfig(load_in_8bit=True) +pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config}) ``` -There are several other options for quantizing models besides `bitsandbytes` - please see the [Quantization guide](./quantization) -for more information. - -### Performance considerations - - - -For a more extensive guide on language model performance and optimization, check out [LLM Inference Optimization](./llm_optims) . - - - - -As a general rule, larger chat models will be slower in addition to requiring more memory. It's possible to be -more concrete about this, though: Generating text from a chat model is unusual in that it is bottlenecked by -**memory bandwidth** rather than compute power, because every active parameter must be read from memory for each -token that the model generates. This means that number of tokens per second you can generate from a chat -model is generally proportional to the total bandwidth of the memory it resides in, divided by the size of the model. +In general, larger models are slower in addition to requiring more memory because text generation is bottlenecked by **memory bandwidth** instead of compute power. Each active parameter must be read from memory for each generated token. For a 16GB model, this means 16GB must be read from memory for every generated token. -In our quickstart example above, our model was ~16GB in size when loaded in `bfloat16` precision. -This means that 16GB must be read from memory for every token generated by the model. Total memory bandwidth can -vary from 20-100GB/sec for consumer CPUs to 200-900GB/sec for consumer GPUs, specialized CPUs like -Intel Xeon, AMD Threadripper/Epyc or high-end Apple silicon, and finally up to 2-3TB/sec for data center GPUs like -the Nvidia A100 or H100. This should give you a good idea of the generation speed you can expect from these different -hardware types. +The number of generated tokens/sec is proportional to the total memory bandwidth of the system divided by the model size. Depending on your hardware, total memory bandwidth can vary. Refer to the table below for approximate generation speeds for different hardware types. -Therefore, if you want to improve the speed of text generation, the easiest solution is to either reduce the -size of the model in memory (usually by quantization), or get hardware with higher memory bandwidth. For advanced users, -several other techniques exist to get around this bandwidth bottleneck. The most common are variants on -[assisted generation](https://huggingface.co/blog/assisted-generation), also known as "speculative -sampling". These techniques try to guess multiple future tokens at once, often using a smaller "draft model", and then -confirm these generations with the chat model. If the guesses are validated by the chat model, more than one token can -be generated per forward pass, which greatly alleviates the bandwidth bottleneck and improves generation speed. +| Hardware | Memory bandwidth | +|---|---| +| consumer CPU | 20-100GB/sec | +| specialized CPU (Intel Xeon, AMD Threadripper/Epyc, Apple silicon) | 200-900GB/sec | +| data center GPU (NVIDIA A100/H100) | 2-3TB/sec | -Finally, we should also note the impact of "Mixture of Experts" (MoE) models here. Several popular chat models, -such as Mixtral, Qwen-MoE and DBRX, are MoE models. In these models, not every parameter is active for every token generated. -As a result, MoE models generally have much lower memory bandwidth requirements, even though their total size -can be quite large. They can therefore be several times faster than a normal "dense" model of the same size. However, -techniques like assisted generation are generally ineffective for these models because more parameters will become -active with each new speculated token, which will negate the bandwidth and speed benefits that the MoE architecture -provides. +The easiest solution for improving generation speed is to either reduce the model size in memory with quantization or use hardware with higher memory bandwidth. You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token per `forward` pass. This significantly alleviates the bandwidth bottleneck and improves generation speed. +> [!TIP] +> Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe.md), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token. diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md index 55dbc6f7fa9e..5ab8bd05534e 100644 --- a/docs/source/en/generation_strategies.md +++ b/docs/source/en/generation_strategies.md @@ -280,8 +280,6 @@ inputs = tokenizer("What is the highest peak in the world?", return_tensors="pt" outputs = model.generate(**inputs, max_new_tokens=50, dola_layers=[18,20], do_sample=False, repetition_penalty=1.2) tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True) " Mount EverestMount Everest, called Himalaya in Nepali, is the world's highest peak above sea level and it rises to an incredible height of 29,028 feet above the ocean. Its summit is over a mile taller than Mt" - - ```
From 88d3c6f6447ed87ce986d36f31fd0600b05c7538 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 23 Sep 2024 14:55:59 -0700 Subject: [PATCH 052/116] chat stuff --- docs/source/en/_toctree.yml | 4 + docs/source/en/chat_extras.md | 299 ++++++++++++++++++++++ docs/source/en/chat_templating_writing.md | 145 +++++++++++ 3 files changed, 448 insertions(+) create mode 100644 docs/source/en/chat_extras.md create mode 100644 docs/source/en/chat_templating_writing.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index d42a27ddc116..957e9fc3e3db 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -80,6 +80,10 @@ title: Chat pipeline - local: chat_templating title: Templates + - local: chat_templating_writing + title: Template writing + - local: chat_extras + title: Tools and RAG - title: Framework-specific inference optimization sections: - local: tf_xla diff --git a/docs/source/en/chat_extras.md b/docs/source/en/chat_extras.md new file mode 100644 index 000000000000..697f80d218ff --- /dev/null +++ b/docs/source/en/chat_extras.md @@ -0,0 +1,299 @@ + + +# Tools and RAG + +The [`~PreTrainedTokenizerBase.apply_chat_template`] method supports virtually any additional argument types - strings, lists, dicts - besides the chat message. This makes it possible to use chat templates for many use cases. + +This guide will demonstrate how to use chat templates with tools and retrieval-augmented generation (RAG). + +## Tools + +Tools are functions a large language model (LLM) can call to perform specific tasks. It is a powerful way to extend the capabilities of conversational agents with real-time information, computational tools, or access to large databases. + +Follow the rules below when creating a tool. + +1. The function should have a descriptive name. +2. The function arguments must have a type hint in the function header (don't include in the `Args` block). +3. The function must have a [Google-style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) docstring. +4. The function can have a return type and `Returns` block, but these are optional because most tool use models ignore them. + +An example tool to get temperature and wind speed is shown below. + +```py +def get_current_temperature(location: str, unit: str) -> float: + """ + Get the current temperature at a location. + + Args: + location: The location to get the temperature for, in the format "City, Country" + unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"]) + Returns: + The current temperature at the specified location in the specified units, as a float. + """ + return 22. # A real function should probably actually get the temperature! + +def get_current_wind_speed(location: str) -> float: + """ + Get the current wind speed in km/h at a given location. + + Args: + location: The location to get the temperature for, in the format "City, Country" + Returns: + The current wind speed at the given location in km/h, as a float. + """ + return 6. # A real function should probably actually get the wind speed! + +tools = [get_current_temperature, get_current_wind_speed] +``` + +Load a model and tokenizer that supports tool use like [NousResearch/Hermes-2-Pro-Llama-3-8B](https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B), but you can also consider a larger model like [Command-R](./model_doc/cohere) and [Mixtral-8x22B](./model_doc/mixtral) if your hardware can support it. + +```py +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B") +tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B") +model = AutoModelForCausalLM.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B", torch_dtype=torch.bfloat16, device_map="auto") +``` + +Create a chat message. + +```py +messages = [ + {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."}, + {"role": "user", "content": "Hey, what's the temperature in Paris right now?"} +] +``` + +Use [`~PreTrainedTokenizerBase.apply_chat_template`] on the messages and pass the list of tools to the `tools` parameter. Then you can pass the inputs to the model for generation. + +```py +inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") +inputs = {k: v.to(model.device) for k, v in inputs.items()} +outputs = model.generate(**inputs, max_new_tokens=128) +print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):])) +``` + +```txt + +{"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"} +<|im_end|> +``` + +The chat model called the `get_current_temperature` tool with the correct parameters from the docstring. It inferred France as the location based on Paris, and that it should use Celsius for the units of temperature. + +Now append the `get_current_temperature` function and these arguments to the chat message as `tool_call`. The `tool_call` dictionary should be provided to the `assistant` role instead of the `system` or `user`. + +> [!WARNING] +> The OpenAI API uses a JSON string as its `tool_call` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict. + + + + +```py +tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}} +messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]}) +``` + +Allow the assistant to read the function outputs and chat with the user. + +```py +inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") +inputs = {k: v.to(model.device) for k, v in inputs.items()} +out = model.generate(**inputs, max_new_tokens=128) +print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):])) +``` + +```txt +The temperature in Paris, France right now is approximately 12°C (53.6°F).<|im_end|> +``` + + + + +For [Mistral](./model_doc/mistral) and [Mixtral](./model_doc/mixtral) models, you need an additional `tool_call_id`. The `tool_call_id` is 9 randomly generated alphanumeric characters assigned to the `id` key in the `tool_call` dictionary. + +```py +tool_call_id = "9Ae3bDc2F" +tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}} +messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]}) +``` + +```py +inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") +inputs = {k: v.to(model.device) for k, v in inputs.items()} +out = model.generate(**inputs, max_new_tokens=128) +print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):])) +``` + + + + +### Schema + +[`~PreTrainedTokenizerBase.apply_chat_template`] converts functions into a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step) which is passed to the model chat template. A LLM never sees the code inside the function. In other words, a LLM doesn't care how the model works technically, it only cares about function **definition** and **arguments**. + +The JSON schema is automatically generated behind the scenes as long as your function follows the rules listed earlier above. But you can use [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205) to manually convert a schema for more visibility or debugging. + +```py +from transformers.utils import get_json_schema + +def multiply(a: float, b: float): + """ + A function that multiplies two numbers + + Args: + a: The first number to multiply + b: The second number to multiply + """ + return a * b + +schema = get_json_schema(multiply) +print(schema) +``` + +```json +{ + "type": "function", + "function": { + "name": "multiply", + "description": "A function that multiplies two numbers", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number", + "description": "The first number to multiply" + }, + "b": { + "type": "number", + "description": "The second number to multiply" + } + }, + "required": ["a", "b"] + } + } +} +``` + +You can edit the schema or write one entirely from scratch. This gives you a lot of flexibility to definie precise schemas for more complex functions. + +> [!WARNING] +> Try keeping your function signatures simple and the arguments to a minimum. These are easier for a model to understand and use than complex functions for example with nested arguments. + +The example below demonstrates writing a schema manually and then passing it to [`~PreTrainedTokenizerBase.apply_chat_template`]. + +```py +# A simple function that takes no arguments +current_time = { + "type": "function", + "function": { + "name": "current_time", + "description": "Get the current local time as a string.", + "parameters": { + 'type': 'object', + 'properties': {} + } + } +} + +# A more complete function that takes two numerical arguments +multiply = { + 'type': 'function', + 'function': { + 'name': 'multiply', + 'description': 'A function that multiplies two numbers', + 'parameters': { + 'type': 'object', + 'properties': { + 'a': { + 'type': 'number', + 'description': 'The first number to multiply' + }, + 'b': { + 'type': 'number', 'description': 'The second number to multiply' + } + }, + 'required': ['a', 'b'] + } + } +} + +model_input = tokenizer.apply_chat_template( + messages, + tools = [current_time, multiply] +) +``` + +## Retrieval-augmented generation (RAG) + +Retrieval-augmented generation (RAG) models enhance a models existing knowledge by allowing it to search documents for additional information before returning a query. For RAG models, add a `documents` parameter to [`~PreTrainedTokenizerBase.apply_chat_template`]. This `documents` parameter should be a list of documents, and each document should be a single dict with `title` and `content` keys. + +> [!TIP] +> The `documents` parameter for RAG isn't widely supported and many models have chat templates that ignore `documents`. Verify if a model supports `documents` by reading its model card or executing `print(tokenizer.chat_template)` to see if the `documents` key is present. [Command-R](https://hf.co/CohereForAI/c4ai-command-r-08-2024) and [Command-R+](https://hf.co/CohereForAI/c4ai-command-r-plus-08-2024) both support `documents` in their RAG chat templates. + +Create a list of documents to pass to the model. + +```py +documents = [ + { + "title": "The Moon: Our Age-Old Foe", + "text": "Man has always dreamed of destroying the moon. In this essay, I shall..." + }, + { + "title": "The Sun: Our Age-Old Friend", + "text": "Although often underappreciated, the sun provides several notable benefits..." + } +] +``` + +Set `chat_template="rag"` in [`~PreTrainedTokenizerBase.apply_chat_template`] and generate a response. + +```py +from transformers import AutoTokenizer, AutoModelForCausalLM + +# Load the model and tokenizer +tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit") +model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit", device_map="auto") +device = model.device # Get the device the model is loaded on + +# Define conversation input +conversation = [ + {"role": "user", "content": "What has Man always dreamed of?"} +] + +input_ids = tokenizer.apply_chat_template( + conversation=conversation, + documents=documents, + chat_template="rag", + tokenize=True, + add_generation_prompt=True, + return_tensors="pt").to(device) + +# Generate a response +generated_tokens = model.generate( + input_ids, + max_new_tokens=100, + do_sample=True, + temperature=0.3, + ) + +# Decode and print the generated text along with generation prompt +generated_text = tokenizer.decode(generated_tokens[0]) +print(generated_text) +``` diff --git a/docs/source/en/chat_templating_writing.md b/docs/source/en/chat_templating_writing.md new file mode 100644 index 000000000000..dd793e611b2d --- /dev/null +++ b/docs/source/en/chat_templating_writing.md @@ -0,0 +1,145 @@ + + +# Template writing + +A chat template is a [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) template stored in the tokenizers [chat_template](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.chat_template) attribute. Jinja is a templating language that allows you to write Python-like code and syntax. A chat template performs the following three roles. + +1. Print the role enclosed in `<|` and `|>` (`<|user|>`, `<|assistant|>`, etc.). +2. Print the message followed by an end-of-sequence (EOS) token. +3. Print the assistant token if [add_generation_prompt=True](./chat_templating#add_generation_prompt) so the model knows to generate an assistant response. + +An example template is shown below. + +```jinja +{%- for message in messages %} + {{- '<|' + message['role'] + |>\n' }} + {{- message['content'] + eos_token }} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|assistant|>\n' }} +{%- endif %} +``` + +The template can be customized to handle more complex use cases. This guide will show you how to add and edit templates and some template writing tips. + +## Create a template + +Create a template by writing a Jinja template and then setting it as the chat template in the tokenizer. For example, the template below adds `[ASST]` and `[/ASST]` tags to the assistant messages. + +```jinja +{%- for message in messages %} + {%- if message['role'] == 'user' %} + {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }} + {%- elif message['role'] == 'system' %} + {{- '<>\\n' + message['content'].strip() + '\\n<>\\n\\n' }} + {%- elif message['role'] == 'assistant' %} + {{- '[ASST] ' + message['content'] + ' [/ASST]' + eos_token }} + {%- endif %} +{%- endfor %} +``` + +Set the template in the tokenizer, and the next time you use [`~PreTrainedTokenizerBase.apply_chat_template`], the new template is used. + +```py +template = tokenizer.chat_template +template = template.replace("SYS", "SYSTEM") # Change the system token +tokenizer.chat_template = template # Set the new template +``` + +The template is saved in the `tokenizer_config.json` file. Save it to the Hub with [`~PushToHubMixin.push_to_hub`] so you can reuse it later and make sure everyone is using the right template for your model. + +```py +tokenizer.push_to_hub("model_name") +``` + +## Template writing tips + +The easiest way to start writing Jinja templates is to refer to existing templates. Use `print(tokenizer.chat_template)` on any chat model to see what template it's using. Try starting with simple models that don't call any tools or support RAG. Finally, take a look at the [Jinja documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for more details about formatting and syntax. + +This section curates some best practices for writing clean and efficient Jinja templates. + +### Trimming whitespace + +Jinja prints any whitespace before or after a block of text. This can be an issue for chat templates because whitespace usage should be intentional. Add `-` to strip any whitespace before a block. + +```jinja +{%- for message in messages %} + {{- message['role'] + message['content'] }} +{%- endfor %} +``` + +The incorrect whitespace usage example below may introduce a newline and indentation in the output. + +```jinja +{% for message in messages %} + {{ message['role'] + message['content'] }} +{% endfor %} +``` + +### Special variables + +There are five special variables available inside a template. You can pass virtually any additional arguments to [`~PreTrainedTokenizerBase.apply_chat_template`] and it will be available inside the template as a variable. However, you should try to keep the number of variables to the five below to make it easier for users to use the chat model without writing custom code to handle model-specific arguments. + +- `messages` contains the chat history as a list of message dicts. +- `tools` contains a list of tools in JSON schema format. +- `documents` contains a list of documents with the format `{"title": Title, "contents": "Contents"}` (designed for RAG models). +- `add_generation_prompt` is a boolean that determines whether to add an assistant header at the end of the conversation. +- `bos_token` and `eos_token` are special tokens extracted from a tokenizers `special_tokens_map`. + +### Callable functions + +There are two callable functions available inside a template. + +- `raise_exception(msg)` raises a `TemplateException`. This is useful for debugging or warning users about incorrect template usage. +- `strftime_now(format_str)` retrieves the current date and time in a specific format which could be useful to include in system messages. It is equivalent to [datetime.now().strftime(format_str)](https://docs.python.org/3/library/datetime.html#datetime.datetime.now) in Python. + +### Compatibility with non-Python Jinja + +Jinka is implemented in multiple languages. and they generally have the same syntax. Writing a template in Python allows you to use Python methods such as [lower](https://docs.python.org/3/library/stdtypes.html#str.lower) on strings or [items](https://docs.python.org/3/library/stdtypes.html#dict.items) on dicts. But this won't work if the template is used in a non-Python implementation, for example, when deploying with Javascript or Rust. + +Make the changes below to ensure compatibility across all Jinja implementations. + +- Replace Python methods with Jinja filters. For example, replace `string.lower()` with `string|lower` or `dict.items()` with `dict|dictitems`. Most of the changes follow the same pattern except `string.strip()`, which is replaced with `string|trim`. Refer to the list of [built-in filters](https://jinja.palletsprojects.com/en/3.1.x/templates/#builtin-filters) for a complete list of filters. +- Replace `True`, `False`, and `None` (these are Python specific) with `true`, `false`, and `none` respectively. +- Directly rendering a dict or list may return different results in other implementations. For example, string entries may change from single-quote to double-quote. To avoid this, add the [tojson](https://jinja.palletsprojects.com/en/3.1.x/templates/#jinja-filters.tojson) filter to maintain consistency. + +### Big templates + +Newer models or models with features like [tool-calling](./chat_extras#tools) and [RAG](./chat_extras#retrieval-augmented-generation-rag) require larger templates than can be more than 100 lines. It may be easier to write larger templates in a separate file. The line numbers in the separate file corresponds exactly to the line numbers in template parsing or execution errors, making it easier to debug any potential issues. + +Write the template in a separate file and extract it to the chat template. + +```py +open("template.jinja", "w").write(tokenizer.chat_template) +``` + +You could also load an edited template back into the tokenizer. + +```py +tokenizer.chat_template = open("template.jinja").read() +``` + +## Contribute + +Add a chat template by setting the `chat_template` attribute in the tokenizer and testing it with [`~PreTrainedTokenizerBase.apply_chat_template`]. If it works as expected, then you can upload it to the Hub with with [`~PushToHubMixin.push_to_hub`]. + +Even if you're not the model owner, it is still helpful to add a template for a model with an empty chat template or a model that is using a default class template. Open a [pull request](https://hf.co/docs/hub/repositories-pull-requests-discussions) on the model repository to add the template. + +```py +tokenizer.chat_template = template +tokenizer.push_to_hub("model_name") +``` From 1c4ea9d59a4de46f61226c2e22b2d8e0e4d0a6a2 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 23 Sep 2024 17:03:08 -0700 Subject: [PATCH 053/116] xla --- docs/source/en/_toctree.yml | 16 ++--- docs/source/en/tf_xla.md | 123 ++++++++++++------------------------ 2 files changed, 47 insertions(+), 92 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 957e9fc3e3db..ea24c03b7aa6 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -84,12 +84,16 @@ title: Template writing - local: chat_extras title: Tools and RAG - - title: Framework-specific inference optimization + - title: Optimization sections: - - local: tf_xla - title: XLA Integration for TensorFlow Models - local: perf_torch_compile - title: Optimize inference using `torch.compile()` + title: torch.compile + - local: tf_xla + title: XLA + - local: perf_infer_cpu + title: CPU + - local: perf_infer_gpu_one + title: GPU - local: agents title: Agents - local: agents_advanced @@ -100,10 +104,6 @@ title: Interoperability with GGUF files - local: tiktoken title: Interoperability with TikToken files - - local: perf_infer_cpu - title: CPU inference - - local: perf_infer_gpu_one - title: GPU inference - title: Training sections: - title: Trainer API diff --git a/docs/source/en/tf_xla.md b/docs/source/en/tf_xla.md index a585aec068b1..d6822ab5b9ec 100644 --- a/docs/source/en/tf_xla.md +++ b/docs/source/en/tf_xla.md @@ -1,4 +1,4 @@ - -# XLA Integration for TensorFlow Models +# XLA [[open-in-colab]] -Accelerated Linear Algebra, dubbed XLA, is a compiler for accelerating the runtime of TensorFlow Models. From the [official documentation](https://www.tensorflow.org/xla): +[Accelerated Linear Algebra (XLA)](https://openxla.org/xla) is a linear algebra compiler that optimizes model runtime across different hardware and frameworks. -XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear algebra that can accelerate TensorFlow models with potentially no source code changes. +This guide will look specifically at how to accelerate *TensorFlow* models with XLA. -Using XLA in TensorFlow is simple – it comes packaged inside the `tensorflow` library, and it can be triggered with the `jit_compile` argument in any graph-creating function such as [`tf.function`](https://www.tensorflow.org/guide/intro_to_graphs). When using Keras methods like `fit()` and `predict()`, you can enable XLA simply by passing the `jit_compile` argument to `model.compile()`. However, XLA is not limited to these methods - it can also be used to accelerate any arbitrary `tf.function`. +## TensorFlow -Several TensorFlow methods in 🤗 Transformers have been rewritten to be XLA-compatible, including text generation for models such as [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2), [T5](https://huggingface.co/docs/transformers/model_doc/t5) and [OPT](https://huggingface.co/docs/transformers/model_doc/opt), as well as speech processing for models such as [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper). +XLA can potentially accelerate a TensorFlow model without making any source code changes. It is already packaged with the TensorFlow library, and it is triggered with `jit_compile` in any graph creating function such as [tf.function](https://www.tensorflow.org/api_docs/python/tf/function). -While the exact amount of speed-up is very much model-dependent, for TensorFlow text generation models inside 🤗 Transformers, we noticed a speed-up of ~100x. This document will explain how you can use XLA for these models to get the maximum amount of performance. We’ll also provide links to additional resources if you’re interested to learn more about the benchmarks and our design philosophy behind the XLA integration. +If you're using Keras methods like [fit](https://keras.io/api/models/model_training_apis/#fit-method) and [predict](https://keras.io/api/models/model_training_apis/#predict-method), enable XLA by passing `jit_compile=True` to [compile](https://keras.io/api/models/model_training_apis/#compile-method). -## Running TF functions with XLA +```py +model.compile(jit_compile=True) +``` -Let us consider the following model in TensorFlow: +XLA can be used to accelerate any arbitrary [tf.function](https://www.tensorflow.org/api_docs/python/tf/function). -```py +Models with a TensorFlow implementation like [GPT2](./model_doc/gpt2), [T5](./model_doc/t5), [OPT](./model_doc/opt), and [Whisper](./model_doc/whisper) are XLA compatible. The amount of speed up depends on a model, but in general, TensorFlow models in Transformers get a ~100x speed up. + +### Functions + +A typical forward pass in a TensorFlow model is shown below. To run a forward pass with XLA, wrap the model with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function) and set `jit_compile=True`. + +```diff import tensorflow as tf model = tf.keras.Sequential( [tf.keras.layers.Dense(10, input_shape=(10,), activation="relu"), tf.keras.layers.Dense(5, activation="softmax")] ) -``` - -The above model accepts inputs having a dimension of `(10, )`. We can use the model for running a forward pass like so: - -```py # Generate random inputs for the model. batch_size = 16 input_vector_dim = 10 random_inputs = tf.random.normal((batch_size, input_vector_dim)) # Run a forward pass. -_ = model(random_inputs) -``` - -In order to run the forward pass with an XLA-compiled function, we’d need to do: - -```py -xla_fn = tf.function(model, jit_compile=True) -_ = xla_fn(random_inputs) +- _ = model(random_inputs) ++ xla_fn = tf.function(model, jit_compile=True) ++ _ = xla_fn(random_inputs) ``` -The default `call()` function of the `model` is used for compiling the XLA graph. But if there’s any other model function you want to compile into XLA that’s also possible with: +The default `call` function of the model is used to compile the XLA graph. But if there's any other model function you want to compile with XLA, wrap them with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function). ```py my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True) ``` -## Running a TF text generation model with XLA from 🤗 Transformers +### Text generation -To enable XLA-accelerated generation within 🤗 Transformers, you need to have a recent version of `transformers` installed. You can install it by running: - -```bash -pip install transformers --upgrade -``` - -And then you can run the following code: +You could also compile other model functions with XLA. For example, enable XLA for text generation by wrapping the [`~TFGenerationMixin.generate`] method with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function). ```py import tensorflow as tf from transformers import AutoTokenizer, TFAutoModelForCausalLM - # Will error if the minimal version of Transformers is not installed. from transformers.utils import check_min_version check_min_version("4.21.0") - tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="") model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2") input_string = ["TensorFlow is"] -# One line to create an XLA generation function xla_generate = tf.function(model.generate, jit_compile=True) tokenized_input = tokenizer(input_string, return_tensors="tf") @@ -97,18 +86,16 @@ generated_tokens = xla_generate(**tokenized_input, num_beams=2) decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True) print(f"Generated -- {decoded_text}") -# Generated -- TensorFlow is an open-source, open-source, distributed-source application # framework for the +"Generated -- TensorFlow is an open-source, open-source, distributed-source application framework for the" ``` -As you can notice, enabling XLA on `generate()` is just a single line of code. The rest of the code remains unchanged. However, there are a couple of gotchas in the above code snippet that are specific to XLA. You need to be aware of those to realize the speed-ups that XLA can bring in. We discuss these in the following section. - -## Gotchas to be aware of +## Tracing -When you are executing an XLA-enabled function (like `xla_generate()` above) for the first time, it will internally try to infer the computation graph, which is time-consuming. This process is known as [“tracing”](https://www.tensorflow.org/guide/intro_to_graphs#when_is_a_function_tracing). +When executing an XLA-enabled function for the first time, it tries to infer the computation graph in a process known as *tracing*. This is a time-consuming step, but any consecutive calls to the function will be much faster because it won't have to trace the computation graph again. -You might notice that the generation time is not fast. Successive calls of `xla_generate()` (or any other XLA-enabled function) won’t have to infer the computation graph, given the inputs to the function follow the same shape with which the computation graph was initially built. While this is not a problem for modalities with fixed input shapes (e.g., images), you must pay attention if you are working with variable input shape modalities (e.g., text). +To ensure a function is only traced once, the inputs must have the same shape as when the graph was built. This usually isn't an issue for fixed input shapes like images, but it can be an issue for inputs with variable shapes like text. -To ensure `xla_generate()` always operates with the same input shapes, you can specify the `padding` arguments when calling the tokenizer. +One way to handle this is to pad your text so it always has the same shape. Configure the padding options such as [pad_to_multiple_of](https://hf.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.pad.pad_to_multiple_of) in the tokenizer. ```py import tensorflow as tf @@ -120,7 +107,7 @@ input_string = ["TensorFlow is"] xla_generate = tf.function(model.generate, jit_compile=True) -# Here, we call the tokenizer with padding options. +# Call tokenizer with padding options. tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf") generated_tokens = xla_generate(**tokenized_input, num_beams=2) @@ -128,47 +115,15 @@ decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True) print(f"Generated -- {decoded_text}") ``` -This way, you can ensure that the inputs to `xla_generate()` will always receive inputs with the shape it was traced with and thus leading to speed-ups in the generation time. You can verify this with the code below: - -```py -import time -import tensorflow as tf -from transformers import AutoTokenizer, TFAutoModelForCausalLM - -tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="
") -model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2") - -xla_generate = tf.function(model.generate, jit_compile=True) - -for input_string in ["TensorFlow is", "TensorFlow is a", "TFLite is a"]: - tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf") - start = time.time_ns() - generated_tokens = xla_generate(**tokenized_input, num_beams=2) - end = time.time_ns() - print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n") -``` - -On a Tesla T4 GPU, you can expect the outputs like so: - -```bash -Execution time -- 30819.6 ms - -Execution time -- 79.0 ms - -Execution time -- 78.9 ms -``` -The first call to `xla_generate()` is time-consuming because of tracing, but the successive calls are orders of magnitude faster. Keep in mind that any change in the generation options at any point will trigger re-tracing and thus leading to slow-downs in the generation time. +In addition to the input shape, any changes to the generation options at any point also triggers tracing. -We didn’t cover all the text generation options 🤗 Transformers provides in this document. We encourage you to read the documentation for advanced use cases. +## Resources -## Additional Resources +Learn more about XLA with the following resources. -Here, we leave you with some additional resources if you want to delve deeper into XLA in 🤗 Transformers and in general. - -* [This Colab Notebook](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb) provides an interactive demonstration if you want to fiddle with the XLA-compatible encoder-decoder (like [T5](https://huggingface.co/docs/transformers/model_doc/t5)) and decoder-only (like [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)) text generation models. -* [This blog post](https://huggingface.co/blog/tf-xla-generate) provides an overview of the comparison benchmarks for XLA-compatible models along with a friendly introduction to XLA in TensorFlow. -* [This blog post](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html) discusses our design philosophy behind adding XLA support to the TensorFlow models in 🤗 Transformers. -* Recommended posts for learning more about XLA and TensorFlow graphs in general: - * [XLA: Optimizing Compiler for Machine Learning](https://www.tensorflow.org/xla) - * [Introduction to graphs and tf.function](https://www.tensorflow.org/guide/intro_to_graphs) - * [Better performance with tf.function](https://www.tensorflow.org/guide/function) +- A [notebook](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb) demonstrating XLA-compatible encoder-decoder and decoder-only text generation models. +- The [Faster Text Generation with TensorFlow and XLA](https://hf.co/blog/tf-xla-generate) blog post compares benchmarks for XLA-compatible models and provides a friendly introduction to XLA in TensorFlow. +- The [How Hugging Face improved Text Generation performance with XLA](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html) blog post discusses the design philosophy behind adding XLA to TensorFlow models in Transformers. +- The [Introduction to graphs and tf.function](https://www.tensorflow.org/guide/intro_to_graphs) guide. +- The [Better performance with tf.function](https://www.tensorflow.org/guide/function) guide. +- The [XLA](https://openxla.org/xla) documentation. From 6e704366c85beea71783b3fba5093833df76dce7 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 24 Sep 2024 16:09:00 -0700 Subject: [PATCH 054/116] torch.compile --- docs/source/en/_toctree.yml | 96 +------ docs/source/en/perf_torch_compile.md | 364 +++------------------------ 2 files changed, 38 insertions(+), 422 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index ea24c03b7aa6..00fc04fe2146 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -19,6 +19,8 @@ title: Share - local: add_new_model title: Add a new model + - local: modular_transformers + title: Modular transformers - local: task_summary title: What 🤗 Transformers can do - local: tasks_explained @@ -264,98 +266,8 @@ title: Community resources - local: troubleshooting title: Troubleshoot - - local: gguf - title: Interoperability with GGUF files - - local: tiktoken - title: Interoperability with TikToken files - - local: modular_transformers - title: Modularity in `transformers` - - local: how_to_hack_models - title: Model Hacking (overwriting a class to your usage) - title: Developer guides -- sections: - - local: quantization/overview - title: Getting started - - local: quantization/bitsandbytes - title: bitsandbytes - - local: quantization/gptq - title: GPTQ - - local: quantization/awq - title: AWQ - - local: quantization/aqlm - title: AQLM - - local: quantization/vptq - title: SpQR - - local: quantization/spqr - title: VPTQ - - local: quantization/quanto - title: Quanto - - local: quantization/eetq - title: EETQ - - local: quantization/higgs - title: HIGGS - - local: quantization/hqq - title: HQQ - - local: quantization/fbgemm_fp8 - title: FBGEMM_FP8 - - local: quantization/optimum - title: Optimum - - local: quantization/torchao - title: TorchAO - - local: quantization/bitnet - title: BitNet - - local: quantization/compressed_tensors - title: compressed-tensors - - local: quantization/finegrained_fp8 - title: Fine-grained FP8 - - local: quantization/contribute - title: Contribute new quantization method - title: Quantization Methods -- sections: - - local: performance - title: Overview - - local: llm_optims - title: LLM inference optimization - - sections: - - local: perf_train_gpu_one - title: Methods and tools for efficient training on a single GPU - - local: perf_train_gpu_many - title: Multiple GPUs and parallelism - - local: fsdp - title: Fully Sharded Data Parallel - - local: deepspeed - title: DeepSpeed - - local: perf_train_cpu - title: Efficient training on CPU - - local: perf_train_cpu_many - title: Distributed CPU training - - local: perf_train_tpu_tf - title: Training on TPU with TensorFlow - - local: perf_train_special - title: PyTorch training on Apple silicon - - local: perf_hardware - title: Custom hardware for training - - local: hpo_train - title: Hyperparameter Search using Trainer API - title: Efficient training techniques - - sections: - - local: perf_infer_cpu - title: CPU inference - - local: perf_infer_gpu_one - title: GPU inference - - local: perf_infer_gpu_multi - title: Multi-GPU inference - title: Optimizing inference - - local: big_models - title: Instantiate a big model - - local: debugging - title: Debugging - - local: tf_xla - title: XLA Integration for TensorFlow Models - - local: perf_torch_compile - title: Optimize inference using `torch.compile()` - title: Performance and scalability -- sections: +- title: Community + sections: - local: contributing title: How to contribute to Transformers? - local: testing diff --git a/docs/source/en/perf_torch_compile.md b/docs/source/en/perf_torch_compile.md index 2155a403b2b7..71fe721bb20f 100644 --- a/docs/source/en/perf_torch_compile.md +++ b/docs/source/en/perf_torch_compile.md @@ -1,4 +1,4 @@ - -# Optimize inference using torch.compile() +# torch.compile -This guide aims to provide a benchmark on the inference speed-ups introduced with [`torch.compile()`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for [computer vision models in 🤗 Transformers](https://huggingface.co/models?pipeline_tag=image-classification&library=transformers&sort=trending). +[torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) compiles PyTorch code into optimized kernels that significantly speed up inference. This feature relies on TorchDynamo to compile the code into graphs and TorchInductor to further compile the graphs into optimized kernels. It is a powerful optimization tool and in many cases, it only requires adding a single line of code. -## Benefits of torch.compile - -Depending on the model and the GPU, `torch.compile()` yields up to 30% speed-up during inference. To use `torch.compile()`, simply install any version of `torch` above 2.0. +Wrap a model with torch.compile to compile and return an optimized model. -Compiling a model takes time, so it's useful if you are compiling the model only once instead of every time you infer. -To compile any computer vision model of your choice, call `torch.compile()` on the model as shown below: +```py +from transformers import AutoModelForCausalLM -```diff -from transformers import AutoModelForImageClassification - -model = AutoModelForImageClassification.from_pretrained(MODEL_ID).to(DEVICE) -+ model = torch.compile(model) +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") +compiled_model = torch.compile(model) ``` -`compile()` comes with multiple modes for compiling, which essentially differ in compilation time and inference overhead. `max-autotune` takes longer than `reduce-overhead` but results in faster inference. Default mode is fastest for compilation but is not as efficient compared to `reduce-overhead` for inference time. In this guide, we used the default mode. You can learn more about it [here](https://pytorch.org/get-started/pytorch-2.0/#user-experience). - -We benchmarked `torch.compile` with different computer vision models, tasks, types of hardware, and batch sizes on `torch` version 2.0.1. - -## Benchmarking code - -Below you can find the benchmarking code for each task. We warm up the GPU before inference and take the mean time of 300 inferences, using the same image each time. - -### Image Classification with ViT - -```python -import torch -from PIL import Image -import requests -import numpy as np -from transformers import AutoImageProcessor, AutoModelForImageClassification -from accelerate.test_utils.testing import get_backend - -device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) -url = 'http://images.cocodataset.org/val2017/000000039769.jpg' -image = Image.open(requests.get(url, stream=True).raw) +> [!TIP] +> The initial call to torch.compile is slow because the model needs to be compiled. Subsequent calls to the compiled model are much faster because it doesn't need to compile again. -processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") -model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224").to(device) -model = torch.compile(model) +There are several parameters you can use to customize the compilation process. Two of the more important ones are listed below. For a full list of parameters, refer to the [torch.compile documentation](https://pytorch.org/docs/stable/generated/torch.compile.html). -processed_input = processor(image, return_tensors='pt').to(device) +## Modes -with torch.no_grad(): - _ = model(**processed_input) +The `mode` parameter offers several performance options for compiling, and you should try different modes to see which one works best for your use case. -``` - -#### Object Detection with DETR - -```python -from transformers import AutoImageProcessor, AutoModelForObjectDetection -from accelerate.test_utils.testing import get_backend - -device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) -processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50") -model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device) -model = torch.compile(model) +- `default` is a balanced option between speed and memory. +- `reduce-overhead` reduces the Python overhead at the expense of a little more memory, but it can be faster. +- `max-autotune` offers the fastest speed, but compiling the code takes longer. -texts = ["a photo of a cat", "a photo of a dog"] -inputs = processor(text=texts, images=image, return_tensors="pt").to(device) +```py +from transformers import AutoModelForCausalLM -with torch.no_grad(): - _ = model(**inputs) +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") +compiled_model = torch.compile(model, mode="reduce-overhead") ``` -#### Image Segmentation with Segformer +## Fullgraph -```python -from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation -from accelerate.test_utils.testing import get_backend +Fullgraph attempts to compile the entire model into a single graph to maximize performance. torch.compile raises an error if it encounters a graph break, which means it can't compile the model into a single graph. -device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) -processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") -model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to(device) -model = torch.compile(model) -seg_inputs = processor(images=image, return_tensors="pt").to(device) +```py +from transformers import AutoModelForCausalLM -with torch.no_grad(): - _ = model(**seg_inputs) +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") +compiled_model = torch.compile(model, mode="reduce-overhead", fullgraph=True) ``` -Below you can find the list of the models we benchmarked. - -**Image Classification** -- [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) -- [microsoft/beit-base-patch16-224-pt22k-ft22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k) -- [facebook/convnext-large-224](https://huggingface.co/facebook/convnext-large-224) -- [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50) - -**Image Segmentation** -- [nvidia/segformer-b0-finetuned-ade-512-512](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512) -- [facebook/mask2former-swin-tiny-coco-panoptic](https://huggingface.co/facebook/mask2former-swin-tiny-coco-panoptic) -- [facebook/maskformer-swin-base-ade](https://huggingface.co/facebook/maskformer-swin-base-ade) -- [google/deeplabv3_mobilenet_v2_1.0_513](https://huggingface.co/google/deeplabv3_mobilenet_v2_1.0_513) - -**Object Detection** -- [google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) -- [facebook/detr-resnet-101](https://huggingface.co/facebook/detr-resnet-101) -- [microsoft/conditional-detr-resnet-50](https://huggingface.co/microsoft/conditional-detr-resnet-50) - -Below you can find visualization of inference durations with and without `torch.compile()` and percentage improvements for each model in different hardware and batch sizes. - -
-
- -
-
- -
-
- -
-
- -
-
- -
-
- -
-
- - -![Duration Comparison on V100 with Batch Size of 1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/v100_1_duration.png) - -![Percentage Improvement on T4 with Batch Size of 4](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/torch_compile/T4_4_percentage.png) - -Below you can find inference durations in milliseconds for each model with and without `compile()`. Note that OwlViT results in OOM in larger batch sizes. - -### A100 (batch size: 1) - -| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | -|:---:|:---:|:---:| -| Image Classification/ViT | 9.325 | 7.584 | -| Image Segmentation/Segformer | 11.759 | 10.500 | -| Object Detection/OwlViT | 24.978 | 18.420 | -| Image Classification/BeiT | 11.282 | 8.448 | -| Object Detection/DETR | 34.619 | 19.040 | -| Image Classification/ConvNeXT | 10.410 | 10.208 | -| Image Classification/ResNet | 6.531 | 4.124 | -| Image Segmentation/Mask2former | 60.188 | 49.117 | -| Image Segmentation/Maskformer | 75.764 | 59.487 | -| Image Segmentation/MobileNet | 8.583 | 3.974 | -| Object Detection/Resnet-101 | 36.276 | 18.197 | -| Object Detection/Conditional-DETR | 31.219 | 17.993 | - - -### A100 (batch size: 4) - -| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | -|:---:|:---:|:---:| -| Image Classification/ViT | 14.832 | 14.499 | -| Image Segmentation/Segformer | 18.838 | 16.476 | -| Image Classification/BeiT | 13.205 | 13.048 | -| Object Detection/DETR | 48.657 | 32.418| -| Image Classification/ConvNeXT | 22.940 | 21.631 | -| Image Classification/ResNet | 6.657 | 4.268 | -| Image Segmentation/Mask2former | 74.277 | 61.781 | -| Image Segmentation/Maskformer | 180.700 | 159.116 | -| Image Segmentation/MobileNet | 14.174 | 8.515 | -| Object Detection/Resnet-101 | 68.101 | 44.998 | -| Object Detection/Conditional-DETR | 56.470 | 35.552 | - -### A100 (batch size: 16) - -| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | -|:---:|:---:|:---:| -| Image Classification/ViT | 40.944 | 40.010 | -| Image Segmentation/Segformer | 37.005 | 31.144 | -| Image Classification/BeiT | 41.854 | 41.048 | -| Object Detection/DETR | 164.382 | 161.902 | -| Image Classification/ConvNeXT | 82.258 | 75.561 | -| Image Classification/ResNet | 7.018 | 5.024 | -| Image Segmentation/Mask2former | 178.945 | 154.814 | -| Image Segmentation/Maskformer | 638.570 | 579.826 | -| Image Segmentation/MobileNet | 51.693 | 30.310 | -| Object Detection/Resnet-101 | 232.887 | 155.021 | -| Object Detection/Conditional-DETR | 180.491 | 124.032 | - -### V100 (batch size: 1) - -| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | -|:---:|:---:|:---:| -| Image Classification/ViT | 10.495 | 6.00 | -| Image Segmentation/Segformer | 13.321 | 5.862 | -| Object Detection/OwlViT | 25.769 | 22.395 | -| Image Classification/BeiT | 11.347 | 7.234 | -| Object Detection/DETR | 33.951 | 19.388 | -| Image Classification/ConvNeXT | 11.623 | 10.412 | -| Image Classification/ResNet | 6.484 | 3.820 | -| Image Segmentation/Mask2former | 64.640 | 49.873 | -| Image Segmentation/Maskformer | 95.532 | 72.207 | -| Image Segmentation/MobileNet | 9.217 | 4.753 | -| Object Detection/Resnet-101 | 52.818 | 28.367 | -| Object Detection/Conditional-DETR | 39.512 | 20.816 | - -### V100 (batch size: 4) - -| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | -|:---:|:---:|:---:| -| Image Classification/ViT | 15.181 | 14.501 | -| Image Segmentation/Segformer | 16.787 | 16.188 | -| Image Classification/BeiT | 15.171 | 14.753 | -| Object Detection/DETR | 88.529 | 64.195 | -| Image Classification/ConvNeXT | 29.574 | 27.085 | -| Image Classification/ResNet | 6.109 | 4.731 | -| Image Segmentation/Mask2former | 90.402 | 76.926 | -| Image Segmentation/Maskformer | 234.261 | 205.456 | -| Image Segmentation/MobileNet | 24.623 | 14.816 | -| Object Detection/Resnet-101 | 134.672 | 101.304 | -| Object Detection/Conditional-DETR | 97.464 | 69.739 | - -### V100 (batch size: 16) - -| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | -|:---:|:---:|:---:| -| Image Classification/ViT | 52.209 | 51.633 | -| Image Segmentation/Segformer | 61.013 | 55.499 | -| Image Classification/BeiT | 53.938 | 53.581 | -| Object Detection/DETR | OOM | OOM | -| Image Classification/ConvNeXT | 109.682 | 100.771 | -| Image Classification/ResNet | 14.857 | 12.089 | -| Image Segmentation/Mask2former | 249.605 | 222.801 | -| Image Segmentation/Maskformer | 831.142 | 743.645 | -| Image Segmentation/MobileNet | 93.129 | 55.365 | -| Object Detection/Resnet-101 | 482.425 | 361.843 | -| Object Detection/Conditional-DETR | 344.661 | 255.298 | - -### T4 (batch size: 1) - -| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | -|:---:|:---:|:---:| -| Image Classification/ViT | 16.520 | 15.786 | -| Image Segmentation/Segformer | 16.116 | 14.205 | -| Object Detection/OwlViT | 53.634 | 51.105 | -| Image Classification/BeiT | 16.464 | 15.710 | -| Object Detection/DETR | 73.100 | 53.99 | -| Image Classification/ConvNeXT | 32.932 | 30.845 | -| Image Classification/ResNet | 6.031 | 4.321 | -| Image Segmentation/Mask2former | 79.192 | 66.815 | -| Image Segmentation/Maskformer | 200.026 | 188.268 | -| Image Segmentation/MobileNet | 18.908 | 11.997 | -| Object Detection/Resnet-101 | 106.622 | 82.566 | -| Object Detection/Conditional-DETR | 77.594 | 56.984 | - -### T4 (batch size: 4) - -| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | -|:---:|:---:|:---:| -| Image Classification/ViT | 43.653 | 43.626 | -| Image Segmentation/Segformer | 45.327 | 42.445 | -| Image Classification/BeiT | 52.007 | 51.354 | -| Object Detection/DETR | 277.850 | 268.003 | -| Image Classification/ConvNeXT | 119.259 | 105.580 | -| Image Classification/ResNet | 13.039 | 11.388 | -| Image Segmentation/Mask2former | 201.540 | 184.670 | -| Image Segmentation/Maskformer | 764.052 | 711.280 | -| Image Segmentation/MobileNet | 74.289 | 48.677 | -| Object Detection/Resnet-101 | 421.859 | 357.614 | -| Object Detection/Conditional-DETR | 289.002 | 226.945 | - -### T4 (batch size: 16) - -| **Task/Model** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | -|:---:|:---:|:---:| -| Image Classification/ViT | 163.914 | 160.907 | -| Image Segmentation/Segformer | 192.412 | 163.620 | -| Image Classification/BeiT | 188.978 | 187.976 | -| Object Detection/DETR | OOM | OOM | -| Image Classification/ConvNeXT | 422.886 | 388.078 | -| Image Classification/ResNet | 44.114 | 37.604 | -| Image Segmentation/Mask2former | 756.337 | 695.291 | -| Image Segmentation/Maskformer | 2842.940 | 2656.88 | -| Image Segmentation/MobileNet | 299.003 | 201.942 | -| Object Detection/Resnet-101 | 1619.505 | 1262.758 | -| Object Detection/Conditional-DETR | 1137.513 | 897.390| - -## PyTorch Nightly -We also benchmarked on PyTorch nightly (2.1.0dev, find the wheel [here](https://download.pytorch.org/whl/nightly/cu118)) and observed improvement in latency both for uncompiled and compiled models. - -### A100 - -| **Task/Model** | **Batch Size** | **torch 2.0 - no compile** | **torch 2.0 -
compile** | -|:---:|:---:|:---:|:---:| -| Image Classification/BeiT | Unbatched | 12.462 | 6.954 | -| Image Classification/BeiT | 4 | 14.109 | 12.851 | -| Image Classification/BeiT | 16 | 42.179 | 42.147 | -| Object Detection/DETR | Unbatched | 30.484 | 15.221 | -| Object Detection/DETR | 4 | 46.816 | 30.942 | -| Object Detection/DETR | 16 | 163.749 | 163.706 | - -### T4 - -| **Task/Model** | **Batch Size** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | -|:---:|:---:|:---:|:---:| -| Image Classification/BeiT | Unbatched | 14.408 | 14.052 | -| Image Classification/BeiT | 4 | 47.381 | 46.604 | -| Image Classification/BeiT | 16 | 42.179 | 42.147 | -| Object Detection/DETR | Unbatched | 68.382 | 53.481 | -| Object Detection/DETR | 4 | 269.615 | 204.785 | -| Object Detection/DETR | 16 | OOM | OOM | - -### V100 - -| **Task/Model** | **Batch Size** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | -|:---:|:---:|:---:|:---:| -| Image Classification/BeiT | Unbatched | 13.477 | 7.926 | -| Image Classification/BeiT | 4 | 15.103 | 14.378 | -| Image Classification/BeiT | 16 | 52.517 | 51.691 | -| Object Detection/DETR | Unbatched | 28.706 | 19.077 | -| Object Detection/DETR | 4 | 88.402 | 62.949| -| Object Detection/DETR | 16 | OOM | OOM | - - -## Reduce Overhead -We benchmarked `reduce-overhead` compilation mode for A100 and T4 in Nightly. - -### A100 - -| **Task/Model** | **Batch Size** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | -|:---:|:---:|:---:|:---:| -| Image Classification/ConvNeXT | Unbatched | 11.758 | 7.335 | -| Image Classification/ConvNeXT | 4 | 23.171 | 21.490 | -| Image Classification/ResNet | Unbatched | 7.435 | 3.801 | -| Image Classification/ResNet | 4 | 7.261 | 2.187 | -| Object Detection/Conditional-DETR | Unbatched | 32.823 | 11.627 | -| Object Detection/Conditional-DETR | 4 | 50.622 | 33.831 | -| Image Segmentation/MobileNet | Unbatched | 9.869 | 4.244 | -| Image Segmentation/MobileNet | 4 | 14.385 | 7.946 | - - -### T4 - -| **Task/Model** | **Batch Size** | **torch 2.0 -
no compile** | **torch 2.0 -
compile** | -|:---:|:---:|:---:|:---:| -| Image Classification/ConvNeXT | Unbatched | 32.137 | 31.84 | -| Image Classification/ConvNeXT | 4 | 120.944 | 110.209 | -| Image Classification/ResNet | Unbatched | 9.761 | 7.698 | -| Image Classification/ResNet | 4 | 15.215 | 13.871 | -| Object Detection/Conditional-DETR | Unbatched | 72.150 | 57.660 | -| Object Detection/Conditional-DETR | 4 | 301.494 | 247.543 | -| Image Segmentation/MobileNet | Unbatched | 22.266 | 19.339 | -| Image Segmentation/MobileNet | 4 | 78.311 | 50.983 | +## Benchmark results +Refer to the table below for performance benchmarks comparing the mean inference time in milliseconds with torch.compile enabled and disabled across various GPUs and batch sizes on the same image. Select **Subset** in the table below to switch between different GPUs, as well as benchmarks on [PyTorch nightly 2.1.0dev](https://download.pytorch.org/whl/nightly/cu118) and torch.compile with `reduce-overhead` mode enabled. + From 0f3d76982e4706d1bf9b29af5ba7b9efe1c0b30f Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 25 Sep 2024 16:14:32 -0700 Subject: [PATCH 055/116] cpu inference --- docs/source/en/_toctree.yml | 4 +- docs/source/en/perf_infer_cpu.md | 97 +++++++++++----------------- docs/source/en/perf_torch_compile.md | 4 +- 3 files changed, 42 insertions(+), 63 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 00fc04fe2146..ae47bccb763a 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -92,10 +92,10 @@ title: torch.compile - local: tf_xla title: XLA - - local: perf_infer_cpu - title: CPU - local: perf_infer_gpu_one title: GPU + - local: perf_infer_cpu + title: CPU - local: agents title: Agents - local: agents_advanced diff --git a/docs/source/en/perf_infer_cpu.md b/docs/source/en/perf_infer_cpu.md index 7f8b525b3df6..dee66b73d25e 100644 --- a/docs/source/en/perf_infer_cpu.md +++ b/docs/source/en/perf_infer_cpu.md @@ -1,4 +1,4 @@ - -# CPU inference +# CPU -With some optimizations, it is possible to efficiently run large model inference on a CPU. One of these optimization techniques involves compiling the PyTorch code into an intermediate format for high-performance environments like C++. The other technique fuses multiple operations into one kernel to reduce the overhead of running each operation separately. +CPUs are a viable and cost-effective inference option. With a few optimization methods, it is possible to achieve good performance with large models on CPUs. These methods include fusing kernels to reduce overhead and compiling your code to a faster intermediate format that can be deployed in production environments. -You'll learn how to use [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) for faster inference, and how to convert your PyTorch code to [TorchScript](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html). If you're using an Intel CPU, you can also use [graph optimizations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features.html#graph-optimization) from [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/index.html) to boost inference speed even more. Finally, learn how to use 🤗 Optimum to accelerate inference with ONNX Runtime or OpenVINO (if you're using an Intel CPU). +This guide will show you a few ways to optimize inference on a CPU. -## BetterTransformer +## Optimum -BetterTransformer accelerates inference with its fastpath (native PyTorch specialized implementation of Transformer functions) execution. The two optimizations in the fastpath execution are: +[Optimum](https://hf.co/docs/optimum/en/index) is a Hugging Face library focused on optimizing model performance across various hardware. It supports [ONNX Runtime](https://onnxruntime.ai/docs/) (ORT), a model accelerator, for a wide range of hardware and frameworks including CPUs. -1. fusion, which combines multiple sequential operations into a single "kernel" to reduce the number of computation steps -2. skipping the inherent sparsity of padding tokens to avoid unnecessary computation with nested tensors +Optimum provides the [`~optimum.onnxruntime.ORTModel`] class for loading a ONNX models. For example, load the [optimum/roberta-base-squad2](https://hf.co/optimum/roberta-base-squad2) checkpoint for question answering inference. This checkpoint contains a [model.onnx](https://hf.co/optimum/roberta-base-squad2/blob/main/model.onnx) file. -BetterTransformer also converts all attention operations to use the more memory-efficient [scaled dot product attention](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention). +```py +from transformers import AutoTokenizer, pipeline +from optimum.onnxruntime import ORTModelForQuestionAnswering + +onnx_qa = pipeline("question-answering", model="optimum/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2") + +question = "What's my name?" +context = "My name is Philipp and I live in Nuremberg." +pred = onnx_qa(question, context) +``` - +> [!TIP] +> Optimum includes an [Intel](https://hf.co/docs/optimum/intel/index) extension that provides additional optimizations such as quantization, pruning, and knowledge distillation for Intel CPUs. This extension also includes tools to convert models to [OpenVINO](https://hf.co/docs/optimum/intel/inference), a toolkit for optimizing and deploying models, for even faster inference. -BetterTransformer is not supported for all models. Check this [list](https://huggingface.co/docs/optimum/bettertransformer/overview#supported-models) to see if a model supports BetterTransformer. +### BetterTransformer - +[BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) is a *fastpath* execution of specialized Transformers functions directly on the hardware level such as a CPU. There are two main components of the fastpath execution. -Before you start, make sure you have 🤗 Optimum [installed](https://huggingface.co/docs/optimum/installation). +- fusing multiple operations into a single kernel for faster and more efficient execution +- skipping unnecessary computation of padding tokens with nested tensors -Enable BetterTransformer with the [`PreTrainedModel.to_bettertransformer`] method: +> [!WARNING] +> BetterTransformer isn't supported for all models. Check this [list](https://hf.co/docs/optimum/bettertransformer/overview#supported-models) to see whether a model supports BetterTransformer. + +BetterTransformer is available through Optimum with the [`~PreTrainedModel.to_bettertransformer`] method. ```py from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder", torch_dtype="auto") +model = AutoModelForCausalLM.from_pretrained("bigscience/bloom") +model = model.to_bettertransformer() ``` ## TorchScript -TorchScript is an intermediate PyTorch model representation that can be run in production environments where performance is important. You can train a model in PyTorch and then export it to TorchScript to free the model from Python performance constraints. PyTorch [traces](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) a model to return a [`ScriptFunction`] that is optimized with just-in-time compilation (JIT). Compared to the default eager mode, JIT mode in PyTorch typically yields better performance for inference using optimization techniques like operator fusion. +[TorchScript](https://pytorch.org/docs/stable/jit.html) is an intermediate PyTorch model format that can be run in non-Python environments, like C++, where performance is critical. Train a PyTorch model and convert it to a TorchScript function or module with [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html). This function optimizes the model with just-in-time (JIT) compilation, and compared to the default eager mode, JIT-compiled models offer better inference performance. -For a gentle introduction to TorchScript, see the [Introduction to PyTorch TorchScript](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html) tutorial. +> [!TIP] +> Refer to the [Introduction to PyTorch TorchScript](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html) tutorial for a gentle introduction to TorchScript. -With the [`Trainer`] class, you can enable JIT mode for CPU inference by setting the `--jit_mode_eval` flag: +On a CPU, enable `torch.jit.trace` with the `--jit_mode_eval` flag in [`Trainer`]. ```bash python examples/pytorch/question-answering/run_qa.py \ @@ -64,26 +79,16 @@ python examples/pytorch/question-answering/run_qa.py \ --jit_mode_eval ``` - - -For PyTorch >= 1.14.0, JIT-mode could benefit any model for prediction and evaluation since the dict input is supported in `jit.trace`. - -For PyTorch < 1.14.0, JIT-mode could benefit a model if its forward parameter order matches the tuple input order in `jit.trace`, such as a question-answering model. If the forward parameter order does not match the tuple input order in `jit.trace`, like a text classification model, `jit.trace` will fail and we are capturing this with the exception here to make it fallback. Logging is used to notify users. - - +## IPEX -## IPEX graph optimization +[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/getting_started.html) (IPEX) offers additional optimizations for PyTorch on Intel CPUs. IPEX further optimizes TorchScript with [graph optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html) which fuses operations like Multi-head attention, Concat Linear, Linear + Add, Linear + Gelu, Add + LayerNorm, and more, into single kernels for faster execution. -Intel® Extension for PyTorch (IPEX) provides further optimizations in JIT mode for Intel CPUs, and we recommend combining it with TorchScript for even faster performance. The IPEX [graph optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html) fuses operations like Multi-head attention, Concat Linear, Linear + Add, Linear + Gelu, Add + LayerNorm, and more. - -To take advantage of these graph optimizations, make sure you have IPEX [installed](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html): +Make sure IPEX is installed, and set the `--use_opex` and `--jit_mode_eval` flags in [`Trainer`] to enable IPEX graph optimization and TorchScript. ```bash -pip install intel_extension_for_pytorch +!pip install intel_extension_for_pytorch ``` -Set the `--use_ipex` and `--jit_mode_eval` flags in the [`Trainer`] class to enable JIT mode with the graph optimizations: - ```bash python examples/pytorch/question-answering/run_qa.py \ --model_name_or_path csarron/bert-base-uncased-squad-v1 \ @@ -96,31 +101,3 @@ python examples/pytorch/question-answering/run_qa.py \ --use_ipex \ --jit_mode_eval ``` - -## 🤗 Optimum - - - -Learn more details about using ORT with 🤗 Optimum in the [Optimum Inference with ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/models) guide. This section only provides a brief and simple example. - - - -ONNX Runtime (ORT) is a model accelerator that runs inference on CPUs by default. ORT is supported by 🤗 Optimum which can be used in 🤗 Transformers, without making too many changes to your code. You only need to replace the 🤗 Transformers `AutoClass` with its equivalent [`~optimum.onnxruntime.ORTModel`] for the task you're solving, and load a checkpoint in the ONNX format. - -For example, if you're running inference on a question answering task, load the [optimum/roberta-base-squad2](https://huggingface.co/optimum/roberta-base-squad2) checkpoint which contains a `model.onnx` file: - -```py -from transformers import AutoTokenizer, pipeline -from optimum.onnxruntime import ORTModelForQuestionAnswering - -model = ORTModelForQuestionAnswering.from_pretrained("optimum/roberta-base-squad2") -tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2") - -onnx_qa = pipeline("question-answering", model=model, tokenizer=tokenizer) - -question = "What's my name?" -context = "My name is Philipp and I live in Nuremberg." -pred = onnx_qa(question, context) -``` - -If you have an Intel CPU, take a look at 🤗 [Optimum Intel](https://huggingface.co/docs/optimum/intel/index) which supports a variety of compression techniques (quantization, pruning, knowledge distillation) and tools for converting models to the [OpenVINO](https://huggingface.co/docs/optimum/intel/inference) format for higher performance inference. diff --git a/docs/source/en/perf_torch_compile.md b/docs/source/en/perf_torch_compile.md index 71fe721bb20f..941bd343e7ae 100644 --- a/docs/source/en/perf_torch_compile.md +++ b/docs/source/en/perf_torch_compile.md @@ -59,7 +59,9 @@ compiled_model = torch.compile(model, mode="reduce-overhead", fullgraph=True) ## Benchmark results -Refer to the table below for performance benchmarks comparing the mean inference time in milliseconds with torch.compile enabled and disabled across various GPUs and batch sizes on the same image. Select **Subset** in the table below to switch between different GPUs, as well as benchmarks on [PyTorch nightly 2.1.0dev](https://download.pytorch.org/whl/nightly/cu118) and torch.compile with `reduce-overhead` mode enabled. +Refer to the table below for performance benchmarks comparing the mean inference time in milliseconds with torch.compile enabled and disabled across various GPUs and batch sizes on the same image. + +Select **Subset** in the table below to switch between different GPUs, as well as benchmarks on [PyTorch nightly](https://download.pytorch.org/whl/nightly/cu118) 2.1.0dev and torch.compile with `reduce-overhead` mode enabled. From f6c33a16dd13dc19e18aaa7bf0bccc2b9705a4c6 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Fri, 18 Oct 2024 15:57:34 -0700 Subject: [PATCH 060/116] finetune --- docs/source/en/_toctree.yml | 18 +- docs/source/en/training.md | 432 +++++++----------------------------- 2 files changed, 86 insertions(+), 364 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index b9f247eec35c..ca61fba06fbb 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -9,7 +9,6 @@ - title: Base classes sections: - title: Models - isExpanded: false sections: - local: models title: Load @@ -34,7 +33,6 @@ - local: attention title: Attention mechanisms - title: Preprocessors - isExpanded: false sections: - local: fast_tokenizers title: Tokenizers @@ -55,7 +53,6 @@ - title: Inference sections: - title: Pipeline API - isExpanded: false sections: - local: pipeline_tutorial title: Pipeline @@ -66,7 +63,6 @@ - local: add_new_pipeline title: Add a new pipeline - title: LLMs - isExpanded: false sections: - local: tasks/prompting title: Prompt engineering @@ -85,7 +81,6 @@ - local: perplexity title: Perplexity of fixed-length models - title: Chat - isExpanded: false sections: - local: conversations title: Chat pipeline @@ -96,7 +91,6 @@ - local: chat_extras title: Tools and RAG - title: Optimization - isExpanded: false sections: - local: perf_torch_compile title: torch.compile @@ -115,10 +109,9 @@ - title: Training sections: - title: Trainer API - isExpanded: false sections: - local: training - title: Fine-tune a pretrained model + title: Finetuning - local: trainer title: Trainer - local: hpo_train @@ -126,7 +119,6 @@ - local: run_scripts title: Train with a script - title: Distributed training - isExpanded: false sections: - local: accelerate title: Set up distributed training with 🤗 Accelerate @@ -139,7 +131,6 @@ - local: perf_train_cpu_many title: Distributed CPU training - title: Hardware-specific training - isExpanded: false sections: - local: perf_train_gpu_one title: Methods and tools for efficient training on a single GPU @@ -198,7 +189,6 @@ - title: Resources sections: - title: Task recipes - isExpanded: false sections: - title: Natural language processing sections: @@ -289,7 +279,6 @@ - title: API sections: - title: Main classes - isExpanded: false sections: - local: main_classes/agent title: Agents and Tools @@ -336,7 +325,6 @@ - local: main_classes/image_processor title: Image Processor - title: Models - isExpanded: false sections: - title: Text models sections: @@ -1013,7 +1001,6 @@ - local: model_doc/graphormer title: Graphormer - title: Internal helpers - isExpanded: false sections: - local: internal/modeling_utils title: Custom Layers and Utilities @@ -1033,5 +1020,4 @@ title: General Utilities - local: internal/time_series_utils title: Utilities for Time Series - title: Internal Helpers - title: API + \ No newline at end of file diff --git a/docs/source/en/training.md b/docs/source/en/training.md index fa6ef0c0da6e..f31ab8717f6f 100644 --- a/docs/source/en/training.md +++ b/docs/source/en/training.md @@ -1,4 +1,4 @@ - -# Fine-tune a pretrained model +# Finetuning [[open-in-colab]] -There are significant benefits to using a pretrained model. It reduces computation costs, your carbon footprint, and allows you to use state-of-the-art models without having to train one from scratch. 🤗 Transformers provides access to thousands of pretrained models for a wide range of tasks. When you use a pretrained model, you train it on a dataset specific to your task. This is known as fine-tuning, an incredibly powerful training technique. In this tutorial, you will fine-tune a pretrained model with a deep learning framework of your choice: +Finetuning adapts a pretrained model to a specific task with a smaller specialized dataset. This approach requires far less data and compute compared to training a model from scratch, which makes it a more accessible option for many users. -* Fine-tune a pretrained model with 🤗 Transformers [`Trainer`]. -* Fine-tune a pretrained model in TensorFlow with Keras. -* Fine-tune a pretrained model in native PyTorch. +Transformers provides the [`Trainer`] API, which offers a comprehensive set of training features, for finetuning any of the models on the [Hub](https://hf.co/models). - +> [!TIP] +> Learn how to finetune models for other tasks in our Task Recipes section! -## Prepare a dataset +This guide will show you how to finetune a model with [`Trainer`] to classify Yelp reviews. - - -Before you can fine-tune a pretrained model, download a dataset and prepare it for training. The previous tutorial showed you how to process data for training, and now you get an opportunity to put those skills to the test! - -Begin by loading the [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full) dataset: +Login to your Hugging Face account with your user token to ensure you can access gated models and share your models on the Hub. ```py ->>> from datasets import load_dataset +from huggingface_hub import login ->>> dataset = load_dataset("yelp_review_full") ->>> dataset["train"][100] -{'label': 0, - 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'} +login() ``` -As you now know, you need a tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence lengths. To process your dataset in one step, use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process#map) method to apply a preprocessing function over the entire dataset: +Start by loading the [Yelp Reviews](https://hf.co/datasets/yelp_review_full) dataset and [preprocess](./fast_tokenizers) (tokenize, pad, and truncate) it for training. Use [`~datasets.Dataset.map`] to preprocess the entire dataset in one step. ```py ->>> from transformers import AutoTokenizer - ->>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") - - ->>> def tokenize_function(examples): -... return tokenizer(examples["text"], padding="max_length", truncation=True) - +from datasets import load_dataset +from transformers import AutoTokenizer ->>> tokenized_datasets = dataset.map(tokenize_function, batched=True) -``` +dataset = load_dataset("yelp_review_full") +tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") -If you like, you can create a smaller subset of the full dataset to fine-tune on to reduce the time it takes: +def tokenize(examples): + return tokenizer(examples["text"], padding="max_length", truncation=True) -```py ->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) ->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) +dataset = dataset.map(tokenize, batched=True) ``` - - -## Train +> [!TIP] +> Finetune on a smaller subset of the full dataset to reduce the time it takes, but the results won't be as good compared to finetuning on the full dataset. +> ```py +> small_train = dataset["train"].shuffle(seed=42).select(range(1000)) +> small_eval = dataset["test"].shuffle(seed=42).select(range(1000)) +> ``` -At this point, you should follow the section corresponding to the framework you want to use. You can use the links -in the right sidebar to jump to the one you want - and if you want to hide all of the content for a given framework, -just use the button at the top-right of that framework's block! +## Trainer - - -## Train with PyTorch Trainer +[`Trainer`] is an optimized training loop for Transformers models, making it easy to start training right away without manually writing your own training loop. Pick and choose from a wide range of training features in [`TrainingArguments`] such as gradient accumulation, mixed precision, and options for reporting and logging training metrics. -🤗 Transformers provides a [`Trainer`] class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [`Trainer`] API supports a wide range of training options and features such as logging, gradient accumulation, and mixed precision. - -Start by loading your model and specify the number of expected labels. From the Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields), you know there are five labels. - -By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. +Load a model and provide the number of expected labels (find this information on the Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields)): ```py ->>> from transformers import AutoModelForSequenceClassification +from transformers import AutoModelForSequenceClassification ->>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5, torch_dtype="auto") +model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5) +"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']" +"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference." ``` - - -You will see a warning about some of the pretrained weights not being used and some weights being randomly -initialized. Don't worry, this is completely normal! The pretrained head of the BERT model is discarded, and replaced with a randomly initialized classification head. You will fine-tune this new model head on your sequence classification task, transferring the knowledge of the pretrained model to it. - - +> [!TIP] +> The message above is a reminder that the models pretrained head is discarded and replaced with a randomly initialized classification head. The randomly initialized head needs to be finetuned on your specific task to output meanginful predictions. -### Training hyperparameters +With the model loaded, set up your training hyperparameters in [`TrainingArguments`]. Hyperparameters are variables that control the training process - such as the learning rate, batch size, number of epochs - which in turn impacts model performance. Selecting the correct hyperparameters is important and you should experiment with them to find the best configuration for your task. -Next, create a [`TrainingArguments`] class which contains all the hyperparameters you can tune as well as flags for activating different training options. For this tutorial you can start with the default training [hyperparameters](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments), but feel free to experiment with these to find your optimal settings. +For this guide, you can use the default hyperparameters which provide a good baseline to begin with. The only settings to configure in this guide are where to save the checkpoint, how to evaluate model performance during training, and pushing the model to the Hub. -Specify where to save the checkpoints from your training: +[`Trainer`] requires a function to compute and report your metric. For a classification task, you'll use [`evaluate.load`] to load the [accuracy](https://hf.co/spaces/evaluate-metric/accuracy) function from the [Evaluate](https://hf.co/docs/evaluate/index) library. Gather the predictions and labels in [`~evaluate.EvaluationModule.compute`] to calculate the accuracy. ```py ->>> from transformers import TrainingArguments - ->>> training_args = TrainingArguments(output_dir="test_trainer") -``` - -### Evaluate - -[`Trainer`] does not automatically evaluate model performance during training. You'll need to pass [`Trainer`] a function to compute and report metrics. The [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) library provides a simple [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) function you can load with the [`evaluate.load`] (see this [quicktour](https://huggingface.co/docs/evaluate/a_quick_tour) for more information) function: - -```py ->>> import numpy as np ->>> import evaluate - ->>> metric = evaluate.load("accuracy") -``` +import numpy as np +import evaluate -Call [`~evaluate.compute`] on `metric` to calculate the accuracy of your predictions. Before passing your predictions to `compute`, you need to convert the logits to predictions (remember all 🤗 Transformers models return logits): +metric = evaluate.load("accuracy") -```py ->>> def compute_metrics(eval_pred): -... logits, labels = eval_pred -... predictions = np.argmax(logits, axis=-1) -... return metric.compute(predictions=predictions, references=labels) +def compute_metrics(eval_pred): + logits, labels = eval_pred + # convert the logits to their predicted class + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) ``` -If you'd like to monitor your evaluation metrics during fine-tuning, specify the `eval_strategy` parameter in your training arguments to report the evaluation metric at the end of each epoch: +Set up [`TrainingArguments`] with where to save the model and when to compute accuracy during training. The example below sets it to `"epoch"`, which reports the accuracy at the end of each epoch. Add `push_to_hub=True` to upload the model to the Hub after training. ```py ->>> from transformers import TrainingArguments, Trainer +from transformers import TrainingArguments ->>> training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch") +training_args = TrainingArguments( + output_dir="yelp_review_classifier", + eval_strategy="epoch", + push_to_hub=True, +) ``` -### Trainer - -Create a [`Trainer`] object with your model, training arguments, training and test datasets, and evaluation function: +Create a [`Trainer`] instance and pass it the model, training arguments, training and test datasets, and evaluation function. Then call [`~Trainer.train`] to start training. ```py ->>> trainer = Trainer( -... model=model, -... args=training_args, -... train_dataset=small_train_dataset, -... eval_dataset=small_eval_dataset, -... compute_metrics=compute_metrics, -... ) +trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset["train"], + eval_dataset=dataset["test"], + compute_metrics=compute_metrics, +) +trainer.train() ``` -Then fine-tune your model by calling [`~transformers.Trainer.train`]: +Finally, call [`~Trainer.push_to_hub`] to upload your model and tokenizer to the Hub. ```py ->>> trainer.train() +trainer.push_to_hub() ``` - - - - - -## Train a TensorFlow model with Keras +## TensorFlow -You can also train 🤗 Transformers models in TensorFlow with the Keras API! - -### Loading data for Keras - -When you want to train a 🤗 Transformers model with the Keras API, you need to convert your dataset to a format that -Keras understands. If your dataset is small, you can just convert the whole thing to NumPy arrays and pass it to Keras. -Let's try that first before we do anything more complicated. - -First, load a dataset. We'll use the CoLA dataset from the [GLUE benchmark](https://huggingface.co/datasets/glue), -since it's a simple binary text classification task, and just take the training split for now. +[`Trainer`] is incompatible with Transformers TensorFlow models. Instead, finetune these models with [Keras](https://keras.io/) since they're implemented as standard [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model). ```py +from transformers import TFAutoModelForSequenceClassification from datasets import load_dataset - -dataset = load_dataset("glue", "cola") -dataset = dataset["train"] # Just take the training split for now -``` - -Next, load a tokenizer and tokenize the data as NumPy arrays. Note that the labels are already a list of 0 and 1s, -so we can just convert that directly to a NumPy array without tokenization! - -```py from transformers import AutoTokenizer -import numpy as np +model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5) +dataset = load_dataset("yelp_review_full") tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") -tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True) -# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras -tokenized_data = dict(tokenized_data) - -labels = np.array(dataset["label"]) # Label is already an array of 0 and 1 -``` -Finally, load, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method), and [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) the model. Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: +def tokenize(examples): + return tokenizer(examples["text"]) -```py -from transformers import TFAutoModelForSequenceClassification -from tensorflow.keras.optimizers import Adam - -# Load and compile our model -model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased") -# Lower learning rates are often better for fine-tuning transformers -model.compile(optimizer=Adam(3e-5)) # No loss argument! - -model.fit(tokenized_data, labels) +dataset = dataset.map(tokenize) ``` - - -You don't have to pass a loss argument to your models when you `compile()` them! Hugging Face models automatically -choose a loss that is appropriate for their task and model architecture if this argument is left blank. You can always -override this by specifying a loss yourself if you want to! - - - -This approach works great for smaller datasets, but for larger datasets, you might find it starts to become a problem. Why? -Because the tokenized array and labels would have to be fully loaded into memory, and because NumPy doesn’t handle -“jagged” arrays, so every tokenized sample would have to be padded to the length of the longest sample in the whole -dataset. That’s going to make your array even bigger, and all those padding tokens will slow down training too! +There are two methods to convert a dataset to [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset). -### Loading data as a tf.data.Dataset +- [`~TFPreTrainedModel.prepare_tf_dataset`] is the recommended way to create a [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) because you can inspect the model to figure out which columns to use as inputs and which columns to discard. This allows you to create a simpler, more performant dataset. +- [`~datasets.Dataset.to_tf_dataset`] is a more low-level method from the [Datasets](https://hf.co/docs/datasets/index) library that gives you more control over how a dataset is created by specifying the columns and label columns to use. -If you want to avoid slowing down training, you can load your data as a `tf.data.Dataset` instead. Although you can write your own -`tf.data` pipeline if you want, we have two convenience methods for doing this: - -- [`~TFPreTrainedModel.prepare_tf_dataset`]: This is the method we recommend in most cases. Because it is a method -on your model, it can inspect the model to automatically figure out which columns are usable as model inputs, and -discard the others to make a simpler, more performant dataset. -- [`~datasets.Dataset.to_tf_dataset`]: This method is more low-level, and is useful when you want to exactly control how -your dataset is created, by specifying exactly which `columns` and `label_cols` to include. - -Before you can use [`~TFPreTrainedModel.prepare_tf_dataset`], you will need to add the tokenizer outputs to your dataset as columns, as shown in -the following code sample: +Add the tokenizer to [`~TFPreTrainedModel.prepare_tf_dataset`] to pad each batch, and you can optionally shuffle the dataset. For more complicated preprocessing, pass the preprocessing function to the `collate_fn` parameter instead. ```py -def tokenize_dataset(data): - # Keys of the returned dictionary will be added to the dataset as columns - return tokenizer(data["text"]) - - -dataset = dataset.map(tokenize_dataset) +tf_dataset = model.prepare_tf_dataset( + dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer +) ``` -Remember that Hugging Face datasets are stored on disk by default, so this will not inflate your memory usage! Once the -columns have been added, you can stream batches from the dataset and add padding to each batch, which greatly -reduces the number of padding tokens compared to padding the entire dataset. +Finally, [compile](https://keras.io/api/models/model_training_apis/#compile-method) and [fit](https://keras.io/api/models/model_training_apis/#fit-method) the model to start training. +> [!TIP] +> It isn't necessary to pass a loss argument to [compile](https://keras.io/api/models/model_training_apis/#compile-method) because Transformers automatically chooses a loss that is appropriate for the task and architecture. However, you can always specify a loss argument if you want. ```py ->>> tf_dataset = model.prepare_tf_dataset(dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer) -``` - -Note that in the code sample above, you need to pass the tokenizer to `prepare_tf_dataset` so it can correctly pad batches as they're loaded. -If all the samples in your dataset are the same length and no padding is necessary, you can skip this argument. -If you need to do something more complex than just padding samples (e.g. corrupting tokens for masked language -modelling), you can use the `collate_fn` argument instead to pass a function that will be called to transform the -list of samples into a batch and apply any preprocessing you want. See our -[examples](https://github.com/huggingface/transformers/tree/main/examples) or -[notebooks](https://huggingface.co/docs/transformers/notebooks) to see this approach in action. - -Once you've created a `tf.data.Dataset`, you can compile and fit the model as before: - -```py -model.compile(optimizer=Adam(3e-5)) # No loss argument! +from tensorflow.keras.optimizers import Adam +model.compile(optimizer=Adam(3e-5)) model.fit(tf_dataset) ``` - - - - - -## Train in native PyTorch - - - - - -[`Trainer`] takes care of the training loop and allows you to fine-tune a model in a single line of code. For users who prefer to write their own training loop, you can also fine-tune a 🤗 Transformers model in native PyTorch. - -At this point, you may need to restart your notebook or execute the following code to free some memory: - -```py -from accelerate.utils.memory import clear_device_cache -del model -del trainer -clear_device_cache() -``` - -Next, manually postprocess `tokenized_dataset` to prepare it for training. - -1. Remove the `text` column because the model does not accept raw text as an input: - - ```py - >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"]) - ``` - -2. Rename the `label` column to `labels` because the model expects the argument to be named `labels`: - - ```py - >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels") - ``` - -3. Set the format of the dataset to return PyTorch tensors instead of lists: - - ```py - >>> tokenized_datasets.set_format("torch") - ``` - -Then create a smaller subset of the dataset as previously shown to speed up the fine-tuning: - -```py ->>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) ->>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) -``` - -### DataLoader - -Create a `DataLoader` for your training and test datasets so you can iterate over batches of data: - -```py ->>> from torch.utils.data import DataLoader - ->>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8) ->>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8) -``` - -Load your model with the number of expected labels: - -```py ->>> from transformers import AutoModelForSequenceClassification - ->>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5) -``` - -### Optimizer and learning rate scheduler - -Create an optimizer and learning rate scheduler to fine-tune the model. Let's use the [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) optimizer from PyTorch: - -```py ->>> from torch.optim import AdamW - ->>> optimizer = AdamW(model.parameters(), lr=5e-5) -``` - -Create the default learning rate scheduler from [`Trainer`]: - -```py ->>> from transformers import get_scheduler - ->>> num_epochs = 3 ->>> num_training_steps = num_epochs * len(train_dataloader) ->>> lr_scheduler = get_scheduler( -... name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps -... ) -``` - -Lastly, specify `device` to use a GPU if you have access to one. Otherwise, training on a CPU may take several hours instead of a couple of minutes. - -```py ->>> import torch ->>> from accelerate.test_utils.testing import get_backend - ->>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) ->>> model.to(device) -``` - - - -Get free access to a cloud GPU if you don't have one with a hosted notebook like [Colaboratory](https://colab.research.google.com/) or [SageMaker StudioLab](https://studiolab.sagemaker.aws/). - - - -Great, now you are ready to train! 🥳 - -### Training loop - -To keep track of your training progress, use the [tqdm](https://tqdm.github.io/) library to add a progress bar over the number of training steps: - -```py ->>> from tqdm.auto import tqdm - ->>> progress_bar = tqdm(range(num_training_steps)) - ->>> model.train() ->>> for epoch in range(num_epochs): -... for batch in train_dataloader: -... batch = {k: v.to(device) for k, v in batch.items()} -... outputs = model(**batch) -... loss = outputs.loss -... loss.backward() - -... optimizer.step() -... lr_scheduler.step() -... optimizer.zero_grad() -... progress_bar.update(1) -``` - -### Evaluate - -Just like how you added an evaluation function to [`Trainer`], you need to do the same when you write your own training loop. But instead of calculating and reporting the metric at the end of each epoch, this time you'll accumulate all the batches with [`~evaluate.add_batch`] and calculate the metric at the very end. - -```py ->>> import evaluate - ->>> metric = evaluate.load("accuracy") ->>> model.eval() ->>> for batch in eval_dataloader: -... batch = {k: v.to(device) for k, v in batch.items()} -... with torch.no_grad(): -... outputs = model(**batch) - -... logits = outputs.logits -... predictions = torch.argmax(logits, dim=-1) -... metric.add_batch(predictions=predictions, references=batch["labels"]) - ->>> metric.compute() -``` - - - - - -## Additional resources - -For more fine-tuning examples, refer to: - -- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) includes scripts - to train common NLP tasks in PyTorch and TensorFlow. +## Resources -- [🤗 Transformers Notebooks](notebooks) contains various notebooks on how to fine-tune a model for specific tasks in PyTorch and TensorFlow. +Refer to the Transformers [examples](https://github.com/huggingface/transformers/tree/main/examples) for more detailed training scripts on various tasks. You can also check out the [notebooks](./notebooks) for interactive examples. From 4ef333d7ea7c680d63e92676fa1b31affc11a5e2 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 21 Oct 2024 11:26:50 -0700 Subject: [PATCH 061/116] toctree --- docs/source/en/_toctree.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index ca61fba06fbb..edb85958dcb4 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -7,6 +7,7 @@ - local: installation title: Installation - title: Base classes + isExpanded: False sections: - title: Models sections: @@ -51,6 +52,7 @@ - local: pad_truncation title: Padding and truncation - title: Inference + isExpanded: False sections: - title: Pipeline API sections: @@ -107,6 +109,7 @@ - local: multilingual title: Run inference with multilingual models - title: Training + isExpanded: False sections: - title: Trainer API sections: @@ -187,6 +190,7 @@ - local: torchscript title: Export to TorchScript - title: Resources + isExpanded: False sections: - title: Task recipes sections: @@ -277,6 +281,7 @@ - local: glossary title: Glossary - title: API + isExpanded: False sections: - title: Main classes sections: From d520d47a86b44d73f529fd61f902055594617e2c Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 22 Oct 2024 15:13:22 -0700 Subject: [PATCH 062/116] trainer --- docs/source/en/_toctree.yml | 6 +- docs/source/en/optimizers.md | 23 + docs/source/en/pipeline_tutorial.md | 2 +- docs/source/en/trainer.md | 821 ++++------------------------ 4 files changed, 125 insertions(+), 727 deletions(-) create mode 100644 docs/source/en/optimizers.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index edb85958dcb4..db4ded7f420a 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -113,10 +113,12 @@ sections: - title: Trainer API sections: - - local: training - title: Finetuning - local: trainer title: Trainer + - local: training + title: Finetuning + - local: optimizers + title: Optimizers - local: hpo_train title: Hyperparameter Search using Trainer API - local: run_scripts diff --git a/docs/source/en/optimizers.md b/docs/source/en/optimizers.md new file mode 100644 index 000000000000..9375fe56bd03 --- /dev/null +++ b/docs/source/en/optimizers.md @@ -0,0 +1,23 @@ + + +# Optimizers + +## LOMO + +## GrokAdamW + +## Schedule Free diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md index b1ece3fd8d74..7460743e3bcc 100644 --- a/docs/source/en/pipeline_tutorial.md +++ b/docs/source/en/pipeline_tutorial.md @@ -53,7 +53,7 @@ This guide will introduce you to the [`Pipeline`], demonstrate its features, and Here are some examples of how to use [`Pipeline`] for different tasks and modalities. - + ```py diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md index 8cfe5dfc6afd..60611dd8b43a 100644 --- a/docs/source/en/trainer.md +++ b/docs/source/en/trainer.md @@ -1,4 +1,4 @@ - -# Agents, supercharged - Multi-agents, External tools, and more - -[[open-in-colab]] - -### What is an agent? - -> [!TIP] -> If you're new to `transformers.agents`, make sure to first read the main [agents documentation](./agents). - -In this page we're going to highlight several advanced uses of `transformers.agents`. - -## Multi-agents - -Multi-agent has been introduced in Microsoft's framework [Autogen](https://huggingface.co/papers/2308.08155). -It simply means having several agents working together to solve your task instead of only one. -It empirically yields better performance on most benchmarks. The reason for this better performance is conceptually simple: for many tasks, rather than using a do-it-all system, you would prefer to specialize units on sub-tasks. Here, having agents with separate tool sets and memories allows to achieve efficient specialization. - -You can easily build hierarchical multi-agent systems with `transformers.agents`. - -To do so, encapsulate the agent in a [`ManagedAgent`] object. This object needs arguments `agent`, `name`, and a `description`, which will then be embedded in the manager agent's system prompt to let it know how to call this managed agent, as we also do for tools. - -Here's an example of making an agent that managed a specific web search agent using our [`DuckDuckGoSearchTool`]: - -```py -from transformers.agents import ReactCodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent - -llm_engine = HfApiEngine() - -web_agent = ReactCodeAgent(tools=[DuckDuckGoSearchTool()], llm_engine=llm_engine) - -managed_web_agent = ManagedAgent( - agent=web_agent, - name="web_search", - description="Runs web searches for you. Give it your query as an argument." -) - -manager_agent = ReactCodeAgent( - tools=[], llm_engine=llm_engine, managed_agents=[managed_web_agent] -) - -manager_agent.run("Who is the CEO of Hugging Face?") -``` - -> [!TIP] -> For an in-depth example of an efficient multi-agent implementation, see [how we pushed our multi-agent system to the top of the GAIA leaderboard](https://huggingface.co/blog/beating-gaia). - - -## Advanced tool usage - -### Directly define a tool by subclassing Tool, and share it to the Hub - -Let's take again the tool example from main documentation, for which we had implemented a `tool` decorator. - -If you need to add variation, like custom attributes for your tool, you can build your tool following the fine-grained method: building a class that inherits from the [`Tool`] superclass. - -The custom tool needs: -- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name it `model_download_counter`. -- An attribute `description` is used to populate the agent's system prompt. -- An `inputs` attribute, which is a dictionary with keys `"type"` and `"description"`. It contains information that helps the Python interpreter make educated choices about the input. -- An `output_type` attribute, which specifies the output type. -- A `forward` method which contains the inference code to be executed. - -The types for both `inputs` and `output_type` should be amongst [Pydantic formats](https://docs.pydantic.dev/latest/concepts/json_schema/#generating-json-schema). - -```python -from transformers import Tool -from huggingface_hub import list_models - -class HFModelDownloadsTool(Tool): - name = "model_download_counter" - description = """ - This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. - It returns the name of the checkpoint.""" - - inputs = { - "task": { - "type": "string", - "description": "the task category (such as text-classification, depth-estimation, etc)", - } - } - output_type = "string" - - def forward(self, task: str): - model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) - return model.id -``` - -Now that the custom `HfModelDownloadsTool` class is ready, you can save it to a file named `model_downloads.py` and import it for use. - - -```python -from model_downloads import HFModelDownloadsTool - -tool = HFModelDownloadsTool() -``` - -You can also share your custom tool to the Hub by calling [`~Tool.push_to_hub`] on the tool. Make sure you've created a repository for it on the Hub and are using a token with read access. - -```python -tool.push_to_hub("{your_username}/hf-model-downloads") -``` - -Load the tool with the [`~Tool.load_tool`] function and pass it to the `tools` parameter in your agent. - -```python -from transformers import load_tool, CodeAgent - -model_download_tool = load_tool("m-ric/hf-model-downloads") -``` - -### Import a Space as a tool 🚀 - -You can directly import a Space from the Hub as a tool using the [`Tool.from_space`] method! - -You only need to provide the id of the Space on the Hub, its name, and a description that will help you agent understand what the tool does. Under the hood, this will use [`gradio-client`](https://pypi.org/project/gradio-client/) library to call the Space. - -For instance, let's import the [FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) Space from the Hub and use it to generate an image. - -``` -from transformers import Tool - -image_generation_tool = Tool.from_space( - "black-forest-labs/FLUX.1-dev", - name="image_generator", - description="Generate an image from a prompt") - -image_generation_tool("A sunny beach") -``` -And voilà, here's your image! 🏖️ - - - -Then you can use this tool just like any other tool. For example, let's improve the prompt `a rabbit wearing a space suit` and generate an image of it. - -```python -from transformers import ReactCodeAgent - -agent = ReactCodeAgent(tools=[image_generation_tool]) - -agent.run( - "Improve this prompt, then generate an image of it.", prompt='A rabbit wearing a space suit' -) -``` - -```text -=== Agent thoughts: -improved_prompt could be "A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background" - -Now that I have improved the prompt, I can use the image generator tool to generate an image based on this prompt. -=== Agent is executing the code below: -image = image_generator(prompt="A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background") -final_answer(image) -``` - - - -How cool is this? 🤩 - -### Use gradio-tools - -[gradio-tools](https://github.com/freddyaboulton/gradio-tools) is a powerful library that allows using Hugging -Face Spaces as tools. It supports many existing Spaces as well as custom Spaces. - -Transformers supports `gradio_tools` with the [`Tool.from_gradio`] method. For example, let's use the [`StableDiffusionPromptGeneratorTool`](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py) from `gradio-tools` toolkit for improving prompts to generate better images. - -Import and instantiate the tool, then pass it to the `Tool.from_gradio` method: - -```python -from gradio_tools import StableDiffusionPromptGeneratorTool -from transformers import Tool, load_tool, CodeAgent - -gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool() -prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool) -``` - -> [!WARNING] -> gradio-tools require *textual* inputs and outputs even when working with different modalities like image and audio objects. Image and audio inputs and outputs are currently incompatible. - -### Use LangChain tools - -We love Langchain and think it has a very compelling suite of tools. -To import a tool from LangChain, use the `from_langchain()` method. - -Here is how you can use it to recreate the intro's search result using a LangChain web search tool. -This tool will need `pip install google-search-results` to work properly. -```python -from langchain.agents import load_tools -from transformers import Tool, ReactCodeAgent - -search_tool = Tool.from_langchain(load_tools(["serpapi"])[0]) - -agent = ReactCodeAgent(tools=[search_tool]) - -agent.run("How many more blocks (also denoted as layers) are in BERT base encoder compared to the encoder from the architecture proposed in Attention is All You Need?") -``` - -## Display your agent run in a cool Gradio interface - -You can leverage `gradio.Chatbot` to display your agent's thoughts using `stream_to_gradio`, here is an example: - -```py -import gradio as gr -from transformers import ( - load_tool, - ReactCodeAgent, - HfApiEngine, - stream_to_gradio, -) - -# Import tool from Hub -image_generation_tool = load_tool("m-ric/text-to-image") - -llm_engine = HfApiEngine("meta-llama/Meta-Llama-3-70B-Instruct") - -# Initialize the agent with the image generation tool -agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine) - - -def interact_with_agent(task): - messages = [] - messages.append(gr.ChatMessage(role="user", content=task)) - yield messages - for msg in stream_to_gradio(agent, task): - messages.append(msg) - yield messages + [ - gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!") - ] - yield messages - - -with gr.Blocks() as demo: - text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.") - submit = gr.Button("Run illustrator agent!") - chatbot = gr.Chatbot( - label="Agent", - type="messages", - avatar_images=( - None, - "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png", - ), - ) - submit.click(interact_with_agent, [text_input], [chatbot]) - -if __name__ == "__main__": - demo.launch() -``` diff --git a/docs/source/en/hpo_train.md b/docs/source/en/hpo_train.md index 49dde04fe606..60e5451b8e35 100644 --- a/docs/source/en/hpo_train.md +++ b/docs/source/en/hpo_train.md @@ -13,124 +13,155 @@ rendered properly in your Markdown viewer. --> -# Hyperparameter Search using Trainer API +# Hyperparameter search -🤗 Transformers provides a [`Trainer`] class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [`Trainer`] provides API for hyperparameter search. This doc shows how to enable it in example. +Hyperparameter search discovers an optimal set of hyperparameters that produces the best model performance. [`Trainer`] supports several hyperparameter search backends - [Optuna](https://optuna.readthedocs.io/en/stable/index.html), [SigOpt](https://docs.sigopt.com/), [Weights & Biases](https://docs.wandb.ai/), [Ray Tune](https://docs.ray.io/en/latest/tune/index.html) - through [`~Trainer.hyperparameter_search`] to optimize an objective or even multiple objectives. -## Hyperparameter Search backend +This guide will go over how to set up a hyperparameter search for each of the backends. -[`Trainer`] supports four hyperparameter search backends currently: -[optuna](https://optuna.org/), [sigopt](https://sigopt.com/), [raytune](https://docs.ray.io/en/latest/tune/index.html) and [wandb](https://wandb.ai/site/sweeps). - -you should install them before using them as the hyperparameter search backend ```bash pip install optuna/sigopt/wandb/ray[tune] ``` -## How to enable Hyperparameter search in example +To use [`~Trainer.hyperparameter_search`], you need to create a `model_init` function. This function includes basic model information (arguments and configuration) because it needs to be reinitialized for each search trial in the run. + +> [!WARNING] +> The `model_init` function is incompatible with the [optimizers](./main_classes/trainer#transformers.Trainer.optimizers) parameter. Subclass [`Trainer`] and override the [`~Trainer.create_optimizer_and_scheduler`] method to create a custom optimizer and scheduler. -Define the hyperparameter search space, different backends need different format. +An example `model_init` function is shown below. -For sigopt, see sigopt [object_parameter](https://docs.sigopt.com/ai-module-api-references/api_reference/objects/object_parameter), it's like following: ```py ->>> def sigopt_hp_space(trial): -... return [ -... {"bounds": {"min": 1e-6, "max": 1e-4}, "name": "learning_rate", "type": "double"}, -... { -... "categorical_values": ["16", "32", "64", "128"], -... "name": "per_device_train_batch_size", -... "type": "categorical", -... }, -... ] +def model_init(trial): + return AutoModelForSequenceClassification.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + token=True if model_args.use_auth_token else None, + ) ``` -For optuna, see optuna [object_parameter](https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html#sphx-glr-tutorial-10-key-features-002-configurations-py), it's like following: +Pass `model_init` to [`Trainer`] along with everything else you need for training. Then you can call [`~Trainer.hyperparameter_search`] to start the search. -```py ->>> def optuna_hp_space(trial): -... return { -... "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True), -... "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]), -... } -``` +[`~Trainer.hyperparameter_search`] accepts a [direction](./main_classes/trainer#transformers.Trainer.hyperparameter_search.direction) parameter to specify whether to minimize, maximize, or minimize and maximize multiple objectives. You'll also need to set the [backend](./main_classes/trainer#transformers.Trainer.hyperparameter_search.backend) you're using, an [object](./main_classes/trainer#transformers.Trainer.hyperparameter_search.hp_space) containing the hyperparameters to optimize for, the [number of trials](./main_classes/trainer#transformers.Trainer.hyperparameter_search.n_trials) to run, and a [compute_objective](./main_classes/trainer#transformers.Trainer.hyperparameter_search.compute_objective) to return the objective values. -Optuna provides multi-objective HPO. You can pass `direction` in `hyperparameter_search` and define your own compute_objective to return multiple objective values. The Pareto Front (`List[BestRun]`) will be returned in hyperparameter_search, you should refer to the test case `TrainerHyperParameterMultiObjectOptunaIntegrationTest` in [test_trainer](https://github.com/huggingface/transformers/blob/main/tests/trainer/test_trainer.py). It's like following +> [!TIP] +> If [compute_objective](./main_classes/trainer#transformers.Trainer.hyperparameter_search.compute_objective) isn't defined, the default [compute_objective](./main_classes/trainer#transformers.Trainer.hyperparameter_search.compute_objective) is called which is the sum of an evaluation metric like F1. ```py ->>> best_trials = trainer.hyperparameter_search( -... direction=["minimize", "maximize"], -... backend="optuna", -... hp_space=optuna_hp_space, -... n_trials=20, -... compute_objective=compute_objective, -... ) +from transformers import Trainer + +trainer = Trainer( + model=None, + args=training_args, + train_dataset=small_train_dataset, + eval_dataset=small_eval_dataset, + compute_metrics=compute_metrics, + processing_class=tokenizer, + model_init=model_init, + data_collator=data_collator, +) +trainer.hyperparameter_search(...) ``` -For raytune, see raytune [object_parameter](https://docs.ray.io/en/latest/tune/api/search_space.html), it's like following: +The following examples demonstrate how to perform a hyperparameter search for the learning rate and training batch size using the different backends. -```py ->>> def ray_hp_space(trial): -... return { -... "learning_rate": tune.loguniform(1e-6, 1e-4), -... "per_device_train_batch_size": tune.choice([16, 32, 64, 128]), -... } -``` + + -For wandb, see wandb [object_parameter](https://docs.wandb.ai/guides/sweeps/configuration), it's like following: +[Optuna](https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html#sphx-glr-tutorial-10-key-features-002-configurations-py) optimizes categorical, integers, and floats. ```py ->>> def wandb_hp_space(trial): -... return { -... "method": "random", -... "metric": {"name": "objective", "goal": "minimize"}, -... "parameters": { -... "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4}, -... "per_device_train_batch_size": {"values": [16, 32, 64, 128]}, -... }, -... } +def optuna_hp_space(trial): + return { + "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True), + "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]), + } + +best_trials = trainer.hyperparameter_search( + direction=["minimize", "maximize"], + backend="optuna", + hp_space=optuna_hp_space, + n_trials=20, + compute_objective=compute_objective, +) ``` -Define a `model_init` function and pass it to the [`Trainer`], as an example: + + + +[Ray Tune](https://docs.ray.io/en/latest/tune/api/search_space.html) optimizes floats, integers, and categorical parameters but it also offers multiple sampling distributions for each parameter such as uniform and log-uniform. + ```py ->>> def model_init(trial): -... return AutoModelForSequenceClassification.from_pretrained( -... model_args.model_name_or_path, -... from_tf=bool(".ckpt" in model_args.model_name_or_path), -... config=config, -... cache_dir=model_args.cache_dir, -... revision=model_args.model_revision, -... token=True if model_args.use_auth_token else None, -... ) +def ray_hp_space(trial): + return { + "learning_rate": tune.loguniform(1e-6, 1e-4), + "per_device_train_batch_size": tune.choice([16, 32, 64, 128]), + } + +best_trials = trainer.hyperparameter_search( + direction=["minimize", "maximize"], + backend="ray", + hp_space=ray_hp_space, + n_trials=20, + compute_objective=compute_objective, +) ``` -Create a [`Trainer`] with your `model_init` function, training arguments, training and test datasets, and evaluation function: + + + +[SigOpt](https://docs.sigopt.com/ai-module-api-references/api_reference/objects/object_parameter) optimizes double, integer, and categorical parameters. ```py ->>> trainer = Trainer( -... model=None, -... args=training_args, -... train_dataset=small_train_dataset, -... eval_dataset=small_eval_dataset, -... compute_metrics=compute_metrics, -... processing_class=tokenizer, -... model_init=model_init, -... data_collator=data_collator, -... ) +def sigopt_hp_space(trial): + return [ + {"bounds": {"min": 1e-6, "max": 1e-4}, "name": "learning_rate", "type": "double"}, + { + "categorical_values": ["16", "32", "64", "128"], + "name": "per_device_train_batch_size", + "type": "categorical", + }, + ] + +best_trials = trainer.hyperparameter_search( + direction=["minimize", "maximize"], + backend="sigopt", + hp_space=sigopt_hp_space, + n_trials=20, + compute_objective=compute_objective, +) ``` -Call hyperparameter search, get the best trial parameters, backend could be `"optuna"`/`"sigopt"`/`"wandb"`/`"ray"`. direction can be`"minimize"` or `"maximize"`, which indicates whether to optimize greater or lower objective. + + -You could define your own compute_objective function, if not defined, the default compute_objective will be called, and the sum of eval metric like f1 is returned as objective value. +[Weights & Biases](https://docs.wandb.ai/guides/sweeps/sweep-config-keys) also optimizes integers, floats, and categorical parameters but it also includes support for different search strategies and distribution options. ```py ->>> best_trial = trainer.hyperparameter_search( -... direction="maximize", -... backend="optuna", -... hp_space=optuna_hp_space, -... n_trials=20, -... compute_objective=compute_objective, -... ) +def wandb_hp_space(trial): + return { + "method": "random", + "metric": {"name": "objective", "goal": "minimize"}, + "parameters": { + "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4}, + "per_device_train_batch_size": {"values": [16, 32, 64, 128]}, + }, + } + +best_trials = trainer.hyperparameter_search( + direction=["minimize", "maximize"], + backend="wandb", + hp_space=wandb_hp_space, + n_trials=20, + compute_objective=compute_objective, +) ``` -## Hyperparameter search For DDP finetune -Currently, Hyperparameter search for DDP is enabled for optuna and sigopt. Only the rank-zero process will generate the search trial and pass the argument to other ranks. + + + +## Distributed Data Parallel + +[`Trainer`] only supports hyperparameter search for distributed data parallel (DDP) on the Optuna and SigOpt backends. Only the rank-zero process is used to generate the search trial, and the resulting parameters are passed along to the other ranks. From 93b21f80f10c865e2fb3468cbdf317b9040b6b7e Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Thu, 31 Oct 2024 14:19:24 -0700 Subject: [PATCH 066/116] accelerate --- docs/source/en/_toctree.yml | 6 +- docs/source/en/accelerate.md | 199 ++++++++++++++++++++--------------- 2 files changed, 117 insertions(+), 88 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 3de8479d0534..418b186cb22e 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -121,12 +121,10 @@ title: Optimizers - local: hpo_train title: Hyperparameter search - - local: run_scripts - title: Train with a script - title: Distributed training sections: - local: accelerate - title: Set up distributed training with 🤗 Accelerate + title: Accelerate - local: perf_train_gpu_many title: Multiple GPUs and parallelism - local: fsdp @@ -264,6 +262,8 @@ title: Image-text-to-text - local: tasks/video_text_to_text title: Video-text-to-text + - local: run_scripts + title: Training scripts - local: benchmarks title: Benchmarks - local: notebooks diff --git a/docs/source/en/accelerate.md b/docs/source/en/accelerate.md index e0a7a9c65623..0093e0fbca23 100644 --- a/docs/source/en/accelerate.md +++ b/docs/source/en/accelerate.md @@ -1,4 +1,4 @@ - -# Distributed training with 🤗 Accelerate +# Accelerate -As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the [🤗 Accelerate](https://huggingface.co/docs/accelerate) library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU's on one machine or multiple GPU's across several machines. In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment. +[Accelerate](https://hf.co/docs/accelerate/index) is a library designed to simplify distributed training on any type of setup with PyTorch by uniting the most common frameworks ([Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) and [DeepSpeed](https://www.deepspeed.ai/)) for it into a single interface. [`Trainer`] is powered by Accelerate under the hood, enabling loading big models and distributed training. -## Setup - -Get started by installing 🤗 Accelerate: +This guide will show you two ways to use Accelerate with Transformers, using FSDP as the backend. The first method demonstrates distributed training with [`Trainer`], and the second method demonstrates adapting a PyTorch training loop. For more detailed information about Accelerate, please refer to the [documentation](https://hf.co/docs/accelerate/index). ```bash pip install accelerate ``` -Then import and create an [`~accelerate.Accelerator`] object. The [`~accelerate.Accelerator`] will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device. +Start by running [accelerate config](https://hf.co/docs/accelerate/main/en/package_reference/cli#accelerate-config) in the command line to answer a series of prompts about your training system. This creates and saves a configuration file to help Accelerate correctly set up training based on your setup. -```py ->>> from accelerate import Accelerator +```bash +accelerate config +``` ->>> accelerator = Accelerator() +Depending on your setup and the answers you provide, an example configuration file for distributing training with FSDP on one machine with two GPUs. + +```yaml +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: FSDP +downcast_bf16: 'no' +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_backward_prefetch_policy: BACKWARD_PRE + fsdp_forward_prefetch: false + fsdp_cpu_ram_efficient_loading: true + fsdp_offload_params: false + fsdp_sharding_strategy: FULL_SHARD + fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_sync_module_states: true + fsdp_transformer_layer_cls_to_wrap: BertLayer + fsdp_use_orig_params: true +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 2 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false ``` -## Prepare to accelerate +## Trainer -The next step is to pass all the relevant training objects to the [`~accelerate.Accelerator.prepare`] method. This includes your training and evaluation DataLoaders, a model and an optimizer: +Pass the path to the saved configuration file to [`TrainingArguments`], and from there, pass your [`TrainingArguments`] to [`Trainer`]. ```py ->>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( -... train_dataloader, eval_dataloader, model, optimizer -... ) +from transformers import TrainingArguments, Trainer + +training_args = TrainingArguments( + output_dir="your-model", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=2, + fsdp_config="path/to/fsdp_config", + fsdp_strategy="full_shard", + weight_decay=0.01, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + push_to_hub=True, +) + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset["train"], + eval_dataset=dataset["test"], + processing_class=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, +) + +trainer.train() ``` -## Backward +## Native PyTorch -The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`~accelerate.Accelerator.backward`] method: +Accelerate can also be added to any PyTorch training loop to enable distributed training. The [`~accelerate.Accelerator`] is the main entry point for adapting your PyTorch code to work with Accelerate. It automatically detects your distributed training setup and initializes all the necessary components for training. You don't need to explicitly place your model on a device because [`~accelerate.Accelerator`] knows which device to move your model to. ```py ->>> for epoch in range(num_epochs): -... for batch in train_dataloader: -... outputs = model(**batch) -... loss = outputs.loss -... accelerator.backward(loss) - -... optimizer.step() -... lr_scheduler.step() -... optimizer.zero_grad() -... progress_bar.update(1) +from accelerate import Accelerator + +accelerator = Accelerator() +device = accelerator.device ``` -As you can see in the following code, you only need to add four additional lines of code to your training loop to enable distributed training! +All PyTorch objects (model, optimizer, scheduler, dataloaders) should be passed to the [`~accelerate.Accelerator.prepare`] method now. This method moves your model to the appropriate device or devices, adapts the optimizer and scheduler to use [`~accelerate.AcceleratedOptimizer`] and [`~accelerate.AcceleratedScheduler`], and creates a new shardable dataloader. -```diff -+ from accelerate import Accelerator - from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler +```py +train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( + train_dataloader, eval_dataloader, model, optimizer +) +``` -+ accelerator = Accelerator() +Replace `loss.backward` in your training loop with Accelerates [`~accelerate.Accelerator.backward`] method to scale the gradients and determine the appropriate `backward` method to use depending on your framework (for example, DeepSpeed or Megatron). - model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) - optimizer = AdamW(model.parameters(), lr=3e-5) +```py +for epoch in range(num_epochs): + for batch in train_dataloader: + outputs = model(**batch) + loss = outputs.loss + accelerator.backward(loss) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) +``` -- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") -- model.to(device) +Combine everything into a function and make it callable as a script. -+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( -+ train_dataloader, eval_dataloader, model, optimizer -+ ) +```py +from accelerate import Accelerator + +def main(): + accelerator = Accelerator() - num_epochs = 3 - num_training_steps = num_epochs * len(train_dataloader) - lr_scheduler = get_scheduler( - "linear", - optimizer=optimizer, - num_warmup_steps=0, - num_training_steps=num_training_steps + model, optimizer, training_dataloader, scheduler = accelerator.prepare( + model, optimizer, training_dataloader, scheduler ) - progress_bar = tqdm(range(num_training_steps)) - - model.train() - for epoch in range(num_epochs): - for batch in train_dataloader: -- batch = {k: v.to(device) for k, v in batch.items()} - outputs = model(**batch) - loss = outputs.loss -- loss.backward() -+ accelerator.backward(loss) - - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - progress_bar.update(1) + for batch in training_dataloader: + optimizer.zero_grad() + inputs, targets = batch + outputs = model(inputs) + loss = loss_function(outputs, targets) + accelerator.backward(loss) + optimizer.step() + scheduler.step() + +if __name__ == "__main__": + main() ``` -## Train - -Once you've added the relevant lines of code, launch your training in a script or a notebook like Colaboratory. +From the command line, call [accelerate launch](https://hf.co/docs/accelerate/main/en/package_reference/cli#accelerate-launch) to run your training script. Any additional arguments or parameters can be passed here as well. -### Train with a script - -If you are running your training from a script, run the following command to create and save a configuration file: - -```bash -accelerate config -``` - -Then launch your training with: +To launch your training script on two GPUs, add the `--num_processes` argument. ```bash -accelerate launch train.py -``` - -### Train with a notebook - -🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. Wrap all the code responsible for training in a function, and pass it to [`~accelerate.notebook_launcher`]: - -```py ->>> from accelerate import notebook_launcher - ->>> notebook_launcher(training_function) +accelerate launch --num_processes=2 your_script.py ``` -For more information about 🤗 Accelerate and its rich features, refer to the [documentation](https://huggingface.co/docs/accelerate). +Refer to the [Launching Accelerate scripts](https://hf.co/docs/accelerate/main/en/basic_tutorials/launch) for more details. From 435e904f4b7dd87b09f1bf4f2b386095451701f9 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 5 Nov 2024 16:15:14 -0800 Subject: [PATCH 067/116] parallelism --- docs/source/en/_toctree.yml | 2 +- docs/source/en/perf_train_gpu_many.md | 659 ++------------------------ 2 files changed, 50 insertions(+), 611 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 418b186cb22e..915076ac6ec8 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -126,7 +126,7 @@ - local: accelerate title: Accelerate - local: perf_train_gpu_many - title: Multiple GPUs and parallelism + title: Parallelism methods - local: fsdp title: Fully Sharded Data Parallel - local: deepspeed diff --git a/docs/source/en/perf_train_gpu_many.md b/docs/source/en/perf_train_gpu_many.md index bf9467d19d22..18aa41a43f63 100644 --- a/docs/source/en/perf_train_gpu_many.md +++ b/docs/source/en/perf_train_gpu_many.md @@ -1,4 +1,4 @@ - -# Efficient Training on Multiple GPUs +# Parallelism methods -If training a model on a single GPU is too slow or if the model's weights do not fit in a single GPU's memory, transitioning -to a multi-GPU setup may be a viable option. Prior to making this transition, thoroughly explore all the strategies covered -in the [Methods and tools for efficient training on a single GPU](perf_train_gpu_one) as they are universally applicable -to model training on any number of GPUs. Once you have employed those strategies and found them insufficient for your -case on a single GPU, consider moving to multiple GPUs. +Multi-GPU setups are effective for accelerating training and fitting large models in memory that otherwise wouldn't fit on a single GPU. It relies on parallelizing the workload across GPUs. There are several types of parallelism such as data parallelism, tensor parallelism, pipeline parallelism, and model parallelism. Each type of parallelism splits the workload differently, whether it's the data or the model. -Transitioning from a single GPU to multiple GPUs requires the introduction of some form of parallelism, as the workload -must be distributed across the resources. Multiple techniques can be employed to achieve parallelism, such as data -parallelism, tensor parallelism, and pipeline parallelism. It's important to note that there isn't a one-size-fits-all -solution, and the optimal settings depend on the specific hardware configuration you are using. +This guide will discuss the various parallelism methods, combining them, and choosing an appropriate strategy for your setup. For more details about distributed training, refer to the [Accelerate](https://hf.co/docs/accelerate/index) documentation. -This guide offers an in-depth overview of individual types of parallelism, as well as guidance on ways to combine -techniques and choosing an appropriate approach. For step-by-step tutorials on distributed training, please refer to -the [🤗 Accelerate documentation](https://huggingface.co/docs/accelerate/index). +## Data parallelism - +Data parallelism evenly distributes data across multiple GPUs. Each GPU holds a copy of the model and concurrently proccesses their portion of the data. At the end, the results from each GPU are synchronized and combined. -While the main concepts discussed in this guide are likely applicable across frameworks, here we focus on -PyTorch-based implementations. +Data parallelism significantly reduces training time by processing data in parallel, and it is scalable to the number of GPUs available. However, synchronizing results from each GPU can add overhead. - +There are two types of data parallelism, DataParallel (DP) and DistributedDataParallel (DDP). -Before diving deeper into the specifics of each technique, let's go over the rough decision process when training -large models on a large infrastructure. +### DataParallel -## Scalability strategy +[DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) supports distributed training on a *single machine* with multiple GPUs. -Begin by estimating how much vRAM is required to train your model. For models hosted on the 🤗 Hub, use our -[Model Memory Calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage), which gives you -accurate calculations within a few percent margin. +1. The default GPU, `GPU 0`, reads a batch of data and sends a mini batch of it to the other GPUs. +2. An up-to-date model is replicated from `GPU 0` to the other GPUs. +3. A `forward` pass is performed on each GPU and their outputs are sent to `GPU 0` to compute the loss. +4. The loss is distributed from `GPU 0` to the other GPUs for the `backward` pass. +5. The gradients from each GPU are sent back to `GPU 0` and averaged. -**Parallelization strategy for a single Node / multi-GPU setup** +### DistributedDataParallel -When training a model on a single node with multiple GPUs, your choice of parallelization strategy can significantly -impact performance. Here's a breakdown of your options: +[DistributedDataParallel](https://pytorch.org/docs/main/notes/ddp.html) supports distributed training across *multiple machines* with multiple GPUs. -**Case 1: Your model fits onto a single GPU** +1. The main process replicates the model from the default GPU, `GPU 0`, to each GPU. +2. Each GPU directly processes a mini batch of data. +3. The local gradients are averaged across all GPUs during the `backward` pass. -If your model can comfortably fit onto a single GPU, you have two primary options: +DDP is recommended because it reduces communication overhead between GPUs, efficiently utilizes each GPU, and scales to more than one machine. -1. DDP - Distributed DataParallel -2. [Zero Redundancy Optimizer (ZeRO)](https://arxiv.org/abs/1910.02054) - depending on the situation and configuration used, this method may or may not be faster, however, it's worth experimenting with it. +### ZeRO data parallelism -**Case 2: Your model doesn't fit onto a single GPU:** +[Zero Redundancy Optimizer](https://www.deepspeed.ai/tutorials/zero/) is a more memory efficient type of data parallelism. It significantly improves memory efficiency by partitioning parameters, gradients, and optimizer states across data parallel processes to reduce memory usage. There are three ZeRO stages: -If your model is too large for a single GPU, you have several alternatives to consider: - -1. PipelineParallel (PP) -2. [ZeRO](https://arxiv.org/abs/1910.02054) -3. [TensorParallel](#tensor-parallelism) (TP) - -With very fast inter-node connectivity (e.g., NVLINK or NVSwitch) all three strategies (PP, ZeRO, TP) should result in -similar performance. However, without these, PP will be faster than TP or ZeRO. The degree of TP may also -make a difference. It's best to experiment with your specific setup to determine the most suitable strategy. - -TP is almost always used within a single node. That is TP size <= GPUs per node. - -**Case 3: Largest layer of your model does not fit onto a single GPU** - -1. If you are not using ZeRO, you have to use TensorParallel (TP), because PipelineParallel (PP) alone won't be sufficient to accommodate the large layer. -2. If you are using ZeRO, additionally adopt techniques from the [Methods and tools for efficient training on a single GPU](perf_train_gpu_one). - -**Parallelization strategy for a multi-Node / multi-GPU setup** - -* When you have fast inter-node connectivity (e.g., NVLINK or NVSwitch) consider using one of these options: - - 1. ZeRO - as it requires close to no modifications to the model - 2. A combination of PipelineParallel(PP) with TensorParallel(TP) and DataParallel(DP) - this approach will result in fewer communications, but requires significant changes to the model - -* When you have slow inter-node connectivity and still low on GPU memory: - - 1. Employ a combination of DataParallel(DP) with PipelineParallel(PP), TensorParallel(TP), and ZeRO. - -In the following sections of this guide we dig deeper into how these different parallelism methods work. - -## Data Parallelism - -Even with only 2 GPUs, you can readily leverage the accelerated training capabilities offered by PyTorch's built-in features, -such as `DataParallel` (DP) and `DistributedDataParallel` (DDP). Note that -[PyTorch documentation](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html) recommends to prefer -`DistributedDataParallel` (DDP) over `DataParallel` (DP) for multi-GPU training as it works for all models. -Let's take a look at how these two methods work and what makes them different. - -### DataParallel vs DistributedDataParallel - -To understand the key differences in inter-GPU communication overhead between the two methods, let's review the processes per batch: - -[DDP](https://pytorch.org/docs/master/notes/ddp.html): - -- At the start time the main process replicates the model once from GPU 0 to the rest of GPUs -- Then for each batch: - 1. Each GPU directly consumes its mini-batch of data. - 2. During `backward`, once the local gradients are ready, they are averaged across all processes. - -[DP](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html): - -For each batch: - 1. GPU 0 reads the batch of data and then sends a mini-batch to each GPU. - 2. The up-to-date model is replicated from GPU 0 to each GPU. - 3. `forward` is executed, and output from each GPU is sent to GPU 0 to compute the loss. - 4. The loss is distributed from GPU 0 to all GPUs, and `backward` is run. - 5. Gradients from each GPU are sent to GPU 0 and averaged. - -Key differences include: -1. DDP performs only a single communication per batch - sending gradients, while DP performs five different data exchanges per batch. -DDP copies data using [torch.distributed](https://pytorch.org/docs/master/distributed.html), while DP copies data within -the process via Python threads (which introduces limitations associated with GIL). As a result, **`DistributedDataParallel` (DDP) is generally faster than `DataParallel` (DP)** unless you have slow GPU card inter-connectivity. -2. Under DP, GPU 0 performs significantly more work than other GPUs, resulting in GPU under-utilization. -3. DDP supports distributed training across multiple machines, whereas DP does not. - -This is not an exhaustive list of differences between DP and DDP, however, other nuances are out of scope of this guide. -You can get a deeper understanding of these methods by reading this [article](https://www.telesens.co/2019/04/04/distributed-data-parallel-training-using-pytorch-on-aws/). - -Let's illustrate the differences between DP and DDP with an experiment. We'll benchmark the differences between DP and -DDP with an added context of NVLink presence: - -* Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`). -* Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`. - -To disable the NVLink feature on one of the benchmarks, we use `NCCL_P2P_DISABLE=1`. - -Here is the benchmarking code and outputs: - -**DP** - -```bash -rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \ -python examples/pytorch/language-modeling/run_clm.py \ ---model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \ ---do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 - -{'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69} -``` - -**DDP w/ NVlink** - -```bash -rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \ -torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \ ---model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \ ---do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 - -{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69} -``` - -**DDP w/o NVlink** - -```bash -rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \ -torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \ ---model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \ ---do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 - -{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69} -``` - -Here are the same benchmarking results gathered in a table for convenience: - -| Type | NVlink | Time | -| :----- | ----- | ---: | -| 2:DP | Y | 110s | -| 2:DDP | Y | 101s | -| 2:DDP | N | 131s | - -As you can see, in this case DP is ~10% slower than DDP with NVlink, but ~15% faster than DDP without NVlink. -The real difference will depend on how much data each GPU needs to sync with the others - the more there is to sync, -the more a slow link will impede the overall runtime. - -## ZeRO Data Parallelism - -ZeRO-powered data parallelism (ZeRO-DP) is illustrated in the following diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/). +- Stage 1 partitions the optimizer states +- Stage 2 partitions the optimizer and gradient states +- Stage 3 partitions the optimizer, gradient, and parameters
- DeepSpeed-Image-1 -
- -While it may appear complex, it is a very similar concept to `DataParallel` (DP). The difference is that instead of -replicating the full model parameters, gradients and optimizer states, each GPU stores only a slice of it. Then, at -run-time when the full layer parameters are needed just for the given layer, all GPUs synchronize to give each other -parts that they miss. - -To illustrate this idea, consider a simple model with 3 layers (La, Lb, and Lc), where each layer has 3 parameters. -Layer La, for example, has weights a0, a1 and a2: - -``` -La | Lb | Lc ----|----|--- -a0 | b0 | c0 -a1 | b1 | c1 -a2 | b2 | c2 -``` - -If we have 3 GPUs, ZeRO-DP splits the model onto 3 GPUs like so: - -``` -GPU0: -La | Lb | Lc ----|----|--- -a0 | b0 | c0 - -GPU1: -La | Lb | Lc ----|----|--- -a1 | b1 | c1 - -GPU2: -La | Lb | Lc ----|----|--- -a2 | b2 | c2 -``` - -In a way, this is the same horizontal slicing as tensor parallelism, as opposed to Vertical -slicing, where one puts whole layer-groups on different GPUs. Now let's see how this works: - -Each of these GPUs will get the usual mini-batch as it works in DP: - -``` -x0 => GPU0 -x1 => GPU1 -x2 => GPU2 -``` - -The inputs are passed without modifications as if they would be processed by the original model. - -First, the inputs get to the layer `La`. What happens at this point? - -On GPU0: the x0 mini-batch requires the a0, a1, a2 parameters to do its forward path through the layer, but the GPU0 has only a0. -It will get a1 from GPU1 and a2 from GPU2, bringing all the pieces of the model together. - -In parallel, GPU1 gets another mini-batch - x1. GPU1 has the a1 parameter, but needs a0 and a2, so it gets those from GPU0 and GPU2. -Same happens to GPU2 that gets the mini-batch x2. It gets a0 and a1 from GPU0 and GPU1. - -This way each of the 3 GPUs gets the full tensors reconstructed and makes a forward pass with its own mini-batch. -As soon as the calculation is done, the data that is no longer needed gets dropped - it's only used during the calculation. -The reconstruction is done efficiently via a pre-fetch. - -Then the whole process is repeated for layer Lb, then Lc forward-wise, and then backward Lc -> Lb -> La. - - - -This mechanism is similar to an efficient group backpacking strategy: person A carries the tent, person B carries the stove, -and person C carries the axe. Each night they all share what they have with others and get from others what they don't have, -and in the morning they pack up their allocated type of gear and continue on their way. This is what ZeRO DP/Sharded DDP is. -Compare this strategy to the simple one where each person has to carry their own tent, stove and axe (similar to -DataParallel (DP and DDP) in PyTorch), which would be far more inefficient. - - - -While reading the literature on this topic you may encounter the following synonyms: Sharded, Partitioned. -If you pay close attention the way ZeRO partitions the model's weights - it looks very similar to tensor parallelism -which will be discussed later. This is because it partitions/shards each layer's weights, unlike vertical model parallelism -which is discussed next. - -Implementations: - -- [DeepSpeed](https://www.deepspeed.ai/tutorials/zero/) ZeRO-DP stages 1+2+3 -- [`Accelerate` integration](https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed) -- [`transformers` integration](main_classes/trainer#trainer-integrations) - -## From Naive Model Parallelism to Pipeline Parallelism - -To explain Pipeline parallelism, we'll first look into Naive Model Parallelism (MP), also known as Vertical MP. This approach -involves distributing groups of model layers across multiple GPUs by assigning specific layers to specific GPUs with `.to()`. -As data flows through these layers, it is moved to the same GPU as the layer, while the other layers remain untouched. - -We refer to this Model parallelism as "Vertical" because of how models are typically visualized. For example, the -following diagram shows an 8-layer model split vertically into two slices, placing layers 0-3 onto -GPU0 and 4-7 to GPU1: - -``` -================ -| Layer | | -| 0 | | -| 1 | GPU0 | -| 2 | | -| 3 | | -================ -| Layer | | -| 4 | | -| 5 | GPU1 | -| 6 | | -| 7 | | -================ -``` - -In this example, when data moves from layer 0 to 3, it's no different from regular forward pass. However, passing data -from layer 3 to 4 requires moving it from GPU0 to GPU1, introducing a communication overhead. If the participating -GPUs are on the same compute node (e.g. same physical machine) this copying is fast, but if the GPUs are distributed -across different compute nodes (e.g. multiple machines), the communication overhead could be substantially greater. - -Following that, layers 4 to 7 work as they would in the original model. Upon completion of the 7th layer, there is often -a need to send the data back to layer 0 where the labels are (or alternatively send the labels to the last layer). Now the loss can be -computed and the optimizer can do its work. - -Naive Model Parallelism comes several shortcomings: -- **All but one GPU are idle at any given moment**: if 4 GPUs are used, it's nearly identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware. -- **Overhead in data transfer between devices**: E.g. 4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, but a single 24GB card will complete the training faster, because it doesn't have the data copying overhead. But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states) -- **Copying shared embeddings**: Shared embeddings may need to get copied back and forth between GPUs. - -Now that you are familiar with how the naive approach to model parallelism works and its shortcomings, let's look at Pipeline Parallelism (PP). -PP is almost identical to a naive MP, but it solves the GPU idling problem by chunking the incoming batch into micro-batches -and artificially creating a pipeline, which allows different GPUs to concurrently participate in the computation process. - -The following illustration from the [GPipe paper](https://ai.googleblog.com/2019/03/introducing-gpipe-open-source-library.html) -shows the naive MP on the top, and PP on the bottom: - -
- MP vs PP +
-At the bottom of the diagram, you can observe that the Pipeline Parallelism (PP) approach minimizes the number of idle -GPU zones, referred to as 'bubbles'. Both parts of the diagram show a parallelism level of degree 4, meaning that 4 GPUs -are involved in the pipeline. You can see that there's a forward path of 4 pipe stages (F0, F1, F2 and F3) followed by -a backward path in reverse order (B3, B2, B1, and B0). - -PP introduces a new hyperparameter to tune - `chunks`, which determines how many data chunks are sent in a sequence -through the same pipe stage. For example, in the bottom diagram you can see `chunks=4`. GPU0 performs the same -forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do complete their work. -Only when the other GPUs begin to complete their work, GPU0 starts to work again doing the backward path for chunks -3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0). - -Note that this is the same concept as gradient accumulation steps. PyTorch uses `chunks`, while DeepSpeed refers -to the same hyperparameter as gradient accumulation steps. - -Because of the chunks, PP introduces the notion of micro-batches (MBS). DP splits the global data batch size into -mini-batches, so if you have a DP degree of 4, a global batch size of 1024 gets split up into 4 mini-batches of -256 each (1024/4). And if the number of `chunks` (or GAS) is 32 we end up with a micro-batch size of 8 (256/32). Each -Pipeline stage works with a single micro-batch at a time. To calculate the global batch size of the DP + PP setup, -use the formula: `mbs * chunks * dp_degree` (`8 * 32 * 4 = 1024`). -With `chunks=1` you end up with the naive MP, which is inefficient. With a large `chunks` value you end up with -tiny micro-batch sizes which is also inefficient. For this reason, we encourage to experiment with the `chunks` value to -find the one that leads to the most efficient GPUs utilization. - -You may notice a bubble of "dead" time on the diagram that can't be parallelized because the last `forward` stage -has to wait for `backward` to complete the pipeline. The purpose of finding the best value for `chunks` is to enable a high -concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble. - -Pipeline API solutions have been implemented in: -- PyTorch -- DeepSpeed -- Megatron-LM - -These come with some shortcomings: -- They have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a `nn.Sequential` sequence of the same, which may require changes to the design of the model. -- Currently the Pipeline API is very restricted. If you had a bunch of Python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693 -- Conditional control flow at the level of pipe stages is not possible - e.g., Encoder-Decoder models like T5 require special workarounds to handle a conditional encoder stage. -- They have to arrange each layer so that the output of one layer becomes an input to the other layer. - -More recent solutions include: -- Varuna -- Sagemaker - -We have not experimented with Varuna and SageMaker but their papers report that they have overcome the list of problems -mentioned above and that they require smaller changes to the user's model. - -Implementations: -- [PyTorch](https://pytorch.org/docs/stable/pipeline.html) (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). Some [examples](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py) -- [DeepSpeed](https://www.deepspeed.ai/tutorials/pipeline/) -- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation - no API. -- [Varuna](https://github.com/microsoft/varuna) -- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS. -- [OSLO](https://github.com/tunib-ai/oslo) - this is implemented based on the Hugging Face Transformers. - -🤗 Transformers status: as of this writing none of the models supports full-PP. GPT2 and T5 models have naive MP support. -The main obstacle is being unable to convert the models to `nn.Sequential` and have all the inputs to be Tensors. This -is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that. - -DeepSpeed and Megatron-LM integrations are available in [🤗 Accelerate](https://huggingface.co/docs/accelerate/main/en/usage_guides/deepspeed) - -Other approaches: - -DeepSpeed, Varuna and SageMaker use the concept of an [Interleaved Pipeline](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html) +## Model parallelism -
- Interleaved pipeline execution -
+Model parallelism distributes a model across multiple GPUs. There are several ways to split a model, but the typical method distributes the model layers across GPUs. On the `forward` pass, the first GPU processes a batch of data and passes it to the next group of layers on the next GPU. For the `backward` pass, the data is sent backward from the final layer to the first layer. -Here the bubble (idle time) is further minimized by prioritizing backward passes. Varuna further attempts to improve the -schedule by using simulations to discover the most efficient scheduling. +Model parallelism is a useful strategy for training models that are too large to fit into the memory of a single GPU. However, GPU utilization is unbalanced because only one GPU is active at a time. Passing results between GPUs also adds communication overhead and it can be a bottleneck. -OSLO has pipeline parallelism implementation based on the Transformers without `nn.Sequential` conversion. +## Pipeline parallelism -## Tensor Parallelism +Pipeline parallelism is conceptually very similar to model parallelism, but it's more efficient because it reduces the amount of idle GPU time. Instead of waiting for each GPU to finish processing a batch of data, pipeline parallelism creates *micro-batches* of data. As soon as one micro-batch is finished, it is passed to the next GPU. This way, each GPU can concurrently process part of the data without waiting for the other GPU to completely finish processing a mini batch of data. -In Tensor Parallelism, each GPU processes a slice of a tensor and only aggregates the full tensor for operations requiring it. -To describe this method, this section of the guide relies on the concepts and diagrams from the [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) -paper: [Efficient Large-Scale Language Model Training on GPU Clusters](https://arxiv.org/abs/2104.04473). +Pipeline parallelism shares the same advantages as model parallelism, but it optimizes GPU utilization and reduces idle time. But pipeline parallelism can be more complex because models may need to be rewritten as a sequence of [nn.Sequential](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html) modules and it also isn't possible to completely reduce idle time because the last `forward` pass must also wait for the `backward` pass to finish. -The main building block of any transformer is a fully connected `nn.Linear` followed by a nonlinear activation `GeLU`. -The dot dot-product part of it, following the Megatron's paper notation, can be written as `Y = GeLU(XA)`, where `X` is -an input vector, `Y` is the output vector, and `A` is the weight matrix. +## Tensor parallelism -If we look at the computation in matrix form, you can see how the matrix multiplication can be split between multiple GPUs: +Tensor parallelism distributes large tensor computations across multiple GPUs. The tensors are sliced horizontally or vertically and each slice is processed by a separate GPU. Each GPU performs its calculations on its tensor slice and the results are synchronized at the end to reconstruct the final result. -
- Parallel GEMM -
+Tensor parallelism is effective for training large models that don't fit into the memory of a single GPU. It is also faster and more efficient because each GPU can process its tensor slice in parallel, and it can be combined with other parallelism methods. Like other parallelism methods though, tensor parallelism adds communication overhead between GPUs. -If we split the weight matrix `A` column-wise across `N` GPUs and perform matrix multiplications `XA_1` through `XA_n` in parallel, -then we will end up with `N` output vectors `Y_1, Y_2, ..., Y_n` which can be fed into `GeLU` independently: +## Hybrid parallelism -
- Independent GeLU -
- -Using this principle, we can update a multi-layer perceptron of arbitrary depth, without the need for any synchronization -between GPUs until the very end, where we need to reconstruct the output vector from shards. The Megatron-LM paper authors -provide a helpful illustration for that: - -
- Parallel shard processing -
- -Parallelizing the multi-headed attention layers is even simpler, since they are already inherently parallel, due to having -multiple independent heads! - -
- Parallel self-attention -
- -Special considerations: TP requires very fast network, and therefore it's not advisable to do TP across more than one node. -Practically, if a node has 4 GPUs, the highest TP degree is therefore 4. If you need a TP degree of 8, you need to use -nodes that have at least 8 GPUs. - -This section is based on the original much more [detailed TP overview](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530). -by [@anton-l](https://github.com/anton-l). +Parallelism methods can be combined to achieve even greater memory savings and more efficiently train models with billions of parameters. -Alternative names: -- DeepSpeed calls it [tensor slicing](https://www.deepspeed.ai/training/#model-parallelism) +### Data parallelism and pipeline parallelism -Implementations: -- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation, as it's very model-specific -- [parallelformers](https://github.com/tunib-ai/parallelformers) (only inference at the moment) -- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS. -- [OSLO](https://github.com/tunib-ai/oslo) has the tensor parallelism implementation based on the Transformers. -- [`transformers` integration](main_classes/trainer) tensor parallelism is available through tp_size attribute for models having `base_tp_plan`. Further you can look at [example usage](perf_infer_gpu_multi) +Data and pipeline parallelism distributes the data across GPUs and divides each mini batch of data into micro-batches to achieve pipeline parallelism. -SageMaker combines TP with DP for a more efficient processing. +Each data parallel rank treats the process as if there were only one GPU instead of two, but GPUs 0 and 1 can offload micro-batches of data to GPUs 2 and 3 and reduce idle time. -🤗 Transformers status: -- core: uses PyTorch 2 APIs to support tensor parallelism to models having base_tp_plan in their respective config classes. -- Alternatively, you can as well try [parallelformers](https://github.com/tunib-ai/parallelformers) that provides this support for most of our models. Training mode with TP is as well supported natively in transformers. -- Deepspeed-Inference also supports our BERT, GPT-2, and GPT-Neo models in their super-fast CUDA-kernel-based inference mode, see more [here](https://www.deepspeed.ai/tutorials/inference-tutorial/) - -🤗 Accelerate integrates with [TP from Megatron-LM](https://huggingface.co/docs/accelerate/v0.23.0/en/usage_guides/megatron_lm). - -## Data Parallelism + Pipeline Parallelism - -The following diagram from the DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/) demonstrates -how one can combine DP with PP. +This approach optimizes parallel data processing by reducing idle GPU utilization.
- DP + PP-2d +
-Here it's important to see how DP rank 0 doesn't see GPU2 and DP rank 1 doesn't see GPU3. To DP there is just GPUs 0 -and 1 where it feeds data as if there were just 2 GPUs. GPU0 "secretly" offloads some of its load to GPU2 using PP. -And GPU1 does the same by enlisting GPU3 to its aid. - -Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs. +### ZeRO data parallelism, pipeline parallelism, and model parallelism (3D parallelism) -Implementations: -- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) -- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) -- [Varuna](https://github.com/microsoft/varuna) -- [SageMaker](https://arxiv.org/abs/2111.05972) -- [OSLO](https://github.com/tunib-ai/oslo) +Data, pipeline and model parallelism combine to form [3D parallelism](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/) to optimize memory and compute efficiency. -🤗 Transformers status: not yet implemented +Memory effiiciency is achieved by splitting the model across GPUs and also dividing it into stages to create a pipeline. This allows GPUs to work in parallel on micro-batches of data, reducing the memory usage of the model, optimizer, and activations. -## Data Parallelism + Pipeline Parallelism + Tensor Parallelism +Compute efficiency is enabled by ZeRO data parallelism where each GPU only stores a slice of the model, optimizer, and activations. This allows higher communication bandwidth between data parallel nodes because communication can occur independently or in parallel with the other pipeline stages. -To get an even more efficient training a 3D parallelism is used where PP is combined with TP and DP. This can be seen in the following diagram. +This approach is scalable to extremely large models with trillions of parameters.
- dp-pp-tp-3d +
- -This diagram is from a blog post [3D parallelism: Scaling to trillion-parameter models](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/), which is a good read as well. - -Since each dimension requires at least 2 GPUs, here you'd need at least 8 GPUs. - -Implementations: -- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP. -- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) -- [Varuna](https://github.com/microsoft/varuna) -- [SageMaker](https://arxiv.org/abs/2111.05972) -- [OSLO](https://github.com/tunib-ai/oslo) - -🤗 Transformers status: not yet implemented, since we have no PP and TP. - -## ZeRO Data Parallelism + Pipeline Parallelism + Tensor Parallelism - -One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. It has already been -discussed in [ZeRO Data Parallelism](#zero-data-parallelism). Normally it's a standalone feature that doesn't require PP or TP. -But it can be combined with PP and TP. - -When ZeRO-DP is combined with PP (and optionally TP) it typically enables only ZeRO stage 1 (optimizer sharding). - -While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have negative -performance impacts. There would need to be an additional reduce-scatter collective for every micro-batch to aggregate -the gradients before sharding, which adds a potentially significant communication overhead. By nature of Pipeline Parallelism, -small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with -minimizing the Pipeline bubble (number of micro-batches). Therefore those communication costs are going to impact the performance. - -In addition, there are already fewer layers than normal due to PP and so the memory savings won't be huge. PP already -reduces gradient size by ``1/PP``, and so gradient sharding savings on top of that are less significant than pure DP. - -ZeRO stage 3 is not a good choice either for the same reason - more inter-node communications required. - -And since we have ZeRO, the other benefit is ZeRO-Offload. Since this is stage 1 optimizer states can be offloaded to CPU. - -Implementations: -- [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) and [Megatron-Deepspeed from BigScience](https://github.com/bigscience-workshop/Megatron-DeepSpeed), which is the fork of the former repo. -- [OSLO](https://github.com/tunib-ai/oslo) - -Important papers: - -- [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model]( -https://arxiv.org/abs/2201.11990) - -🤗 Transformers status: not yet implemented, since we have no PP. - -## FlexFlow - -[FlexFlow](https://github.com/flexflow/FlexFlow) also solves the parallelization problem in a slightly different approach. - -Paper: ["Beyond Data and Model Parallelism for Deep Neural Networks" by Zhihao Jia, Matei Zaharia, Alex Aiken](https://arxiv.org/abs/1807.05358) - -It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter. - -1. Sample = Data Parallelism (sample-wise parallel) -2. Operator = Parallelize a single operation into several sub-operations -3. Attribute = Data Parallelism (length-wise parallel) -4. Parameter = Model Parallelism (regardless of dimension - horizontal or vertical) - -Examples: -* Sample - -Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes 5 x 2 x 512. - -* Operator - -If we perform layer normalization, we compute std first and mean second, and then we can normalize data. -Operator parallelism allows computing std and mean in parallel. So if we parallelize them by operator dimension into 2 -devices (cuda:0, cuda:1), first we copy input data into both devices, and cuda:0 computes std, cuda:1 computes mean at the same time. - -* Attribute - -We have 10 batches of 512 length. If we parallelize them by attribute dimension into 2 devices, 10 x 512 will be 10 x 2 x 256. - -* Parameter - -It is similar with tensor model parallelism or naive layer-wise model parallelism. - -
- flex-flow-soap -
- -The significance of this framework is that it takes resources like (1) GPU/TPU/CPU vs. (2) RAM/DRAM vs. (3) -fast-intra-connect/slow-inter-connect and it automatically optimizes all these algorithmically deciding which -parallelisation to use where. - -One very important aspect is that FlexFlow is designed for optimizing DNN parallelizations for models with static and -fixed workloads, since models with dynamic behavior may prefer different parallelization strategies across iterations. - -So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best -strategy to utilise this specific environment. If you add/remove/replace any parts it'll run and re-optimize the plan -for that. And then you can train. A different setup will have its own custom optimization. - -🤗 Transformers status: Transformers models are FX-trace-able via [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py), -which is a prerequisite for FlexFlow, however, changes are required on the FlexFlow side to make it work with Transformers models. - -## GPU selection - -When training on multiple GPUs, you can specify the number of GPUs to use and in what order. This can be useful for instance when you have GPUs with different computing power and want to use the faster GPU first. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) to use only a subset of the available GPUs, and you don't need Accelerate or the [DeepSpeed integration](./main_classes/deepspeed). - -### Number of GPUs - -For example, if you have 4 GPUs and you only want to use the first 2: - - - - -Use the `--nproc_per_node` to select how many GPUs to use. - -```bash -torchrun --nproc_per_node=2 trainer-program.py ... -``` - - - - -Use `--num_processes` to select how many GPUs to use. - -```bash -accelerate launch --num_processes 2 trainer-program.py ... -``` - - - - -Use `--num_gpus` to select how many GPUs to use. - -```bash -deepspeed --num_gpus 2 trainer-program.py ... -``` - - - - -### Order of GPUs - -Now, to select which GPUs to use and their order, you'll use the `CUDA_VISIBLE_DEVICES` environment variable. It is easiest to set the environment variable in a `~/bashrc` or another startup config file. `CUDA_VISIBLE_DEVICES` is used to map which GPUs are used. For example, if you have 4 GPUs (0, 1, 2, 3) and you only want to run GPUs 0 and 2: - -```bash -CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ... -``` - -Only the 2 physical GPUs (0 and 2) are "visible" to PyTorch and these are mapped to `cuda:0` and `cuda:1` respectively. You can also reverse the order of the GPUs to use 2 first. Now, the mapping is `cuda:1` for GPU 0 and `cuda:0` for GPU 2. - -```bash -CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ... -``` - -You can also set the `CUDA_VISIBLE_DEVICES` environment variable to an empty value to create an environment without GPUs. - -```bash -CUDA_VISIBLE_DEVICES= python trainer-program.py ... -``` - - - -As with any environment variable, they can be exported instead of being added to the command line. However, this is not recommended because it can be confusing if you forget how the environment variable was setup and you end up using the wrong GPUs. Instead, it is common practice to set the environment variable for a specific training run on the same command line. - - - -`CUDA_DEVICE_ORDER` is an alternative environment variable you can use to control how the GPUs are ordered. You can either order them by: - -1. PCIe bus ID's that matches the order of [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) and [`rocm-smi`](https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/.doxygen/docBin/html/index.html) for NVIDIA and AMD GPUs respectively - -```bash -export CUDA_DEVICE_ORDER=PCI_BUS_ID -``` - -2. GPU compute ability - -```bash -export CUDA_DEVICE_ORDER=FASTEST_FIRST -``` - -The `CUDA_DEVICE_ORDER` is especially useful if your training setup consists of an older and newer GPU, where the older GPU appears first, but you cannot physically swap the cards to make the newer GPU appear first. In this case, set `CUDA_DEVICE_ORDER=FASTEST_FIRST` to always use the newer and faster GPU first (`nvidia-smi` or `rocm-smi` still reports the GPUs in their PCIe order). Or you could also set `export CUDA_VISIBLE_DEVICES=1,0`. From c5783bd6fd1549b9f8c4ab707cf1f958fb15dc30 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 18 Nov 2024 12:47:11 -0800 Subject: [PATCH 068/116] fsdp --- docs/source/en/_toctree.yml | 2 +- docs/source/en/fsdp.md | 85 ++++++++++++++++++++----------------- 2 files changed, 47 insertions(+), 40 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 915076ac6ec8..94121ae5f8f1 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -128,7 +128,7 @@ - local: perf_train_gpu_many title: Parallelism methods - local: fsdp - title: Fully Sharded Data Parallel + title: FullyShardedDataParallel - local: deepspeed title: DeepSpeed - local: perf_train_cpu_many diff --git a/docs/source/en/fsdp.md b/docs/source/en/fsdp.md index 2c4f114dec85..51817045f2d0 100644 --- a/docs/source/en/fsdp.md +++ b/docs/source/en/fsdp.md @@ -1,4 +1,4 @@ - -# Fully Sharded Data Parallel +# FullyShardedDataParallel -[Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) is a data parallel method that shards a model's parameters, gradients and optimizer states across the number of available GPUs (also called workers or *rank*). Unlike [DistributedDataParallel (DDP)](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html), FSDP reduces memory-usage because a model is replicated on each GPU. This improves GPU memory-efficiency and allows you to train much larger models on fewer GPUs. FSDP is integrated with the Accelerate, a library for easily managing training in distributed environments, which means it is available for use from the [`Trainer`] class. +[Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) is a [parallelism](./perf_train_gpu_many) method that combines the advantages of data and model parallelism for distributed training. -Before you start, make sure Accelerate is installed and at least PyTorch 2.1.0 or newer. +Unlike [DistributedDataParallel (DDP)](./perf_train_gpu_many#distributeddataparallel), FSDP saves more memory because it doesn't replicate a model on each GPU. It shards the models parameters, gradients and optimizer states across GPUs. Each model shard processes a portion of the data and the results are synchronized to speed up training. + +This guide covers how to setup training a model with FSDP using [Accelerate](https://hf.co/docs/accelerate/index), a library for managing distributed training. ```bash pip install accelerate ``` -## FSDP configuration +## Configuration options -To start, run the [`accelerate config`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-config) command to create a configuration file for your training environment. Accelerate uses this configuration file to automatically setup the correct training environment based on your selected training options in `accelerate config`. +Always start by running the [accelerate config](https://hf.co/docs/accelerate/package_reference/cli#accelerate-config) command to help Accelerate setup the correct distributed training environment. ```bash accelerate config ``` -When you run `accelerate config`, you'll be prompted with a series of options to configure your training environment. This section covers some of the most important FSDP options. To learn more about the other available FSDP options, take a look at the [fsdp_config](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.fsdp_config) parameters. +The section below discusses some of the more important FSDP configuration options. Learn more about other available options in the [fsdp_config](https://hf.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.fsdp_config) parameter. ### Sharding strategy -FSDP offers a number of sharding strategies to select from: - -* `FULL_SHARD` - shards model parameters, gradients and optimizer states across workers; select `1` for this option -* `SHARD_GRAD_OP`- shard gradients and optimizer states across workers; select `2` for this option -* `NO_SHARD` - don't shard anything (this is equivalent to DDP); select `3` for this option -* `HYBRID_SHARD` - shard model parameters, gradients and optimizer states within each worker where each worker also has a full copy; select `4` for this option -* `HYBRID_SHARD_ZERO2` - shard gradients and optimizer states within each worker where each worker also has a full copy; select `5` for this option +FSDP offers several sharding strategies to distribute a model. Refer to the table below to help you choose the best strategy for your setup. Specify a strategy with the `fsdp_sharding_strategy` parameter in the configuration file. -This is enabled by the `fsdp_sharding_strategy` flag. +| sharding strategy | description | parameter value | +|---|---|---| +| `FULL_SHARD` | shards model parameters, gradients, and optimizer states | `1` | +| `SHARD_GRAD_OP` | shards gradients and optimizer states | `2` | +| `NO_SHARD` | don't shard the model | `3` | +| `HYBRID_SHARD` | shards model parameters, gradients, and optimizer states within each GPU | `4` | +| `HYBRID_SHARD_ZERO2` | shards gradients and optimizer states within each GPU | `5` | ### CPU offload -You could also offload parameters and gradients when they are not in use to the CPU to save even more GPU memory and help you fit large models where even FSDP may not be sufficient. This is enabled by setting `fsdp_offload_params: true` when running `accelerate config`. +Offload model parameters and gradients when they aren't being used to the CPU to save additional GPU memory. This is useful for scenarios where a model is too large even with FSDP. + +Specify `fsdp_offload_params: true` in the configuration file to enable offloading. ### Wrapping policy -FSDP is applied by wrapping each layer in the network. The wrapping is usually applied in a nested way where the full weights are discarded after each forward pass to save memory for use in the next layer. The *auto wrapping* policy is the simplest way to implement this and you don't need to change any code. You should select `fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP` to wrap a Transformer layer and `fsdp_transformer_layer_cls_to_wrap` to specify which layer to wrap (for example `BertLayer`). +FSDP is applied by wrapping each layer in the network. The wrapping is usually applied in a nested way where the full weights are discarded after each forward pass to save memory for the next layer. + +There are several wrapping policies available, but the *auto wrapping* policy is the simplest and doesn't require any changes to your code. Specify `fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP` to wrap a Transformer layer and `fsdp_transformer_layer_cls_to_wrap` to determine which layer to wrap (for example, `BertLayer`). -Otherwise, you can choose a size-based wrapping policy where FSDP is applied to a layer if it exceeds a certain number of parameters. This is enabled by setting `fsdp_wrap_policy: SIZE_BASED_WRAP` and `min_num_param` to the desired size threshold. +Size-based wrapping is also available. If a layer exceeds a certain number of parameters, it is wrapped. Specify `fsdp_wrap_policy: SIZED_BASED_WRAP` and `min_num_param` to set the minimum number of parameters for a layer to be wrapped. -### Checkpointing +### Checkpoints -Intermediate checkpoints should be saved with `fsdp_state_dict_type: SHARDED_STATE_DICT` because saving the full state dict with CPU offloading on rank 0 takes a lot of time and often results in `NCCL Timeout` errors due to indefinite hanging during broadcasting. You can resume training with the sharded state dicts with the [`~accelerate.Accelerator.load_state`] method. +Intermediate checkpoints should be saved as a sharded state dict because saving the full state dict - even with CPU offloading - is time consuming and can cause `NCCL Timeout` errors due to indefinite hanging during broadcasting. + +Specify `fsdp_state_dict_type: SHARDED_STATE_DICT` in the configuration file to save the sharded state dict. Now you can resume training from the sharded state dict with the [`~accelerate.Accelerator.load_state`] method. ```py -# directory containing checkpoints -accelerator.load_state("ckpt") +accelerator.load_state("directory/containing/checkpoints") ``` -However, when training ends, you want to save the full state dict because sharded state dict is only compatible with FSDP. +Once training is complete though, you should save the full state dict because the sharded state dict is only compatible with FSDP. ```py if trainer.is_fsdp_enabled: - trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT") + trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT") trainer.save_model(script_args.output_dir) ``` ### TPU -[PyTorch XLA](https://pytorch.org/xla/release/2.1/index.html) supports FSDP training for TPUs and it can be enabled by modifying the FSDP configuration file generated by `accelerate config`. In addition to the sharding strategies and wrapping options specified above, you can add the parameters shown below to the file. +[PyTorch XLA](https://pytorch.org/xla/release/2.1/index.html), a package for running PyTorch on XLA devices, enables FSDP on TPUs. Modify the configuration file to include the parameters below. Refer to the [xla_fsdp_settings](https://github.com/pytorch/xla/blob/2e6e183e0724818f137c8135b34ef273dea33318/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py#L128) parameter for additional XLA-specific parameters you can configure for FSDP. ```yaml xla: True # must be set to True to enable PyTorch/XLA -xla_fsdp_settings: # XLA-specific FSDP parameters -xla_fsdp_grad_ckpt: True # use gradient checkpointing +xla_fsdp_settings: # XLA specific FSDP parameters +xla_fsdp_grad_ckpt: True # enable gradient checkpointing ``` -The [`xla_fsdp_settings`](https://github.com/pytorch/xla/blob/2e6e183e0724818f137c8135b34ef273dea33318/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py#L128) allow you to configure additional XLA-specific parameters for FSDP. - -## Launch training +## Training -An example FSDP configuration file may look like: +After running [accelerate config](https://hf.co/docs/accelerate/package_reference/cli#accelerate-config), your configuration file should be ready. An example configuration file is show below that fully shards the parameter, gradient and optimizer states on two GPUs. Your file may look different depending on how you setup your configuration. ```yaml compute_environment: LOCAL_MACHINE @@ -119,20 +124,22 @@ tpu_use_sudo: false use_cpu: false ``` -To launch training, run the [`accelerate launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) command and it'll automatically use the configuration file you previously created with `accelerate config`. +Run the [accelerate launch](https://hf.co/docs/accelerate/package_reference/cli#accelerate-launch) command to launch a training script with the FSDP configurations you chose in the configuration file. ```bash -accelerate launch my-trainer-script.py +accelerate launch my-training-script.py ``` +It is also possible to directly specify some of the FSDP arguments in the command line. + ```bash -accelerate launch --fsdp="full shard" --fsdp_config="path/to/fsdp_config/ my-trainer-script.py +accelerate launch --fsdp="full shard" --fsdp_config="path/to/fsdp_config/" my-training-script.py ``` -## Next steps +## Resources -FSDP can be a powerful tool for training really large models and you have access to more than one GPU or TPU. By sharding the model parameters, optimizer and gradient states, and even offloading them to the CPU when they're inactive, FSDP can reduce the high cost of large-scale training. If you're interested in learning more, the following may be helpful: +FSDP is a powerful tool for training large models with fewer GPUs compared to some other parallelism strategies. Refer to the following resources below to learn even more about FSDP. -* Follow along with the more in-depth Accelerate guide for [FSDP](https://huggingface.co/docs/accelerate/usage_guides/fsdp). -* Read the [Introducing PyTorch Fully Sharded Data Parallel (FSDP) API](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) blog post. -* Read the [Scaling PyTorch models on Cloud TPUs with FSDP](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/) blog post. +- Follow along with the more in-depth Accelerate guide for [FSDP](https://hf.co/docs/accelerate/usage_guides/fsdp). +- Read the [Introducing PyTorch Fully Sharded Data Parallel (FSDP) API](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) blog post. +- Read the [Scaling PyTorch models on Cloud TPUs with FSDP](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/) blog post. From 33bf02a99b7e1ddba351df41b8591e59217c36d3 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 18 Nov 2024 12:56:15 -0800 Subject: [PATCH 069/116] update --- docs/source/en/_toctree.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 94121ae5f8f1..7292316b11dc 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -100,6 +100,8 @@ title: XLA - local: perf_infer_gpu_one title: GPU + - local: perf_infer_gpu_multi + title: Distributed inference - local: perf_infer_cpu title: CPU - local: agents From 682b2594fe86661af81c1efc6253d78ca341ec25 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 19 Nov 2024 13:07:22 -0800 Subject: [PATCH 070/116] distributed cpu --- docs/source/en/_toctree.yml | 8 +- docs/source/en/perf_train_cpu_many.md | 199 +++++++++++--------------- 2 files changed, 88 insertions(+), 119 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 7292316b11dc..c0328d24c1fc 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -101,7 +101,7 @@ - local: perf_infer_gpu_one title: GPU - local: perf_infer_gpu_multi - title: Distributed inference + title: Distributed GPU inference - local: perf_infer_cpu title: CPU - local: agents @@ -127,14 +127,14 @@ sections: - local: accelerate title: Accelerate - - local: perf_train_gpu_many - title: Parallelism methods - local: fsdp title: FullyShardedDataParallel - local: deepspeed title: DeepSpeed - local: perf_train_cpu_many - title: Distributed CPU training + title: Distributed CPUs + - local: perf_train_gpu_many + title: Parallelism methods - title: Hardware-specific training sections: - local: perf_train_gpu_one diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md index d6a029c471de..cf6bb047ef61 100644 --- a/docs/source/en/perf_train_cpu_many.md +++ b/docs/source/en/perf_train_cpu_many.md @@ -13,69 +13,57 @@ rendered properly in your Markdown viewer. --> -# Efficient Training on Multiple CPUs +# Distributed CPUs -When training on a single CPU is too slow, we can use multiple CPUs. This guide focuses on PyTorch-based DDP enabling -distributed CPU training efficiently on [bare metal](#usage-in-trainer) and [Kubernetes](#usage-with-kubernetes). +CPUs are commonly available and can be a cost-effective option for training when GPUs are unavailable. When training large models or if a single CPU is too slow, distributed training with CPUs can help speed up training. -## Intel® oneCCL Bindings for PyTorch +This guide demonstrates how to perform distributed training with multiple CPUs using a [DistributedDataParallel (DDP)](./perf_train_gpu_many#distributeddataparallel) strategy on bare metal with [`Trainer`] and a Kubernetes cluster. All examples shown in this guide depend on the [Intel oneAPI HPC Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/hpc-toolkit.html). -[Intel® oneCCL](https://github.com/oneapi-src/oneCCL) (collective communications library) is a library for efficient distributed deep learning training implementing such collectives like allreduce, allgather, alltoall. For more information on oneCCL, please refer to the [oneCCL documentation](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html) and [oneCCL specification](https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html). +There are two toolkits you'll need from Intel oneAPI. -Module `oneccl_bindings_for_pytorch` (`torch_ccl` before version 1.12) implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup and only works on Linux platform now +1. [oneCCL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/oneccl.html) is a toolkit that includes efficient implementations of collectives commonly used in deep learning such as all-gather, all-reduce, and reduce-scatter. To install from a prebuilt wheel, make sure you always use the latest release. Refer to the table [here](https://github.com/intel/torch-ccl#install-prebuilt-wheel) to check if a version of oneCCL is supported for a Python and PyTorch version. -Check more detailed information for [oneccl_bind_pt](https://github.com/intel/torch-ccl). - -### Intel® oneCCL Bindings for PyTorch installation - -Wheel files are available for the following Python versions: - -| Extension Version | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 | Python 3.11 | -| :---------------: | :--------: | :--------: | :--------: | :---------: | :---------: | -| 2.5.0 | | √ | √ | √ | √ | -| 2.4.0 | | √ | √ | √ | √ | -| 2.3.0 | | √ | √ | √ | √ | -| 2.2.0 | | √ | √ | √ | √ | - -Please run `pip list | grep torch` to get your `pytorch_version`. ```bash -pip install oneccl_bind_pt=={pytorch_version} -f https://developer.intel.com/ipex-whl-stable-cpu +# installs oneCCL for PyTorch 2.4.0 +pip install oneccl_bind_pt==2.4.0 -f https://developer.intel.com/ipex-whl-stable-cpu ``` -where `{pytorch_version}` should be your PyTorch version, for instance 2.4.0. -Check more approaches for [oneccl_bind_pt installation](https://github.com/intel/torch-ccl). -Versions of oneCCL and PyTorch must match. - -## Intel® MPI library -Use this standards-based MPI implementation to deliver flexible, efficient, scalable cluster messaging on Intel® architecture. This component is part of the Intel® oneAPI HPC Toolkit. +> [!TIP] +> Refer to the oneCCL [installation](https://github.com/intel/torch-ccl#installation) for more details. -oneccl_bindings_for_pytorch is installed along with the MPI tool set. Need to source the environment before using it. +1. [MPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html) is a message-passing interface for communications between hardware and networks. The oneCCL toolkit is installed along with MPI, but you need to source the environment as shown below before using it. ```bash oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)") source $oneccl_bindings_for_pytorch_path/env/setvars.sh ``` -#### Intel® Extension for PyTorch installation +Lastly, install the [Intex Extension for PyTorch (IPEX)](https://intel.github.io/intel-extension-for-pytorch/index.html) which enables additional performance optimizations for Intel hardware such as weight sharing and better thread runtime control. -Intel Extension for PyTorch (IPEX) provides performance optimizations for CPU training with both Float32 and BFloat16 (refer to the [single CPU section](./perf_train_cpu) to learn more). +```bash +pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu +``` +> [!TIP] +> Refer to the IPEX [installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation) for more details. -The following "Usage in Trainer" takes mpirun in Intel® MPI library as an example. +## Trainer +[`Trainer`] supports distributed training with CPUs with the oneCCL backend. Add the `--ddp_backend ccl` parameter in the command arguments to enable it. -## Usage in Trainer -To enable multi CPU distributed training in the Trainer with the ccl backend, users should add **`--ddp_backend ccl`** in the command arguments. + + -Let's see an example with the [question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) +The example below demonstrates the [run_qa.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) script. It enables training with two processes on one Xeon CPU, with one process running per socket. +> [!TIP] +> Tune the variable `OMP_NUM_THREADS/CCL_WORKER_COUNT` for optimal performance. -The following command enables training with 2 processes on one Xeon node, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance. -```shell script - export CCL_WORKER_COUNT=1 - export MASTER_ADDR=127.0.0.1 - mpirun -n 2 -genv OMP_NUM_THREADS=23 \ - python3 examples/pytorch/question-answering/run_qa.py \ +```bash +export CCL_WORKER_COUNT=1 +export MASTER_ADDR=127.0.0.1 +mpirun -n 2 -genv OMP_NUM_THREADS=23 \ +python3 run_qa.py \ --model_name_or_path google-bert/bert-large-uncased \ --dataset_name squad \ --do_train \ @@ -90,21 +78,31 @@ The following command enables training with 2 processes on one Xeon node, with o --ddp_backend ccl \ --use_ipex ``` -The following command enables training with a total of four processes on two Xeons (node0 and node1, taking node0 as the main process), ppn (processes per node) is set to 2, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance. -In node0, you need to create a configuration file which contains the IP addresses of each node (for example hostfile) and pass that configuration file path as an argument. -```shell script - cat hostfile - xxx.xxx.xxx.xxx #node0 ip - xxx.xxx.xxx.xxx #node1 ip + + + +Scale the training script to four processes on two Xeon CPUs (`node0` and `node1`) by setting `-n 4` and `ppn 2`. The `ppn` parameter specifies the number of processes per node, with one process running per socket. + +Assume `node0` is the main process and create a configuration file containing the IP addresses of each node (for example, hostfile) and pass the configuration file path as an argument. + +```bash +cat hostfile +xxx.xxx.xxx.xxx #node0 ip +xxx.xxx.xxx.xxx #node1 ip ``` -Now, run the following command in node0 and **4DDP** will be enabled in node0 and node1 with BF16 auto mixed precision: -```shell script - export CCL_WORKER_COUNT=1 - export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip - mpirun -f hostfile -n 4 -ppn 2 \ + +Run the script below on `node0` to enable DDP on `node0` and `node1` and train with bf16 auto mixed precision. + +> [!TIP] +> Tune the variable `OMP_NUM_THREADS/CCL_WORKER_COUNT` for optimal performance. + +```bash +export CCL_WORKER_COUNT=1 +export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip +mpirun -f hostfile -n 4 -ppn 2 \ -genv OMP_NUM_THREADS=23 \ - python3 examples/pytorch/question-answering/run_qa.py \ +python3 run_qa.py \ --model_name_or_path google-bert/bert-large-uncased \ --dataset_name squad \ --do_train \ @@ -121,25 +119,20 @@ Now, run the following command in node0 and **4DDP** will be enabled in node0 an --bf16 ``` -## Usage with Kubernetes + + -The same distributed training job from the previous section can be deployed to a Kubernetes cluster using the -[Kubeflow PyTorchJob training operator](https://www.kubeflow.org/docs/components/training/user-guides/pytorch). +## Kubernetes -### Setup +Distributed training with CPUs can also be deployed to a Kubernetes cluster with [PyTorchJob](https://www.kubeflow.org/docs/components/training/user-guides/pytorch/). Before you get started, you should perform the following setup steps. -This example assumes that you have: -* Access to a Kubernetes cluster with [Kubeflow installed](https://www.kubeflow.org/docs/started/installing-kubeflow) -* [`kubectl`](https://kubernetes.io/docs/tasks/tools) installed and configured to access the Kubernetes cluster -* A [Persistent Volume Claim (PVC)](https://kubernetes.io/docs/concepts/storage/persistent-volumes) that can be used - to store datasets and model files. There are multiple options for setting up the PVC including using an NFS - [storage class](https://kubernetes.io/docs/concepts/storage/storage-classes) or a cloud storage bucket. -* A Docker container that includes your model training script and all the dependencies needed to run the script. For - distributed CPU training jobs, this typically includes PyTorch, Transformers, Intel Extension for PyTorch, Intel - oneCCL Bindings for PyTorch, and OpenSSH to communicate between the containers. +1. Ensure you have access to a Kubernetes cluster with [Kubeflow](https://www.kubeflow.org/docs/started/installing-kubeflow/) installed. +1. Install and configure [kubectl](https://kubernetes.io/docs/tasks/tools) to interact with the cluster. +1. Set up a [PersistentVolumeClaim (PVC)](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) to store datasets and model files. There are multiple options to choose from, including a [StorageClass](https://kubernetes.io/docs/concepts/storage/storage-classes/) or a cloud storage bucket. +1. Set up a Docker container for the training script and all required dependencies such as PyTorch, Transformers, IPEX, oneCCL, and OpenSSH to facilitate communicattion between containers. + +The example Dockerfile below uses a base image that supports distributed training with CPUs, and extracts Transformers to the `/workspace` directory to include the training scripts in the image. The image needs to be built and copied to the clusters nodes or pushed to a container registry prior to deployment. -The snippet below is an example of a Dockerfile that uses a base image that supports distributed CPU training and then -extracts a Transformers release to the `/workspace` directory, so that the example scripts are included in the image: ```dockerfile FROM intel/intel-optimized-pytorch:2.4.0-pip-multinode @@ -157,26 +150,17 @@ RUN pip install --no-cache-dir \ mkdir transformers && \ curl -sSL --retry 5 https://github.com/huggingface/transformers/archive/refs/tags/v${HF_TRANSFORMERS_VER}.tar.gz | tar -C transformers --strip-components=1 -xzf - ``` -The image needs to be built and copied to the cluster's nodes or pushed to a container registry prior to deploying the -PyTorchJob to the cluster. - -### PyTorchJob Specification File - -The [Kubeflow PyTorchJob](https://www.kubeflow.org/docs/components/training/user-guides/pytorch) is used to run the distributed -training job on the cluster. The yaml file for the PyTorchJob defines parameters such as: - * The name of the PyTorchJob - * The number of replicas (workers) - * The python script and it's parameters that will be used to run the training job - * The types of resources (node selector, memory, and CPU) needed for each worker - * The image/tag for the Docker container to use - * Environment variables - * A volume mount for the PVC - -The volume mount defines a path where the PVC will be mounted in the container for each worker pod. This location can be -used for the dataset, checkpoint files, and the saved model after training completes. - -The snippet below is an example of a yaml file for a PyTorchJob with 4 workers running the -[question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering). + +### PyTorchJob + +[PyTorchJob](https://www.kubeflow.org/docs/components/training/user-guides/pytorch/) is an extension of the Kubernetes API for running PyTorch training jobs on Kubernetes. It includes a yaml file that defines the training jobs parameters such as the name of the PyTorchJob, number of workers, types of resources for each worker, and more. + +The volume mount parameter is a path to where the PVC is mounted in the container for each worker pod. The PVC is typically used to hold the dataset, checkpoint files, and the model after it has finished training. + +The example yaml file below sets up four workers on the [run_qa.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) script. Adapt the yaml file based on your training script and number of nodes in your cluster. + +The CPU resource limits and requests are defined in [CPU units](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-cpu). One CPU unit is equivalent to one physical CPU core or virtual core. The CPU units defined in the yaml file should be less than the amount of available CPU and memory capacity of a single machine in order to leave some resources for kubelet and the system. For a `Guaranteed` [quality of service](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod), set the same CPU and memory amounts for both the resource limits and requests. + ```yaml apiVersion: "kubeflow.org/v1" kind: PyTorchJob @@ -255,35 +239,22 @@ spec: emptyDir: medium: Memory ``` -To run this example, update the yaml based on your training script and the nodes in your cluster. - - - -The CPU resource limits/requests in the yaml are defined in -[cpu units](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-cpu) -where 1 CPU unit is equivalent to 1 physical CPU core or 1 virtual core (depending on whether the node is a physical -host or a VM). The amount of CPU and memory limits/requests defined in the yaml should be less than the amount of -available CPU/memory capacity on a single machine. It is usually a good idea to not use the entire machine's capacity in -order to leave some resources for the kubelet and OS. In order to get ["guaranteed"](https://kubernetes.io/docs/concepts/workloads/pods/pod-qos/#guaranteed) -[quality of service](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod) for the worker pods, -set the same CPU and memory amounts for both the resource limits and requests. - - ### Deploy -After the PyTorchJob spec has been updated with values appropriate for your cluster and training job, it can be deployed -to the cluster using: +After you've setup the PyTorchJob yaml file with the appropriate settings for your cluster and training job, deploy it to the cluster with the command below. + ```bash export NAMESPACE= kubectl create -f pytorchjob.yaml -n ${NAMESPACE} ``` -The `kubectl get pods -n ${NAMESPACE}` command can then be used to list the pods in your namespace. You should see -the worker pods for the PyTorchJob that was just deployed. At first, they will probably have a status of "Pending" as -the containers get pulled and created, then the status should change to "Running". -``` +List the pods in the namespace with `kubectl get pods -n ${NAMESPACE}`. At first, the status may be "Pending" but it should change to "Running" once the containers are pulled and created. + +```bash +kubectl get pods -n ${NAMESPACE} + NAME READY STATUS RESTARTS AGE ... transformers-pytorchjob-worker-0 1/1 Running 0 7m37s @@ -293,16 +264,14 @@ transformers-pytorchjob-worker-3 1/1 Running ... ``` -The logs for worker can be viewed using `kubectl logs -n ${NAMESPACE}`. Add `-f` to stream the logs, for example: +Inspect the logs for each worker with the following command. Add `-f` to stream the logs. + ```bash kubectl logs transformers-pytorchjob-worker-0 -n ${NAMESPACE} -f ``` -After the training job completes, the trained model can be copied from the PVC or storage location. When you are done -with the job, the PyTorchJob resource can be deleted from the cluster using `kubectl delete -f pytorchjob.yaml -n ${NAMESPACE}`. - -## Summary +Once training is complete, the trained model can be copied from the PVC or storage location. Delete the PyTorchJob resource from the cluster with the command below. -This guide covered running distributed PyTorch training jobs using multiple CPUs on bare metal and on a Kubernetes -cluster. Both cases utilize Intel Extension for PyTorch and Intel oneCCL Bindings for PyTorch for optimal training -performance, and can be used as a template to run your own workload on multiple nodes. +```bash +kubectl delete -f pytorchjob.yaml -n ${NAMESPACE} +``` From b7d2f3fb568798e299178d6e4ec27681bf59a12f Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 20 Nov 2024 15:14:24 -0800 Subject: [PATCH 071/116] hardware training --- docs/source/en/_toctree.yml | 10 +- docs/source/en/perf_hardware.md | 154 ++++++-------------------- docs/source/en/perf_train_cpu.md | 91 +++++++-------- docs/source/en/perf_train_cpu_many.md | 2 +- docs/source/en/perf_train_gpu_one.md | 4 +- docs/source/en/perf_train_special.md | 52 ++------- docs/source/en/perf_train_tpu_tf.md | 4 +- 7 files changed, 95 insertions(+), 222 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index c0328d24c1fc..dcf4cdfc02d8 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -138,15 +138,15 @@ - title: Hardware-specific training sections: - local: perf_train_gpu_one - title: Methods and tools for efficient training on a single GPU + title: GPU - local: perf_train_cpu - title: Efficient training on CPU + title: CPU - local: perf_train_tpu_tf - title: Training on TPU with TensorFlow + title: TPU wih TensorFlow - local: perf_train_special - title: PyTorch training on Apple silicon + title: Apple Silicon - local: perf_hardware - title: Custom hardware for training + title: Build your own machine - local: peft title: Load and train adapters with 🤗 PEFT - local: sagemaker diff --git a/docs/source/en/perf_hardware.md b/docs/source/en/perf_hardware.md index 260fe5b71ccb..4827c40bed85 100644 --- a/docs/source/en/perf_hardware.md +++ b/docs/source/en/perf_hardware.md @@ -1,155 +1,73 @@ - +# Build your own machine -# Custom hardware for training +One of the most important consideration when building a machine for deep learning is the GPU choice. GPUs are the standard workhorse for deep learning owing to their tensor cores for performing very efficient matrix multiplication and high memory bandwidth. To train large models, you either need a more powerful GPU, multiple GPUs, or take advantage of techniques that offload some of the load to the CPU or NVMe. -The hardware you use to run model training and inference can have a big effect on performance. For a deep dive into GPUs make sure to check out Tim Dettmer's excellent [blog post](https://timdettmers.com/2020/09/07/which-gpu-for-deep-learning/). +This guide provides some practical tips for setting up a GPU for deep learning. For a more detailed discussion and comparison of GPUs, take a look at the [Which GPU(s) to Get for Deep Learning](https://timdettmers.com/2023/01/30/which-gpu-for-deep-learning/) blog post. -Let's have a look at some practical advice for GPU setups. +## Power -## GPU -When you train bigger models you have essentially three options: +High-end consumer GPUs may have two or three PCIe 8-pin power sockets, and you should make sure you have the same number of 12V PCIe 8-pin cables connected to each socket. Don't use a *pigtail cable*, a single cable with two splits at one end, to connect two sockets or else you won't get full performance from your GPU. -- bigger GPUs -- more GPUs -- more CPU and NVMe (offloaded to by [DeepSpeed-Infinity](main_classes/deepspeed#nvme-support)) +Each PCIe 8-pin power cable should be connected to a 12V rail on the power supply unit (PSU) and can deliver up to 150W. Other GPUs may use a PCIe 12-pin connector which can deliver up to 500-600W. Lower-end GPUs may only use a PCIe 6-pin connector which supplies up to 75W. -Let's start at the case where you have a single GPU. +It is important the PSU has stable voltage otherwise it may not be able to supply the GPU with enough power to function properly during peak usage. -### Power and Cooling +## Cooling -If you bought an expensive high end GPU make sure you give it the correct power and sufficient cooling. +An overheated GPU throttles its performance and can even shutdown if it's too hot to prevent damage. Keeping the GPU temperature low, anywhere between 158 - 167F, is essential for delivering full perfomance and maintaining its lifespan. Once temperatures reach 183 - 194F, the GPU may begin to throttle performance. -**Power**: +## Multi-GPU connectivity -Some high end consumer GPU cards have 2 and sometimes 3 PCI-E 8-Pin power sockets. Make sure you have as many independent 12V PCI-E 8-Pin cables plugged into the card as there are sockets. Do not use the 2 splits at one end of the same cable (also known as pigtail cable). That is if you have 2 sockets on the GPU, you want 2 PCI-E 8-Pin cables going from your PSU to the card and not one that has 2 PCI-E 8-Pin connectors at the end! You won't get the full performance out of your card otherwise. +When your setup uses multiple GPUs, it is important to consider how they're connected. [NVLink](https://www.nvidia.com/en-us/design-visualization/nvlink-bridges/) connections are faster than PCIe bridges, but you should also consider the [parallelism](./perf_train_gpu_many) strategy you're using. For example, in DistributedDataParallel, GPUs communicate less frequently compared to ZeRO-DP. In this case, a slower connection is not as important. -Each PCI-E 8-Pin power cable needs to be plugged into a 12V rail on the PSU side and can supply up to 150W of power. - -Some other cards may use a PCI-E 12-Pin connectors, and these can deliver up to 500-600W of power. - -Low end cards may use 6-Pin connectors, which supply up to 75W of power. - -Additionally you want the high-end PSU that has stable voltage. Some lower quality ones may not give the card the stable voltage it needs to function at its peak. - -And of course the PSU needs to have enough unused Watts to power the card. - -**Cooling**: - -When a GPU gets overheated it will start throttling down and will not deliver full performance and it can even shutdown if it gets too hot. - -It's hard to tell the exact best temperature to strive for when a GPU is heavily loaded, but probably anything under +80C is good, but lower is better - perhaps 70-75C is an excellent range to be in. The throttling down is likely to start at around 84-90C. But other than throttling performance a prolonged very high temperature is likely to reduce the lifespan of a GPU. - -Next let's have a look at one of the most important aspects when having multiple GPUs: connectivity. - -### Multi-GPU Connectivity - -If you use multiple GPUs the way cards are inter-connected can have a huge impact on the total training time. If the GPUs are on the same physical node, you can run: +Run the command below to check how your GPUs are connected. ```bash nvidia-smi topo -m ``` -and it will tell you how the GPUs are inter-connected. On a machine with dual-GPU and which are connected with NVLink, you will most likely see something like: + + -``` +[NVLink](https://www.nvidia.com/en-us/design-visualization/nvlink-bridges/) is a high-speed communication system designed by NVIDIA for connecting multiple NVIDIA GPUs. Training [openai-community/gpt2](https://huggingface.co/openai-community/gpt2) on a small sample of the [wikitext](https://huggingface.co/datasets/Salesforce/wikitext) dataset is ~23% faster with NVLink. + +On a machine with two GPUs connected with NVLink, an example output of `nvidia-smi topo -m` is shown below. + +```bash GPU0 GPU1 CPU Affinity NUMA Affinity GPU0 X NV2 0-23 N/A GPU1 NV2 X 0-23 N/A ``` -on a different machine w/o NVLink we may see: -``` - GPU0 GPU1 CPU Affinity NUMA Affinity -GPU0 X PHB 0-11 N/A -GPU1 PHB X 0-11 N/A -``` - -The report includes this legend: - -``` - X = Self - SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) - NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node - PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) - PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) - PIX = Connection traversing at most a single PCIe bridge - NV# = Connection traversing a bonded set of # NVLinks -``` +`NV2` indicates `GPU0` and `GPU1` are connected by 2 NVLinks. -So the first report `NV2` tells us the GPUs are interconnected with 2 NVLinks, and the second report `PHB` we have a typical consumer-level PCIe+Bridge setup. + + -Check what type of connectivity you have on your setup. Some of these will make the communication between cards faster (e.g. NVLink), others slower (e.g. PHB). - -Depending on the type of scalability solution used, the connectivity speed could have a major or a minor impact. If the GPUs need to sync rarely, as in DDP, the impact of a slower connection will be less significant. If the GPUs need to send messages to each other often, as in ZeRO-DP, then faster connectivity becomes super important to achieve faster training. - -#### NVlink - -[NVLink](https://en.wikipedia.org/wiki/NVLink) is a wire-based serial multi-lane near-range communications link developed by Nvidia. - -Each new generation provides a faster bandwidth, e.g. here is a quote from [Nvidia Ampere GA102 GPU Architecture](https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf): - -> Third-Generation NVLink® -> GA102 GPUs utilize NVIDIA’s third-generation NVLink interface, which includes four x4 links, -> with each link providing 14.0625 GB/sec bandwidth in each direction between two GPUs. Four -> links provide 56.25 GB/sec bandwidth in each direction, and 112.5 GB/sec total bandwidth -> between two GPUs. Two RTX 3090 GPUs can be connected together for SLI using NVLink. -> (Note that 3-Way and 4-Way SLI configurations are not supported.) - -So the higher `X` you get in the report of `NVX` in the output of `nvidia-smi topo -m` the better. The generation will depend on your GPU architecture. - -Let's compare the execution of an `openai-community/gpt2` language model training over a small sample of wikitext. - -The results are: - - -| NVlink | Time | -| ----- | ---: | -| Y | 101s | -| N | 131s | - - -You can see that NVLink completes the training ~23% faster. In the second benchmark we use `NCCL_P2P_DISABLE=1` to tell the GPUs not to use NVLink. - -Here is the full benchmark code and outputs: +On a machine with two GPUs connected with a PCIe bridge, an example output of `nvidia-smi topo -m` is shown below. ```bash -# DDP w/ NVLink - -rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \ ---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \ ---dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \ ---output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 - -{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69} - -# DDP w/o NVLink - -rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 torchrun \ ---nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \ ---dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train ---output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 - -{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69} + GPU0 GPU1 CPU Affinity NUMA Affinity +GPU0 X PHB 0-11 N/A +GPU1 PHB X 0-11 N/A ``` -Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`) -Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0` +`PHB` indicates `GPU0` and `GPU1` are connected by a PCIe bridge. + + + diff --git a/docs/source/en/perf_train_cpu.md b/docs/source/en/perf_train_cpu.md index ab2f735ecbdd..2f87641008a7 100644 --- a/docs/source/en/perf_train_cpu.md +++ b/docs/source/en/perf_train_cpu.md @@ -1,4 +1,4 @@ - -# Efficient Training on CPU +# CPU -This guide focuses on training large models efficiently on CPU. +A modern CPU is capable of efficiently training large models by leveraging the underlying optimizations built into the hardware and training on fp16 or bf16 datatypes. -## Mixed precision with IPEX -Mixed precision uses single (fp32) and half-precision (bf16/fp16) data types in a model to accelerate training or inference while still preserving much of the single-precision accuracy. Modern CPUs such as 3rd, 4th, and 5th Gen Intel® Xeon® Scalable processors natively support bf16. 6th Gen Intel® Xeon® Scalable processors natively support bf16 and fp16. You should get more performance out of the box by enabling mixed precision training with bf16 or fp16. +This guide focuses on how to train large models on an Intel CPU using mixed precision and the [Intel Extension for PyTorch (IPEX)](https://intel.github.io/intel-extension-for-pytorch/index.html) library. -To further maximize training performance, you can use Intel® Extension for PyTorch (IPEX), which is a library built on PyTorch and adds additional CPU instruction level architecture (ISA) level support such as Intel® Advanced Vector Extensions 512 Vector Neural Network Instructions (Intel® AVX512-VNNI), and Intel® Advanced Matrix Extensions (Intel® AMX) for an extra performance boost on Intel CPUs. However, CPUs with only AVX2 (e.g., AMD or older Intel CPUs) are not guaranteed to have better performance under IPEX. +Install IPEX with the command below. You can find your PyTorch version by running `pip list | grep torch` in the command line. -Auto Mixed Precision (AMP) for CPU backends has been enabled since PyTorch 1.10. AMP support for bf16/fp16 on CPUs and bf16/fp16 operator optimization is also supported in IPEX and partially upstreamed to the main PyTorch branch. You can get better performance and user experience with IPEX AMP. +```bash +pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu +``` + +> [!TIP] +> Refer to the IPEX [installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation) guide for more details. -Check more detailed information for [Auto Mixed Precision](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html). +IPEX provides additional performance optimizations for Intel CPUs. These include additional CPU instruction level architecture (ISA) support such as [Intel AVX512-VNNI](https://en.wikichip.org/wiki/x86/avx512_vnni) and [Intel AMX](https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/what-is-intel-amx.html). Both of these features are designed to accelerate matrix multiplication. Older AMD and Intel CPUs with only Intel AVX2, however, aren't guaranteed better performance with IPEX. -### IPEX installation: +IPEX also supports [Auto Mixed Precision (AMP)](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html) training with the fp16 and bf16 datatypes. Reducing precision speeds up training and reduces memory usage because it requires less computation. The loss in accuracy from using full-precision is minimal. 3rd, 4th, and 5th generation Intel Xeon Scalable processors natively support bf16, and the 6th generation processor also natively supports fp16 in addition to bf16. -IPEX release is following PyTorch, to install via pip: +AMP is enabled for CPU backends training with PyTorch. -| PyTorch Version | IPEX version | -| :---------------: | :----------: | -| 2.5.0 | 2.5.0+cpu | -| 2.4.0 | 2.4.0+cpu | -| 2.3.0 | 2.3.0+cpu | -| 2.2.0 | 2.2.0+cpu | +[`Trainer`] supports AMP training with a CPU by adding the `--use_cpu`, `--use_ipex`, and `--bf16` parameters. The example below demonstrates the [run_qa.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) script. -Please run `pip list | grep torch` to get your `pytorch_version`, so you can get the `IPEX version_name`. ```bash -pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu +python run_qa.py \ + --model_name_or_path google-bert/bert-base-uncased \ + --dataset_name squad \ + --do_train \ + --do_eval \ + --per_device_train_batch_size 12 \ + --learning_rate 3e-5 \ + --num_train_epochs 2 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir /tmp/debug_squad/ \ + --use_ipex \ + --bf16 \ + --use_cpu ``` -You can check the latest versions in [ipex-whl-stable-cpu](https://developer.intel.com/ipex-whl-stable-cpu) if needed. - -Check more approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html). - -### Usage in Trainer -To enable auto mixed precision with IPEX in Trainer, users should add `use_ipex`, `bf16` or `fp16`, and `no_cuda` in training command arguments. - -Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) - -- Training with IPEX using BF16 auto mixed precision on CPU: -
 python examples/pytorch/question-answering/run_qa.py \
---model_name_or_path google-bert/bert-base-uncased \
---dataset_name squad \
---do_train \
---do_eval \
---per_device_train_batch_size 12 \
---learning_rate 3e-5 \
---num_train_epochs 2 \
---max_seq_length 384 \
---doc_stride 128 \
---output_dir /tmp/debug_squad/ \
---use_ipex \
---bf16 \
---use_cpu
- -If you want to enable `use_ipex` and `bf16` in your script, add these parameters to `TrainingArguments` like this: -```diff + +These parameters can also be added to [`TrainingArguments`] as shown below. + +```py training_args = TrainingArguments( - output_dir=args.output_path, -+ bf16=True, -+ use_ipex=True, -+ use_cpu=True, - **kwargs + output_dir="./outputs", + bf16=True, + use_ipex=True, + use_cpu=True, ) ``` -### Practice example +## Resources -Blog: [Accelerating PyTorch Transformers with Intel Sapphire Rapids](https://huggingface.co/blog/intel-sapphire-rapids) +Learn more about training on Intel CPUs in the [Accelerating PyTorch Transformers with Intel Sapphire Rapids](https://huggingface.co/blog/intel-sapphire-rapids) blog post. diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md index cf6bb047ef61..4f14329abfd1 100644 --- a/docs/source/en/perf_train_cpu_many.md +++ b/docs/source/en/perf_train_cpu_many.md @@ -1,4 +1,4 @@ - -# Methods and tools for efficient training on a single GPU +# GPU This guide demonstrates practical techniques that you can use to increase the efficiency of your model's training by optimizing memory utilization, speeding up the training, or both. If you'd like to understand how GPU is utilized during diff --git a/docs/source/en/perf_train_special.md b/docs/source/en/perf_train_special.md index d98d3e0e32e5..16611f16da4c 100644 --- a/docs/source/en/perf_train_special.md +++ b/docs/source/en/perf_train_special.md @@ -1,4 +1,4 @@ - -# PyTorch training on Apple silicon +# Apple Silicon -Previously, training models on a Mac was limited to the CPU only. With the release of PyTorch v1.12, you can take advantage of training models with Apple's silicon GPUs for significantly faster performance and training. This is powered in PyTorch by integrating Apple's Metal Performance Shaders (MPS) as a backend. The [MPS backend](https://pytorch.org/docs/stable/notes/mps.html) implements PyTorch operations as custom Metal shaders and places these modules on a `mps` device. +Apple Silicon (M series) features a unified memory architecture making it possible to efficiently train large models locally and improves performance by reducing latency associated with data retrieval. You can take advantage of Apple Silicon for training with PyTorch due to its integration with [Metal Performance Shaders (MPS)](https://pytorch.org/docs/stable/notes/mps.html). - +The `mps` backend requires macOS 12.3 or later. -Some PyTorch operations are not implemented in MPS yet and will throw an error. To avoid this, you should set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU kernels instead (you'll still see a `UserWarning`). +> [!WARNING] +> Some PyTorch operations are not implemented in MPS yet. To avoid an error, set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to fallback on the CPU kernels. Please open an issue in the [PyTorch](https://github.com/pytorch/pytorch/issues) repository if you encounter any other issues. -
+[`TrainingArguments`] and [`Trainer`] detects and sets the backend device to `mps` if an Apple Silicon device is available. No additional changes are required to enable training on your device. -If you run into any other errors, please open an issue in the [PyTorch](https://github.com/pytorch/pytorch/issues) repository because the [`Trainer`] only integrates the MPS backend. +The `mps` backend doesn't support [distributed training](https://pytorch.org/docs/stable/distributed.html#backends). -
+## Resources -With the `mps` device set, you can: - -* train larger networks or batch sizes locally -* reduce data retrieval latency because the GPU's unified memory architecture allows direct access to the full memory store -* reduce costs because you don't need to train on cloud-based GPUs or add additional local GPUs - -Get started by making sure you have PyTorch installed. MPS acceleration is supported on macOS 12.3+. - -```bash -pip install torch torchvision torchaudio -``` - -[`TrainingArguments`] uses the `mps` device by default if it's available which means you don't need to explicitly set the device. For example, you can run the [run_glue.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py) script with the MPS backend automatically enabled without making any changes. - -```diff -export TASK_NAME=mrpc - -python examples/pytorch/text-classification/run_glue.py \ - --model_name_or_path google-bert/bert-base-cased \ - --task_name $TASK_NAME \ -- --use_mps_device \ - --do_train \ - --do_eval \ - --max_seq_length 128 \ - --per_device_train_batch_size 32 \ - --learning_rate 2e-5 \ - --num_train_epochs 3 \ - --output_dir /tmp/$TASK_NAME/ \ - --overwrite_output_dir -``` - -Backends for [distributed setups](https://pytorch.org/docs/stable/distributed.html#backends) like `gloo` and `nccl` are not supported by the `mps` device which means you can only train on a single GPU with the MPS backend. - -You can learn more about the MPS backend in the [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/) blog post. +Learn more about the MPS backend in the [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/) blog post. diff --git a/docs/source/en/perf_train_tpu_tf.md b/docs/source/en/perf_train_tpu_tf.md index 1897c1ad745f..efdfaf38aeaa 100644 --- a/docs/source/en/perf_train_tpu_tf.md +++ b/docs/source/en/perf_train_tpu_tf.md @@ -1,4 +1,4 @@ - -# Training on TPU with TensorFlow +# TPU wih TensorFlow From cd7173d6305fd0a09b43674790aa3fa9c8eddb45 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Mon, 9 Dec 2024 08:35:47 -0800 Subject: [PATCH 072/116] gpu training --- docs/source/en/perf_train_gpu_one.md | 530 +++------------------------ 1 file changed, 56 insertions(+), 474 deletions(-) diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md index a85961b0e819..514516ac1600 100644 --- a/docs/source/en/perf_train_gpu_one.md +++ b/docs/source/en/perf_train_gpu_one.md @@ -15,520 +15,102 @@ rendered properly in your Markdown viewer. # GPU -This guide demonstrates practical techniques that you can use to increase the efficiency of your model's training by -optimizing memory utilization, speeding up the training, or both. If you'd like to understand how GPU is utilized during -training, please refer to the [Model training anatomy](model_memory_anatomy) conceptual guide first. This guide -focuses on practical techniques. +GPUs are commonly used to train deep learning models due to their high memory bandwidth and parallel processing capabilities. Depending on your GPU and model size, it is possible to even train models with billions of parameters. The key is to find the right balance between GPU memory utilization (data throughput/training time) and training speed. - +This guide will show you the features available in Transformers for efficiently training a model on a single GPU. In many cases, you'll want to use a combination of these features to optimize training. -If you have access to a machine with multiple GPUs, these approaches are still valid, plus you can leverage additional methods outlined in the [multi-GPU section](perf_train_gpu_many). +Refer to the table below to quickly help you identify the features relevant to your training scenario. - +| Feature | Training speed | Memory usage | +|---|---|---| +| batch size | yes | yes | +| gradient accumulation | no | yes | +| gradient checkpointing | no | yes | +| mixed precision | yes | depends | +| optimizers | yes | yes | +| data preloading | yes | no | +| torch_empty_cache_steps | no | yes | +| torch.compile | yes | no | +| PEFT | no | yes | -When training large models, there are two aspects that should be considered at the same time: +## Trainer -* Data throughput/training time -* Model performance +[Trainer](./trainer) supports many useful training features that can be configured through [`TrainingArguments`]. This section highlights some of the more important features for optimizing training. -Maximizing the throughput (samples/second) leads to lower training cost. This is generally achieved by utilizing the GPU -as much as possible and thus filling GPU memory to its limit. If the desired batch size exceeds the limits of the GPU memory, -the memory optimization techniques, such as gradient accumulation, can help. +### Batch size -However, if the preferred batch size fits into memory, there's no reason to apply memory-optimizing techniques because they can -slow down the training. Just because one can use a large batch size, does not necessarily mean they should. As part of -hyperparameter tuning, you should determine which batch size yields the best results and then optimize resources accordingly. +Batch size is one of the most important hyperparameters for efficient GPU training because it affects memory usage and training speed. Larger batch sizes lead to faster training because it takes advantage of GPUs parallel processing power. It is recommended to use batch sizes that are powers of 2, such as 8, 64, 128, 256, 512, etc. The batch size depends on your GPU and the models data type. -The methods and tools covered in this guide can be classified based on the effect they have on the training process: - -| Method/tool | Improves training speed | Optimizes memory utilization | -|:--------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------|:-----------------------------| -| [Batch size choice](#batch-size-choice) | Yes | Yes | -| [Gradient accumulation](#gradient-accumulation) | No | Yes | -| [Gradient checkpointing](#gradient-checkpointing) | No | Yes | -| [Mixed precision training](#mixed-precision-training) | Yes | Maybe* | -| [torch_empty_cache_steps](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.torch_empty_cache_steps) | No | Yes | -| [Optimizer choice](#optimizer-choice) | Yes | Yes | -| [Data preloading](#data-preloading) | Yes | No | -| [DeepSpeed Zero](#deepspeed-zero) | No | Yes | -| [torch.compile](#using-torchcompile) | Yes | No | -| [Parameter-Efficient Fine Tuning (PEFT)](#using--peft) | No | Yes | - - - -*Note: when using mixed precision with a small model and a large batch size, there will be some memory savings but with a -large model and a small batch size, the memory use will be larger. - - - -You can combine the above methods to get a cumulative effect. These techniques are available to you whether you are -training your model with [`Trainer`] or writing a pure PyTorch loop, in which case you can [configure these optimizations -with 🤗 Accelerate](#using--accelerate). - -If these methods do not result in sufficient gains, you can explore the following options: -* [Look into building your own custom Docker container with efficient software prebuilds](#efficient-software-prebuilds) -* [Consider a model that uses Mixture of Experts (MoE)](#mixture-of-experts) -* [Convert your model to BetterTransformer to leverage PyTorch native attention](#using-pytorch-native-attention-and-flash-attention) - -Finally, if all of the above is still not enough, even after switching to a server-grade GPU like A100, consider moving -to a multi-GPU setup. All these approaches are still valid in a multi-GPU setup, plus you can leverage additional parallelism -techniques outlined in the [multi-GPU section](perf_train_gpu_many). - -## Batch size choice - -To achieve optimal performance, start by identifying the appropriate batch size. It is recommended to use batch sizes and -input/output neuron counts that are of size 2^N. Often it's a multiple of 8, but it can be -higher depending on the hardware being used and the model's dtype. - -For reference, check out NVIDIA's recommendation for [input/output neuron counts]( -https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features) and -[batch size](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#batch-size) for -fully connected layers (which are involved in GEMMs (General Matrix Multiplications)). - -[Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) -define the multiplier based on the dtype and the hardware. For instance, for fp16 data type a multiple of 8 is recommended, unless -it's an A100 GPU, in which case use multiples of 64. - -For parameters that are small, consider also [Dimension Quantization Effects](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization). -This is where tiling happens and the right multiplier can have a significant speedup. - -## Gradient Accumulation - -The **gradient accumulation** method aims to calculate gradients in smaller increments instead of computing them for the -entire batch at once. This approach involves iteratively calculating gradients in smaller batches by performing forward -and backward passes through the model and accumulating the gradients during the process. Once a sufficient number of -gradients have been accumulated, the model's optimization step is executed. By employing gradient accumulation, it -becomes possible to increase the **effective batch size** beyond the limitations imposed by the GPU's memory capacity. -However, it is important to note that the additional forward and backward passes introduced by gradient accumulation can -slow down the training process. - -You can enable gradient accumulation by adding the `gradient_accumulation_steps` argument to [`TrainingArguments`]: +Configure [`~TrainingArguments.per_device_train_batch_size`] in [`TrainingArguments`]. ```py -training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args) -``` - -In the above example, your effective batch size becomes 4. - -Alternatively, use 🤗 Accelerate to gain full control over the training loop. Find the 🤗 Accelerate example -[further down in this guide](#using--accelerate). +from transformers import TrainingArguments -While it is advised to max out GPU usage as much as possible, a high number of gradient accumulation steps can -result in a more pronounced training slowdown. Consider the following example. Let's say, the `per_device_train_batch_size=4` -without gradient accumulation hits the GPU's limit. If you would like to train with batches of size 64, do not set the -`per_device_train_batch_size` to 1 and `gradient_accumulation_steps` to 64. Instead, keep `per_device_train_batch_size=4` -and set `gradient_accumulation_steps=16`. This results in the same effective batch size while making better use of -the available GPU resources. - -For additional information, please refer to batch size and gradient accumulation benchmarks for [RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004392537) -and [A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1005033957). - -## Gradient Checkpointing - -Some large models may still face memory issues even when the batch size is set to 1 and gradient accumulation is used. -This is because there are other components that also require memory storage. - -Saving all activations from the forward pass in order to compute the gradients during the backward pass can result in -significant memory overhead. The alternative approach of discarding the activations and recalculating them when needed -during the backward pass, would introduce a considerable computational overhead and slow down the training process. - -**Gradient checkpointing** offers a compromise between these two approaches and saves strategically selected activations -throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. For -an in-depth explanation of gradient checkpointing, refer to [this great article](https://medium.com/tensorflow/fitting-larger-networks-into-memory-583e3c758ff9). - -To enable gradient checkpointing in the [`Trainer`], pass the corresponding a flag to [`TrainingArguments`]: - -```py -training_args = TrainingArguments( - per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args +args = TrainingArguments( + per_device_train_batch_size=256, + per_device_eval_batch_size=256, ) ``` -Alternatively, use 🤗 Accelerate - find the 🤗 Accelerate example [further in this guide](#using--accelerate). +Refer to the NVIDIA [Performance](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features) guide to learn more about how input features and output neuron counts and batch size affect performance. These are involved in the General Matrix Multiplications (GEMMs) performed by the GPU. Larger parameters are better for parallelization and efficiency. - +The [Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) section is also useful for selecting a batch size that maximizes the speed of tensor multiplication based on the data type and GPU. For example, multiples of 8 are recommended for fp16, unless it's an A100 GPU, in which case use multiples of 65. -While gradient checkpointing may improve memory efficiency, it slows training by approximately 20%. +Finally, consider [Dimension Quantization Effects](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization) for smaller parameters. Tile quantization results when matrix dimensions aren't divisible by a GPUs thread block tile size, causing the GPU to underutilize its resources. Selecting the correct batch size multiplier, such that the matrix is divisible by the tile size, can significantly speed up training. - +### Gradient accumulation -## Mixed precision training +Gradient accumulation overcomes memory constraints - useful for fitting a very large model that otherwise wouldn't fit on a single GPU - by accumulating gradients over multiple mini-batches before updating the parameters. This reduces memory by storing fewer gradients and enables training with a larger *effective batch size* because usually, the parameters are updated from a single batch of data. Training can slow down though due to the additional forward and backward passes introduced by gradient accumulation. -**Mixed precision training** is a technique that aims to optimize the computational efficiency of training models by -utilizing lower-precision numerical formats for certain variables. Traditionally, most models use 32-bit floating point -precision (fp32 or float32) to represent and process variables. However, not all variables require this high precision -level to achieve accurate results. By reducing the precision of certain variables to lower numerical formats like 16-bit -floating point (fp16 or float16), we can speed up the computations. Because in this approach some computations are performed -in half-precision, while some are still in full precision, the approach is called mixed precision training. - -Most commonly mixed precision training is achieved by using fp16 (float16) data types, however, some GPU architectures -(such as the Ampere architecture) offer bf16 and tf32 (CUDA internal data type) data types. Check -out the [NVIDIA Blog](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/) to learn more about -the differences between these data types. - -### fp16 - -The main advantage of mixed precision training comes from saving the activations in half precision (fp16). -Although the gradients are also computed in half precision they are converted back to full precision for the optimization -step so no memory is saved here. -While mixed precision training results in faster computations, it can also lead to more GPU memory being utilized, especially for small batch sizes. -This is because the model is now present on the GPU in both 16-bit and 32-bit precision (1.5x the original model on the GPU). - -To enable mixed precision training, set the `fp16` flag to `True`: +Configure [`~TrainingArguments.per_device_train_batch_size`] in [`TrainingArguments`] to enable gradient accumulation. ```py -training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args) -``` - -If you prefer to use 🤗 Accelerate, find the 🤗 Accelerate example [further in this guide](#using--accelerate). - -### BF16 - -If you have access to an Ampere or newer hardware you can use bf16 for mixed precision training and evaluation. While -bf16 has a worse precision than fp16, it has a much bigger dynamic range. In fp16 the biggest number you can have -is `65504` and any number above that will result in an overflow. A bf16 number can be as large as `3.39e+38` (!) which -is about the same as fp32 - because both have 8-bits used for the numerical range. - -You can enable BF16 in the 🤗 Trainer with: - -```python -training_args = TrainingArguments(bf16=True, **default_args) -``` - -### TF32 - -The Ampere hardware uses a magical data type called tf32. It has the same numerical range as fp32 (8-bits), but instead -of 23 bits precision it has only 10 bits (same as fp16) and uses only 19 bits in total. It's "magical" in the sense that -you can use the normal fp32 training and/or inference code and by enabling tf32 support you can get up to 3x throughput -improvement. All you need to do is to add the following to your code: - -```python -import torch -torch.backends.cuda.matmul.allow_tf32 = True -torch.backends.cudnn.allow_tf32 = True -``` - -CUDA will automatically switch to using tf32 instead of fp32 where possible, assuming that the used GPU is from the Ampere series. - -According to [NVIDIA research](https://developer.nvidia.com/blog/accelerating-ai-training-with-tf32-tensor-cores/), the -majority of machine learning training workloads show the same perplexity and convergence with tf32 training as with fp32. -If you're already using fp16 or bf16 mixed precision it may help with the throughput as well. - -You can enable this mode in the 🤗 Trainer: - -```python -TrainingArguments(tf32=True, **default_args) -``` - - - -tf32 can't be accessed directly via `tensor.to(dtype=torch.tf32)` because it is an internal CUDA data type. You need `torch>=1.7` to use tf32 data types. - - - -For additional information on tf32 vs other precisions, please refer to the following benchmarks: -[RTX-3090](https://github.com/huggingface/transformers/issues/14608#issuecomment-1004390803) and -[A100](https://github.com/huggingface/transformers/issues/15026#issuecomment-1004543189). - -## Flash Attention 2 - -You can speedup the training throughput by using Flash Attention 2 integration in transformers. Check out the appropriate section in the [single GPU section](./perf_infer_gpu_one#Flash-Attention-2) to learn more about how to load a model with Flash Attention 2 modules. - -## Optimizer choice - -The most common optimizer used to train transformer models is Adam or AdamW (Adam with weight decay). Adam achieves -good convergence by storing the rolling average of the previous gradients; however, it adds an additional memory -footprint of the order of the number of model parameters. To remedy this, you can use an alternative optimizer. -For example if you have [NVIDIA/apex](https://github.com/NVIDIA/apex) installed for NVIDIA GPUs, or [ROCmSoftwarePlatform/apex](https://github.com/ROCmSoftwarePlatform/apex) for AMD GPUs, `adamw_apex_fused` will give you the -fastest training experience among all supported AdamW optimizers. - -[`Trainer`] integrates a variety of optimizers that can be used out of box: `adamw_hf`, `adamw_torch`, `adamw_torch_fused`, -`adamw_apex_fused`, `adamw_anyprecision`, `adafactor`, or `adamw_bnb_8bit`. More optimizers can be plugged in via a third-party implementation. - -Let's take a closer look at two alternatives to AdamW optimizer: -1. `adafactor` which is available in [`Trainer`] -2. `adamw_bnb_8bit` is also available in Trainer, but a third-party integration is provided below for demonstration. - -For comparison, for a 3B-parameter model, like “google-t5/t5-3b”: -* A standard AdamW optimizer will need 24GB of GPU memory because it uses 8 bytes for each parameter (8*3 => 24GB) -* Adafactor optimizer will need more than 12GB. It uses slightly more than 4 bytes for each parameter, so 4*3 and then some extra. -* 8bit BNB quantized optimizer will use only (2*3) 6GB if all optimizer states are quantized. - -### Adafactor - -Adafactor doesn't store rolling averages for each element in weight matrices. Instead, it keeps aggregated information -(sums of rolling averages row- and column-wise), significantly reducing its footprint. However, compared to Adam, -Adafactor may have slower convergence in certain cases. - -You can switch to Adafactor by setting `optim="adafactor"` in [`TrainingArguments`]: - -```py -training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", **default_args) -``` - -Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training) -you can notice up to 3x improvement while maintaining the throughput! However, as mentioned before, the convergence of -Adafactor can be worse than Adam. - -### 8-bit Adam - -Instead of aggregating optimizer states like Adafactor, 8-bit Adam keeps the full state and quantizes it. Quantization -means that it stores the state with lower precision and dequantizes it only for the optimization. This is similar to the -idea behind mixed precision training. +from transformers import TrainingArguments -To use `adamw_bnb_8bit`, you simply need to set `optim="adamw_bnb_8bit"` in [`TrainingArguments`]: - -```py -training_args = TrainingArguments(per_device_train_batch_size=4, optim="adamw_bnb_8bit", **default_args) -``` - -However, we can also use a third-party implementation of the 8-bit optimizer for demonstration purposes to see how that can be integrated. - -First, follow the installation guide in the GitHub [repo](https://github.com/bitsandbytes-foundation/bitsandbytes) to install the `bitsandbytes` library -that implements the 8-bit Adam optimizer. - -Next you need to initialize the optimizer. This involves two steps: -* First, group the model's parameters into two groups - one where weight decay should be applied, and the other one where it should not. Usually, biases and layer norm parameters are not weight decayed. -* Then do some argument housekeeping to use the same parameters as the previously used AdamW optimizer. - -```py -import bitsandbytes as bnb -from torch import nn -from transformers.trainer_pt_utils import get_parameter_names - -training_args = TrainingArguments(per_device_train_batch_size=4, **default_args) - -decay_parameters = get_parameter_names(model, [nn.LayerNorm], ["bias", "layernorm", "rmsnorm"]) -optimizer_grouped_parameters = [ - { - "params": [p for n, p in model.named_parameters() if n in decay_parameters], - "weight_decay": training_args.weight_decay, - }, - { - "params": [p for n, p in model.named_parameters() if n not in decay_parameters], - "weight_decay": 0.0, - }, -] - -optimizer_kwargs = { - "betas": (training_args.adam_beta1, training_args.adam_beta2), - "eps": training_args.adam_epsilon, -} -optimizer_kwargs["lr"] = training_args.learning_rate -adam_bnb_optim = bnb.optim.Adam8bit( - optimizer_grouped_parameters, - betas=(training_args.adam_beta1, training_args.adam_beta2), - eps=training_args.adam_epsilon, - lr=training_args.learning_rate, +# effective batch size of 64 +args = TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=16, ) ``` -Finally, pass the custom optimizer as an argument to the `Trainer`: - -```py -trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None)) -``` - -Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training), -you can expect to get about a 3x memory improvement and even slightly higher throughput as using Adafactor. - -### multi_tensor - -pytorch-nightly introduced `torch.optim._multi_tensor` which should significantly speed up the optimizers for situations -with lots of small feature tensors. It should eventually become the default, but if you want to experiment with it sooner, take a look at this GitHub [issue](https://github.com/huggingface/transformers/issues/9965). - -## Data preloading - -One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it -can handle. By default, everything happens in the main process, and it might not be able to read the data from disk fast -enough, and thus create a bottleneck, leading to GPU under-utilization. Configure the following arguments to reduce the bottleneck: - -- `DataLoader(pin_memory=True, ...)` - ensures the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory. -- `DataLoader(num_workers=4, ...)` - spawn several workers to preload data faster. During training, watch the GPU utilization stats; if it's far from 100%, experiment with increasing the number of workers. Of course, the problem could be elsewhere, so many workers won't necessarily lead to better performance. - -When using [`Trainer`], the corresponding [`TrainingArguments`] are: `dataloader_pin_memory` (`True` by default), and `dataloader_num_workers` (defaults to `0`). - -## DeepSpeed ZeRO - -DeepSpeed is an open-source deep learning optimization library that is integrated with 🤗 Transformers and 🤗 Accelerate. -It provides a wide range of features and optimizations designed to improve the efficiency and scalability of large-scale -deep learning training. - -If your model fits onto a single GPU and you have enough space to fit a small batch size, you don't need to use DeepSpeed -as it'll only slow things down. However, if the model doesn't fit onto a single GPU or you can't fit a small batch, you can -leverage DeepSpeed ZeRO + CPU Offload, or NVMe Offload for much larger models. In this case, you need to separately -[install the library](main_classes/deepspeed#installation), then follow one of the guides to create a configuration file -and launch DeepSpeed: - -* For an in-depth guide on DeepSpeed integration with [`Trainer`], review [the corresponding documentation](main_classes/deepspeed), specifically the -[section for a single GPU](main_classes/deepspeed#deployment-with-one-gpu). Some adjustments are required to use DeepSpeed in a notebook; please take a look at the [corresponding guide](main_classes/deepspeed#deployment-in-notebooks). -* If you prefer to use 🤗 Accelerate, refer to [🤗 Accelerate DeepSpeed guide](https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed). - -## Using torch.compile - -PyTorch 2.0 introduced a new compile function that doesn't require any modification to existing PyTorch code but can -optimize your code by adding a single line of code: `model = torch.compile(model)`. - -If using [`Trainer`], you only need `to` pass the `torch_compile` option in the [`TrainingArguments`]: - -```python -training_args = TrainingArguments(torch_compile=True, **default_args) -``` - -`torch.compile` uses Python's frame evaluation API to automatically create a graph from existing PyTorch programs. After -capturing the graph, different backends can be deployed to lower the graph to an optimized engine. -You can find more details and benchmarks in [PyTorch documentation](https://pytorch.org/get-started/pytorch-2.0/). - -`torch.compile` has a growing list of backends, which can be found in by calling `torchdynamo.list_backends()`, each of which with its optional dependencies. - -Choose which backend to use by specifying it via `torch_compile_backend` in the [`TrainingArguments`]. Some of the most commonly used backends are: - -**Debugging backends**: -* `dynamo.optimize("eager")` - Uses PyTorch to run the extracted GraphModule. This is quite useful in debugging TorchDynamo issues. -* `dynamo.optimize("aot_eager")` - Uses AotAutograd with no compiler, i.e, just using PyTorch eager for the AotAutograd's extracted forward and backward graphs. This is useful for debugging, and unlikely to give speedups. - -**Training & inference backends**: -* `dynamo.optimize("inductor")` - Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton kernels [Read more](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747) -* `dynamo.optimize("nvfuser")` - nvFuser with TorchScript. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593) -* `dynamo.optimize("aot_nvfuser")` - nvFuser with AotAutograd. [Read more](https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593) -* `dynamo.optimize("aot_cudagraphs")` - cudagraphs with AotAutograd. [Read more](https://github.com/pytorch/torchdynamo/pull/757) - -**Inference-only backend**s: -* `dynamo.optimize("ofi")` - Uses TorchScript optimize_for_inference. [Read more](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html) -* `dynamo.optimize("fx2trt")` - Uses NVIDIA TensorRT for inference optimizations. [Read more](https://pytorch.org/TensorRT/tutorials/getting_started_with_fx_path.html) -* `dynamo.optimize("onnxrt")` - Uses ONNXRT for inference on CPU/GPU. [Read more](https://onnxruntime.ai/) -* `dynamo.optimize("ipex")` - Uses IPEX for inference on CPU. [Read more](https://github.com/intel/intel-extension-for-pytorch) +Try to avoid too many gradient accumulation steps because it can really slow down training. Consider the example below, where the maximum batch size that'll fit on your GPU is 4. You should keep your batch size at 4 to better utilize the GPU. -For an example of using `torch.compile` with 🤗 Transformers, check out this [blog post on fine-tuning a BERT model for Text Classification using the newest PyTorch 2.0 features](https://www.philschmid.de/getting-started-pytorch-2-0-transformers) +| batch size | gradient accumulation steps | effective batch size | | +|---|---|---|---| +| 1 | 64 | 64 | 👎 | +| 4 | 16 | 64 | 👍 | -## Using 🤗 PEFT +### Gradient checkpointing -[Parameter-Efficient Fine Tuning (PEFT)](https://huggingface.co/blog/peft) methods freeze the pretrained model parameters during fine-tuning and add a small number of trainable parameters (the adapters) on top of it. +Gradient checkpointing reduces memory usage by only storing some of the intermediate activations during the backward pass and recomputing the remaining activations. This avoids storing *all* of the intermediate activations from the forward pass, which can require a lot of memory overhead. However, it comes at the cost of slower training speed (~20%). -As a result the [memory associated to the optimizer states and gradients](https://huggingface.co/docs/transformers/model_memory_anatomy#anatomy-of-models-memory) are greatly reduced. - -For example with a vanilla AdamW, the memory requirement for the optimizer state would be: -* fp32 copy of parameters: 4 bytes/param -* Momentum: 4 bytes/param -* Variance: 4 bytes/param - -Suppose a model with 7B parameters and 200 million parameters injected with [Low Rank Adapters](https://huggingface.co/docs/peft/conceptual_guides/lora). - -The memory requirement for the optimizer state of the plain model would be 12 * 7 = 84 GB (assuming 7B trainable parameters). - -Adding Lora increases slightly the memory associated to the model weights and substantially decreases memory requirement for the optimizer state to 12 * 0.2 = 2.4GB. - -Read more about PEFT and its detailed usage in [the PEFT documentation](https://huggingface.co/docs/peft/) or [PEFT repository](https://github.com/huggingface/peft). - -## Using 🤗 Accelerate - -With [🤗 Accelerate](https://huggingface.co/docs/accelerate/index) you can use the above methods while gaining full -control over the training loop and can essentially write the loop in pure PyTorch with some minor modifications. - -Suppose you have combined the methods in the [`TrainingArguments`] like so: +Configure [`~TrainingArguments.gradient_checkpointing`] in [`TrainingArguments`] to enable gradient checkpointing. ```py -training_args = TrainingArguments( - per_device_train_batch_size=1, - gradient_accumulation_steps=4, +from transformers import TrainingArguments + +args = TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=16, gradient_checkpointing=True, - fp16=True, - **default_args, ) ``` -The full example training loop with 🤗 Accelerate is only a handful of lines of code long: - -```py -from accelerate import Accelerator -from torch.utils.data.dataloader import DataLoader - -dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size) - -if training_args.gradient_checkpointing: - model.gradient_checkpointing_enable() - -accelerator = Accelerator(fp16=training_args.fp16) -model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader) - -model.train() -for step, batch in enumerate(dataloader, start=1): - loss = model(**batch).loss - loss = loss / training_args.gradient_accumulation_steps - accelerator.backward(loss) - if step % training_args.gradient_accumulation_steps == 0: - optimizer.step() - optimizer.zero_grad() -``` - -First we wrap the dataset in a [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). -Then we can enable gradient checkpointing by calling the model's [`~PreTrainedModel.gradient_checkpointing_enable`] method. -When we initialize the [`Accelerator`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator) -we can specify if we want to use mixed precision training and it will take care of it for us in the [`prepare`] call. -During the [`prepare`](https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.prepare) -call the dataloader will also be distributed across workers should we use multiple GPUs. We use the same [8-bit optimizer](#8-bit-adam) from the earlier example. - -Finally, we can add the main training loop. Note that the `backward` call is handled by 🤗 Accelerate. We can also see -how gradient accumulation works: we normalize the loss, so we get the average at the end of accumulation and once we have -enough steps we run the optimization. - -Implementing these optimization techniques with 🤗 Accelerate only takes a handful of lines of code and comes with the -benefit of more flexibility in the training loop. For a full documentation of all features have a look at the -[Accelerate documentation](https://huggingface.co/docs/accelerate/index). - - -## Efficient Software Prebuilds - -PyTorch's [pip and conda builds](https://pytorch.org/get-started/locally/#start-locally) come prebuilt with the cuda toolkit -which is enough to run PyTorch, but it is insufficient if you need to build cuda extensions. - -At times, additional efforts may be required to pre-build some components. For instance, if you're using libraries like `apex` that -don't come pre-compiled. In other situations figuring out how to install the right cuda toolkit system-wide can be complicated. -To address these scenarios PyTorch and NVIDIA released a new version of NGC docker container which already comes with -everything prebuilt. You just need to install your programs on it, and it will run out of the box. - -This approach is also useful if you want to tweak the pytorch source and/or make a new customized build. -To find the docker image version you want start [with PyTorch release notes](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/), -choose one of the latest monthly releases. Go into the release's notes for the desired release, check that the environment's -components are matching your needs (including NVIDIA Driver requirements!) and then at the very top of that document go -to the corresponding NGC page. If for some reason you get lost, here is [the index of all PyTorch NGC images](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch). - -Next follow the instructions to download and deploy the docker image. - -## Mixture of Experts - -Some recent papers reported a 4-5x training speedup and a faster inference by integrating -Mixture of Experts (MoE) into the Transformer models. - -Since it has been discovered that more parameters lead to better performance, this technique allows to increase the -number of parameters by an order of magnitude without increasing training costs. - -In this approach every other FFN layer is replaced with a MoE Layer which consists of many experts, with a gated function -that trains each expert in a balanced way depending on the input token's position in a sequence. - -![MoE Transformer 2x block](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/perf-moe-transformer.png) - -(source: [GLAM](https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html)) - -You can find exhaustive details and comparison tables in the papers listed at the end of this section. +### Mixed precision -The main drawback of this approach is that it requires staggering amounts of GPU memory - almost an order of magnitude -larger than its dense equivalent. Various distillation and approaches are proposed to how to overcome the much higher memory requirements. +Mixed precision accelerates training speed by performing some calculations in half-precision and some in full-precision. The half-precision calculations boosts training speed because it's not as computationally expensive as performing the calculations in full-precision. Preserving some of the calculations in full-precision maintains accuracy. -There is direct trade-off though, you can use just a few experts with a 2-3x smaller base model instead of dozens or -hundreds experts leading to a 5x smaller model and thus increase the training speed moderately while increasing the -memory requirements moderately as well. +### Optimizers -Most related papers and implementations are built around Tensorflow/TPUs: +### Data preloading -- [GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding](https://arxiv.org/abs/2006.16668) -- [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) -- [GLaM: Generalist Language Model (GLaM)](https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html) +## PyTorch -And for Pytorch DeepSpeed has built one as well: [DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale](https://arxiv.org/abs/2201.05596), [Mixture of Experts](https://www.deepspeed.ai/tutorials/mixture-of-experts/) - blog posts: [1](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/), [2](https://www.microsoft.com/en-us/research/publication/scalable-and-efficient-moe-training-for-multitask-multilingual-models/) and specific deployment with large transformer-based natural language generation models: [blog post](https://www.deepspeed.ai/2021/12/09/deepspeed-moe-nlg.html), [Megatron-Deepspeed branch](https://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training). +### torch.empty_cache_steps -## Using PyTorch native attention and Flash Attention +### torch.compile -PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) can also call FlashAttention and memory-efficient attention kernels under the hood. SDPA support is currently being added natively in Transformers and is used by default for `torch>=2.1.1` when an implementation is available. Please refer to [PyTorch scaled dot product attention](https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) for a list of supported models and more details. +### PyTorch scaled dot production attention -Check out this [blogpost](https://pytorch.org/blog/out-of-the-box-acceleration/) to learn more about acceleration and memory-savings with SDPA. +## PEFT From 2b849cfaf9116010a04ba119cd17cbe8ed31d726 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 10 Dec 2024 09:43:23 -0800 Subject: [PATCH 073/116] gpu training 2 --- docs/source/en/perf_train_gpu_one.md | 210 ++++++++++++++++++++++++++- 1 file changed, 209 insertions(+), 1 deletion(-) diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md index 514516ac1600..888995898ab9 100644 --- a/docs/source/en/perf_train_gpu_one.md +++ b/docs/source/en/perf_train_gpu_one.md @@ -99,18 +99,226 @@ args = TrainingArguments( ### Mixed precision -Mixed precision accelerates training speed by performing some calculations in half-precision and some in full-precision. The half-precision calculations boosts training speed because it's not as computationally expensive as performing the calculations in full-precision. Preserving some of the calculations in full-precision maintains accuracy. +Mixed precision accelerates training speed by performing some calculations in half-precision (fp16) and some in full-precision (fp32). The half-precision calculations boosts training speed because it's not as computationally expensive as performing the calculations in full-precision. Meanwhile, preserving some of the calculations in full-precision maintains accuracy. + +There are several data types available for mixed precision training. + + + + +The main advantage of mixed precision training is saving the activations in fp16. + +Configure [`~TrainingArguments.fp16`] in [`TrainingArguments`] to enable mixed precision training with the fp16 data type. + +```py +from transformers import TrainingArguments + +args = TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=16, + gradient_checkpointing=True, + fp16=True. +) +``` + +fp16 doesn't memory-optimized because the gradients that are computed in fp16 are converted back to fp32 during the optimization step. You may end up using more GPU memory, especially for small batch sizes, because there are now two versions (fp16 and fp32) of the model on the GPU. + + + + +[bf16](https://cloud.google.com/blog/products/ai-machine-learning/bfloat16-the-secret-to-high-performance-on-cloud-tpus) trades off some precision for a much larger dynamic range, which is helpful for avoiding overflow and underflow errors. You can use bf16 without adding any loss scaling methods like you would with fp16. bf16 is supported by NVIDIAs Ampere architecture or newer. + +Configure [`~TrainingArguments.fp16`] in [`TrainingArguments`] to enable mixed precision training with the bf16 data type. + +```py +from transformers import TrainingArguments + +args = TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=16, + gradient_checkpointing=True, + bf16=True, +) +``` + + + + +[tf32](https://blogs.nvidia.com/blog/tensorfloat-32-precision-format/) is a mode on NVIDIA Ampere GPUs that convert the convolution and matrix multiplication inputs to tf32. All other storage and operations are kept in fp32. This allows tf32 to maintain the same range as fp32, the same precision as fp16 and more precision than bf16. Combining tf32 with fp16 or bf16 mixed precision training can improve throughput by 16x. + +tf32 is enabled by default on NVIDIA Ampere GPUs, but you can also add the code below to your fp32 training or inference code to explicitly enable it. + +```py +import torch +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True +``` + +Configure [`~TrainingArguments.tf32`] in [`TrainingArguments`] to enable mixed precision training with tf32 mode. + +```py +from transformers import TrainingArguments + +args = TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=16, + gradient_checkpointing=True, + bf16=True. + tf32=True, +) +``` + + + ### Optimizers +Transformers implements the [AdamW (adamw_torch)](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) optimizer from PyTorch by default. But because it stores a weighted average of past gradients, it requires additional memory proportional to the number of model parameters to store the past gradients. This can be an issue when training very large models, and in such cases, you should consider choosing a different optimizer. For example, if you have [Apex](https://nvidia.github.io/apex/index.html) installed on either [NVIDIA](https://github.com/NVIDIA/apex) or [AMD](https://github.com/ROCm/apex), then using the `adamw_apex_fused` optimizer provides the fastest training for all AdamW optimizers. + +Configure [`~TrainingArguments.optim`] in [`TrainingArguments`] to choose an optimizer. + +```py +from transformers import TrainingArguments + +args = TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=16, + gradient_checkpointing=True, + bf16=True, + optim="adamw_bnb_8bit +) +``` + +There are many optimizers to choose from (refer to [OptimizerNames](https://github.com/huggingface/transformers/blob/34f4080ff59b1668d919a1ba9f8bc4a3a2a3f478/src/transformers/training_args.py#L145) for a full supported list) depending on your training scenario. For example, Adafactor can significantly reduce memory requirements by storing a weighted average of a row or column instead of each element in the matrix at the cost of slower convergence. Another example is using a [8-bit AdamW optimizer](https://huggingface.co/docs/bitsandbytes) from bitsandbytes to quantize optimizer states. The optimizer state is stored in a lower precision and dequantized before being used in the optimizer step. + +Refer to the [optimizer](./optimizers) guide for to learn about more specialized optimizers. + ### Data preloading +Data preloading loads and prepares batches of data in advance on the CPU to ensure the GPU is continuously working, reducing GPU idling and increasing utilization. There are two ways to preload data to ensure the GPU is always working. + +1. Allocate pinned memory on the CPU to store the data and transfer it directly to the GPU. +2. Increase the number of CPU threads or workers to preload the data faster. + +Configure [`~TrainingArguments.dataloader_pin_memory`] and [`~TrainingArguments.dataloader_num_workers`] in [`TrainingArguments`] to allocate pinned memory and increase the number of workers. + +```py +from transformers import TrainingArguments + +args = TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=16, + gradient_checkpointing=True, + bf16=True, + optim="adamw_bnb_8bit, + dataloader_pin_memory=True, + dataloader_num_workers=4, +) +``` + ## PyTorch +PyTorch provides several features for reducing memory requirements and increasing training speed. These features can often be enabled in Transformers by only adding a few lines of code. + ### torch.empty_cache_steps +The [torch.cuda.empty_cache](https://pytorch.org/docs/stable/generated/torch.cuda.empty_cache.html#torch.cuda.empty_cache) function releases unused cached memory, which can help avoid out-of-memory (OOM) errors at the cost of ~10% slower training. + +Configure [`~TrainingArguments.torch_empty_cache_steps`] in [`TrainingArguments`] to enable torch.empty_cache after a certain number of training steps. + +```py +from transformers import TrainingArguments + +args = TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=16, + gradient_checkpointing=True, + bf16=True, + optim="adamw_bnb_8bit, + dataloader_pin_memory=True, + dataloader_num_workers=4, + torch_empty_cache_steps=4, +) +``` + ### torch.compile +[torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) compiles PyTorch code into optimized kernels that significantly speed up training. This feature relies on TorchDynamo to capture PyTorch graphs with the Frame Evaluation API. The graph can be further compiled into optimized kernels for different backends. + +Configure [`~TrainingArguments.torch_compile`] in [`TrainingArguments`] to enable it, and configure [`~TrainingArguments.torch_compile_backend`] to select a backend to use. + +```py +from transformers import TrainingArguments + +args = TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=16, + gradient_checkpointing=True, + bf16=True, + optim="adamw_bnb_8bit, + dataloader_pin_memory=True, + dataloader_num_workers=4, + torch_empty_cache_steps=4, + torch_compile=True, + torch_compile_backend="inductor" +) +``` + +Refer to the table below to help you choose the right backend for your training scenario. + +| backend | description | goal | +|---|---|---| +| eager | uses PyTorch to run extracted GraphModule | debugging | +| aot_eager | uses PyTorch eager mode for AOTAutograd's extracted forward and backward graphs | debugging | +| inductor | uses TorchInductor with AOTAutograd and CUDA Graphs by leveraging Triton kernels | training and inference | +| nvfuser | uses nvFuser with TorchScript | training and inference | +| aot_nvfuser | uses nvFuser with AOTAutograd | training and inference | +| aot_cudagraphs | uses CUDA Graphs with AOTAutograd | training and inference | +| ofi | uses TorchScripts [optimize_for_inference](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html#torch-jit-optimize-for-inference) | inference | +| fx2trt | uses [Torch-TensorRT](https://pytorch.org/TensorRT/tutorials/getting_started_with_fx_path.html) | inference | +| onnxrt | uses [ONNX-RT](https://onnxruntime.ai/) for CPU and GPU inference | inference | +| ipex | uses [IPEX](https://github.com/intel/intel-extension-for-pytorch) for CPU inference | inference | + ### PyTorch scaled dot production attention +PyTorch's [torch.nn.functional.scaled_dot_product_attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) is a native implementation of the scaled dot product attention mechanism. SDPA is more efficient and optimized than the original attention mechanism in transformer models. It supports three types of scaled dot product attention. + +- [FlashAttention2](https://github.com/Dao-AILab/flash-attention) is automatically enabled for models with the fp16 or bf16 torch type. Make sure to cast your model to the appropriate type first. +- [xFormers](https://github.com/facebookresearch/xformers) or Memory-Efficient Attention supports models with the fp32 torch type. +- C++ implementation of scaled dot product attention. + +SDPA is enabled by default for PyTorch 2.1.1+, but it can be explicitly enabled by setting `attn_implementation="sdpa"` in [`~PreTrainedModel.from_pretrained`]. + +```py +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", attn_implementation="sdpa") +``` + ## PEFT + +[PEFT](https://huggingface.co/docs/peft/index), a library of parameter-efficient finetuning methods, enable training and storing large models often on consumer GPUs by only finetuning a small number of extra model parameters on top of the pretrained model. A significant amount of memory is saved because the GPU doesn't need to store the optimizer states and gradients for the pretrained base model. + +[Low-Rank Adaptation (LoRA)](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a very common PEFT method that decomposes the weight matrix into two smaller trainable matrices. Refer to the PEFT [Quicktour](https://huggingface.co/docs/peft/quicktour) for more details, but the example below demonstrates how to create a LoRA adapter for training. + +```py +from peft import LoraConfig, TaskType, get_peft_model +from transformers import AutoModelForCausalLM + +# create LoRA configuration object +peft_config = LoraConfig( + task_type=TaskType.CAUSAL_LM, # type of task to train on + inference_mode=False, # set to False for training + r=8, # dimension of the smaller matrices + lora_alpha=32, # scaling factor + lora_dropout=0.1 # dropout of LoRA layers +) + +# create a LoRA adapter +model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b") +model = get_peft_model(model, peft_config) +# print the number of parameters you're actually training +model.print_trainable_parameters +``` + +The model is ready to be passed to [`Trainer`] for training. From 6d262440ded77552eca80452ff8d04b914388d9f Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Tue, 10 Dec 2024 15:12:34 -0800 Subject: [PATCH 074/116] peft --- docs/source/en/_toctree.yml | 8 +- docs/source/en/main_classes/peft.md | 23 +++ docs/source/en/peft.md | 239 ++++++++------------------- docs/source/en/perf_train_gpu_one.md | 44 +---- docs/source/en/perf_train_tpu_tf.md | 2 +- 5 files changed, 105 insertions(+), 211 deletions(-) create mode 100644 docs/source/en/main_classes/peft.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index dcf4cdfc02d8..094e5f2ee44c 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -142,15 +142,13 @@ - local: perf_train_cpu title: CPU - local: perf_train_tpu_tf - title: TPU wih TensorFlow + title: TPU - local: perf_train_special title: Apple Silicon - local: perf_hardware title: Build your own machine - local: peft - title: Load and train adapters with 🤗 PEFT - - local: sagemaker - title: Run training on Amazon SageMaker + title: PEFT - local: debugging title: Debugging - local: model_memory_anatomy @@ -317,6 +315,8 @@ title: Optimization - local: main_classes/output title: Model outputs + - local: main_classes/peft + title: PEFT - local: main_classes/pipelines title: Pipelines - local: main_classes/processors diff --git a/docs/source/en/main_classes/peft.md b/docs/source/en/main_classes/peft.md new file mode 100644 index 000000000000..85790f120ebf --- /dev/null +++ b/docs/source/en/main_classes/peft.md @@ -0,0 +1,23 @@ + + +# PEFT + +The [`~integrations.PeftAdapterMixin`] provides functions from the [PEFT](https://huggingface.co/docs/peft/index) library for managing adapters with Transformers. This mixin currently supports LoRA, IA3, and AdaLora. Prefix tuning methods (prompt tuning, prompt learning) aren't supported because they can't be injected into a torch module. + +[[autodoc]] integrations.PeftAdapterMixin + - load_adapter + - add_adapter + - set_adapter + - disable_adapters + - enable_adapters + - active_adapters + - get_adapter_state_dict diff --git a/docs/source/en/peft.md b/docs/source/en/peft.md index e1777114dbcf..e69cca412528 100644 --- a/docs/source/en/peft.md +++ b/docs/source/en/peft.md @@ -1,4 +1,4 @@ - -# Load adapters with 🤗 PEFT +# PEFT [[open-in-colab]] -[Parameter-Efficient Fine Tuning (PEFT)](https://huggingface.co/blog/peft) methods freeze the pretrained model parameters during fine-tuning and add a small number of trainable parameters (the adapters) on top of it. The adapters are trained to learn task-specific information. This approach has been shown to be very memory-efficient with lower compute usage while producing results comparable to a fully fine-tuned model. +[PEFT](https://huggingface.co/docs/peft/index), a library of parameter-efficient finetuning methods, enable training and storing large models often on consumer GPUs. These methods only finetune a small number of extra model parameters, also known as adapters, on top of the pretrained model. A significant amount of memory is saved because the GPU doesn't need to store the optimizer states and gradients for the pretrained base model. Adapters are very lightweight, making it convenient to share, store, and load them. -Adapters trained with PEFT are also usually an order of magnitude smaller than the full model, making it convenient to share, store, and load them. +This guide provides a short overview of the PEFT library and how to use it for training with Transformers. For more details, refer to the PEFT [documentation](https://huggingface.co/docs/peft/index). -
- -
The adapter weights for a OPTForCausalLM model stored on the Hub are only ~6MB compared to the full size of the model weights, which can be ~700MB.
-
+Install PEFT with the command below. -If you're interested in learning more about the 🤗 PEFT library, check out the [documentation](https://huggingface.co/docs/peft/index). - -## Setup - -Get started by installing 🤗 PEFT: + + ```bash -pip install peft +pip install -U peft ``` -If you want to try out the brand new features, you might be interested in installing the library from source: + + ```bash pip install git+https://github.com/huggingface/peft.git ``` -## Supported PEFT models - -🤗 Transformers natively supports some PEFT methods, meaning you can load adapter weights stored locally or on the Hub and easily run or train them with a few lines of code. The following methods are supported: - -- [Low Rank Adapters](https://huggingface.co/docs/peft/conceptual_guides/lora) -- [IA3](https://huggingface.co/docs/peft/conceptual_guides/ia3) -- [AdaLoRA](https://arxiv.org/abs/2303.10512) - -If you want to use other PEFT methods, such as prompt learning or prompt tuning, or learn about the 🤗 PEFT library in general, please refer to the [documentation](https://huggingface.co/docs/peft/index). + + +> [!TIP] +> PEFT currently supports the LoRA, IA3, and AdaLoRA methods for Transformers. To use another PEFT method, such as prompt learniing or prompt tuning, you'll need to use the PEFT library directly. -## Load a PEFT adapter - -To load and use a PEFT adapter model from 🤗 Transformers, make sure the Hub repository or local directory contains an `adapter_config.json` file and the adapter weights, as shown in the example image above. Then you can load the PEFT adapter model using the `AutoModelFor` class. For example, to load a PEFT adapter model for causal language modeling: - -1. specify the PEFT model id -2. pass it to the [`AutoModelForCausalLM`] class +[Low-Rank Adaptation (LoRA)](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a very common PEFT method that decomposes the weight matrix into two smaller trainable matrices. Start by defining a [`~peft.LoraConfig`] object with the parameters shown below. ```py -from transformers import AutoModelForCausalLM, AutoTokenizer - -peft_model_id = "ybelkada/opt-350m-lora" -model = AutoModelForCausalLM.from_pretrained(peft_model_id) -``` +from peft import LoraConfig, TaskType, get_peft_model +from transformers import AutoModelForCausalLM - - -You can load a PEFT adapter with either an `AutoModelFor` class or the base model class like `OPTForCausalLM` or `LlamaForCausalLM`. - - - -You can also load a PEFT adapter by calling the `load_adapter` method: - -```py -from transformers import AutoModelForCausalLM, AutoTokenizer - -model_id = "facebook/opt-350m" -peft_model_id = "ybelkada/opt-350m-lora" - -model = AutoModelForCausalLM.from_pretrained(model_id) -model.load_adapter(peft_model_id) +# create LoRA configuration object +lora_config = LoraConfig( + task_type=TaskType.CAUSAL_LM, # type of task to train on + inference_mode=False, # set to False for training + r=8, # dimension of the smaller matrices + lora_alpha=32, # scaling factor + lora_dropout=0.1 # dropout of LoRA layers +) ``` -Check out the [API documentation](#transformers.integrations.PeftAdapterMixin) section below for more details. - -## Load in 8bit or 4bit - -The `bitsandbytes` integration supports 8bit and 4bit precision data types, which are useful for loading large models because it saves memory (see the `bitsandbytes` integration [guide](./quantization#bitsandbytes-integration) to learn more). Add the `load_in_8bit` or `load_in_4bit` parameters to [`~PreTrainedModel.from_pretrained`] and set `device_map="auto"` to effectively distribute the model to your hardware: +Add [`~peft.LoraConfig`] to the model with [`~integrations.PeftAdapterMixin.add_adapter`]. The model is ready to be passed to [`Trainer`] for training. ```py -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig - -peft_model_id = "ybelkada/opt-350m-lora" -model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) +model.add_adapter(lora_config, adapter_name="lora_1") +trainer = Trainer(model=model, ...) +trainer.train() ``` -## Add a new adapter +To add an additional trainable adapter on top of a model with an existing adapter attached, specify the modules you want to train in [modules_to_save()](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraConfig.modules_to_save). -You can use [`~peft.PeftModel.add_adapter`] to add a new adapter to a model with an existing adapter as long as the new adapter is the same type as the current one. For example, if you have an existing LoRA adapter attached to a model: +For example, to train the `lm_head` module on top of a causal language model with a LoRA adapter attached, set `modules_to_save=["lm_head"]`. Add the adapter to the model as shown below, and then pass it to [`Trainer`]. ```py -from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM from peft import LoraConfig -model_id = "facebook/opt-350m" -model = AutoModelForCausalLM.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b") lora_config = LoraConfig( target_modules=["q_proj", "k_proj"], - init_lora_weights=False + modules_to_save=["lm_head"], ) -model.add_adapter(lora_config, adapter_name="adapter_1") -``` - -To add a new adapter: - -```py -# attach new adapter with same config -model.add_adapter(lora_config, adapter_name="adapter_2") +model.add_adapter(lora_config) +trainer = Trainer(model=model, ...) +trainer.train() ``` -Now you can use [`~peft.PeftModel.set_adapter`] to set which adapter to use: +Save your adapter with [`~PreTrainedModel.save_pretrained`] to reuse it. -```py -# use adapter_1 -model.set_adapter("adapter_1") -output_disabled = model.generate(**inputs) -print(tokenizer.decode(output_disabled[0], skip_special_tokens=True)) - -# use adapter_2 -model.set_adapter("adapter_2") -output_enabled = model.generate(**inputs) -print(tokenizer.decode(output_enabled[0], skip_special_tokens=True)) -``` +## Load adapter -## Enable and disable adapters +To load an adapter with Transformers, the Hub repository or local directory must contain an `adapter_config.json` file and the adapter weights. Load the adapter with [`~PreTrainedModel.from_pretrained`] or with [`~integrations.PeftAdapterMixin.load_adapter`]. -Once you've added an adapter to a model, you can enable or disable the adapter module. To enable the adapter module: + + ```py -from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer -from peft import PeftConfig +from transformers import AutoModelForCausalLM -model_id = "facebook/opt-350m" -adapter_model_id = "ybelkada/opt-350m-lora" -tokenizer = AutoTokenizer.from_pretrained(model_id) -text = "Hello" -inputs = tokenizer(text, return_tensors="pt") - -model = AutoModelForCausalLM.from_pretrained(model_id) -peft_config = PeftConfig.from_pretrained(adapter_model_id) - -# to initiate with random weights -peft_config.init_lora_weights = False - -model.add_adapter(peft_config) -model.enable_adapters() -output = model.generate(**inputs) +model = AutoModelForCausalLM.from_pretrained("klcsp/gemma7b-lora-alpaca-11-v1") ``` -To disable the adapter module: + + ```py -model.disable_adapters() -output = model.generate(**inputs) -``` - -## Train a PEFT adapter - -PEFT adapters are supported by the [`Trainer`] class so that you can train an adapter for your specific use case. It only requires adding a few more lines of code. For example, to train a LoRA adapter: - - +from transformers import AutoModelForCausalLM -If you aren't familiar with fine-tuning a model with [`Trainer`], take a look at the [Fine-tune a pretrained model](training) tutorial. +model = AutoModelForCausalLM.from_pretrained("google/gemma-7b") +model.load_adapter("klcsp/gemma7b-lora-alpaca-11-v1") +``` - +For very large models, it is helpful to load a quantized version of the model in 8 or 4-bit precision to save memory. Transformers supports quantization with its [bitsandbytes](https://huggingface.co/docs/bitsandbytes/index) integration. Specify in [`BitsAndBytesConfig`] whether you want to load a model in 8 or 4-bit precision. -1. Define your adapter configuration with the task type and hyperparameters (see [`~peft.LoraConfig`] for more details about what the hyperparameters do). +For multiple devices, add `device_map="auto"` to automatically distribute the model across your hardware. ```py -from peft import LoraConfig +from transformers import AutoModelForCausalLM, BitsAndBytesConfig -peft_config = LoraConfig( - lora_alpha=16, - lora_dropout=0.1, - r=64, - bias="none", - task_type="CAUSAL_LM", +model = AutoModelForCausalLM.from_pretrained( + "klcsp/gemma7b-lora-alpaca-11-v1", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + device_map="auto", ) ``` -2. Add adapter to the model. - -```py -model.add_adapter(peft_config) -``` +## Set adapter -3. Now you can pass the model to [`Trainer`]! +[`~integrations.PeftAdapterMixin.add_adapter`] adds a new adapter to a model. To add a second adapter, the new adapter must be the same type as the first adapter. Use the [`~integrations.PeftAdapterMixin.add_adapter.adapter_name`] parameter to assign a name to the adapter. ```py -trainer = Trainer(model=model, ...) -trainer.train() +model.add_adapter(lora_config, adapter_name="lora_2") ``` -To save your trained adapter and load it back: +Once added, use [`~integrations.PeftAdapterMixin.set_adapter`] to force a model to use the specified adapter and disable the other adapters. ```py -model.save_pretrained(save_dir) -model = AutoModelForCausalLM.from_pretrained(save_dir) +model.set_adapter("lora_2") ``` -## Add additional trainable layers to a PEFT adapter +## Enable and disable adapter -You can also fine-tune additional trainable adapters on top of a model that has adapters attached by passing `modules_to_save` in your PEFT config. For example, if you want to also fine-tune the lm_head on top of a model with a LoRA adapter: +[`~integrations.PeftAdapterMixin.enable_adapters`] is a broader function that enables *all* adapters attached to a model, and [`~integrations.PeftAdapterMixin.disable_adapters`] disables *all* attached adapters. ```py -from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer -from peft import LoraConfig - -model_id = "facebook/opt-350m" -model = AutoModelForCausalLM.from_pretrained(model_id) - -lora_config = LoraConfig( - target_modules=["q_proj", "k_proj"], - modules_to_save=["lm_head"], -) +model.add_adapter(lora_1) +model.add_adapter(lora_2) +model.enable_adapters() -model.add_adapter(lora_config) +# disable all adapters +model.disable_adapters() ``` - -## API docs - -[[autodoc]] integrations.PeftAdapterMixin - - load_adapter - - add_adapter - - set_adapter - - disable_adapters - - enable_adapters - - active_adapters - - get_adapter_state_dict - - - - - diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md index 888995898ab9..12e4c27d3679 100644 --- a/docs/source/en/perf_train_gpu_one.md +++ b/docs/source/en/perf_train_gpu_one.md @@ -154,7 +154,7 @@ torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True ``` -Configure [`~TrainingArguments.tf32`] in [`TrainingArguments`] to enable mixed precision training with tf32 mode. +Configure [tf32()](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.tf32) in [`TrainingArguments`] to enable mixed precision training with tf32 mode. ```py from transformers import TrainingArguments @@ -185,7 +185,7 @@ args = TrainingArguments( gradient_accumulation_steps=16, gradient_checkpointing=True, bf16=True, - optim="adamw_bnb_8bit + optim="adamw_bnb_8bit" ) ``` @@ -210,7 +210,7 @@ args = TrainingArguments( gradient_accumulation_steps=16, gradient_checkpointing=True, bf16=True, - optim="adamw_bnb_8bit, + optim="adamw_bnb_8bit", dataloader_pin_memory=True, dataloader_num_workers=4, ) @@ -224,7 +224,7 @@ PyTorch provides several features for reducing memory requirements and increasin The [torch.cuda.empty_cache](https://pytorch.org/docs/stable/generated/torch.cuda.empty_cache.html#torch.cuda.empty_cache) function releases unused cached memory, which can help avoid out-of-memory (OOM) errors at the cost of ~10% slower training. -Configure [`~TrainingArguments.torch_empty_cache_steps`] in [`TrainingArguments`] to enable torch.empty_cache after a certain number of training steps. +Configure [torch_empty_cache_steps()](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.torch_empty_cache_steps) in [`TrainingArguments`] to enable torch.empty_cache after a certain number of training steps. ```py from transformers import TrainingArguments @@ -245,7 +245,7 @@ args = TrainingArguments( [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) compiles PyTorch code into optimized kernels that significantly speed up training. This feature relies on TorchDynamo to capture PyTorch graphs with the Frame Evaluation API. The graph can be further compiled into optimized kernels for different backends. -Configure [`~TrainingArguments.torch_compile`] in [`TrainingArguments`] to enable it, and configure [`~TrainingArguments.torch_compile_backend`] to select a backend to use. +Configure [`~TrainingArguments.torch_compile`] in [`TrainingArguments`] to enable it, and configure [torch_compile_backend()](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.torch_compile_backend) to select a backend to use. ```py from transformers import TrainingArguments @@ -255,7 +255,7 @@ args = TrainingArguments( gradient_accumulation_steps=16, gradient_checkpointing=True, bf16=True, - optim="adamw_bnb_8bit, + optim="adamw_bnb_8bit", dataloader_pin_memory=True, dataloader_num_workers=4, torch_empty_cache_steps=4, @@ -279,9 +279,9 @@ Refer to the table below to help you choose the right backend for your training | onnxrt | uses [ONNX-RT](https://onnxruntime.ai/) for CPU and GPU inference | inference | | ipex | uses [IPEX](https://github.com/intel/intel-extension-for-pytorch) for CPU inference | inference | -### PyTorch scaled dot production attention +### Scaled dot production attention -PyTorch's [torch.nn.functional.scaled_dot_product_attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) is a native implementation of the scaled dot product attention mechanism. SDPA is more efficient and optimized than the original attention mechanism in transformer models. It supports three types of scaled dot product attention. +[torch.nn.functional.scaled_dot_product_attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) is a native PyTorch implementation of the scaled dot product attention mechanism. SDPA is more efficient and optimized than the original attention mechanism in transformer models. It supports three types of scaled dot product attention. - [FlashAttention2](https://github.com/Dao-AILab/flash-attention) is automatically enabled for models with the fp16 or bf16 torch type. Make sure to cast your model to the appropriate type first. - [xFormers](https://github.com/facebookresearch/xformers) or Memory-Efficient Attention supports models with the fp32 torch type. @@ -294,31 +294,3 @@ from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", attn_implementation="sdpa") ``` - -## PEFT - -[PEFT](https://huggingface.co/docs/peft/index), a library of parameter-efficient finetuning methods, enable training and storing large models often on consumer GPUs by only finetuning a small number of extra model parameters on top of the pretrained model. A significant amount of memory is saved because the GPU doesn't need to store the optimizer states and gradients for the pretrained base model. - -[Low-Rank Adaptation (LoRA)](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a very common PEFT method that decomposes the weight matrix into two smaller trainable matrices. Refer to the PEFT [Quicktour](https://huggingface.co/docs/peft/quicktour) for more details, but the example below demonstrates how to create a LoRA adapter for training. - -```py -from peft import LoraConfig, TaskType, get_peft_model -from transformers import AutoModelForCausalLM - -# create LoRA configuration object -peft_config = LoraConfig( - task_type=TaskType.CAUSAL_LM, # type of task to train on - inference_mode=False, # set to False for training - r=8, # dimension of the smaller matrices - lora_alpha=32, # scaling factor - lora_dropout=0.1 # dropout of LoRA layers -) - -# create a LoRA adapter -model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b") -model = get_peft_model(model, peft_config) -# print the number of parameters you're actually training -model.print_trainable_parameters -``` - -The model is ready to be passed to [`Trainer`] for training. diff --git a/docs/source/en/perf_train_tpu_tf.md b/docs/source/en/perf_train_tpu_tf.md index efdfaf38aeaa..8c64dab189a6 100644 --- a/docs/source/en/perf_train_tpu_tf.md +++ b/docs/source/en/perf_train_tpu_tf.md @@ -13,7 +13,7 @@ rendered properly in your Markdown viewer. --> -# TPU wih TensorFlow +# TPU From 1d1daa263500c54ed76cee401b46fb03464673f7 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 11 Dec 2024 09:36:04 -0800 Subject: [PATCH 075/116] distrib debug --- docs/source/en/_toctree.yml | 5 +- docs/source/en/debugging.md | 256 ++++++++++-------------------------- docs/source/en/sagemaker.md | 28 ---- 3 files changed, 75 insertions(+), 214 deletions(-) delete mode 100644 docs/source/en/sagemaker.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 094e5f2ee44c..499e7e8db5c5 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -133,6 +133,8 @@ title: DeepSpeed - local: perf_train_cpu_many title: Distributed CPUs + - local: debugging + title: Multi-GPU debugging - local: perf_train_gpu_many title: Parallelism methods - title: Hardware-specific training @@ -149,11 +151,10 @@ title: Build your own machine - local: peft title: PEFT - - local: debugging - title: Debugging - local: model_memory_anatomy title: Model training anatomy - title: Quantization + isExpanded: false sections: - local: quantization/overview title: Getting started diff --git a/docs/source/en/debugging.md b/docs/source/en/debugging.md index 76e87f063206..af6f05df10c4 100644 --- a/docs/source/en/debugging.md +++ b/docs/source/en/debugging.md @@ -1,4 +1,4 @@ - -# Debugging +# Multi-GPU debugging -Training on multiple GPUs can be a tricky endeavor whether you're running into installation issues or communication problems between your GPUs. This debugging guide covers some issues you may run into and how to resolve them. +Distributed training can be tricky because you have to ensure you're using the correct CUDA version across your system, you may encounter inter-communication issues between GPUs, and there may be underflow or overflow problems in your model. This guide covers how to debug these issues, especially as it relates to DeepSpeed and PyTorch. -## DeepSpeed CUDA installation +## DeepSpeed CUDA issues -If you're using DeepSpeed, you've probably already installed it with the following command. +DeepSpeed compiles CUDA C++ which can be a potential source of errors when building PyTorch extensions that require CUDA. These errors depend on how CUDA is installed on your system. This section focuses on PyTorch built with *CUDA 10.2* ```bash pip install deepspeed ``` -DeepSpeed compiles CUDA C++ code and it can be a potential source of errors when building PyTorch extensions that require CUDA. These errors depend on how CUDA is installed on your system, and this section focuses on PyTorch built with *CUDA 10.2*. - - - -For any other installation issues, please [open an issue](https://github.com/deepspeedai/DeepSpeed/issues) with the DeepSpeed team. - - +> [!TIP] +> For any other installation issues, please [open an issue](https://github.com/microsoft/DeepSpeed/issues) with the DeepSpeed team. ### Non-identical CUDA toolkits -PyTorch comes with its own CUDA toolkit, but to use DeepSpeed with PyTorch, you need to have an identical version of CUDA installed system-wide. For example, if you installed PyTorch with `cudatoolkit==10.2` in your Python environment, then you'll also need to have CUDA 10.2 installed system-wide. If you don't have CUDA installed system-wide, you should install it first. +PyTorch comes with its own CUDA toolkit, but to use DeepSpeed with PyTorch, you need to have an identical version of CUDA installed system-wide. For example, if you installed PyTorch with `cudatoolkit==10.2` in your Python environment, then you'll also need to have CUDA 10.2 installed everywhere. -The exact location may vary from system to system, but `usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly setup and added to your `PATH` environment variable, you can find the installation location with the following command: +The exact location can vary from system to system, but `usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly setup and added to your `PATH` environment variable, you can find the installation location with the following command. ```bash which nvcc @@ -46,23 +41,23 @@ which nvcc ### Multiple CUDA toolkits -You may also have more than one CUDA toolkit installed system-wide. +You may also have more than one CUDA toolkit installed on your system. ```bash /usr/local/cuda-10.2 /usr/local/cuda-11.0 ``` -Typically, package installers set the paths to whatever the last version was installed. If the package build fails because it can't find the right CUDA version (despite it being installed system-wide already), then you need to configure the `PATH` and `LD_LIBRARY_PATH` environment variables to point to the correct path. +Typically, package installers set the paths to whatever the last version was installed. If the package build fails because it can't find the right CUDA version (despite it being installed already), then you need to configure the `PATH` and `LD_LIBRARY_PATH` environment variables to point to the correct path. -Take a look at the contents of these environment variables first: +Take a look at the contents of the following environment variables first. ```bash echo $PATH echo $LD_LIBRARY_PATH ``` -`PATH` lists the locations of the executables and `LD_LIBRARY_PATH` lists where to look for shared libraries. Earlier entries are prioritized over later ones, and `:` is used to separate multiple entries. To tell the build program where to find the specific CUDA toolkit you want, insert the correct path to list first. This command prepends rather than overwrites the existing values. +`PATH` lists the locations of the executables and `LD_LIBRARY_PATH` lists where to look for shared libraries. Earlier entries are prioritized over later ones, and `:` is used to separate multiple entries. To find a specific CUDA toolkit, insert the correct path to list first. This command prepends rather than overwrites the existing values. ```bash # adjust the version and full path if needed @@ -70,23 +65,23 @@ export PATH=/usr/local/cuda-10.2/bin:$PATH export LD_LIBRARY_PATH=/usr/local/cuda-10.2/lib64:$LD_LIBRARY_PATH ``` -In addition, you should also check the directories you assign actually exist. The `lib64` sub-directory contains various CUDA `.so` objects (like `libcudart.so`) and while it is unlikely your system names them differently, you should check the actual names and change them accordingly. +In addition, you should also check that the assigned directories you actually exist. The `lib64` sub-directory contains various CUDA `.so` objects (like `libcudart.so`), and while it is unlikely your system names them differently, you should check the actual names and change them accordingly. ### Older CUDA versions Sometimes, older CUDA versions may refuse to build with newer compilers. For example, if you have `gcc-9` but CUDA wants `gcc-7`. Usually, installing the latest CUDA toolkit enables support for the newer compiler. -You could also install an older version of the compiler in addition to the one you're currently using (or it may already be installed but it's not used by default and the build system can't see it). To resolve this, you can create a symlink to give the build system visibility to the older compiler. +You could also install an older version of the compiler in addition to the one you're currently using (or it may already be installed but it's not used by default and the build system can't see it). To resolve this, create a symlink to give the build system visibility to the older compiler. ```bash -# adapt the path to your system +# adjust the path to your system sudo ln -s /usr/bin/gcc-7 /usr/local/cuda-10.2/bin/gcc sudo ln -s /usr/bin/g++-7 /usr/local/cuda-10.2/bin/g++ ``` ### Prebuild -If you're still having issues with installing DeepSpeed or if you're building DeepSpeed at run time, you can try to prebuild the DeepSpeed modules before installing them. To make a local build for DeepSpeed: +If you're still having issues with installing DeepSpeed or if you're building DeepSpeed at run time, try to prebuild the DeepSpeed modules before installing them. Run the commands below to make a local build for DeepSpeed. ```bash git clone https://github.com/deepspeedai/DeepSpeed/ @@ -97,19 +92,16 @@ TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \ --disable-pip-version-check 2>&1 | tee build.log ``` - - -To use NVMe offload, add the `DS_BUILD_AIO=1` parameter to the build command and make sure you install the libaio-dev package system-wide. - - +> [TIP] +> Add the `DS_BUILD_AIO=1` parameter to the build command to use NVMe offload. Make sure you install the libaio-dev package across your system. -Next, you'll have to specify your GPU's architecture by editing the `TORCH_CUDA_ARCH_LIST` variable (find a complete list of NVIDIA GPUs and their corresponding architectures on this [page](https://developer.nvidia.com/cuda-gpus)). To check the PyTorch version that corresponds to your architecture, run the following command: +Next, specify your GPU's architecture by editing the `TORCH_CUDA_ARCH_LIST` variable (find a complete list of NVIDIA GPUs and their corresponding architectures on this [page](https://developer.nvidia.com/cuda-gpus)). To check the PyTorch version that corresponds to your architecture, run the following command. ```bash python -c "import torch; print(torch.cuda.get_arch_list())" ``` -Find the architecture for a GPU with the following command: +Find the architecture for a GPU with the following command. @@ -121,7 +113,7 @@ CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capa -To find the architecture for GPU `0`: +Run the following command to find the architecture for GPU `0`. The results will show a value for `major` and `minor`, which is your GPU architecture. The GPU architecture below is `8.6`. ```bash CUDA_VISIBLE_DEVICES=0 python -c "import torch; \ @@ -129,8 +121,6 @@ print(torch.cuda.get_device_properties(torch.device('cuda'))) "_CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82)" ``` -This means your GPU architecture is `8.6`. - @@ -138,7 +128,7 @@ If you get `8, 6`, then you can set `TORCH_CUDA_ARCH_LIST="8.6"`. For multiple G It is also possible to not specify `TORCH_CUDA_ARCH_LIST` and the build program automatically queries the GPU architecture of the build. However, it may or may not match the actual GPU on the target machine which is why it is better to explicitly specify the correct architecture. -For training on multiple machines with the same setup, you'll need to make a binary wheel: +For training on multiple machines with the same setup, you'll need to make a binary wheel as shown below. ```bash git clone https://github.com/deepspeedai/DeepSpeed/ @@ -148,88 +138,64 @@ TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \ python setup.py build_ext -j8 bdist_wheel ``` -This command generates a binary wheel that'll look something like `dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`. Now you can install this wheel locally or on another machine. +This command generates a binary wheel that'll look something like `dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`. Install this wheel locally or on another machine. ```bash pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl ``` -## Multi-GPU Network Issues Debug +## Communication issues -When training or inferencing with `DistributedDataParallel` and multiple GPU, if you run into issue of inter-communication between processes and/or nodes, you can use the following script to diagnose network issues. +Distributed training involves communication between processes and or nodes and this can be a potential source of errors. -```bash -wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py -``` - -For example to test how 2 GPUs interact do: +Download the script below to diagnose network issues, and then run it to test GPU communication. The example command below tests how two GPUs communicate. Adjust the `--nproc_per_node` and `--nnodes` parameters to adapt it to your system. ```bash +wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py ``` -If both processes can talk to each and allocate GPU memory each will print an OK status. -For more GPUs or nodes adjust the arguments in the script. +The script prints an `OK` status if both GPUs are able to communicate and allocate memory. Take a closer look at the diagnostic script for more details and a recipe for running it in a SLURM environment. -You will find a lot more details inside the diagnostics script and even a recipe to how you could run it in a SLURM environment. - -An additional level of debug is to add `NCCL_DEBUG=INFO` environment variable as follows: +Add the `NCCL_DEBUG=INFO` environment variable to report more NCCL-related debugging information. ```bash NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py ``` -This will dump a lot of NCCL-related debug information, which you can then search online if you find that some problems are reported. Or if you're not sure how to interpret the output you can share the log file in an Issue. - - - -## Underflow and Overflow Detection - - - -This feature is currently available for PyTorch-only. +## Underflow and overflow detection - +Underflow and overflow can occur when activations or weights are `inf`, `nan`, and when `loss=NaN`. This may indicate an underflow or overflow issue. To detect these issues, activate the `DebugUnderflowOverflow` module in [`TrainingArguments.debug`] or import and add the module to your own training loop or another trainer class. - + + -For multi-GPU training it requires DDP (`torch.distributed.launch`). +```py +from transformers import TrainingArguments - - - - -This feature can be used with any `nn.Module`-based model. - - - -If you start getting `loss=NaN` or the model exhibits some other abnormal behavior due to `inf` or `nan` in -activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily -you can accomplish that easily by activating a special module that will do the detection automatically. - -If you're using [`Trainer`], you just need to add: - -```bash ---debug underflow_overflow +args = TrainingArguments( + debug="underflow_overflow", + ... +) ``` -to the normal command line arguments, or pass `debug="underflow_overflow"` when creating the -[`TrainingArguments`] object. - -If you're using your own training loop or another Trainer you can accomplish the same with: + + -```python +```py from transformers.debug_utils import DebugUnderflowOverflow debug_overflow = DebugUnderflowOverflow(model) ``` -[`~debug_utils.DebugUnderflowOverflow`] inserts hooks into the model that immediately after each -forward call will test input and output variables and also the corresponding module's weights. As soon as `inf` or -`nan` is detected in at least one element of the activations or weights, the program will assert and print a report -like this (this was caught with `google/mt5-small` under fp16 mixed precision): + + -``` +The [`~debug_utils.DebugUnderflowOverflow`] module inserts hooks into the model to test the input and output variables and the corresponding model weights after each forward call. If `inf` or `nan` is detected in at least one element of the activations or weights, the module prints a report like the one shown below. + +The example below is for fp16 mixed precision training with a [google/mt5-small](https://huggingface.co/google/mt5-small). + +```shell Detected inf/nan during batch_number=0 Last 21 forward frames: abs min abs max metadata @@ -269,48 +235,20 @@ abs min abs max metadata 0.00e+00 inf output ``` -The example output has been trimmed in the middle for brevity. - -The second column shows the value of the absolute largest element, so if you have a closer look at the last few frames, -the inputs and outputs were in the range of `1e4`. So when this training was done under fp16 mixed precision the very -last step overflowed (since under `fp16` the largest number before `inf` is `64e3`). To avoid overflows under -`fp16` the activations must remain way below `1e4`, because `1e4 * 1e4 = 1e8` so any matrix multiplication with -large activations is going to lead to a numerical overflow condition. - -At the very start of the trace you can discover at which batch number the problem occurred (here `Detected inf/nan during batch_number=0` means the problem occurred on the first batch). +At the start of the report, you can see which batch number the error occurred. In this case, it occurred on the first batch. -Each reported frame starts by declaring the fully qualified entry for the corresponding module this frame is reporting -for. If we look just at this frame: +Each frame describes the module it is reporting on. For example, the frame below inspected `encoder.block.2.layer.1.layer_norm`. This indicates the layer norm in the first layer of the second block of the encoder. The forward calls are to `T5LayerNorm`. -``` +```shell encoder.block.2.layer.1.layer_norm T5LayerNorm 8.69e-02 4.18e-01 weight 2.65e-04 3.42e+03 input[0] 1.79e-06 4.65e+00 output ``` -Here, `encoder.block.2.layer.1.layer_norm` indicates that it was a layer norm for the first layer, of the second -block of the encoder. And the specific calls of the `forward` is `T5LayerNorm`. - -Let's look at the last few frames of that report: +The last frame reports on the `Dropout.forward` function. It called the `dropout` attribute from inside the `DenseReluDense` class. You can observe that the overflow (`inf`) occurred in the first layer of the encoders second block in the first batch. The absolute largest input element was 6.27e+04. -``` -Detected inf/nan during batch_number=0 -Last 21 forward frames: -abs min abs max metadata -[...] - encoder.block.2.layer.1.DenseReluDense.wi_0 Linear -2.17e-07 4.50e+00 weight -1.79e-06 4.65e+00 input[0] -2.68e-06 3.70e+01 output - encoder.block.2.layer.1.DenseReluDense.wi_1 Linear -8.08e-07 2.66e+01 weight -1.79e-06 4.65e+00 input[0] -1.27e-04 2.37e+02 output - encoder.block.2.layer.1.DenseReluDense.wo Linear -1.01e-06 6.44e+00 weight -0.00e+00 9.74e+03 input[0] -3.18e-04 6.27e+04 output +```shell encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense 1.79e-06 4.65e+00 input[0] 3.18e-04 6.27e+04 output @@ -319,22 +257,11 @@ abs min abs max metadata 0.00e+00 inf output ``` -The last frame reports for `Dropout.forward` function with the first entry for the only input and the second for the -only output. You can see that it was called from an attribute `dropout` inside `DenseReluDense` class. We can see -that it happened during the first layer, of the 2nd block, during the very first batch. Finally, the absolute largest -input elements was `6.27e+04` and same for the output was `inf`. - -You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was -around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which renormalizes -the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an -overflow (`inf`). - -As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16 -numbers. +The `T5DenseGatedGeluDense.forward` function output activations had an absolute maximum value of 6.27e+04 which is close to fp16s maximum limit of 6.4e+04. In the next step, `Dropout` renormalizes the weights, after zeroing some elements, which pushes the absolute maximum value to greater than 6.4e+04 resulting in an overflow. -Let's match the report to the code from `models/t5/modeling_t5.py`: +Now that you know where the error is happening, you can investigate the modeling code in [modeling_t5.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py). -```python +```py class T5DenseGatedGeluDense(nn.Module): def __init__(self, config): super().__init__() @@ -353,29 +280,11 @@ class T5DenseGatedGeluDense(nn.Module): return hidden_states ``` -Now it's easy to see the `dropout` call, and all the previous calls as well. - -Since the detection is happening in a forward hook, these reports are printed immediately after each `forward` -returns. - -Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers -started to go up and most likely switch to the `fp32` mode here, so that the numbers don't overflow when multiplied -or summed up. Of course, there might be other solutions. For example, we could turn off `amp` temporarily if it's -enabled, after moving the original `forward` into a helper wrapper, like so: - -```python -def _forward(self, hidden_states): - hidden_gelu = self.gelu_act(self.wi_0(hidden_states)) - hidden_linear = self.wi_1(hidden_states) - hidden_states = hidden_gelu * hidden_linear - hidden_states = self.dropout(hidden_states) - hidden_states = self.wo(hidden_states) - return hidden_states - +One solution is to back a few steps before the values started growing too large and switch to fp32 so the numbers don't overflow when multiplied or summed. Another potential solution is to temporarily disable mixed precision training (`amp`). +```py import torch - def forward(self, hidden_states): if torch.is_autocast_enabled(): with torch.cuda.amp.autocast(enabled=False): @@ -384,14 +293,11 @@ def forward(self, hidden_states): return self._forward(hidden_states) ``` -Since the automatic detector only reports on inputs and outputs of full frames, once you know where to look, you may -want to analyse the intermediary stages of any specific `forward` function as well. In such a case you can use the -`detect_overflow` helper function to inject the detector where you want it, for example: +The report only returns inputs and outputs of full frames, so you may also want to analyze the intermediate values of any `forward` function as well. Add the `detect_overflow` function after the forward calls to track `inf` or `nan` values in the intermediate `forwarded_states`. -```python +```py from debug_utils import detect_overflow - class T5LayerFF(nn.Module): [...] @@ -403,40 +309,25 @@ class T5LayerFF(nn.Module): return hidden_states + self.dropout(forwarded_states) ``` -You can see that we added 2 of these and now we track if `inf` or `nan` for `forwarded_states` was detected -somewhere in between. - -Actually, the detector already reports these because each of the calls in the example above is a `nn.Module`, but -let's say if you had some local direct calculations this is how you'd do that. - -Additionally, if you're instantiating the debugger in your own code, you can adjust the number of frames printed from -its default, e.g.: +Finally, you can configure the number of frames printed by [`~debug_utils.DebugUnderflowOverflow`]. -```python +```py from transformers.debug_utils import DebugUnderflowOverflow debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100) ``` -### Specific batch absolute min and max value tracing +### Batch tracing -The same debugging class can be used for per-batch tracing with the underflow/overflow detection feature turned off. +[`~debug_utils.DebugUnderflowOverflow`] is able to trace the absolute minimum and maximum values in each batch with the underflow and overflow feature disabled. This is useful for identifying where errors are occurring in the model. -Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a given -batch, and only do that for batches 1 and 3. Then you instantiate this class as: +The example below shows how to trace the minimum and maximum values in batches 1 and 3 (batches are zero-indexd). -```python +```py debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3]) ``` -And now full batches 1 and 3 will be traced using the same format as the underflow/overflow detector does. - -Batches are 0-indexed. - -This is helpful if you know that the program starts misbehaving after a certain batch number, so you can fast-forward -right to that area. Here is a sample truncated output for such configuration: - -``` +```shell *** Starting batch number=1 *** abs min abs max metadata shared Embedding @@ -465,13 +356,10 @@ abs min abs max metadata [...] ``` -Here you will get a huge number of frames dumped - as many as there were forward calls in your model, so it may or may -not what you want, but sometimes it can be easier to use for debugging purposes than a normal debugger. For example, if -a problem starts happening at batch number 150. So you can dump traces for batches 149 and 150 and compare where -numbers started to diverge. +[`~debug_utils.DebugUnderflowOverflow`] reports on a large number of frames which is easier for debugging. Once you know where a problem is occurring, say batch 150, then you can focus the trace for batches 149 and 150 and compare where the numbers are diverging. -You can also specify the batch number after which to stop the training, with: +It is also possible to abort the trace after a certain batch number, for example, batch 3. -```python +```py debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3) ``` diff --git a/docs/source/en/sagemaker.md b/docs/source/en/sagemaker.md deleted file mode 100644 index 41802d9d42b2..000000000000 --- a/docs/source/en/sagemaker.md +++ /dev/null @@ -1,28 +0,0 @@ - - -# Run training on Amazon SageMaker - -The documentation has been moved to [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). This page will be removed in `transformers` 5.0. - -### Table of Contents - -- [Train Hugging Face models on Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train) -- [Deploy Hugging Face models to Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference) From 2685ca36665744d6ed37301bee7427259b69ac84 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 11 Dec 2024 15:53:30 -0800 Subject: [PATCH 076/116] deepspeed 1 --- docs/source/en/debugging.md | 4 +- docs/source/en/deepspeed.md | 1049 ++++------------------------------- 2 files changed, 109 insertions(+), 944 deletions(-) diff --git a/docs/source/en/debugging.md b/docs/source/en/debugging.md index af6f05df10c4..07ea2ed59b06 100644 --- a/docs/source/en/debugging.md +++ b/docs/source/en/debugging.md @@ -92,7 +92,7 @@ TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \ --disable-pip-version-check 2>&1 | tee build.log ``` -> [TIP] +> [!TIP] > Add the `DS_BUILD_AIO=1` parameter to the build command to use NVMe offload. Make sure you install the libaio-dev package across your system. Next, specify your GPU's architecture by editing the `TORCH_CUDA_ARCH_LIST` variable (find a complete list of NVIDIA GPUs and their corresponding architectures on this [page](https://developer.nvidia.com/cuda-gpus)). To check the PyTorch version that corresponds to your architecture, run the following command. @@ -167,7 +167,7 @@ NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 to Underflow and overflow can occur when activations or weights are `inf`, `nan`, and when `loss=NaN`. This may indicate an underflow or overflow issue. To detect these issues, activate the `DebugUnderflowOverflow` module in [`TrainingArguments.debug`] or import and add the module to your own training loop or another trainer class. - + ```py diff --git a/docs/source/en/deepspeed.md b/docs/source/en/deepspeed.md index cb21b7e8fca8..89a6ba59eb8c 100644 --- a/docs/source/en/deepspeed.md +++ b/docs/source/en/deepspeed.md @@ -16,27 +16,21 @@ rendered properly in your Markdown viewer. # DeepSpeed -[DeepSpeed](https://www.deepspeed.ai/) is a PyTorch optimization library that makes distributed training memory-efficient and fast. At its core is the [Zero Redundancy Optimizer (ZeRO)](https://hf.co/papers/1910.02054) which enables training large models at scale. ZeRO works in several stages: +[DeepSpeed](https://www.deepspeed.ai/) is designed to optimize distributed training for large models with data, model, pipeline, and even a combination of all three [parallelism](./perf_train_gpu_many) strategies to provide better memory efficiency and faster training speeds. This is achieved by the [Zero Redundancy Optimizer (ZeRO)](https://hf.co/papers/1910.02054) which consists of three stages. -* ZeRO-1, optimizer state partitioning across GPUs -* ZeRO-2, gradient partitioning across GPUs -* ZeRO-3, parameter partitioning across GPUs +| ZeRO stage | description | +|---|---| +| 1 | partition optimizer states | +| 2 | partition optimizer and gradient states | +| 3 | partition optimizer, gradient, and parameters | -In GPU-limited environments, ZeRO also enables offloading optimizer memory and computation from the GPU to the CPU to fit and train really large models on a single GPU. DeepSpeed is integrated with the Transformers [`Trainer`] class for all ZeRO stages and offloading. All you need to do is provide a config file or you can use a provided template. For inference, Transformers support ZeRO-3 and offloading since it allows loading huge models. +Each stage progressively saves more memory, allowing really large models to fit and be trained on a single GPU. DeepSpeed is integrated with [`Trainer`] for all ZeRO stages and offloading optimizer memory and computations from the GPU to the CPU. Provide a config file or one of the example templates to [`Trainer`] to enable DeepSpeed features. -This guide will walk you through how to deploy DeepSpeed training, the features you can enable, how to setup the config files for different ZeRO stages, offloading, inference, and using DeepSpeed without the [`Trainer`]. +This guide walks you through setting up a DeepSpeed config file, how to enable its features in [`Trainer`], and deploy training. -## Installation +Install DeepSpeed from either PyPI or Transformers. For more detailed installation instructions, refer to the DeepSpeed [installation](https://www.deepspeed.ai/tutorials/advanced-install/) or GitHUB [README](https://github.com/microsoft/deepspeed#installation). -DeepSpeed is available to install from PyPI or Transformers (for more detailed installation options, take a look at the DeepSpeed [installation details](https://www.deepspeed.ai/tutorials/advanced-install/) or the GitHub [README](https://github.com/deepspeedai/DeepSpeed#installation)). - - - -If you're having difficulties installing DeepSpeed, check the [DeepSpeed CUDA installation](../debugging#deepspeed-cuda-installation) guide. While DeepSpeed has a pip installable PyPI package, it is highly recommended to [install it from source](https://www.deepspeed.ai/tutorials/advanced-install/#install-deepspeed-from-source) to best match your hardware and to support certain features, like 1-bit Adam, which aren’t available in the PyPI distribution. - - - - + ```bash @@ -53,9 +47,12 @@ pip install transformers[deepspeed] -## Memory requirements +> [!WARNING] +> Refer to the [DeepSpeed CUDA installation](./debugging#deepspeed-cuda-issues) if you're having trouble with your installation. While DeepSpeed has a pip installable package, it is highly recommended to [install it from source](https://www.deepspeed.ai/tutorials/advanced-install/#install-deepspeed-from-source) to ensure it matches your hardware and to support certain features which aren't available in the PyPI distribution. -Before you begin, it is a good idea to check whether you have enough GPU and CPU memory to fit your model. DeepSpeed provides a tool for estimating the required CPU/GPU memory. For example, to estimate the memory requirements for the [bigscience/T0_3B](bigscience/T0_3B) model on a single GPU: +DeepSpeed provides a tool for estimating the required CPU and GPU memory for the parameters, optimizer and gradient states. You'll also need some memory for the CUDA kernels and activations. + +Run the command below to check the memory requirements for [bigscience/T0_3B](https://huggingface.co/docs/transformers/main/en/bigscience/T0_3B) on a single GPU. ```bash $ python -c 'from transformers import AutoModel; \ @@ -75,64 +72,48 @@ SW: Model with 2783M total params, 65M largest layer params. 15.56GB | 46.91GB | offload_param=none, offload_optimizer=none, zero_init=0 ``` -This means you either need a single 80GB GPU without CPU offload or a 8GB GPU and a ~60GB CPU to offload to (these are just the memory requirements for the parameters, optimizer states and gradients, and you'll need a bit more for the CUDA kernels and activations). You should also consider the tradeoff between cost and speed because it'll be cheaper to rent or buy a smaller GPU but it'll take longer to train your model. - -If you have enough GPU memory make sure you disable CPU/NVMe offload to make everything faster. +> [!TIP] +> If you have enough GPU memory, disable CPU and NVMe offload to speed everything up. -## Select a ZeRO stage +## Choose a ZeRO stage -After you've installed DeepSpeed and have a better idea of your memory requirements, the next step is selecting a ZeRO stage to use. In order of fastest and most memory-efficient: +Consider the table below to help you choose the appropriate ZeRO stage for training because there is a trade-off between training speed and memory usage. The table orders the ZeRO stages from fastest to slowest and from least memory usage to most. -| Fastest | Memory efficient | -|------------------|------------------| -| ZeRO-1 | ZeRO-3 + offload | -| ZeRO-2 | ZeRO-3 | +| fastest | memory usage | +|---|---| +| ZeRO-1 | ZeRO-3 + offload | +| ZeRO-2 | ZeRO-3 | | ZeRO-2 + offload | ZeRO-2 + offload | -| ZeRO-3 | ZeRO-2 | -| ZeRO-3 + offload | ZeRO-1 | - -To find what works best for you, start with the fastest approach and if you run out of memory, try the next stage which is slower but more memory efficient. Feel free to work in whichever direction you prefer (starting with the most memory efficient or fastest) to discover the appropriate balance between speed and memory usage. +| ZeRO-3 | ZeRO-2 | +| ZeRO-3 + offload | ZeRO-1 | -A general process you can use is (start with batch size of 1): +Decide the type of performance you're optimizing for, speed or memory, and then work backwards to discover the best ZeRO stage for your use case. For example, if you're optimizing for speed, start with the fastest ZeRO stage and if you run out of memory, try the next stage which is slower but more memory efficient. -1. enable gradient checkpointing -2. try ZeRO-2 -3. try ZeRO-2 and offload the optimizer -4. try ZeRO-3 -5. try ZeRO-3 and offload parameters to the CPU -6. try ZeRO-3 and offload parameters and the optimizer to the CPU -7. try lowering various default values like a narrower search beam if you're using the [`~GenerationMixin.generate`] method -8. try mixed half-precision (fp16 on older GPU architectures and bf16 on Ampere) over full-precision weights -9. add more hardware if possible or enable Infinity to offload parameters and the optimizer to a NVMe -10. once you're not running out of memory, measure effective throughput and then try to increase the batch size as large as you can to maximize GPU efficiency -11. lastly, try to optimize your training setup by disabling some offload features or use a faster ZeRO stage and increasing/decreasing the batch size to find the best tradeoff between speed and memory usage +## Config file +Enable DeepSpeed in [`Trainer`] with a config file containing all the parameters for how to configure and setup your training. When the training script is executed, DeepSpeed logs the configuration from [`Trainer`] to the console so you can see exactly what's being used. -## DeepSpeed configuration file - -DeepSpeed works with the [`Trainer`] class by way of a config file containing all the parameters for configuring how you want setup your training run. When you execute your training script, DeepSpeed logs the configuration it received from [`Trainer`] to the console so you can see exactly what configuration was used. - - - -Find a complete list of DeepSpeed configuration options on the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. You can also find more practical examples of various DeepSpeed configuration examples on the [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) repository or the main [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) repository. To quickly find specific examples, you can: - -```bash -git clone https://github.com/deepspeedai/DeepSpeedExamples -cd DeepSpeedExamples -find . -name '*json' -# find examples with the Lamb optimizer -grep -i Lamb $(find . -name '*json') -``` - - +> [!TIP] +> Find a complete list of DeepSpeed configuration options on the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. There are also practical examples of various DeepSpeed configuration examples in the [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) main [DeepSpeed](https://github.com/microsoft/DeepSpeed) repository. Run the command below to quickly find specific examples. +> +> ```bash +> git clone https://github.com/microsoft/DeepSpeedExamples +> cd DeepSpeedExamples +> find . -name '*json' +> # find examples with the Lamb optimizer +> grep -i Lamb $(find . -name '*json') +> ``` -The DeepSpeed configuration file is passed as a path to a JSON file if you're training from the command line interface or as a nested `dict` object if you're using the [`Trainer`] in a notebook setting. +The config file is passed as a path to a JSON file if you're training from the command line interface or as a nested dict object if you're using [`Trainer`] in a notebook. ```py -TrainingArguments(..., deepspeed="path/to/deepspeed_config.json") +TrainingArguments( + deepspeed="path/to/deepspeed_config.json", + ..., +) ``` @@ -140,45 +121,49 @@ TrainingArguments(..., deepspeed="path/to/deepspeed_config.json") ```py ds_config_dict = dict(scheduler=scheduler_params, optimizer=optimizer_params) -args = TrainingArguments(..., deepspeed=ds_config_dict) -trainer = Trainer(model, args, ...) +args = TrainingArguments( + deepspeed=ds_config_dict, + ..., +) +trainer = Trainer( + model, + args, + ..., +) ``` -### DeepSpeed and Trainer parameters +### DeepSpeed versus Trainer parameters -There are three types of configuration parameters: +There are three types of config parameters. -1. Some of the configuration parameters are shared by [`Trainer`] and DeepSpeed, and it can be difficult to identify errors when there are conflicting definitions. To make it easier, these shared configuration parameters are configured from the [`Trainer`] command line arguments. +1. Some config parameters are shared by DeepSpeed and [`Trainer`] making it difficult to identify errors when there are conflicting definitions. In this case, configure these parameters from the [`Trainer`] command line arguments. +1. Some config parameters are automatically derived from the model configuration and don't need to be manually configured. [`Trainer`] uses the config value `auto` to set the most correct or efficient option. You could define these parameters explicitly, but you must take care to ensure the [`Trainer`] and DeepSpeed config parameters match. Mismatches may cause training to fail in very difficult to detect ways. +1. Some config parameters are specific to DeepSpeed and should be manually set based on your training requirements. -2. Some configuration parameters that are automatically derived from the model configuration so you don't need to manually adjust these values. The [`Trainer`] uses a configuration value `auto` to determine set the most correct or efficient value. You could set your own configuration parameters explicitly, but you must take care to ensure the [`Trainer`] arguments and DeepSpeed configuration parameters agree. Mismatches may cause the training to fail in very difficult to detect ways! +There are two ways to modify the config parameters. -3. Some configuration parameters specific to DeepSpeed only which need to be manually set based on your training needs. +> [!TIP] +> Some values, such as `scheduler.params.total_num_steps`, are calculated by the [`Trainer`] during training. -You could also modify the DeepSpeed configuration and edit [`TrainingArguments`] from it: +1. Create or load a DeepSpeed config to use as the main config. +1. Create a [`TrainingArguments`] object based on the DeepSpeed config values. -1. Create or load a DeepSpeed configuration to use as the main configuration -2. Create a [`TrainingArguments`] object based on these DeepSpeed configuration values +### ZeRO stage -Some values, such as `scheduler.params.total_num_steps` are calculated by the [`Trainer`] during training. +Each ZeRO stage has its own config, as defined in `zero_optimization`. -### ZeRO configuration +For a more detailed explanation of each parameter, refer to the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. These parameters must be setup with DeepSpeed because [`Trainer`] doesn't provide equivalent command line arguments. -There are three configurations, each corresponding to a different ZeRO stage. Stage 1 is not as interesting for scalability, and this guide focuses on stages 2 and 3. The `zero_optimization` configuration contains all the options for what to enable and how to configure them. For a more detailed explanation of each parameter, take a look at the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. - - -DeepSpeed doesn’t validate parameter names and any typos fallback on the parameter's default setting. You can watch the DeepSpeed engine startup log messages to see what values it is going to use. - - - -The following configurations must be setup with DeepSpeed because the [`Trainer`] doesn't provide equivalent command line arguments. +> [!WARNING] +> DeepSpeed doesn't validate parameter names and any typos will fallback on the parameters default setting. Observe the DeepSpeed engine startup log messages to see what values are being used. -ZeRO-1 shards the optimizer states across GPUs, and you can expect a tiny speed up. The ZeRO-1 config can be setup like this: +ZeRO-1 shards the optimizer states across GPUs and you can expect a small speed up. ```yml { @@ -191,11 +176,11 @@ ZeRO-1 shards the optimizer states across GPUs, and you can expect a tiny speed -ZeRO-2 shards the optimizer and gradients across GPUs. This stage is primarily used for training since its features are not relevant to inference. Some important parameters to configure for better performance include: +ZeRO-2 shards the optimizer and gradient states across GPUs. This stage is primarily used for training since its features are not relevant to inference. Some important parameters to configure for better performance include the following. * `offload_optimizer` should be enabled to reduce GPU memory usage. -* `overlap_comm` when set to `true` trades off increased GPU memory usage to lower allreduce latency. This feature uses 4.5x the `allgather_bucket_size` and `reduce_bucket_size` values. In this example, they're set to `5e8` which means it requires 9GB of GPU memory. If your GPU memory is 8GB or less, you should reduce `overlap_comm` to lower the memory requirements and prevent an out-of-memory (OOM) error. -* `allgather_bucket_size` and `reduce_bucket_size` trade off available GPU memory for communication speed. The smaller their values, the slower communication is and the more GPU memory is available. You can balance, for example, whether a bigger batch size is more important than a slightly slower training time. +* `overlap_comm` when set to `true` uses increased GPU memory usage in exchange for lower allreduce latency. This feature uses 4.5x the `allgather_bucket_size` and `reduce_bucket_size` values. In this example, they're set to `5e8` which means it requires 9GB of GPU memory. If your GPU memory is 8GB or less, you should reduce `overlap_comm` to lower the memory requirements and prevent an out-of-memory (OOM) error. +* `allgather_bucket_size` and `reduce_bucket_size` trade-off available GPU memory for communication speed. The smaller their values, the slower communication is and the more GPU memory is available. You can balance, for example, whether a bigger batch size is more important than a slightly slower training time. * `round_robin_gradients` is available in DeepSpeed 0.4.4 for CPU offloading. It parallelizes gradient copying to CPU memory among ranks by fine-grained gradient partitioning. Performance benefit grows with gradient accumulation steps (more copying between optimizer steps) or GPU count (increased parallelism). ```yml @@ -220,19 +205,19 @@ ZeRO-2 shards the optimizer and gradients across GPUs. This stage is primarily u -ZeRO-3 shards the optimizer, gradient, and parameters across GPUs. Unlike ZeRO-2, ZeRO-3 can also be used for inference, in addition to training, because it allows large models to be loaded on multiple GPUs. Some important parameters to configure include: +ZeRO-3 shards the optimizer and gradient states, and parameters across GPUs. Unlike ZeRO-2, ZeRO-3 can also be used for inference in addition to training because it loads large models onto multiple GPUs. Some important parameters to configure include the following. -* `device: "cpu"` can help if you're running out of GPU memory and if you have free CPU memory available. This allows offloading model parameters to the CPU. +* `device: "cpu"` can help if you're running out of GPU memory and if you have free CPU memory available. This offloads model parameters to the CPU. * `pin_memory: true` can improve throughput, but less memory becomes available for other processes because the pinned memory is reserved for the specific process that requested it and it's typically accessed much faster than normal CPU memory. -* `stage3_max_live_parameters` is the upper limit on how many full parameters you want to keep on the GPU at any given time. Reduce this value if you encounter an OOM error. -* `stage3_max_reuse_distance` is a value for determining when a parameter is used again in the future, and it helps decide whether to throw the parameter away or to keep it. If the parameter is going to be reused (if the value is less than `stage3_max_reuse_distance`), then it is kept to reduce communication overhead. This is super helpful when activation checkpointing is enabled and you want to keep the parameter in the forward recompute until the backward pass. But reduce this value if you encounter an OOM error. +* `stage3_max_live_parameters` is the upper limit on how many full parameters to keep on the GPU at any given time. Reduce this value if you encounter an OOM error. +* `stage3_max_reuse_distance` is a value for determining when a parameter is used again in the future, and it helps decide whether to throw the parameter away or to keep it. If the parameter is going to be reused (if the value is less than `stage3_max_reuse_distance`), then it is kept to reduce communication overhead. This is helpful when activation checkpointing is enabled and you want to keep the parameter in the forward recompute until the backward pass. But reduce this value if you encounter an OOM error. * `stage3_gather_16bit_weights_on_model_save` consolidates fp16 weights when a model is saved. For large models and multiple GPUs, this is expensive in terms of memory and speed. You should enable it if you're planning on resuming training. -* `sub_group_size` controls which parameters are updated during the optimizer step. Parameters are grouped into buckets of `sub_group_size` and each bucket is updated one at a time. When used with NVMe offload, `sub_group_size` determines when model states are moved in and out of CPU memory from during the optimization step. This prevents running out of CPU memory for extremely large models. `sub_group_size` can be left to its default value if you aren't using NVMe offload, but you may want to change it if you: +* `sub_group_size` controls which parameters are updated during the optimizer step. Parameters are grouped into buckets of `sub_group_size` and each bucket is updated one at a time. When used with NVMe offload, `sub_group_size` determines when model states are moved in and out of CPU memory during the optimization step. This prevents running out of CPU memory for extremely large models. `sub_group_size` can be left to its default value if you aren't using NVMe offload, but you may want to change it if you: - 1. Run into an OOM error during the optimizer step. In this case, reduce `sub_group_size` to reduce memory usage of the temporary buffers. - 2. The optimizer step is taking a really long time. In this case, increase `sub_group_size` to improve bandwidth utilization as a result of increased data buffers. + 1. Run into an OOM error during the optimization step. In this case, reduce `sub_group_size` to reduce memory usage of the temporary buffers. + 2. The optimization step is taking a really long time. In this case, increase `sub_group_size` to improve bandwidth utilization as a result of increased data buffers. -* `reduce_bucket_size`, `stage3_prefetch_bucket_size`, and `stage3_param_persistence_threshold` are dependent on a model's hidden size. It is recommended to set these values to `auto` and allow the [`Trainer`] to automatically assign the values. +* `reduce_bucket_size`, `stage3_prefetch_bucket_size`, and `stage3_param_persistence_threshold` are dependent on a models hidden size. It is recommended to set these values to `auto` and allow the [`Trainer`] to automatically assign the values. ```yml { @@ -259,7 +244,9 @@ ZeRO-3 shards the optimizer, gradient, and parameters across GPUs. Unlike ZeRO-2 } ``` -You can use the [`deepspeed.zero.Init`](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.zero.Init) context manager to initialize a model faster: +#### Initialize large models + +With ZeRO-3, use the [deepspeed.zero.Init](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.zero.Init) context manager to initialize a model faster. ```py from transformers import T5ForConditionalGeneration, T5Config @@ -270,7 +257,10 @@ with deepspeed.zero.Init(): model = T5ForConditionalGeneration(config) ``` -For pretrained models, the DeepSped config file needs to have `is_deepspeed_zero3_enabled: true` setup in [`TrainingArguments`] and it needs a ZeRO configuration enabled. The [`TrainingArguments`] object must be created **before** calling the model [`~PreTrainedModel.from_pretrained`]. +The DeepSped config file needs to have `is_deepspeed_zero3_enabled: true` setup in [`TrainingArguments`] and it needs a ZeRO configuration enabled. The [`TrainingArguments`] object must be created **before** calling [`~PreTrainedModel.from_pretrained`]. + +> [!TIP] +> You'll need ZeRO-3 when the fp16 weights don't fit on a single GPU. But if you're able to load the fp16 weights, set `torch_dtype=torch.float16` in [`~PreTrainedModel.from_pretrained`]. ```py from transformers import AutoModel, Trainer, TrainingArguments @@ -280,34 +270,31 @@ model = AutoModel.from_pretrained("google-t5/t5-small") trainer = Trainer(model=model, args=training_args, ...) ``` -You'll need ZeRO-3 if the fp16 weights don't fit on a single GPU. If you're able to load fp16 weights, then make sure you specify `torch_dtype=torch.float16` in [`~PreTrainedModel.from_pretrained`]. - -Another consideration for ZeRO-3 is if you have multiple GPUs, no single GPU has all the parameters unless it's the parameters for the currently executing layer. To access all parameters from all the layers at once, such as loading pretrained model weights in [`~PreTrainedModel.from_pretrained`], one layer is loaded at a time and immediately partitioned to all GPUs. This is because for very large models, it isn't possible to load the weights on one GPU and then distribute them across the other GPUs due to memory limitations. +When there are multiple GPUs, no single GPU has all the parameters unless it's the parameters of the currently executing layer. To access all parameters from all the layers at once, such as loading pretrained model weights in [`~PreTrainedModel.from_pretrained`], one layer is loaded at a time and immediately partitioned to all GPUs. For very large models, it isn't possible to load the weights onto one GPU and then distribute them across the other GPUs due to memory limitations. -If you encounter a model parameter weight that looks like the following, where `tensor([1.])` or the parameter size is 1 instead of a larger multi-dimensional shape, this means the parameter is partitioned and this is a ZeRO-3 placeholder. +If you encounter a model parameter weight where `tensor([1.])` or the parameter size is 1 instead of a larger multidimensional shape, it means the parameter is partitioned and this is a ZeRO-3 placeholder. ```py tensor([1.0], device="cuda:0", dtype=torch.float16, requires_grad=True) ``` - - -For more information about initializing large models with ZeRO-3 and accessing the parameters, take a look at the [Constructing Massive Models](https://deepspeed.readthedocs.io/en/latest/zero3.html#constructing-massive-models) and [Gathering Parameters](https://deepspeed.readthedocs.io/en/latest/zero3.html#gathering-parameters) guides. - - +[!TIP] +> For more information about initializing large models with ZeRO-3 and accessing the parameters, take a look at the [Constructing Massive Models](https://deepspeed.readthedocs.io/en/latest/zero3.html#constructing-massive-models) and [Gathering Parameters](https://deepspeed.readthedocs.io/en/latest/zero3.html#gathering-parameters) guides. -### NVMe configuration +### NVMe -[ZeRO-Infinity](https://hf.co/papers/2104.07857) allows offloading model states to the CPU and/or NVMe to save even more memory. Smart partitioning and tiling algorithms allow each GPU to send and receive very small amounts of data during offloading such that a modern NVMe can fit an even larger total memory pool than is available to your training process. ZeRO-Infinity requires ZeRO-3. +[ZeRO-Infinity](https://hf.co/papers/2104.07857) offloads model states to the CPU and/or NVMe to save even more memory. Smart partitioning and tiling algorithms allow each GPU to send and receive very small amounts of data during offloading such that a modern NVMe can fit an even larger total memory pool than is available to your training process. ZeRO-Infinity requires ZeRO-3. -Depending on the CPU and/or NVMe memory available, you can offload both the [optimizer states](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) and [parameters](https://www.deepspeed.ai/docs/config-json/#parameter-offloading), just one of them, or none. You should also make sure the `nvme_path` is pointing to an NVMe device, because while it still works with a normal hard drive or solid state drive, it'll be significantly slower. With a modern NVMe, you can expect peak transfer speeds of ~3.5GB/s for read and ~3GB/s for write operations. Lastly, [run a benchmark](https://github.com/deepspeedai/DeepSpeed/issues/998) on your training setup to determine the optimal `aio` configuration. +Depending on the CPU and NVMe memory available, you can offload both the [optimizer states](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) and [parameters](https://www.deepspeed.ai/docs/config-json/#parameter-offloading), just one of them, or none of them. Make sure the `nvme_path` points to a NVMe device, because while it still works with a regular hard drive or solid state drive, it'll be significantly slower. With a modern NVMe, you can expect peak transfer speeds of ~3.5GB/s for read operations and ~3GB/s for write operations. -The example ZeRO-3/Infinity configuration file below sets most of the parameter values to `auto`, but you could also manually add these values. +Consider running a [benchmark](https://github.com/microsoft/DeepSpeed/issues/998) on your training setup to determine the optimal `aio` configuration. -```yml +The example ZeRO-3 and ZeRO-Infinity config below sets most of the parameter values to `auto`, but you can also manually set configure these values. + +```yaml { "fp16": { "enabled": "auto", @@ -381,856 +368,34 @@ The example ZeRO-3/Infinity configuration file below sets most of the parameter } ``` -## DeepSpeed features - -There are a number of important parameters to specify in the DeepSpeed configuration file which are briefly described in this section. - -### Activation/gradient checkpointing - -Activation and gradient checkpointing trades speed for more GPU memory which allows you to overcome scenarios where your GPU is out of memory or to increase your batch size for better performance. To enable this feature: - -1. For a Hugging Face model, set `model.gradient_checkpointing_enable()` or `--gradient_checkpointing` in the [`Trainer`]. -2. For a non-Hugging Face model, use the DeepSpeed [Activation Checkpointing API](https://deepspeed.readthedocs.io/en/latest/activation-checkpointing.html). You could also replace the Transformers modeling code and replace `torch.utils.checkpoint` with the DeepSpeed API. This approach is more flexible because you can offload the forward activations to the CPU memory instead of recalculating them. - -### Optimizer and scheduler - -DeepSpeed and Transformers optimizer and scheduler can be mixed and matched as long as you don't enable `offload_optimizer`. When `offload_optimizer` is enabled, you could use a non-DeepSpeed optimizer (except for LAMB) as long as it has both a CPU and GPU implementation. - - - -The optimizer and scheduler parameters for the config file can be set from the command line to avoid hard to find errors. For example, if the learning rate is set to a different value in another place you can override it from the command line. Aside from the optimizer and scheduler parameters, you'll need to ensure your [`Trainer`] command line arguments match the DeepSpeed configuration. - - - - - - -DeepSpeed offers several [optimizers](https://www.deepspeed.ai/docs/config-json/#optimizer-parameters) (Adam, AdamW, OneBitAdam, and LAMB) but you can also import other optimizers from PyTorch. If you don't configure the optimizer in the config, the [`Trainer`] automatically selects AdamW and either uses the supplied values or the default values for the following parameters from the command line: `lr`, `adam_beta1`, `adam_beta2`, `adam_epsilon`, `weight_decay`. - -You can set the parameters to `"auto"` or manually input your own desired values. - -```yaml -{ - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - } -} -``` - -You can also use an unsupported optimizer by adding the following to the top level configuration. - -```yaml -{ - "zero_allow_untested_optimizer": true -} -``` - -From DeepSpeed==0.8.3 on, if you want to use offload, you'll also need to the following to the top level configuration because offload works best with DeepSpeed's CPU Adam optimizer. - -```yaml -{ - "zero_force_ds_cpu_optimizer": false -} -``` - - - - -DeepSpeed supports the LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR learning rate [schedulers](https://www.deepspeed.ai/docs/config-json/#scheduler-parameters). - -Transformers and DeepSpeed provide two of the same schedulers: - -* WarmupLR is the same as `--lr_scheduler_type constant_with_warmup` in Transformers -* WarmupDecayLR is the same as `--lr_scheduler_type linear` in Transformers (this is the default scheduler used in Transformers) - -If you don't configure the scheduler in the config, the [`Trainer`] automatically selects WarmupDecayLR and either uses the supplied values or the default values for the following parameters from the command line: `warmup_min_lr`, `warmup_max_lr`, `warmup_num_steps`, `total_num_steps` (automatically calculated during run time if `max_steps` is not provided). - -You can set the parameters to `"auto"` or manually input your own desired values. - -```yaml -{ - "scheduler": { - "type": "WarmupDecayLR", - "params": { - "total_num_steps": "auto", - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto" - } - } -} -``` - - - - -### Precision - -Deepspeed supports fp32, fp16, and bf16 mixed precision. - - - - -If your model doesn't work well with mixed precision, for example if it wasn't pretrained in mixed precision, you may encounter overflow or underflow issues which can cause NaN loss. For these cases, you should use full fp32 precision by explicitly disabling the default fp16 mode. - -```yaml -{ - "fp16": { - "enabled": false - } -} -``` - -For Ampere GPUs and PyTorch > 1.7, it automatically switches to the more efficient [tf32](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices) format for some operations but the results are still in fp32. You can control it from the [`Trainer`] by setting `--tf32` to enable it, and `--tf32 0` or `--no_tf32` to disable it. - - - - -To configure PyTorch AMP-like fp16 mixed precision reduces memory usage and accelerates training speed. [`Trainer`] automatically enables or disables fp16 based on the value of `args.fp16_backend`, and the rest of the config can be set by you. fp16 is enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend amp` or `--fp16_full_eval`. - -```yaml -{ - "fp16": { - "enabled": "auto", - "loss_scale": 0, - "loss_scale_window": 1000, - "initial_scale_power": 16, - "hysteresis": 2, - "min_loss_scale": 1 - } -} -``` - -For additional DeepSpeed fp16 training options, take a look at the [FP16 Training Options](https://www.deepspeed.ai/docs/config-json/#fp16-training-options) reference. - -To configure Apex-like fp16 mixed precision, setup the config as shown below with `"auto"` or your own values. [`Trainer`] automatically configure `amp` based on the values of `args.fp16_backend` and `args.fp16_opt_level`. It can also be enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend apex` or `--fp16_opt_level 01`. - -```yaml -{ - "amp": { - "enabled": "auto", - "opt_level": "auto" - } -} -``` - - - +## Training features -To use bf16, you'll need at least DeepSpeed==0.6.0. bf16 has the same dynamic range as fp32 and doesn’t require loss scaling. However, if you use [gradient accumulation](#gradient-accumulation) with bf16, gradients are accumulated in bf16 which may not be desired because this format's low precision can lead to lossy accumulation. +DeepSpeed supports many training features that can be configured in the config file. This section describes some of the most important features. -bf16 can be setup in the config file or enabled from the command line when the following arguments are passed: `--bf16` or `--bf16_full_eval`. - -```yaml -{ - "bf16": { - "enabled": "auto" - } -} -``` - - - +### Activation and gradient checkpointing ### Batch size -The batch size can be auto-configured or explicitly set. If you choose to use the `"auto"` option, [`Trainer`] sets `train_micro_batch_size_per_gpu` to the value of args.`per_device_train_batch_size` and `train_batch_size` to `args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps`. - -```yaml -{ - "train_micro_batch_size_per_gpu": "auto", - "train_batch_size": "auto" -} -``` +### Communication data type ### Gradient accumulation -Gradient accumulation can be auto-configured or explicitly set. If you choose to use the `"auto"` option, [`Trainer`] sets it to the value of `args.gradient_accumulation_steps`. - -```yaml -{ - "gradient_accumulation_steps": "auto" -} - -``` - ### Gradient clipping -Gradient clipping can be auto-configured or explicitly set. If you choose to use the `"auto"` option, [`Trainer`] sets it to the value of `args.max_grad_norm`. - -```yaml -{ - "gradient_clipping": "auto" -} -``` - -### Communication data type - -For communication collectives like reduction, gathering and scattering operations, a separate data type is used. - -All gather and scatter operations are performed in the same data type the data is in. For example, if you're training with bf16, the data is also gathered in bf16 because gathering is a non-lossy operation. - -Reduce operations are lossy, for example when gradients are averaged across multiple GPUs. When the communication is done in fp16 or bf16, it is more likely to be lossy because adding multiple numbers in low precision isn't exact. This is especially the case with bf16 which has a lower precision than fp16. For this reason, fp16 is the default for reduction operations because the loss is minimal when averaging gradients. - -You can choose the communication data type by setting the `communication_data_type` parameter in the config file. For example, choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it is downcasted to whichever half-precision dtype you're training in. - -```yaml -{ - "communication_data_type": "fp32" -} -``` - -### Universal Checkpointing - -[Universal Checkpointing](https://www.deepspeed.ai/tutorials/universal-checkpointing) is an efficient and flexible feature for saving and loading model checkpoints. It enables seamless model training continuation and fine-tuning across different model architectures, parallelism techniques, and training configurations. - -Resume training with a universal checkpoint by setting [load_universal](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to `true` in the config file. - -```yaml -{ - "checkpoint": { - "load_universal": true - } -} -``` - -## Deployment - -DeepSpeed can be deployed by different launchers such as [torchrun](https://pytorch.org/docs/stable/elastic/run.html), the `deepspeed` launcher, or [Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch). To deploy, add `--deepspeed ds_config.json` to the [`Trainer`] command line. It’s recommended to use DeepSpeed’s [`add_config_arguments`](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) utility to add any necessary command line arguments to your code. - -This guide will show you how to deploy DeepSpeed with the `deepspeed` launcher for different training setups. You can check out this [post](https://github.com/huggingface/transformers/issues/8771#issuecomment-759248400) for more practical usage examples. - - - - - -To deploy DeepSpeed on multiple GPUs, add the `--num_gpus` parameter. If you want to use all available GPUs, you don't need to add `--num_gpus`. The example below uses 2 GPUs. +### Mixed precision training -```bash -deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \ ---deepspeed tests/deepspeed/ds_config_zero3.json \ ---model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ ---output_dir output_dir --overwrite_output_dir --fp16 \ ---do_train --max_train_samples 500 --num_train_epochs 1 \ ---dataset_name wmt16 --dataset_config "ro-en" \ ---source_lang en --target_lang ro -``` - - - - -To deploy DeepSpeed on a single GPU, add the `--num_gpus` parameter. It isn't necessary to explicitly set this value if you only have 1 GPU because DeepSpeed deploys all GPUs it can see on a given node. - -```bash -deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \ ---deepspeed tests/deepspeed/ds_config_zero2.json \ ---model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ ---output_dir output_dir --overwrite_output_dir --fp16 \ ---do_train --max_train_samples 500 --num_train_epochs 1 \ ---dataset_name wmt16 --dataset_config "ro-en" \ ---source_lang en --target_lang ro -``` - -DeepSpeed is still useful with just 1 GPU because you can: - -1. Offload some computations and memory to the CPU to make more GPU resources available to your model to use a larger batch size or fit a very large model that normally won't fit. -2. Minimize memory fragmentation with it's smart GPU memory management system which also allows you to fit bigger models and data batches. - - - -Set the `allgather_bucket_size` and `reduce_bucket_size` values to 2e8 in the [ZeRO-2](#zero-configuration) configuration file to get better performance on a single GPU. - - - - - - -### Multi-node deployment - -A node is one or more GPUs for running a workload. A more powerful setup is a multi-node setup which can be launched with the `deepspeed` launcher. For this guide, let's assume there are two nodes with 8 GPUs each. The first node can be accessed `ssh hostname1` and the second node with `ssh hostname2`. Both nodes must be able to communicate with each other locally over ssh without a password. - -By default, DeepSpeed expects your multi-node environment to use a shared storage. If this is not the case and each node can only see the local filesystem, you need to adjust the config file to include a [`checkpoint`](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to allow loading without access to a shared filesystem: - -```yaml -{ - "checkpoint": { - "use_node_local_storage": true - } -} -``` - -You could also use the [`Trainer`]'s `--save_on_each_node` argument to automatically add the above `checkpoint` to your config. - - - - -For [torchrun](https://pytorch.org/docs/stable/elastic/run.html), you have to ssh to each node and run the following command on both of them. The launcher waits until both nodes are synchronized before launching the training. - -```bash -torchrun --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \ ---master_port=9901 your_program.py --deepspeed ds_config.json -``` - - - - -For the `deepspeed` launcher, start by creating a `hostfile`. - -```bash -hostname1 slots=8 -hostname2 slots=8 -``` +### Optimizer and scheduler -Then you can launch the training with the following command. The `deepspeed` launcher automatically launches the command on both nodes at once. +## Deploy -```bash -deepspeed --num_gpus 8 --num_nodes 2 --hostfile hostfile --master_addr hostname1 --master_port=9901 \ -your_program.py --deepspeed ds_config.json -``` - -Check out the [Resource Configuration (multi-node)](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) guide for more details about configuring multi-node compute resources. - - - +### Multi-node ### SLURM -In a SLURM environment, you'll need to adapt your SLURM script to your specific SLURM environment. An example SLURM script may look like: - -```bash -#SBATCH --job-name=test-nodes # name -#SBATCH --nodes=2 # nodes -#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! -#SBATCH --cpus-per-task=10 # number of cores per tasks -#SBATCH --gres=gpu:8 # number of gpus -#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) -#SBATCH --output=%x-%j.out # output file name - -export GPUS_PER_NODE=8 -export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) -export MASTER_PORT=9901 - -srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \ - --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \ - --master_addr $MASTER_ADDR --master_port $MASTER_PORT \ -your_program.py --deepspeed ds_config.json' -``` - -Then you can schedule your multi-node deployment with the following command which launches training simultaneously on all nodes. - -```bash -sbatch launch.slurm -``` - ### Notebook -The `deepspeed` launcher doesn't support deployment from a notebook so you'll need to emulate the distributed environment. However, this only works for 1 GPU. If you want to use more than 1 GPU, you must use a multi-process environment for DeepSpeed to work. This means you have to use the `deepspeed` launcher which can't be emulated as shown here. - -```py -# DeepSpeed requires a distributed environment even when only one process is used. -# This emulates a launcher in the notebook -import os - -os.environ["MASTER_ADDR"] = "localhost" -os.environ["MASTER_PORT"] = "9994" # modify if RuntimeError: Address already in use -os.environ["RANK"] = "0" -os.environ["LOCAL_RANK"] = "0" -os.environ["WORLD_SIZE"] = "1" - -# Now proceed as normal, plus pass the DeepSpeed config file -training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json") -trainer = Trainer(...) -trainer.train() -``` - -If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated cell. - -```py -%%bash -cat <<'EOT' > ds_config_zero3.json -{ - "fp16": { - "enabled": "auto", - "loss_scale": 0, - "loss_scale_window": 1000, - "initial_scale_power": 16, - "hysteresis": 2, - "min_loss_scale": 1 - }, - - "optimizer": { - "type": "AdamW", - "params": { - "lr": "auto", - "betas": "auto", - "eps": "auto", - "weight_decay": "auto" - } - }, - - "scheduler": { - "type": "WarmupLR", - "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto" - } - }, - - "zero_optimization": { - "stage": 3, - "offload_optimizer": { - "device": "cpu", - "pin_memory": true - }, - "offload_param": { - "device": "cpu", - "pin_memory": true - }, - "overlap_comm": true, - "contiguous_gradients": true, - "sub_group_size": 1e9, - "reduce_bucket_size": "auto", - "stage3_prefetch_bucket_size": "auto", - "stage3_param_persistence_threshold": "auto", - "stage3_max_live_parameters": 1e9, - "stage3_max_reuse_distance": 1e9, - "stage3_gather_16bit_weights_on_model_save": true - }, - - "gradient_accumulation_steps": "auto", - "gradient_clipping": "auto", - "steps_per_print": 2000, - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "wall_clock_breakdown": false -} -EOT -``` - -If the training script is in a file and not in a notebook cell, you can launch `deepspeed` normally from the shell in a notebook cell. For example, to launch `run_translation.py`: - -```py -!git clone https://github.com/huggingface/transformers -!cd transformers; deepspeed examples/pytorch/translation/run_translation.py ... -``` - -You could also use `%%bash` magic and write multi-line code to run the shell program, but you won't be able to view the logs until training is complete. With `%%bash` magic, you don't need to emulate a distributed environment. - -```py -%%bash - -git clone https://github.com/huggingface/transformers -cd transformers -deepspeed examples/pytorch/translation/run_translation.py ... -``` - ## Save model weights -DeepSpeed stores the main full precision fp32 weights in custom checkpoint optimizer files (the glob pattern looks like `global_step*/*optim_states.pt`) and are saved under the normal checkpoint. - - - - -A model trained with ZeRO-2 saves the pytorch_model.bin weights in fp16. To save the model weights in fp16 for a model trained with ZeRO-3, you need to set `"stage3_gather_16bit_weights_on_model_save": true` because the model weights are partitioned across multiple GPUs. Otherwise, the [`Trainer`] won't save the weights in fp16 and it won't create a pytorch_model.bin file. This is because DeepSpeed's state_dict contains a placeholder instead of the real weights and you won't be able to load them. - -```yaml -{ - "zero_optimization": { - "stage3_gather_16bit_weights_on_model_save": true - } -} -``` - - - - -The full precision weights shouldn't be saved during training because it can require a lot of memory. It is usually best to save the fp32 weights offline after training is complete. But if you have a lot of free CPU memory, it is possible to save the fp32 weights during training. This section covers both online and offline approaches. - -### Online - -You must have saved at least one checkpoint to load the latest checkpoint as shown in the following: - -```py -from transformers.trainer_utils import get_last_checkpoint -from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint - -checkpoint_dir = get_last_checkpoint(trainer.args.output_dir) -fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) -``` - -If you've enabled the `--load_best_model_at_end` parameter to track the best checkpoint in [`TrainingArguments`], you can finish training first and save the final model explicitly. Then you can reload it as shown below: - -```py -from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint - -checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final") -trainer.deepspeed.save_checkpoint(checkpoint_dir) -fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) -``` - - - -Once `load_state_dict_from_zero_checkpoint` is run, the model is no longer usable in DeepSpeed in the context of the same application. You'll need to initialize the DeepSpeed engine again since `model.load_state_dict(state_dict)` removes all the DeepSpeed magic from it. Only use this at the very end of training. - - - -You can also extract and load the state_dict of the fp32 weights: - -```py -from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint - -state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu -model = model.cpu() -model.load_state_dict(state_dict) -``` - -### Offline - -DeepSpeed provides a zero_to_fp32.py script at the top-level of the checkpoint folder for extracting weights at any point. This is a standalone script and you don't need a configuration file or [`Trainer`]. - -For example, if your checkpoint folder looked like this: - -```bash -$ ls -l output_dir/checkpoint-1/ --rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json -drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/ --rw-rw-r-- 1 stas stas 12 Mar 27 13:16 latest --rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt --rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin --rw-rw-r-- 1 stas stas 623 Mar 27 20:42 scheduler.pt --rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json --rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model --rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json --rw-rw-r-- 1 stas stas 339 Mar 27 20:42 trainer_state.json --rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin --rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py* -``` - -To reconstruct the fp32 weights from the DeepSpeed checkpoint (ZeRO-2 or ZeRO-3) subfolder `global_step1`, run the following command to create and consolidate the full fp32 weights from multiple GPUs into a single pytorch_model.bin file. The script automatically discovers the subfolder containing the checkpoint. - -```py -python zero_to_fp32.py . pytorch_model.bin -``` - - - -Run `python zero_to_fp32.py -h` for more usage details. The script requires 2x the general RAM of the final fp32 weights. - - - - - - -## ZeRO Inference - -[ZeRO Inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html) places the model weights in CPU or NVMe memory to avoid burdening the GPU which makes it possible to run inference with huge models on a GPU. Inference doesn't require any large additional amounts of memory for the optimizer states and gradients so you can fit much larger batches and/or sequence lengths on the same hardware. - -ZeRO Inference shares the same configuration file as [ZeRO-3](#zero-configuration), and ZeRO-2 and ZeRO-1 configs won't work because they don't provide any benefits for inference. - -To run ZeRO Inference, pass your usual training arguments to the [`TrainingArguments`] class and add the `--do_eval` argument. - -```bash -deepspeed --num_gpus=2 your_program.py --do_eval --deepspeed ds_config.json -``` - -## Non-Trainer DeepSpeed integration - -DeepSpeed also works with Transformers without the [`Trainer`] class. This is handled by the [`HfDeepSpeedConfig`] which only takes care of gathering ZeRO-3 parameters and splitting a model across multiple GPUs when you call [`~PreTrainedModel.from_pretrained`]. - - - -If you want everything automatically taken care of for you, try using DeepSpeed with the [`Trainer`]! You'll need to follow the [DeepSpeed documentation](https://www.deepspeed.ai/), and manually configure the parameter values in the config file (you can't use the `"auto"` value). - - - -To efficiently deploy ZeRO-3, you must instantiate the [`HfDeepSpeedConfig`] object before the model and keep that object alive: - - - - -```py -from transformers.integrations import HfDeepSpeedConfig -from transformers import AutoModel -import deepspeed - -ds_config = {...} # deepspeed config object or path to the file -# must run before instantiating the model to detect zero 3 -dschf = HfDeepSpeedConfig(ds_config) # keep this object alive -model = AutoModel.from_pretrained("openai-community/gpt2") -engine = deepspeed.initialize(model=model, config_params=ds_config, ...) -``` - - - - -[`HfDeepSpeedConfig`] is not required for ZeRO-1 or ZeRO-2. - -```py -from transformers.integrations import HfDeepSpeedConfig -from transformers import AutoModel, AutoConfig -import deepspeed - -ds_config = {...} # deepspeed config object or path to the file -# must run before instantiating the model to detect zero 3 -dschf = HfDeepSpeedConfig(ds_config) # keep this object alive -config = AutoConfig.from_pretrained("openai-community/gpt2") -model = AutoModel.from_config(config) -engine = deepspeed.initialize(model=model, config_params=ds_config, ...) -``` - - - - -### Non-Trainer ZeRO Inference - -To run ZeRO Inference without the [`Trainer`] in cases where you can’t fit a model onto a single GPU, try using additional GPUs or/and offloading to CPU memory. The important nuance to understand here is that the way ZeRO is designed, you can process different inputs on different GPUs in parallel. - -Make sure to: - -* disable CPU offload if you have enough GPU memory (since it slows things down). -* enable bf16 if you have an Ampere or newer GPU to make things faster. If you don’t have one of these GPUs, you may enable fp16 as long as you don’t use a model pretrained in bf16 (T5 models) because it may lead to an overflow error. - -Take a look at the following script to get a better idea of how to run ZeRO Inference without the [`Trainer`] on a model that won't fit on a single GPU. - -```py -#!/usr/bin/env python - -# This script demonstrates how to use Deepspeed ZeRO in an inference mode when one can't fit a model -# into a single GPU -# -# 1. Use 1 GPU with CPU offload -# 2. Or use multiple GPUs instead -# -# First you need to install deepspeed: pip install deepspeed -# -# Here we use a 3B "bigscience/T0_3B" model which needs about 15GB GPU RAM - so 1 largish or 2 -# small GPUs can handle it. or 1 small GPU and a lot of CPU memory. -# -# To use a larger model like "bigscience/T0" which needs about 50GB, unless you have an 80GB GPU - -# you will need 2-4 gpus. And then you can adapt the script to handle more gpus if you want to -# process multiple inputs at once. -# -# The provided deepspeed config also activates CPU memory offloading, so chances are that if you -# have a lot of available CPU memory and you don't mind a slowdown you should be able to load a -# model that doesn't normally fit into a single GPU. If you have enough GPU memory the program will -# run faster if you don't want offload to CPU - so disable that section then. -# -# To deploy on 1 gpu: -# -# deepspeed --num_gpus 1 t0.py -# or: -# python -m torch.distributed.run --nproc_per_node=1 t0.py -# -# To deploy on 2 gpus: -# -# deepspeed --num_gpus 2 t0.py -# or: -# python -m torch.distributed.run --nproc_per_node=2 t0.py - -from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM -from transformers.integrations import HfDeepSpeedConfig -import deepspeed -import os -import torch - -os.environ["TOKENIZERS_PARALLELISM"] = "false" # To avoid warnings about parallelism in tokenizers - -# distributed setup -local_rank = int(os.getenv("LOCAL_RANK", "0")) -world_size = int(os.getenv("WORLD_SIZE", "1")) -torch.cuda.set_device(local_rank) -deepspeed.init_distributed() - -model_name = "bigscience/T0_3B" - -config = AutoConfig.from_pretrained(model_name) -model_hidden_size = config.d_model - -# batch size has to be divisible by world_size, but can be bigger than world_size -train_batch_size = 1 * world_size - -# ds_config notes -# -# - enable bf16 if you use Ampere or higher GPU - this will run in mixed precision and will be -# faster. -# -# - for older GPUs you can enable fp16, but it'll only work for non-bf16 pretrained models - e.g. -# all official t5 models are bf16-pretrained -# -# - set offload_param.device to "none" or completely remove the `offload_param` section if you don't -# - want CPU offload -# -# - if using `offload_param` you can manually finetune stage3_param_persistence_threshold to control -# - which params should remain on gpus - the larger the value the smaller the offload size -# -# For in-depth info on Deepspeed config see -# https://huggingface.co/docs/transformers/main/main_classes/deepspeed - -# keeping the same format as json for consistency, except it uses lower case for true/false -# fmt: off -ds_config = { - "fp16": { - "enabled": False - }, - "bf16": { - "enabled": False - }, - "zero_optimization": { - "stage": 3, - "offload_param": { - "device": "cpu", - "pin_memory": True - }, - "overlap_comm": True, - "contiguous_gradients": True, - "reduce_bucket_size": model_hidden_size * model_hidden_size, - "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size, - "stage3_param_persistence_threshold": 10 * model_hidden_size - }, - "steps_per_print": 2000, - "train_batch_size": train_batch_size, - "train_micro_batch_size_per_gpu": 1, - "wall_clock_breakdown": False -} -# fmt: on - -# next line instructs transformers to partition the model directly over multiple gpus using -# deepspeed.zero.Init when model's `from_pretrained` method is called. -# -# **it has to be run before loading the model AutoModelForSeq2SeqLM.from_pretrained(model_name)** -# -# otherwise the model will first be loaded normally and only partitioned at forward time which is -# less efficient and when there is little CPU RAM may fail -dschf = HfDeepSpeedConfig(ds_config) # keep this object alive - -# now a model can be loaded. -model = AutoModelForSeq2SeqLM.from_pretrained(model_name) - -# initialise Deepspeed ZeRO and store only the engine object -ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] -ds_engine.module.eval() # inference - -# Deepspeed ZeRO can process unrelated inputs on each GPU. So for 2 gpus you process 2 inputs at once. -# If you use more GPUs adjust for more. -# And of course if you have just one input to process you then need to pass the same string to both gpus -# If you use only one GPU, then you will have only rank 0. -rank = torch.distributed.get_rank() -if rank == 0: - text_in = "Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy" -elif rank == 1: - text_in = "Is this review positive or negative? Review: this is the worst restaurant ever" - -tokenizer = AutoTokenizer.from_pretrained(model_name) -inputs = tokenizer.encode(text_in, return_tensors="pt").to(device=local_rank) -with torch.no_grad(): - outputs = ds_engine.module.generate(inputs, synced_gpus=True) -text_out = tokenizer.decode(outputs[0], skip_special_tokens=True) -print(f"rank{rank}:\n in={text_in}\n out={text_out}") -``` - -Save the script as t0.py and launch it: - -```bash -$ deepspeed --num_gpus 2 t0.py -rank0: - in=Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy - out=Positive -rank1: - in=Is this review positive or negative? Review: this is the worst restaurant ever - out=negative -``` - -This is a very basic example and you'll want to adapt it to your use case. - -### Generate - -Using multiple GPUs with ZeRO-3 for generation requires synchronizing the GPUs by setting `synced_gpus=True` in the [`~GenerationMixin.generate`] method. Otherwise, if one GPU is finished generating before another one, the whole system hangs because the remaining GPUs haven't received the weight shard from the GPU that finished first. - -For Transformers>=4.28, if `synced_gpus` is automatically set to `True` if multiple GPUs are detected during generation. - -## Troubleshoot - -When you encounter an issue, you should consider whether DeepSpeed is the cause of the problem because often it isn't (unless it's super obviously and you can see DeepSpeed modules in the exception)! The first step should be to retry your setup without DeepSpeed, and if the problem persists, then you can report the issue. If the issue is a core DeepSpeed problem and unrelated to the Transformers integration, open an Issue on the [DeepSpeed repository](https://github.com/deepspeedai/DeepSpeed). - -For issues related to the Transformers integration, please provide the following information: - -* the full DeepSpeed config file - -* the command line arguments of the [`Trainer`], or [`TrainingArguments`] arguments if you're scripting the [`Trainer`] setup yourself (don't dump the [`TrainingArguments`] which has dozens of irrelevant entries) - -* the outputs of: - -```bash -python -c 'import torch; print(f"torch: {torch.__version__}")' -python -c 'import transformers; print(f"transformers: {transformers.__version__}")' -python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")' -``` - -* a link to a Google Colab notebook to reproduce the issue - -* if impossible, a standard and non-custom dataset we can use and also try to use an existing example to reproduce the issue with - -The following sections provide a guide for resolving two of the most common issues. - -### DeepSpeed process killed at startup - -When the DeepSpeed process is killed during launch without a traceback, that usually means the program tried to allocate more CPU memory than your system has or your process tried to allocate more CPU memory than allowed leading the OS kernel to terminate the process. In this case, check whether your configuration file has either `offload_optimizer`, `offload_param` or both configured to offload to the CPU. - -If you have NVMe and ZeRO-3 setup, experiment with offloading to the NVMe ([estimate](https://deepspeed.readthedocs.io/en/latest/memory.html) the memory requirements for your model). - -### NaN loss - -NaN loss often occurs when a model is pretrained in bf16 and then you try to use it with fp16 (especially relevant for TPU trained models). To resolve this, use fp32 or bf16 if your hardware supports it (TPU, Ampere GPUs or newer). - -The other issue may be related to using fp16. For example, if this is your fp16 configuration: - -```yaml -{ - "fp16": { - "enabled": "auto", - "loss_scale": 0, - "loss_scale_window": 1000, - "initial_scale_power": 16, - "hysteresis": 2, - "min_loss_scale": 1 - } -} -``` - -You might see the following `OVERFLOW!` messages in the logs: - -```bash -0%| | 0/189 [00:00 Date: Thu, 12 Dec 2024 14:10:32 -0800 Subject: [PATCH 077/116] deepspeed 2 --- docs/source/en/deepspeed.md | 620 +++++++++++++++++++++++++++++++++++- 1 file changed, 617 insertions(+), 3 deletions(-) diff --git a/docs/source/en/deepspeed.md b/docs/source/en/deepspeed.md index 89a6ba59eb8c..b27afa9bea7a 100644 --- a/docs/source/en/deepspeed.md +++ b/docs/source/en/deepspeed.md @@ -372,30 +372,644 @@ The example ZeRO-3 and ZeRO-Infinity config below sets most of the parameter val DeepSpeed supports many training features that can be configured in the config file. This section describes some of the most important features. -### Activation and gradient checkpointing +### Gradient checkpointing + +Gradient checkpointing saves memory by only storing some of the intermediate activations instead of storing *all* of them. It is useful for fitting larger models on the GPU without running out of memory or to increase the batch size for better performance. Training speed is slower though. + +* For a Hugging Face model, set `model.gradient_checkpointing_enable()` or add `--gradient_checkpointing` in the [`TrainingArguments`]. +* For a non-Hugging Face model, use the DeepSpeed [Activation Checkpointing API](https://deepspeed.readthedocs.io/en/latest/activation-checkpointing.html). Replacing Transformers modeling code and [torch.utils.checkpoint](https://pytorch.org/docs/stable/checkpoint.html) with the DeepSpeed API gives you more flexibility because you can offload the forward activations to the CPU memory instead of recalculating them. ### Batch size +The batch size can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets `train_micro_batch_size_per_gpu` and `train_batch_size` to the value of `world_size * per_device_train_batch_size * gradient_accumulation_steps`. + +```yaml +{ + "train_micro_batch_size_per_gpu": "auto", + "train_batch_size": "auto" +} +``` + ### Communication data type +A separate data type is used for communication collectives like reduction, gathering and scattering operations. + +All gather and scatter operations are performed in the same data type the data is in. For example, if you're training in bf16, the data is also gathered in bf16 because gathering is a non-lossy operation. + +Reduce operations are lossy, for example, when gradients are averaged across multiple GPUs. When the communication is done if fp16 or bf16, it's more likely to be lossy because adding multiple numbers in low precision isn't exact. This is especially the case with bf16 which has a lower precision than fp16. For this reason, fp16 is the default for reduction operations because the loss is minimal when averaging gradients. + +Choose the communication data type by setting the `communication_data_type` parameter in the config file. For example, choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it's downcasted to whichever half-precision data type you're training in. + +```yaml +{ + "communication_data_type": "fp32" +} +``` + ### Gradient accumulation +Gradient accumulation accumulates gradients over several mini-batches of data before updating parameters. It stores less gradients and enables training with a larger *effective batch size*. Training speed is slower though, but it's useful for overcoming memory constraints. + +Gradient accumulation can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets it to the value of `gradient_accumulation_steps`. + +```yaml +{ + "gradient_accumulation_steps": "auto" +} +``` + ### Gradient clipping +Gradient clipping is useful for preventing exploding gradients which can lead to instability during training. It sets a maximum threshold value and rescales the gradients if their norm exceeds the threshold. + +Gradient clipping can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets it to the value of `max_grad_norm`. + +```yaml +{ + "gradient_clipping": "auto" +} +``` + ### Mixed precision training +Mixed precision accelerates training speed by performing some calculations in half-precision, but it also maintains some calculations in full-precision to preserve accuracy. DeepSpeed supports fp32, fp16, and bf16 data types. + + + + +Train in fp32 if a model wasn't pretrained in mixed precision because it may cause underflow or overflow errors. Disable fp16, the default, in this case. + +```yaml +{ + "fp16": { + "enabled": false + } +} +``` + +For Ampere GPUs and PyTorch 1.7+, the more efficient [tf32](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices) mode is automatically enabled for some operations but the results are still in fp32. Configure it from the [`Trainer`] by setting `--tf32` to enable it, and `--tf32 0` or `--no_tf32` to disable it. + + + + +To configure AMP-like fp16 mixed precision, set up the config as shown below with `"auto"` or your own values. [`Trainer`] automatically enables or disables fp16 based on the value of `fp16_backend`, and the rest of the config can be set by you. fp16 is enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend amp` or `--fp16_full_eval`. + +```yaml +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + } +} +``` + +For additional DeepSpeed fp16 training options, take a look at the [FP16 Training Options](https://www.deepspeed.ai/docs/config-json/#fp16-training-options) reference. + +To configure Apex-like fp16 mixed precision, set up the config as shown below with `"auto"` or your own values. [`Trainer`] automatically configures `amp` based on the values of `fp16_backend` and `fp16_opt_level`. It can also be enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend apex` or `--fp16_opt_level 01`. + +```yaml +{ + "amp": { + "enabled": "auto", + "opt_level": "auto" + } +} +``` + + + + +> [!TIP] +> bf16 requires DeepSpeed 0.6.0. + +bf16 has the same dynamic range as fp32, and doesn’t require loss scaling unlike fp16. However, if you use [gradient accumulation](#gradient-accumulation) with bf16, gradients are accumulated in bf16 which may not be desirable because bf16s low precision can lead to lossy accumulation. + +bf16 can be set up in the config file or enabled from the command line when the following arguments are passed: `--bf16` or `--bf16_full_eval`. + +```yaml +{ + "bf16": { + "enabled": "auto" + } +} +``` + + + + ### Optimizer and scheduler +DeepSpeed and Transformers optimizers and schedulers can be mixed and matched if `offload_optimizer` isn't enabled. When `offload_optimizer` is enabled, use a non-DeepSpeed optimizer (except for LAMB) as long as it has it a CPU and GPU implementation. + +Set the optimizer and scheduler parameters for the config file from the command line to avoid hard to find errors. For example, if the learning rate is set to a different value in another place, you can override it from the command line. + + + + +DeepSpeed offers several [optimizers](https://www.deepspeed.ai/docs/config-json/#optimizer-parameters) (Adam, AdamW, OneBitAdam, and LAMB) but you can also import other optimizers from PyTorch. If you don't configure the optimizer in the config, [`Trainer`] automatically selects AdamW and either uses the supplied values or the default values for the following parameters from the command line: `lr`, `adam_beta1`, `adam_beta2`, `adam_epsilon`, `weight_decay`. + +You can set the parameters to `"auto"` or manually input your own values. + +```yaml +{ + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + } +} +``` + +Use an unsupported optimizer by adding the following to the top level configuration. + +```yaml +{ + "zero_allow_untested_optimizer": true +} +``` + +From DeepSpeed 0.8.3+, if you want to use offload, you'll also need to add the following to the top level configuration because offload works best with DeepSpeed's CPU Adam optimizer. + +```yaml +{ + "zero_force_ds_cpu_optimizer": false +} +``` + + + + +DeepSpeed supports the LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR learning rate [schedulers](https://www.deepspeed.ai/docs/config-json/#scheduler-parameters). + +Transformers and DeepSpeed provide two of the same schedulers: + +* WarmupLR is the same as `--lr_scheduler_type constant_with_warmup` in Transformers. +* WarmupDecayLR is the same as `--lr_scheduler_type linear` in Transformers (this is the default scheduler used in Transformers). + +If you don't configure the scheduler in the config file, [`Trainer`] automatically selects WarmupDecayLR and either uses the supplied values or the default values for the following parameters from the command line: `warmup_min_lr`, `warmup_max_lr`, `warmup_num_steps`, `total_num_steps` (automatically calculated during run time if `max_steps` is not provided). + +You can set the parameters to `"auto"` or manually input your own values. + +```yaml +{ + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "total_num_steps": "auto", + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + } +} +``` + + + + ## Deploy +DeepSpeed can be deployed with its native launcher, [torchrun](https://pytorch.org/docs/stable/elastic/run.html) or [Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch). + +Add the `--deepspeed ds_config.json` argument to the [`Trainer`] command line. It is recommended to use DeepSpeeds [add_config_arguments](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) utility to add any other command line arguments to your code. + + + + +To deploy DeepSpeed on multiple GPUs, add `--num_gpus`. You don't need to add `--num_gpus` if you're planning on using all available GPUs. + +```bash +deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \ +--deepspeed tests/deepspeed/ds_config_zero3.json \ +--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ +--output_dir output_dir --overwrite_output_dir --fp16 \ +--do_train --max_train_samples 500 --num_train_epochs 1 \ +--dataset_name wmt16 --dataset_config "ro-en" \ +--source_lang en --target_lang ro +``` + + + + +DeepSpeed is still useful with just one GPU because you can: + +1. Offload some computations and memory to the CPU to make more GPU resources available to your model to use a larger batch size or fit a very large model that normally won't fit. +2. Minimize memory fragmentation with its smart GPU memory management system which also allows you to fit bigger models and data batches. + +To deploy DeepSpeed on a single GPU, add `--num_gpus`. You don't need to add `--num_gpus` if you only have one GPU because DeepSpeed deploys all GPUs it can see on a given node. + +> [!TIP] +> Set the `allgather_bucket_size` and `reduce_bucket_size` values to 2e8 in the [ZeRO-2](#zero-configuration) configuration file to get better performance on a single GPU. + +```bash +deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \ +--deepspeed tests/deepspeed/ds_config_zero2.json \ +--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ +--output_dir output_dir --overwrite_output_dir --fp16 \ +--do_train --max_train_samples 500 --num_train_epochs 1 \ +--dataset_name wmt16 --dataset_config "ro-en" \ +--source_lang en --target_lang ro +``` + + + + ### Multi-node -### SLURM +A multi-node setup consists of multiple nodes, where each node has one of more GPUs running a workload. DeepSpeed expects a shared storage system, but if this is not the case, you need to adjust the config file to include a [checkpoint](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to allow loading without access to a shared filesystem. + +```yaml +{ + "checkpoint": { + "use_node_local_storage": true + } +} +``` + +You could also use the `--save_on_each_node` parameter in [`TrainingArguments`] to automatically add the above `checkpoint` to your config. + +The examples below for the torchrun and DeepSpeed launcher shows how to deploy two nodes with eight GPUs each. Access the first node with `ssh hotname1` and the second node with `ssh hostname2`. Both nodes must be able to communicate with each other locally over ssh without a password. + + + + +With [torchrun](https://pytorch.org/docs/stable/elastic/run.html), ssh to each node and run the following command on both of them. The launcher waits until both nodes are synchronized before launching the training. + +```bash +torchrun --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \ +--master_port=9901 your_program.py --deepspeed ds_config.json +``` + + + + +Create a `hostfile` for the DeepSpeed launcher. + +```bash +hostname1 slots=8 +hostname2 slots=8 +``` + +The DeepSpeed launcher automatically launches the command on both nodes at once with the command below. + +```bash +deepspeed --num_gpus 8 --num_nodes 2 --hostfile hostfile --master_addr hostname1 --master_port=9901 \ +your_program.py --deepspeed ds_config.json +``` + +Check out the [Resource Configuration (multi-node)](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) guide for more details about configuring multi-node compute resources. + + + + +### Slurm + +[Slurm](https://slurm.schedmd.com/documentation.html) is a cluster management and job scheduling system. An example Slurm script is shown below. + +```bash +#SBATCH --job-name=test-nodes # name +#SBATCH --nodes=2 # nodes +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=10 # number of cores per tasks +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +#SBATCH --output=%x-%j.out # output file name + +export GPUS_PER_NODE=8 +export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +export MASTER_PORT=9901 + +srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \ + --master_addr $MASTER_ADDR --master_port $MASTER_PORT \ +your_program.py --deepspeed ds_config.json' +``` + +Launch training simultaneously on all nodes with the command below. + +```bash +sbatch launch.slurm +``` + +### Jupyter Notebook -### Notebook +To use DeepSpeed in a Jupyter Notebook, you need to emulate a distributed environment because the launcher doesn't support deployment from a notebook. This is only supported for one GPU. To use multiple GPUs, you must use a multi-process environment, which means you have to use the DeepSpeed launcher which can't be emulated as shown here. + +```py +# emulate a launcher in the notebook +import os + +os.environ["MASTER_ADDR"] = "localhost" +os.environ["MASTER_PORT"] = "9994" # modify if RuntimeError: Address already in use +os.environ["RANK"] = "0" +os.environ["LOCAL_RANK"] = "0" +os.environ["WORLD_SIZE"] = "1" + +training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json") +trainer = Trainer(...) +trainer.train() +``` + +Create a config file on the fly in the notebook in the current directory with a dedicated cell. + +```py +%%bash +cat <<'EOT' > ds_config_zero3.json +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} +EOT +``` + +If the training script is in a file and not a notebook cell, launch DeepSpeed from the shell in the notebook cell. + +```py +!git clone https://github.com/huggingface/transformers +!cd transformers; deepspeed examples/pytorch/translation/run_translation.py ... +``` + +Another option is to use `%%bash` to run the shell program without emulating the distributed environment. However, you won't be able to view the logs until training is complete. + +```py +%%bash + +git clone https://github.com/huggingface/transformers +cd transformers +deepspeed examples/pytorch/translation/run_translation.py ... +``` ## Save model weights +DeepSpeed stores the main fp32 weights in custom checkpoint optimizer files (`global_step*/*optim_states.pt`) which are saved under the normal checkpoint. + ### fp16 +ZeRO-2 saves the model weights in fp16. To save the weights in fp16 for ZeRO-3, set `"stage3_gather_16bit_weights_on_model_save": true` in the config file, because the weights are distributed across multiple GPUs. + +If you don't, [`Trainer`] won't save the weights in fp16 and it won't create a `pytorch_model.bin` file. This is because DeepSpeed's state_dict contains a placeholder instead of the real weights, so you won't be able to load it. + +```yaml +{ + "zero_optimization": { + "stage": 3, + "stage3_gather_16bit_weights_on_model_save": true + } +} +``` + ### fp32 + +fp32 weights shouldn't be saved during training because it can require a lot of memory, unless you have a lot of free CPU memory. It is usually best to save the fp32 weights offline after training is complete. + + + + +DeepSpeed provies a [zero_to_fp32.py](https://github.com/microsoft/DeepSpeed/blob/91829476a8fd4d0d9268c03c1d56795d20a51c12/deepspeed/utils/zero_to_fp32.py#L14) script at the top-level checkpoint folder for extracting weights at any point. This is a standalone script and you don't need a config file or [`Trainer`]. + +For example, if your checkpoint folder looks like the one shown below, then you can run the following command to create and consolidate the fp32 weights from multiple GPUs into a single `pytorch_model.bin` file. The script automatically discovers the subfolder `global_step1` which contains the checkpoint. + +```bash +$ ls -l output_dir/checkpoint-1/ +-rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json +drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/ +-rw-rw-r-- 1 stas stas 12 Mar 27 13:16 latest +-rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt +-rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin +-rw-rw-r-- 1 stas stas 623 Mar 27 20:42 scheduler.pt +-rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json +-rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model +-rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json +-rw-rw-r-- 1 stas stas 339 Mar 27 20:42 trainer_state.json +-rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin +-rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py* +``` + +> [!TIP] +> Run `python zero_to_fp32.py -h` for more usage details. The script requires 2x the general RAM of the final fp32 weights. + +```bash +python zero_to_fp32.py . pytorch_model.bin +``` + + + + +Adding the `--load_best_model_at_end` parameter in [`TrainingArguments`] tracks the best checkpoint so you can finish training first and save the final model explicitly. Reload the model as shown below. + +> [!WARNING] +> Once [load_state_dict_from_zero_checkpoint](https://deepspeed.readthedocs.io/en/stable/model-checkpointing.html#deepspeed.utils.zero_to_fp32.load_state_dict_from_zero_checkpoint) is run, the model is no longer usable in DeepSpeed in the context of the same application. You'll need to reinitialize the DeepSpeed engine because `model.load_state_dict(state_dict)` removes all the DeepSpeed magic from it. Only use this function once training is complete. + +```py +from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + +checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final") +trainer.deepspeed.save_checkpoint(checkpoint_dir) +fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) +``` + +You must have saved at least one checkpoint to load the latest checkpoint as shown in the example below. + +```py +from transformers.trainer_utils import get_last_checkpoint +from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + +checkpoint_dir = get_last_checkpoint(trainer.args.output_dir) +fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) +``` + +Use `load_state_dict` to extract and load the state_dict of the fp32 weights. + +```py +from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + +state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) +model = model.cpu() +model.load_state_dict(state_dict) +``` + + + + +## Non-Trainer integration + +DeepSpeed also works with Transformers without [`Trainer`]. The [`~integrations.HfDeepSpeedConfig`] is responsible for gathering ZeRO-3 parameters and partitioning a model across multiple GPUs when [`~PreTrainedModel.from_pretrained`] is called. + +You must instantiate [`HfDeepSpeedConfig`] before loading a model to efficiently deploy ZeRO-3. + + + + +```py +from transformers.integrations import HfDeepSpeedConfig +from transformers import AutoModel +import deepspeed + +# DeepSpeed config object or path to the file +ds_config = {...} +# must run before instantiating the model to detect ZeRO-3 +dschf = HfDeepSpeedConfig(ds_config) # keep this object alive +model = AutoModel.from_pretrained("openai-community/gpt2") +engine = deepspeed.initialize(model=model, config_params=ds_config, ...) +``` + + + + +[`HfDeepSpeedConfig`] is not required for ZeRO-1 or ZeRO-2. + +```py +from transformers.integrations import HfDeepSpeedConfig +from transformers import AutoModel, AutoConfig +import deepspeed + +# DeepSpeed config object or path to the file +ds_config = {...} +# must run before instantiating the model to detect zero 3 +dschf = HfDeepSpeedConfig(ds_config) # keep this object alive +# randomly intialize model weights +config = AutoConfig.from_pretrained("openai-community/gpt2") +model = AutoModel.from_config(config) +engine = deepspeed.initialize(model=model, config_params=ds_config, ...) +``` + + + + +## Troubleshoot + +One of the first things to check when you encounter an error is whether DeepSpeed is the cause because often it isn't. Retry your setup without DeepSpeed, and if the error persists, report the issue. If the issue is unrelated to the Transformers integration, please open the issue on the DeepSpeed [repository](https://github.com/microsoft/DeepSpeed). + +For issues related to the Transformers integration, please provide the following information. + +* The full DeepSpeed config file. +* The command line arguments for [`Trainer`] or the [`TrainingArguments`] if you're scripting the [`Trainer`] setup yourself (don't dump the entire [`TrainingArguments`] which contains many irrelevant entries). +* The outputs of the following commands. + + ```bash + python -c 'import torch; print(f"torch: {torch.__version__}")' + python -c 'import transformers; print(f"transformers: {transformers.__version__}")' + python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")' + ``` + +* A link to a Google Colab notebook to reproduce the issue. +* A standard or non-custom dataset or an existing example to reproduce the issue. + +The following sections provide a guide for resolving two of the most common issues. + +### Process killed at startup + +When the DeepSpeed process is killed during launch without a traceback, that usually means the program tried to allocate more CPU memory than is available on your system. Or the process may have tried to allocate more CPU memory than allowed, leading the OS kernel to termine the process. + +In this case, check whether your config file has either `offload_optimizer`, `offlload_param`, or both configured to offload to the CPU. + +If you have NVM3 and ZeRO-3 set up, experiment with offloading to the NVMe ([estimate](https://deepspeed.readthedocs.io/en/latest/memory.html) the memory requirements of a model first) instead. + +### NaN loss + +NaN loss often occurs when a model is pretrained in bf16 and you try to use it with fp16 (especially relevant to TPU trained models). To resolve this, use fp32 or bf16 if your hardware (TPUs, Ampere GPUs or newer) supports it. + +It is also possible that fp16 is causing overflow. For example, if your config file looks like the one below, you may see the following overflow errors in the logs. + +```yaml +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + } +} +``` + +The `OVERFLOW!` error below is a result of the DeepSpeed loss scaler unable to find a scaling coefficient to overcome the loss overflow. Try a higher `initial_scale_power` value in this case (32 usually works). + +```bash +0%| | 0/189 [00:00 Date: Fri, 13 Dec 2024 14:56:55 -0800 Subject: [PATCH 078/116] chat toctree --- docs/source/en/_toctree.yml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 499e7e8db5c5..ade035da6bb3 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -82,16 +82,16 @@ title: Getting the most out of LLMs - local: perplexity title: Perplexity of fixed-length models - - title: Chat - sections: - - local: conversations - title: Chat pipeline - - local: chat_templating - title: Templates - - local: chat_templating_writing - title: Template writing - - local: chat_extras - title: Tools and RAG + - title: Chat + sections: + - local: conversations + title: Chat pipeline + - local: chat_templating + title: Templates + - local: chat_templating_writing + title: Template writing + - local: chat_extras + title: Tools and RAG - title: Optimization sections: - local: perf_torch_compile From 638a6fc3dd4fea81ffaa172a9cd0ebe0203510b6 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Wed, 18 Dec 2024 18:07:11 -0800 Subject: [PATCH 079/116] quant pt 1 --- docs/source/en/_toctree.yml | 34 +++++----- docs/source/en/quantization/eetq.md | 38 ++++++++--- docs/source/en/quantization/hqq.md | 66 ++++++++++++------- docs/source/en/quantization/overview.md | 84 ++++--------------------- 4 files changed, 104 insertions(+), 118 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index ade035da6bb3..88de99a5c4fb 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -154,37 +154,38 @@ - local: model_memory_anatomy title: Model training anatomy - title: Quantization - isExpanded: false + isExpanded: False sections: - local: quantization/overview - title: Getting started - - local: quantization/bitsandbytes - title: bitsandbytes - - local: quantization/gptq - title: GPTQ - - local: quantization/awq - title: AWQ + title: Overview - local: quantization/aqlm title: AQLM - - local: quantization/quanto - title: Quanto + - local: quantization/awq + title: AWQ + - local: quantization/bitnet + title: BitNet + - local: quantization/bitsandbytes + title: bitsandbytes + - local: quantization/compressed_tensors + title: compressed-tensors - local: quantization/eetq title: EETQ - - local: quantization/hqq - title: HQQ - local: quantization/fbgemm_fp8 title: FBGEMM FP8 + - local: quantization/gptq + title: GPTQ + - local: quantization/hqq + title: HQQ - local: quantization/optimum title: Optimum + - local: quantization/quanto + title: Quanto - local: quantization/torchao title: TorchAO - - local: quantization/bitnet - title: BitNet - - local: quantization/compressed_tensors - title: compressed-tensors - local: quantization/contribute title: Contribute new quantization method - title: Deploy to production + isExpanded: False sections: - local: serialization title: Export to ONNX @@ -274,6 +275,7 @@ - local: troubleshooting title: Troubleshoot - title: Community + isExpanded: False sections: - local: contributing title: How to contribute to Transformers? diff --git a/docs/source/en/quantization/eetq.md b/docs/source/en/quantization/eetq.md index bf2c4e0e6466..5fabd2aa7e4b 100644 --- a/docs/source/en/quantization/eetq.md +++ b/docs/source/en/quantization/eetq.md @@ -16,32 +16,50 @@ rendered properly in your Markdown viewer. # EETQ -The [EETQ](https://github.com/NetEase-FuXi/EETQ) library supports int8 per-channel weight-only quantization for NVIDIA GPUS. The high-performance GEMM and GEMV kernels are from FasterTransformer and TensorRT-LLM. It requires no calibration dataset and does not need to pre-quantize your model. Moreover, the accuracy degradation is negligible owing to the per-channel quantization. +The [Easy & Efficient Quantization for Transformers (EETQ)](https://github.com/NetEase-FuXi/EETQ) library supports int8 weight-only per-channel quantization for NVIDIA GPUs. It uses high-performance GEMM and GEMV kernels from [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM). The attention layer is optimized with [FlashAttention2](https://github.com/Dao-AILab/flash-attention). No calibration dataset is required, and the model doesn't need to be pre-quantized. Accuracy degradation is negligible owing to the per-channel quantization. -Make sure you have eetq installed from the [release page](https://github.com/NetEase-FuXi/EETQ/releases) -``` +EETQ further supports fine-tuning with [PEFT](https://huggingface.co/docs/peft). + +Install EETQ from the [release page](https://github.com/NetEase-FuXi/EETQ/releases) or [source code](https://github.com/NetEase-FuXi/EETQ). CUDA 11.4+ is required for EETQ. + + + + +```bash pip install --no-cache-dir https://github.com/NetEase-FuXi/EETQ/releases/download/v1.0.0/EETQ-1.0.0+cu121+torch2.1.2-cp310-cp310-linux_x86_64.whl ``` -or via the source code https://github.com/NetEase-FuXi/EETQ. EETQ requires CUDA capability <= 8.9 and >= 7.0 -``` + + + + +```bash git clone https://github.com/NetEase-FuXi/EETQ.git cd EETQ/ git submodule update --init --recursive pip install . ``` -An unquantized model can be quantized via "from_pretrained". + +
+ +Quantize a model on-the-fly by defining the quantization data type in [`EetqConfig`]. + ```py from transformers import AutoModelForCausalLM, EetqConfig -path = "/path/to/model" + quantization_config = EetqConfig("int8") -model = AutoModelForCausalLM.from_pretrained(path, device_map="auto", quantization_config=quantization_config) +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-3.1-8B", + torch_dtype="auto", + device_map="auto", + quantization_config=quantization_config +) ``` -A quantized model can be saved via "saved_pretrained" and be reused again via the "from_pretrained". +Save the quantized model with [`~PreTrainedModel.save_pretrained`] so it can be reused again with [`~PreTrainedModel.from_pretrained`]. ```py quant_path = "/path/to/save/quantized/model" model.save_pretrained(quant_path) model = AutoModelForCausalLM.from_pretrained(quant_path, device_map="auto") -``` \ No newline at end of file +``` diff --git a/docs/source/en/quantization/hqq.md b/docs/source/en/quantization/hqq.md index 34608cd64fd8..1c6a418abb64 100755 --- a/docs/source/en/quantization/hqq.md +++ b/docs/source/en/quantization/hqq.md @@ -14,27 +14,43 @@ rendered properly in your Markdown viewer. --> +# HQQ -# HQQ +[Half-Quadratic Quantization (HQQ)](https://github.com/mobiusml/hqq/) supports fast on-the-fly quantization for 8, 4, 3, 2, and even 1-bits. It doesn't require calibration data, and it is compatible with any model modality (LLMs, vision, etc.). -Half-Quadratic Quantization (HQQ) implements on-the-fly quantization via fast robust optimization. It doesn't require calibration data and can be used to quantize any model. -Please refer to the official package for more details. +HQQ further supports fine-tuning with [PEFT](https://huggingface.co/docs/peft) and is fully compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for even faster inference and training. -For installation, we recommend you use the following approach to get the latest version and build its corresponding CUDA kernels: -``` +Install HQQ with the following command to get the latest version and to build its corresponding CUDA kernels. + +```bash pip install hqq ``` -To quantize a model, you need to create an [`HqqConfig`]. There are two ways of doing it: -``` Python +You can choose to either replace all the linear layers in a model with the same quantization config or dedicate a specific quantization config for specific linear layers. + + + + +Quantize a model by creating a [`HqqConfig`] and specifying the `nbits` and `group_size` to replace for all the linear layers ([torch.nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)) of the model. + +``` py from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig -# Method 1: all linear layers will use the same quantization config quant_config = HqqConfig(nbits=8, group_size=64) +model = transformers.AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-3.1-8B", + torch_dtype=torch.float16, + device_map="cuda", + quantization_config=quant_config +) ``` -``` Python -# Method 2: each linear layer with the same tag will use a dedicated quantization config + + + +Quantize a model by creating a dictionary specifying the `nbits` and `group_size` for the linear layers to quantize. Pass them to [`HqqConfig`] and set which layers to quantize with the config. This approach is especially useful for quantizing mixture-of-experts (MoEs) because they are less affected ly lower quantization settings. + +``` py q4_config = {'nbits':4, 'group_size':64} q3_config = {'nbits':3, 'group_size':32} quant_config = HqqConfig(dynamic_config={ @@ -47,23 +63,31 @@ quant_config = HqqConfig(dynamic_config={ 'mlp.up_proj' :q3_config, 'mlp.down_proj':q3_config, }) -``` - -The second approach is especially interesting for quantizing Mixture-of-Experts (MoEs) because the experts are less affected by lower quantization settings. - -Then you simply quantize the model as follows -``` Python model = transformers.AutoModelForCausalLM.from_pretrained( - model_id, + "meta-llama/Llama-3.1-8B", torch_dtype=torch.float16, device_map="cuda", quantization_config=quant_config ) ``` -## Optimized Runtime +## Backends + +HQQ supports various backends, including pure PyTorch and custom dequantization CUDA kernels. These backends are suitable for older GPUs and PEFT/QLoRA training. + +```py +from hqq.core.quantize import * + +HQQLinear.set_backend(HQQBackend.PYTORCH) +``` + +For faster inference, HQQ supports 4-bit fused kernels (torchao and Marlin) after a model is quantized. These can reach up to 200 tokens/sec on a single 4090. The example below demonstrates enabling the torchao_int4 backend. + +```py +from hqq.utils.patching import prepare_for_inference + +prepare_for_inference("model", backend="torchao_int4") +``` -HQQ supports various backends, including pure PyTorch and custom dequantization CUDA kernels. These backends are suitable for older gpus and peft/QLoRA training. -For faster inference, HQQ supports 4-bit fused kernels (TorchAO and Marlin), reaching up to 200 tokens/sec on a single 4090. -For more details on how to use the backends, please refer to https://github.com/mobiusml/hqq/?tab=readme-ov-file#backend +Refer to the [Backend](https://github.com/mobiusml/hqq/#backend) guide for more details. diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index 94696e300a57..eb5dcc8a848d 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -1,4 +1,4 @@ - -# Quantization +# Overview -Quantization techniques focus on representing data with less information while also trying to not lose too much accuracy. This often means converting a data type to represent the same information with fewer bits. For example, if your model weights are stored as 32-bit floating points and they're quantized to 16-bit floating points, this halves the model size which makes it easier to store and reduces memory-usage. Lower precision can also speedup inference because it takes less time to perform calculations with fewer bits. +Quantization lowers the memory requirements of loading and using a model by storing the weights in a lower precision while trying to preserve as much accuracy as possible. Weights are typically stored in full-precision (fp32) floating point representations, but half-precision (fp16 or bf16) are increasingly popular data types given the large size of models today. Some quantization methods can reduce the precision even further to integer representations, like int8 or int4. - +Transformers supports many quantization methods, each with their pros and cons, so you can pick the best one for your specific use case. Some methods require calibration for greater accuracy and extreme compression (1-2 bits), while other methods work out of the box with on-the-fly quantization. -Interested in adding a new quantization method to Transformers? Read the [HfQuantizer](./contribute) guide to learn how! +Use the Space below to help you pick a quantization method depending on your hardware and number of bits to quantize to. - + - +## Resources -If you are new to the quantization field, we recommend you to check out these beginner-friendly courses about quantization in collaboration with DeepLearning.AI: +If you are new to quantization, we recommend checking out these beginner-friendly quantization courses in collaboration with DeepLearning.AI. * [Quantization Fundamentals with Hugging Face](https://www.deeplearning.ai/short-courses/quantization-fundamentals-with-hugging-face/) * [Quantization in Depth](https://www.deeplearning.ai/short-courses/quantization-in-depth/) - - - -## When to use what? - -The community has developed many quantization methods for various use cases. With Transformers, you can run any of these integrated methods depending on your use case because each method has their own pros and cons. - -For example, some quantization methods require calibrating the model with a dataset for more accurate and "extreme" compression (up to 1-2 bits quantization), while other methods work out of the box with on-the-fly quantization. - -Another parameter to consider is compatibility with your target device. Do you want to quantize on a CPU, GPU, or Apple silicon? - -In short, supporting a wide range of quantization methods allows you to pick the best quantization method for your specific use case. - -Use the table below to help you decide which quantization method to use. - -| Quantization Method | On the fly quantization | CPU | CUDA GPU | ROCm GPU | Metal (Apple Silicon) | Intel GPU | Torch compile() | Bits | PEFT Fine Tuning | Serializable with 🤗Transformers | 🤗Transformers Support | Link to library | -|-----------------------------------------------|----------------------|-----------------|----------|-----------|------------------------------------|-----------------|-----------------|---------------|------------------|-----------------------------|-------------------------|---------------------------------------------| -| [AQLM](./aqlm.md) | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1/2 | 🟢 | 🟢 | 🟢 | https://github.com/Vahe1994/AQLM | -| [AWQ](./awq.md) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | ? | 4 | 🟢 | 🟢 | 🟢 | https://github.com/casper-hansen/AutoAWQ | -| [bitsandbytes](./bitsandbytes.md) | 🟢 | 🟡 1 | 🟢 | 🟡 1 | 🔴 2 | 🟡 1 | 🔴 1 | 4/8 | 🟢 | 🟢 | 🟢 | https://github.com/bitsandbytes-foundation/bitsandbytes | -| [compressed-tensors](./compressed_tensors.md) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 1/8 | 🟢 | 🟢 | 🟢 | https://github.com/neuralmagic/compressed-tensors | -| [EETQ](./eetq.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ | -| [GGUF / GGML (llama.cpp)](../gguf.md) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 1/8 | 🔴 | [See Notes](../gguf.md) | [See Notes](../gguf.md) | https://github.com/ggerganov/llama.cpp | -| [GPTQModel](./gptq.md) | 🔴 | 🟢 3 | 🟢 | 🟢 | 🟢 | 🟢 4 | 🔴 | 2/3/4/8 | 🟢 | 🟢 | 🟢 | https://github.com/ModelCloud/GPTQModel | -| [AutoGPTQ](./gptq.md) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 2/3/4/8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ | -| [HIGGS](./higgs.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 2/4 | 🔴 | 🟢 | 🟢 | https://github.com/HanGuo97/flute | -| [HQQ](./hqq.md) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1/8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ | -| [optimum-quanto](./quanto.md) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🟢 | 2/4/8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/optimum-quanto | -| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM | -| [torchao](./torchao.md) | 🟢 | | 🟢 | 🔴 | 🟡 5 | 🔴 | | 4/8 | | 🟢🔴 | 🟢 | https://github.com/pytorch/ao | -| [VPTQ](./vptq.md) | 🔴 | 🔴 | 🟢 | 🟡 | 🔴 | 🔴 | 🟢 | 1/8 | 🔴 | 🟢 | 🟢 | https://github.com/microsoft/VPTQ | -| [SpQR](./spqr.md) | 🔴 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 3 | 🔴 | 🟢 | 🟢 | https://github.com/Vahe1994/SpQR/ | -| [FINEGRAINED_FP8](./finegrained_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | | - - -**1:** bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend). Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links. - - - - - -**2:** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships. - - - - - -**3:** GPTQModel[CPU] supports 4-bit via IPEX on Intel/AMD and full bit range via Torch on Intel/AMD/Apple Silicon. - - - - - -**4:** GPTQModel[Intel GPU] via IPEX only supports 4-bit for Intel Datacenter Max/Arc GPUs. - - - - - -**5:** torchao only supports int4 weight on Metal (Apple Silicon). - - - From 6503dce2162d4351d87cf772a29c36bcda2d6668 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Thu, 19 Dec 2024 16:24:46 -0800 Subject: [PATCH 080/116] quant pt 2 --- docs/source/en/_toctree.yml | 10 +- .../en/quantization/compressed_tensors.md | 95 ++++++------------- docs/source/en/quantization/contribute.md | 64 +++++++------ docs/source/en/quantization/eetq.md | 2 +- docs/source/en/quantization/fbgemm_fp8.md | 41 ++++---- docs/source/en/quantization/hqq.md | 9 +- docs/source/en/quantization/optimum.md | 2 +- docs/source/en/quantization/quanto.md | 69 +++++++------- docs/source/en/quantization/torchao.md | 83 ++++++---------- 9 files changed, 160 insertions(+), 215 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 88de99a5c4fb..d38ebf4f6b11 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -13,8 +13,6 @@ sections: - local: models title: Load - - local: gguf - title: GGUF - local: custom_models title: Create a custom model - local: how_to_hack_models @@ -171,7 +169,9 @@ - local: quantization/eetq title: EETQ - local: quantization/fbgemm_fp8 - title: FBGEMM FP8 + title: FBGEMM + - local: gguf + title: GGUF - local: quantization/gptq title: GPTQ - local: quantization/hqq @@ -181,9 +181,9 @@ - local: quantization/quanto title: Quanto - local: quantization/torchao - title: TorchAO + title: torchao - local: quantization/contribute - title: Contribute new quantization method + title: Contribute - title: Deploy to production isExpanded: False sections: diff --git a/docs/source/en/quantization/compressed_tensors.md b/docs/source/en/quantization/compressed_tensors.md index 177e26144589..9e8fc03fa10f 100644 --- a/docs/source/en/quantization/compressed_tensors.md +++ b/docs/source/en/quantization/compressed_tensors.md @@ -13,98 +13,61 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> -# Compressed Tensors -The [`compressed-tensors`](https://github.com/neuralmagic/compressed-tensors) library provides a versatile and efficient way to store and manage compressed model checkpoints. This library supports various quantization and sparsity schemes, making it a unified format for handling different model optimizations like GPTQ, AWQ, SmoothQuant, INT8, FP8, SparseGPT, and more. +# compressed-tensors -Some of the supported formats include: -1. `dense` -2. `int-quantized` ([sample](https://huggingface.co/nm-testing/tinyllama-w8a8-compressed-hf-quantizer)): INT8 quantized models -3. `float-quantized` ([sample](https://huggingface.co/nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat)): FP8 quantized models; currently support E4M3 -4. `pack-quantized` ([sample](https://huggingface.co/nm-testing/tinyllama-w4a16-compressed-hf-quantizer)): INT4 or INT8 weight-quantized models, packed into INT32. For INT4, the weights have an INT4 range but are stored as INT8 and then packed into INT32. +[compressed-tensors](https://github.com/neuralmagic/compressed-tensors) extends [safetensors](https://github.com/huggingface/safetensors) files to compressed tensor data types to provide a unified checkpoint format for storing and loading various quantization and sparsity formats such dense, int-quantized (int8), float-quantized (fp8), and pack-quantized (int4 or int8 weight-quantized packed into int32). -Compressed models can be easily created using [llm-compressor](https://github.com/vllm-project/llm-compressor). -Alternatively models can be created independently and serialized with a compressed tensors config. +compressed-tensors supports fine-tuning with [PEFT](https://huggingface.co/docs/peft) and includes the following features as well. -To find existing models on the Hugging Face Model Hub, search for the [`compressed-tensors` tag](https://huggingface.co/models?other=compressed-tensors). +- fp8, int4, int8 weight and activation precisions. +- Quantization scales and zero-points strategies for [tensor, channel, group, block, token](https://github.com/neuralmagic/compressed-tensors/blob/83b2e7a969d70606421a76b9a3d112646077c8de/src/compressed_tensors/quantization/quant_args.py#L43-L52). +- Dynamic per-token activation quantization (or any static strategy). +- Weight sparsity (unstructured or semi-structured like 2:4) can be composed with quantization for extreme compression. +- Quantization of arbitrary modules, not just [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) modules. +- Targeted support for specific modules by name or class. -#### Features: - - Weight and activation precisions: FP8, INT4, INT8 (for Q/DQ arbitrary precision is allowed for INT) - - Quantization scales and zero-points strategies: [tensor, channel, group, block, token](https://github.com/neuralmagic/compressed-tensors/blob/83b2e7a969d70606421a76b9a3d112646077c8de/src/compressed_tensors/quantization/quant_args.py#L43-L52) - - Dynamic per-token activation quantization (or any static strategy) - - Sparsity in weights (unstructured or semi-structured like 2:4) can be composed with quantization for extreme compression - - Supports quantization of arbitrary modules, not just Linear modules - - Targeted support or ignoring of modules by name or class +Install compressed-tensors from [PyPI](https://pypi.org/project/compressed-tensors) to get the latest stable release (recommended) or install it from source to get the latest features. -## Installation + + -It is recommended to install stable releases of compressed-tensors from [PyPI](https://pypi.org/project/compressed-tensors): ```bash pip install compressed-tensors ``` -Developers who want to experiment with the latest features can also install the package from source: + +Hello, my name is [Name]. I am a [Your Profession/Student] and I am here to learn about the [Course/Program] at [University/Institution]. I am excited to be here and I am looking forward to', '<|begin_of_text|>The capital of France is Paris, which is located in the north-central part of the country. Paris is the most populous city in France and is known for its stunning architecture, art museums, fashion, and romantic atmosphere. The city is home to', "<|begin_of_text|>The future of AI is here, and it's already changing the way we live and work. From virtual assistants to self-driving cars, AI is transforming industries and revolutionizing the way we interact with technology. But what does the future of AI hold"] -""" - -``` - -The above shows a quick example for running generation using a `compressed-tensors` -model. Currently, once loaded the model cannot be saved. - -## Deep dive into a compressed-tensors model checkpoint - -In this example we will examine how the compressed-tensors model nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf is defined through its configuration entry and see how this translates to the loaded model representation. +## Model checkpoint -First, let us look at the [`quantization_config` of the model](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf/blob/main/config.json). At a glance it looks overwhelming with the number of entries but this is because compressed-tensors is a format that allows for flexible expression both during and after model compression. +compressed-tensor models are defined through its configuration entry. The following example is taken from the [nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf/blob/main/config.json) `config.json` file. -In practice for checkpoint loading and inference the configuration can be simplified to not include all the default or empty entries, so we will do that here to focus on what compression is actually represented. +There are a lot of entries to allow for flexible expression both during and after compression, but the entries for loading and inference can be simplified to focus on just a few key entries. ```yaml "quantization_config": { @@ -130,9 +93,9 @@ In practice for checkpoint loading and inference the configuration can be simpli }, ``` -We can see from the above configuration that it is specifying one config group that includes weight and activation quantization to FP8 with a static per-tensor strategy. It is also worth noting that in the `ignore` list there is an entry to skip quantization of the `lm_head` module, so that module should be untouched in the checkpoint. +The config file specifies the quantization of a config group (`group_0`), which includes weight and activation quantization to fp8 with a static per-tensor strategy. The `lm_head` module is unquantized as shown in the `ignore` key. -To see the result of the configuration in practice, we can simply use the [safetensors viewer](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf?show_file_info=model.safetensors.index.json) on the model card to see the quantized weights, input_scale, and weight_scale for all of the Linear modules in the first model layer (and so on for the rest of the layers). +For a more detailed look at the model weights, use the [safetensors viewer](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf?show_file_info=model.safetensors.index.json) on the model card to see the quantized weights, input scale, and weight scale for all [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) modules. | Tensors | Shape | Precision | | ------- | ----- | --------- | @@ -160,7 +123,7 @@ model.layers.0.self_attn.v_proj.input_scale | [1] | BF16 model.layers.0.self_attn.v_proj.weight | [1 024, 4 096] | F8_E4M3 model.layers.0.self_attn.v_proj.weight_scale | [1] | BF16 -When we load the model with the compressed-tensors HFQuantizer integration, we can see that all of the Linear modules that are specified within the quantization configuration have been replaced by `CompressedLinear` modules that manage the compressed weights and forward pass for inference. Note that the `lm_head` mentioned before in the ignore list is still kept as an unquantized Linear module. +When loading a compressed-tensors model with the [`~quantizers.HFQuantizer`] integration, all the [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) modules specified in the quantization config are replaced by [CompressedLinear](https://github.com/neuralmagic/compressed-tensors/blob/975cb223b19fcac2b98a4271d17668462d4d6e1d/src/compressed_tensors/linear/compressed_linear.py#L30) modules that manage the compressed weights and forward pass for inference. The `lm_head` module is still kept as an unquantized nn.Linear module. ```python from transformers import AutoModelForCausalLM diff --git a/docs/source/en/quantization/contribute.md b/docs/source/en/quantization/contribute.md index fb7ef6992223..0d56d7e9d71a 100644 --- a/docs/source/en/quantization/contribute.md +++ b/docs/source/en/quantization/contribute.md @@ -14,56 +14,58 @@ rendered properly in your Markdown viewer. --> -# Contribute new quantization method +# Contribute -Transformers supports and integrates many quantization methods such as QLoRA, GPTQ, LLM.int8, and AWQ. However, there are other quantization approaches that are not yet integrated. To make adding and using these quantization methods with Transformers models easier, you should use the [`HfQuantizer`] class. The [`HfQuantizer`] is designed as an internal helper class for adding a quantization method instead of something you apply to every PyTorch module. +Transformers supports many quantization methods such as QLoRA, GPTQ, LLM.int8, and AWQ. However, there are still many more quantization approaches that haven't been integrated yet. To make adding and using these quantization methods with Transformers easier, use the [`~quantizers.HfQuantizer`] class. [`~quantizers.HfQuantizer`] is designed to be an internal helper class for adding a quantization method instead of something applied to every PyTorch module. -This guide will show you how to integrate a new quantization method with the [`HfQuantizer`] class. +This guide will show you how to integrate a new quantization method with [`~quantizers.HfQuantizer`]. ## Requirements -Before integrating a new quantization method into Transformers, ensure the method you are trying to add meets the following prerequisites. Only quantization methods that can be run with PyTorch modules are currently supported. +Before integrating a new quantization method into Transformers, ensure the method meets the following requirements. Only quantization methods that can be run with PyTorch modules are supported. -- The quantization method is available through a Python package that is pip-installable by anyone (it is also fine if you can only install the package from source). Ideally, pre-compiled kernels are included in the pip package. -- The method can run on commonly-used hardware (CPU, GPU, ...). -- The method is wrapped in a `nn.Module` (e.g., `Linear8bitLt`, `Linear4bit`), and the quantized linear layer should have the following definition: +- The quantization method is available through a Python package that is pip-installable (it is also fine if you can only install the package from source). Ideally, pre-compiled kernels are included in the pip package. +- The method can run on commonly-used hardware (CPU, GPU, etc.). +- The method is wrapped in a [nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) ([`~bitsandbytes.nn.Linear8bitLt`], [`~bitsandbytes.nn.Linear4bit`]), and the quantized linear layer should have the following definition. -```py -class Linear4bit(nn.Module): - def __init__(self, ...): - ... - - def forward(self, x): - return my_4bit_kernel(x, self.weight, self.bias) -``` + ```py + class Linear4bit(nn.Module): + def __init__(self, ...): + ... + + def forward(self, x): + return my_4bit_kernel(x, self.weight, self.bias) + ``` -This way, Transformers models can be easily quantized by replacing some instances of `nn.Linear` with a target class. + This way, Transformers models are easily quantized by replacing instances of [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) with a target class. - The quantization method should be serializable. You can save the quantized weights locally or push them to the Hub. -- Make sure the package that contains the quantization kernels/primitive is stable (no frequent breaking changes). +- Make sure the package containing the quantization kernels/primitive is stable (no frequent breaking changes). -For some quantization methods, they may require "pre-quantizing" the models through data calibration (e.g., AWQ). In this case, we prefer to only support inference in Transformers and let the third-party library maintained by the ML community deal with the model quantization itself. +Some quantization methods may require "pre-quantizing" the model through data calibration (AWQ). In this case, we prefer to only support inference in Transformers and let the third-party library maintained by the ML community deal handle the model quantization itself. -## Build a new HFQuantizer class +## Create new HFQuantizer class -1. Create a new quantization config class inside [src/transformers/utils/quantization_config.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/utils/quantization_config.py) and make sure to expose the new quantization config inside Transformers main `init` by adding it to the [`_import_structure`](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/__init__.py#L1088) object of [src/transformers/__init__.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/__init__.py). +1. Create a new quantization config class inside [src/transformers/utils/quantization_config.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/utils/quantization_config.py). Add the new quantization config to the [_import_structure](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/__init__.py#L1088) inside Transformers' [src/transformers/__init__.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/__init__.py) file. -2. Create a new file inside [src/transformers/quantizers/](https://github.com/huggingface/transformers/tree/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers) named `quantizer_your_method.py`, and make it inherit from [src/transformers/quantizers/base.py::HfQuantizer](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers/base.py#L28). Make sure to add the new quantizer and quantization config in the quantization auto-mapping in [src/transformers/quantizers/auto.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers/auto.py). +2. Create a new file inside [src/transformers/quantizers/](https://github.com/huggingface/transformers/tree/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers) named `quantizer_your_method.py`, and make it inherit from [`~quantizers.HfQuantizer]. Make sure to add the new quantizer and quantization config in the quantization auto-mapping in [src/transformers/quantizers/auto.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers/auto.py). -3. Define the following class attributes/property methods for your quantization method: +3. Define the following class attributes and property methods for your quantization method. -* `requires_calibration`: Whether the quantization method requires a data calibration process. If set to `True`, you can only support inference (with quantized weights) and not inference and quantization. -* `required_packages`: A list of strings of the required packages to use the quantized weights. You might need to define some new utility methods such as `is_auto_awq_available` in [transformers/src/utils/import_utils.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/utils/import_utils.py). -* `requires_parameters_quantization`: Only required if your quantization method requires extra attention to the underlying `nn.Parameter` object. For example, bitsandbytes uses `Params4bit` and `Int8Param`, which requires some extra attention when quantizing the model. Most of the recent quantization method packs int2/int4 weights inside `torch.uint8` weights, so this flag should not be really required (set to `False` by default). -* `is_serializable`: A property method to determine whether the method is serializable or not. -* `is_trainable`: A property method to determine whether you can fine-tune models on top of the quantization method (with or without PEFT approaches). + - `requires_calibration`: Whether the quantization method requires a data calibration process. If set to `True`, you can only support inference (with quantized weights) and not inference and quantization. + - `required_packages`: A list of strings of the required packages to use the quantized weights. You might need to define some new utility methods such as `is_auto_awq_available` in [transformers/src/utils/import_utils.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/utils/import_utils.py). + - `requires_parameters_quantization`: Only required if your quantization method requires extra attention to the underlying [nn.Parameter](https://pytorch.org/docs/stable/generated/torch.nn.parameter.Parameter.html) object. For example, bitsandbytes uses [`~bitsandbytes.nn.Params4bit`] and [`~bitsandbytes.nn.Int8Params`], which requires some extra attention when quantizing the model. Most of the recent quantization method packs int2 and int4 weights inside [torch.uint8](https://pytorch.org/docs/stable/tensors.html) weights, so this flag should not be really required (set to `False` by default). + - `is_serializable`: A property method to determine whether the method is serializable or not. + - `is_trainable`: A property method to determine whether you can fine-tune models on top of the quantization method (with or without PEFT approaches). -4. Write the `validate_environment` and `update_torch_dtype` methods. These methods are called before creating the quantized model to ensure users use the right configuration. You can have a look at how this is done on other quantizers. +4. Write the `validate_environment` and `update_torch_dtype` methods. These methods are called before creating the quantized model to ensure users use the right configuration. Refer to other quantizers for an example of it is implemented. -5. Write the `_process_model_before_weight_loading` method. In Transformers, the quantized models are initialized first on the `"meta"` device before loading the weights. This means the `_process_model_before_weight_loading` method takes care of manipulating the model skeleton to replace some modules (e.g., `nn.Linear`) with the target modules (quantization modules). You can define a module replacement logic or any other utility method by creating a new file in [transformers/src/integrations/](https://github.com/huggingface/transformers/tree/abbffc4525566a48a9733639797c812301218b83/src/transformers/integrations) and exposing the relevant methods in that folder's `__init__.py` file. The best starting point would be to have a look at another quantization methods such as [quantizer_awq.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers/quantizer_awq.py). +5. Write the `_process_model_before_weight_loading` method. In Transformers, the quantized models are initialized first on the `"meta"` device before loading the weights. This means the `_process_model_before_weight_loading` method takes care of manipulating the model skeleton to replace some modules ([nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)) with the target modules (quantization modules). + + You can define module replacement logic or any other utility method by creating a new file in [transformers/src/integrations/](https://github.com/huggingface/transformers/tree/abbffc4525566a48a9733639797c812301218b83/src/transformers/integrations) and exposing the relevant methods in that folder's `__init__.py` file. The best starting point would be to have a look at another quantization method such as [quantizer_awq.py](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers/quantizer_awq.py). 6. Write the `_process_model_after_weight_loading` method. This method enables implementing additional features that require manipulating the model after loading the weights. -7. Document everything! Make sure your quantization method is documented by adding a new file under `docs/source/en/quantization` and adding a new row in the table in `docs/source/en/quantization/overview.md`. +7. Document everything! Make sure your quantization method is documented by adding a new file under `docs/source/en/quantization`. -8. Add tests! You should add tests by first adding the package in our nightly Dockerfile inside `docker/transformers-quantization-latest-gpu` and then adding a new test file in `tests/quantization/xxx`. Feel free to check out how it is implemented for other quantization methods. +8. You should add tests by adding the package in our nightly Dockerfile inside `docker/transformers-quantization-latest-gpu` and then adding a new test file in `tests/quantization/xxx`. Feel free to check out existing quantization methods to see how it is implemented. diff --git a/docs/source/en/quantization/eetq.md b/docs/source/en/quantization/eetq.md index 5fabd2aa7e4b..07cb25e437f8 100644 --- a/docs/source/en/quantization/eetq.md +++ b/docs/source/en/quantization/eetq.md @@ -22,7 +22,7 @@ EETQ further supports fine-tuning with [PEFT](https://huggingface.co/docs/peft). Install EETQ from the [release page](https://github.com/NetEase-FuXi/EETQ/releases) or [source code](https://github.com/NetEase-FuXi/EETQ). CUDA 11.4+ is required for EETQ. - + ```bash diff --git a/docs/source/en/quantization/fbgemm_fp8.md b/docs/source/en/quantization/fbgemm_fp8.md index 61cf8a059bf2..b9382f74952a 100644 --- a/docs/source/en/quantization/fbgemm_fp8.md +++ b/docs/source/en/quantization/fbgemm_fp8.md @@ -14,46 +14,43 @@ rendered properly in your Markdown viewer. --> -# FBGEMM FP8 +# FBGEMM -With FBGEMM FP8 quantization method, you can quantize your model in FP8 (W8A8): -- the weights will be quantized in 8bit (FP8) per channel -- the activation will be quantized in 8bit (FP8) per token - -It relies on the [FBGEMM](https://github.com/pytorch/FBGEMM) library which provides efficient low-precision general matrix multiplication for small batch sizes and support for accuracy-loss minimizing techniques such as row-wise quantization and outlier-aware quantization. +[FBGEMM (Facebook GEneral Matrix Multiplication)](https://github.com/pytorch/FBGEMM) is a low-precision matrix multiplication library for small batch sizes and support for accuracy-loss minimizing techniques such as row-wise quantization and outlier-aware quantization. With FBGEMM, quantize a models weights to 8-bits/channel and the activations to 8-bits/token (also known as fp8 or w8a8). > [!TIP] -> You need a GPU with compute capability>=9 (e.g. H100) +> You need a GPU with [compute capability 9+](https://developer.nvidia.com/cuda-gpus#collapseOne) like a H100. -Before you begin, make sure the following libraries are installed with their latest version: +Install the FBGEMM_GPU package with the command below to ensure you have the latest version. ```bash pip install --upgrade accelerate fbgemm-gpu torch ``` -If you are having issues with fbgemm-gpu and torch library, you might need to install the nightly release. You can follow the instruction [here](https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries:~:text=found%20here.-,Install%20the%20FBGEMM_GPU%20Package,-Install%20through%20PyTorch) +If you're having installation issues, try installing the [nightly release](https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries:~:text=found%20here.-,Install%20the%20FBGEMM_GPU%20Package,-Install%20through%20PyTorch). -By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. +Create a [`FbgemmFp8Config`] and pass it to [`~PreTrainedModel.from_pretrained`] to quantize a model to fp8. ```py -from transformers import FbgemmFp8Config, AutoModelForCausalLM, AutoTokenizer +from transformers import FbgemmFp8Config, AutoModelForCausalLM -model_name = "meta-llama/Meta-Llama-3-8B" quantization_config = FbgemmFp8Config() -quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config) - -tokenizer = AutoTokenizer.from_pretrained(model_name) -input_text = "What are we having for dinner?" -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") - -output = quantized_model.generate(**input_ids, max_new_tokens=10) -print(tokenizer.decode(output[0], skip_special_tokens=True)) +quantized_model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Meta-Llama-3-8B", + torch_dtype="auto", + device_map="auto", + quantization_config=quantization_config +) ``` -A quantized model can be saved via "saved_pretrained" and be reused again via the "from_pretrained". +[`~PreTrainedModel.save_pretrained`] and [`~PreTrainedModel.from_pretrained`] enable saving and loading a quantized model. ```py quant_path = "/path/to/save/quantized/model" model.save_pretrained(quant_path) model = AutoModelForCausalLM.from_pretrained(quant_path, device_map="auto") -``` \ No newline at end of file +``` + +## Resources + +Read the [Open-sourcing FBGEMM for state-of-the-art server-side inference](https://engineering.fb.com/2018/11/07/ml-applications/fbgemm/) blog post for more details on FBGEMM. diff --git a/docs/source/en/quantization/hqq.md b/docs/source/en/quantization/hqq.md index 1c6a418abb64..cc7b5f8cd9bc 100755 --- a/docs/source/en/quantization/hqq.md +++ b/docs/source/en/quantization/hqq.md @@ -36,7 +36,7 @@ Quantize a model by creating a [`HqqConfig`] and specifying the `nbits` and `gro ``` py from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig -quant_config = HqqConfig(nbits=8, group_size=64) +quant_config = HqqConfig(nbits=8, group_size=64) model = transformers.AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-3.1-8B", torch_dtype=torch.float16, @@ -72,6 +72,9 @@ model = transformers.AutoModelForCausalLM.from_pretrained( ) ``` + + + ## Backends HQQ supports various backends, including pure PyTorch and custom dequantization CUDA kernels. These backends are suitable for older GPUs and PEFT/QLoRA training. @@ -91,3 +94,7 @@ prepare_for_inference("model", backend="torchao_int4") ``` Refer to the [Backend](https://github.com/mobiusml/hqq/#backend) guide for more details. + +## Resources + +Read the [Half-Quadratic Quantization of Large Machine Learning Models](https://mobiusml.github.io/hqq_blog/) blog post for more details about HQQ. diff --git a/docs/source/en/quantization/optimum.md b/docs/source/en/quantization/optimum.md index d90b4c818e43..5498e715ee18 100644 --- a/docs/source/en/quantization/optimum.md +++ b/docs/source/en/quantization/optimum.md @@ -16,4 +16,4 @@ rendered properly in your Markdown viewer. # Optimum -The [Optimum](https://huggingface.co/docs/optimum/index) library supports quantization for Intel, Furiosa, ONNX Runtime, GPTQ, and lower-level PyTorch quantization functions. Consider using Optimum for quantization if you're using specific and optimized hardware like Intel CPUs, Furiosa NPUs or a model accelerator like ONNX Runtime. \ No newline at end of file +[Optimum](https://huggingface.co/docs/optimum/index) is an optimization library that supports quantization for Intel, Furiousa, ONNX Runtime, GPTQ, and lower-level PyTorch quantization functions. It is designed to enhance performance for specific hardware - Intel CPUs/HPUs, AMD GPUs, Furiousa NPUs, etc. - and model accelerators like ONNX Runtime. diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md index 7feadefd83d2..e3526de62a42 100644 --- a/docs/source/en/quantization/quanto.md +++ b/docs/source/en/quantization/quanto.md @@ -14,55 +14,56 @@ rendered properly in your Markdown viewer. --> -# Optimum-quanto +# Optimum Quanto - +[Quanto](https://github.com/huggingface/optimum-quanto) is a PyTorch quantization backend for [Optimum](https://huggingface.co/docs/optimum/index). It features linear quantization for weights (float8, int8, int4, int2) with accuracy very similar to full-precision models. Quanto is compatible with with any model modality and device, making it simple to use regardless of hardware. -Try optimum-quanto + transformers with this [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing)! +Quanto is also compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for faster generation. - - - -[🤗 optimum-quanto](https://github.com/huggingface/optimum-quanto) library is a versatile pytorch quantization toolkit. The quantization method used is the linear quantization. Quanto provides several unique features such as: - -- weights quantization (`float8`,`int8`,`int4`,`int2`) -- activation quantization (`float8`,`int8`) -- modality agnostic (e.g CV,LLM) -- device agnostic (e.g CUDA,XPU,MPS,CPU) -- compatibility with `torch.compile` -- easy to add custom kernel for specific device -- supports quantization aware training - - -Before you begin, make sure the following libraries are installed: +Install Quanto with the following command. ```bash pip install optimum-quanto accelerate transformers ``` -Now you can quantize a model by passing [`QuantoConfig`] object in the [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it contains `torch.nn.Linear` layers. - -The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [optimum-quanto](https://github.com/huggingface/optimum-quanto) library instead. +Quantize a model by creating a [`QuantoConfig`] and specifiying the `weights` parameter to quantize to. This works for any model in any modality as long as it contains [torch.nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) layers. -By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. +> [!TIP] +> The Transformers integration only supports weight quantization. Use the Quanto library directly if you need activation quantization, calibration, or QAT. ```py from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig -model_id = "facebook/opt-125m" -tokenizer = AutoTokenizer.from_pretrained(model_id) -quantization_config = QuantoConfig(weights="int8") -quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="cuda:0", quantization_config=quantization_config) +quant_config = QuantoConfig(weights="int8") +model = transformers.AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-3.1-8B", + torch_dtype="auto", + device_map="auto", + quantization_config=quant_config +) ``` -Note that serialization is not supported yet with transformers but it is coming soon! If you want to save the model, you can use quanto library instead. +## torch.compile + +Wrap a Quanto model with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for faster generation. + +```py +import torch +from transformers import AutoModelForSpeechSeq2Seq, QuantoConfig + +quant_config = QuantoConfig(weights="int8") +model = AutoModelForSpeechSeq2Seq.from_pretrained( + "openai/whisper-large-v2", + torch_dtype="auto", + device_map="auto", + quantization_config=quant_config +) + +model = torch.compile(model) +``` -Optimum-quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following benchmark (llama-2-7b on perplexity metric). You can find more benchmarks [here](https://github.com/huggingface/optimum-quanto/tree/main/bench/generation) +## Resources -
-
- llama-2-7b-quanto-perplexity -
-
+Read the [Quanto: a PyTorch quantization backend for Optimum](https://huggingface.co/blog/quanto-introduction) blog post to learn more about the library design and benchmarks. -The library is versatile enough to be compatible with most PTQ optimization algorithms. The plan in the future is to integrate the most popular algorithms in the most seamless possible way (AWQ, Smoothquant). \ No newline at end of file +For more hands-on examples, take a look at the Quanto [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing). \ No newline at end of file diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md index 46fb0f8cbb9a..cb1291ce0df5 100644 --- a/docs/source/en/quantization/torchao.md +++ b/docs/source/en/quantization/torchao.md @@ -9,81 +9,56 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> -# TorchAO +# torchao -[TorchAO](https://github.com/pytorch/ao) is an architecture optimization library for PyTorch, it provides high performance dtypes, optimization techniques and kernels for inference and training, featuring composability with native PyTorch features like `torch.compile`, FSDP etc.. Some benchmark numbers can be found [here](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks). +[torchao](https://github.com/pytorch/ao) is a PyTorch architecture optimization library with support for custom high performance data types, quantization, and sparsity. It is composable with native PyTorch features such as [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for even faster inference and training. -Before you begin, make sure the following libraries are installed with their latest version: +Install torchao with the following command. ```bash # Updating 🤗 Transformers to the latest version, as the example script below uses the new auto compilation pip install --upgrade torch torchao transformers ``` -By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. +torchao supports int8 weight quantization and int8 dynamic quantization of weights. Create a [`TorchAoConfig`] and specify the quantization type and `group_size` of the weights to quantize. ```py import torch from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer -model_name = "meta-llama/Meta-Llama-3-8B" -# We support int4_weight_only, int8_weight_only and int8_dynamic_activation_int8_weight -# More examples and documentations for arguments can be found in https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques quantization_config = TorchAoConfig("int4_weight_only", group_size=128) -quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config) - -tokenizer = AutoTokenizer.from_pretrained(model_name) -input_text = "What are we having for dinner?" -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") - -# auto-compile the quantized model with `cache_implementation="static"` to get speedup -output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") -print(tokenizer.decode(output[0], skip_special_tokens=True)) - -# benchmark the performance -import torch.utils.benchmark as benchmark - -def benchmark_fn(f, *args, **kwargs): - # Manual warmup - for _ in range(5): - f(*args, **kwargs) - - t0 = benchmark.Timer( - stmt="f(*args, **kwargs)", - globals={"args": args, "kwargs": kwargs, "f": f}, - num_threads=torch.get_num_threads(), - ) - return f"{(t0.blocked_autorange().mean):.3f}" - -MAX_NEW_TOKENS = 1000 -print("int4wo-128 model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static")) - -bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16) -output = bf16_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") # auto-compile -print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static")) - +quantized_model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Meta-Llama-3-8B", + torch_dtype="auto", + device_map="auto", + quantization_config=quantization_config +) ``` -## Serialization and Deserialization -torchao quantization is implemented with [tensor subclasses](https://pytorch.org/docs/stable/notes/extending.html#subclassing-torch-tensor), it only work with huggingface non-safetensor serialization and deserialization. It relies on `torch.load(..., weights_only=True)` to avoid arbitrary user code execution during load time and use [add_safe_globals](https://pytorch.org/docs/stable/notes/serialization.html#torch.serialization.add_safe_globals) to allowlist some known user functions. +## torch.compile -The reason why it does not support safe tensor serialization is that wrapper tensor subclass allows maximum flexibility so we want to make sure the effort of supporting new format of quantized Tensor is low, while safe tensor optimizes for maximum safety (no user code execution), it also means we have to make sure to manually support new quantization format. +Wrap the quantized model with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for even faster generation. ```py -# save quantized model locally -output_dir = "llama3-8b-int4wo-128" -quantized_model.save_pretrained(output_dir, safe_serialization=False) +import torchao -# push to huggingface hub -# save_to = "{user_id}/llama3-8b-int4wo-128" -# quantized_model.push_to_hub(save_to, safe_serialization=False) +quantized_model = torch.compile(quantized_model, mode="max-autotune") +``` -# load quantized model -ckpt_id = "llama3-8b-int4wo-128" # or huggingface hub model id -loaded_quantized_model = AutoModelForCausalLM.from_pretrained(ckpt_id, device_map="cuda") +## Serialization +torchao implements [torch.Tensor subclasses](https://pytorch.org/docs/stable/notes/extending.html#subclassing-torch-tensor) for maximum flexibility in supporting new quantized torch.Tensor formats. [Safetensors](https://huggingface.co/docs/safetensors/en/index) serialization and deserialization does not work with torchaco. -# confirm the speedup -loaded_quantized_model = torch.compile(loaded_quantized_model, mode="max-autotune") -print("loaded int4wo-128 model:", benchmark_fn(loaded_quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS)) +To avoid arbitrary user code execution, torchao sets `weights_only=True` in [torch.load](https://pytorch.org/docs/stable/generated/torch.load.html) to ensure only tensors are loaded. Any known user functions can be whitelisted with [add_safe_globals](https://pytorch.org/docs/stable/notes/serialization.html#torch.serialization.add_safe_globals). + +```py +# don't serialize model with Safetensors +output_dir = "llama3-8b-int4wo-128" +quantized_model.save_pretrained("llama3-8b-int4wo-128", safe_serialization=False) ``` + +## Resources + +For a better sense of expected performance, view the [benchmarks](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks) for various models with CUDA and XPU backends. + +Refer to [Other Available Quantization Techniques](https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques) for more examples and documentation. From 5953f7b3e38583d51de561e101463d0a40a08e44 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Thu, 19 Dec 2024 16:28:56 -0800 Subject: [PATCH 081/116] fix toctree --- docs/source/en/_toctree.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index d38ebf4f6b11..449ae9f61985 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -170,8 +170,8 @@ title: EETQ - local: quantization/fbgemm_fp8 title: FBGEMM - - local: gguf - title: GGUF + - local: gguf + title: GGUF - local: quantization/gptq title: GPTQ - local: quantization/hqq From 20a5d3a345bbe15e9fdb02de2b24ac13acd402dd Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Thu, 19 Dec 2024 16:56:21 -0800 Subject: [PATCH 082/116] fix --- docs/source/en/quantization/compressed_tensors.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/en/quantization/compressed_tensors.md b/docs/source/en/quantization/compressed_tensors.md index 9e8fc03fa10f..569f743639f1 100644 --- a/docs/source/en/quantization/compressed_tensors.md +++ b/docs/source/en/quantization/compressed_tensors.md @@ -48,8 +48,6 @@ pip install -e .
-Search using the compressed-tensors [tag](https://huggingface.co/models?other=compressed-tensors) to find a compatible model on the Hugging Face Hub. - Only models that have already been quantized can be loaded at the moment, and once a model is loaded, it cannot be saved. To quantize a model into the compressed-tensors format, see [llm-compressor](https://github.com/vllm-project/llm-compressor). Alternatively, models can be created independently and serizlied with a compressed-tensors config. ```python From acd222cfcfc863cfe8d64360273e20f20a9f04a3 Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Thu, 19 Dec 2024 17:11:08 -0800 Subject: [PATCH 083/116] fix --- docs/source/en/quantization/compressed_tensors.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/en/quantization/compressed_tensors.md b/docs/source/en/quantization/compressed_tensors.md index 569f743639f1..a3b01a1b4489 100644 --- a/docs/source/en/quantization/compressed_tensors.md +++ b/docs/source/en/quantization/compressed_tensors.md @@ -37,7 +37,7 @@ pip install compressed-tensors ```
- ```bash git clone https://github.com/neuralmagic/compressed-tensors @@ -48,6 +48,8 @@ pip install -e .
+Search using the compressed-tensors [tag](https://huggingface.co/models?other=compressed-tensors) to find a compatible model on the Hugging Face Hub. + Only models that have already been quantized can be loaded at the moment, and once a model is loaded, it cannot be saved. To quantize a model into the compressed-tensors format, see [llm-compressor](https://github.com/vllm-project/llm-compressor). Alternatively, models can be created independently and serizlied with a compressed-tensors config. ```python From 675231539121edea72e5365b65d93928fba3bbad Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Fri, 20 Dec 2024 16:04:26 -0800 Subject: [PATCH 084/116] quant pt 3 --- docs/source/en/quantization/aqlm.md | 27 ++-- docs/source/en/quantization/awq.md | 132 ++++++++------------ docs/source/en/quantization/bitnet.md | 51 ++------ docs/source/en/quantization/bitsandbytes.md | 111 ++++++---------- docs/source/en/quantization/gptq.md | 102 +++++++-------- 5 files changed, 162 insertions(+), 261 deletions(-) diff --git a/docs/source/en/quantization/aqlm.md b/docs/source/en/quantization/aqlm.md index 2e00d94cfcff..9c9b6ac0715d 100644 --- a/docs/source/en/quantization/aqlm.md +++ b/docs/source/en/quantization/aqlm.md @@ -16,19 +16,17 @@ rendered properly in your Markdown viewer. # AQLM -> [!TIP] -> Try AQLM on [Google Colab](https://colab.research.google.com/drive/1-xZmBRXT5Fm3Ghn4Mwa2KRypORXb855X?usp=sharing)! +Additive Quantization of Language Models ([AQLM](https://arxiv.org/abs/2401.06118)) quantizes multiple weights together and takes advantage of interdependencies between them. AQLM represents groups of 8-16 weights as a sum of multiple vector codes. -Additive Quantization of Language Models ([AQLM](https://arxiv.org/abs/2401.06118)) is a Large Language Models compression method. It quantizes multiple weights together and takes advantage of interdependencies between them. AQLM represents groups of 8-16 weights as a sum of multiple vector codes. +AQLM also supports fine-tuning with [LoRA](https://huggingface.co/docs/peft/package_reference/lora) with the [PEFT](https://huggingface.co/docs/peft) library, and is fully compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for even faster inference and training. + +Run the command below to install the AQLM library with kernel support for both GPU and CPU inference and training. AQLM only works with Python 3.10+. -Inference support for AQLM is realised in the `aqlm` library. Make sure to install it to run the models (note aqlm works only with python>=3.10): ```bash pip install aqlm[gpu,cpu] ``` -The library provides efficient kernels for both GPU and CPU inference and training. - -The instructions on how to quantize models yourself, as well as all the relevant code can be found in the corresponding GitHub [repository](https://github.com/Vahe1994/AQLM). To run AQLM models simply load a model that has been quantized with AQLM: +Load an AQLM-quantized model with [`~PreTrainedModel.from_pretrained`]. ```python from transformers import AutoTokenizer, AutoModelForCausalLM @@ -38,20 +36,21 @@ quantized_model = AutoModelForCausalLM.from_pretrained( torch_dtype="auto", device_map="auto" ) -tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf") ``` -## PEFT - -Starting with version `aqlm 1.0.2`, AQLM supports Parameter-Efficient Fine-Tuning in a form of [LoRA](https://huggingface.co/docs/peft/package_reference/lora) integrated into the [PEFT](https://huggingface.co/blog/peft) library. +## Configurations -## AQLM configurations +AQLM quantization setups vary mainly in the number of codebooks used, as well as codebook sizes in bits. The most popular setups and supported inference kernels are shown below. -AQLM quantization setups vary mainly on the number of codebooks used as well as codebook sizes in bits. The most popular setups, as well as inference kernels they support are: - | Kernel | Number of codebooks | Codebook size, bits | Notation | Accuracy | Speedup | Fast GPU inference | Fast CPU inference | |---|---------------------|---------------------|----------|-------------|-------------|--------------------|--------------------| | Triton | K | N | KxN | - | Up to ~0.7x | ✅ | ❌ | | CUDA | 1 | 16 | 1x16 | Best | Up to ~1.3x | ✅ | ❌ | | CUDA | 2 | 8 | 2x8 | OK | Up to ~3.0x | ✅ | ❌ | | Numba | K | 8 | Kx8 | Good | Up to ~4.0x | ❌ | ✅ | + +## Resources + +Run the AQLM demo [notebook](https://colab.research.google.com/drive/1-xZmBRXT5Fm3Ghn4Mwa2KRypORXb855X?usp=sharing) for more examples of how to quantize a model, push a quantized model to the Hub, and more. + +For more example demo notebooks, visit the AQLM [repository](https://github.com/Vahe1994/AQLM). diff --git a/docs/source/en/quantization/awq.md b/docs/source/en/quantization/awq.md index f581c161392f..aa57da0f811f 100644 --- a/docs/source/en/quantization/awq.md +++ b/docs/source/en/quantization/awq.md @@ -16,17 +16,11 @@ rendered properly in your Markdown viewer. # AWQ - - -Try AWQ quantization with this [notebook](https://colab.research.google.com/drive/1HzZH89yAXJaZgwJDhQj9LqSBux932BvY)! - - - -[Activation-aware Weight Quantization (AWQ)](https://hf.co/papers/2306.00978) doesn't quantize all the weights in a model, and instead, it preserves a small percentage of weights that are important for LLM performance. This significantly reduces quantization loss such that you can run models in 4-bit precision without experiencing any performance degradation. +[Activation-aware Weight Quantization (AWQ)](https://hf.co/papers/2306.00978) preserves a small fraction of the weights that are important for LLM performance to compress a model to 4-bits with minimal performance degradation. There are several libraries for quantizing models with the AWQ algorithm, such as [llm-awq](https://github.com/mit-han-lab/llm-awq), [autoawq](https://github.com/casper-hansen/AutoAWQ) or [optimum-intel](https://huggingface.co/docs/optimum/main/en/intel/optimization_inc). Transformers supports loading models quantized with the llm-awq and autoawq libraries. This guide will show you how to load models quantized with autoawq, but the process is similar for llm-awq quantized models. -Make sure you have autoawq installed: +Run the command below to install autoawq ```bash pip install autoawq @@ -34,7 +28,7 @@ pip install autoawq > [!WARNING] > AutoAWQ downgrades Transformers to version 4.47.1. If you want to do inference with AutoAWQ, you may need to reinstall your Transformers' version after installing AutoAWQ. -AWQ-quantized models can be identified by checking the `quantization_config` attribute in the model's [config.json](https://huggingface.co/TheBloke/zephyr-7B-alpha-AWQ/blob/main/config.json) file: +Identify an AWQ-quantized model by checking the `quant_method` key in the models [config.json](https://huggingface.co/TheBloke/zephyr-7B-alpha-AWQ/blob/main/config.json) file. ```json { @@ -55,63 +49,60 @@ AWQ-quantized models can be identified by checking the `quantization_config` att } ``` -A quantized model is loaded with the [`~PreTrainedModel.from_pretrained`] method. If you loaded your model on the CPU, make sure to move it to a GPU device first. Use the `device_map` parameter to specify where to place the model: - -```py -from transformers import AutoModelForCausalLM, AutoTokenizer - -model_id = "TheBloke/zephyr-7B-alpha-AWQ" -model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") -``` +Load the AWQ-quantized model with [`~PreTrainedModel.from_pretrained`]. This automatically sets the other weights to fp16 by default for performance reasons. Use the `torch_dtype` parameter to load these other weights in a different format. -Loading an AWQ-quantized model automatically sets other weights to fp16 by default for performance reasons. If you want to load these other weights in a different format, use the `torch_dtype` parameter: +If the model is loaded on the CPU, use the `device_map` parameter to move it to a GPU. ```py from transformers import AutoModelForCausalLM, AutoTokenizer import torch -model_id = "TheBloke/zephyr-7B-alpha-AWQ" -model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32) +model = AutoModelForCausalLM.from_pretrained( + "TheBloke/zephyr-7B-alpha-AWQ", + torch_dtype=torch.float32, + device_map="cuda:0" +) ``` -AWQ quantization can also be combined with [FlashAttention-2](../perf_infer_gpu_one#flashattention-2) to further accelerate inference: +Use `attn_implementation` to enable [FlashAttention2](../perf_infer_gpu_one#flashattention-2) to further accelerate inference. ```py from transformers import AutoModelForCausalLM, AutoTokenizer -model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", attn_implementation="flash_attention_2", device_map="cuda:0") +model = AutoModelForCausalLM.from_pretrained( + "TheBloke/zephyr-7B-alpha-AWQ", + attn_implementation="flash_attention_2", + device_map="cuda:0" +) ``` ## Fused modules -Fused modules offers improved accuracy and performance and it is supported out-of-the-box for AWQ modules for [Llama](https://huggingface.co/meta-llama) and [Mistral](https://huggingface.co/mistralai/Mistral-7B-v0.1) architectures, but you can also fuse AWQ modules for unsupported architectures. - - +Fused modules offer improved accuracy and performance. They are supported out-of-the-box for AWQ modules for [Llama](https://huggingface.co/meta-llama) and [Mistral](https://huggingface.co/mistralai/Mistral-7B-v0.1) architectures, but you can also fuse AWQ modules for unsupported architectures. -Fused modules cannot be combined with other optimization techniques such as FlashAttention-2. - - +> [!WARNING] +> Fused modules cannot be combined with other optimization techniques such as FlashAttention2. -To enable fused modules for supported architectures, create an [`AwqConfig`] and set the parameters `fuse_max_seq_len` and `do_fuse=True`. The `fuse_max_seq_len` parameter is the total sequence length and it should include the context length and the expected generation length. You can set it to a larger value to be safe. +Create an [`AwqConfig`] and set the parameters `fuse_max_seq_len` and `do_fuse=True` to enable fused modules. The `fuse_max_seq_len` parameter is the total sequence length and it should include the context length and the expected generation length. Set it to a larger value to be safe. -For example, to fuse the AWQ modules of the [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) model. +The example below fuses the AWQ modules of the [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) model. ```python import torch from transformers import AwqConfig, AutoModelForCausalLM -model_id = "TheBloke/Mistral-7B-OpenOrca-AWQ" - quantization_config = AwqConfig( bits=4, fuse_max_seq_len=512, do_fuse=True, ) - -model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config).to(0) +model = AutoModelForCausalLM.from_pretrained( + "TheBloke/Mistral-7B-OpenOrca-AWQ", + quantization_config=quantization_config +).to(0) ``` The [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) model was benchmarked with `batch_size=1` with and without fused modules. @@ -156,14 +147,14 @@ The speed and throughput of fused and unfused modules were also tested with the -For architectures that don't support fused modules yet, you need to create a custom fusing mapping to define which modules need to be fused with the `modules_to_fuse` parameter. For example, to fuse the AWQ modules of the [TheBloke/Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ) model. +For architectures that don't support fused modules, create an [`AwqConfig`] and define a custom fusing mapping in `modules_to_fuse` to determine which modules need to be fused. + +The example below fuses the AWQ modules of the [TheBloke/Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ) model. ```python import torch from transformers import AwqConfig, AutoModelForCausalLM -model_id = "TheBloke/Yi-34B-AWQ" - quantization_config = AwqConfig( bits=4, fuse_max_seq_len=512, @@ -178,35 +169,46 @@ quantization_config = AwqConfig( } ) -model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, trust_remote_code=True).to(0) +model = AutoModelForCausalLM.from_pretrained( + "TheBloke/Yi-34B-AWQ", + quantization_config=quantization_config +).to(0) ``` -The parameter `modules_to_fuse` should include: +The parameter `modules_to_fuse` should include the following keys. - `"attention"`: The names of the attention layers to fuse in the following order: query, key, value and output projection layer. If you don't want to fuse these layers, pass an empty list. - `"layernorm"`: The names of all the LayerNorm layers you want to replace with a custom fused LayerNorm. If you don't want to fuse these layers, pass an empty list. - `"mlp"`: The names of the MLP layers you want to fuse into a single MLP layer in the order: (gate (dense, layer, post-attention) / up / down layers). - `"use_alibi"`: If your model uses ALiBi positional embedding. - `"num_attention_heads"`: The number of attention heads. -- `"num_key_value_heads"`: The number of key value heads that should be used to implement Grouped Query Attention (GQA). If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA), otherwise GQA is used. +- `"num_key_value_heads"`: The number of key value heads that should be used to implement Grouped Query Attention (GQA). + + | parameter value | attention | + |---|---| + | `num_key_value_heads=num_attention_heads` | Multi-Head Attention | + | `num_key_value_heads=1` | Multi-Query Attention | + | `num_key_value_heads=...` | Grouped Query Attention | + - `"hidden_size"`: The dimension of the hidden representations. +## ExLlamaV2 - -## ExLlama-v2 support - -Recent versions of `autoawq` supports ExLlama-v2 kernels for faster prefill and decoding. To get started, first install the latest version of `autoawq` by running: +[ExLlamaV2](https://github.com/turboderp/exllamav2) kernels support faster prefill and decoding. Run the command below to install the latest version of autoawq with ExLlamaV2 support. ```bash pip install git+https://github.com/casper-hansen/AutoAWQ.git ``` -Get started by passing an `AwqConfig()` with `version="exllama"`. +Set `version="exllama"` in [`AwqConfig`] to enable ExLlamaV2 kernels. -```python +> [!TIP] +> ExLlamaV2 is supported on AMD GPUs. + +```py import torch from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig @@ -217,34 +219,18 @@ model = AutoModelForCausalLM.from_pretrained( quantization_config=quantization_config, device_map="auto", ) - -input_ids = torch.randint(0, 100, (1, 128), dtype=torch.long, device="cuda") -output = model(input_ids) -print(output.logits) - -tokenizer = AutoTokenizer.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-AWQ") -input_ids = tokenizer.encode("How to make a cake", return_tensors="pt").to(model.device) -output = model.generate(input_ids, do_sample=True, max_length=50, pad_token_id=50256) -print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` - - -Note this feature is supported on AMD GPUs. - - - +## CPU -## Intel CPU/GPU support - -Recent versions of autoawq supports Intel CPU/GPU with IPEX op optimizations. To get started, install the latest version of autoawq. +[Intel Extension for PyTorch (IPEX)](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/) is designed to enable performance optimizations on Intel hardware. Run the command below to install the latest version of autoawq with IPEX support. ```bash pip install intel-extension-for-pytorch # for IPEX-GPU refer to https://intel.github.io/intel-extension-for-pytorch/xpu/2.5.10+xpu/ pip install git+https://github.com/casper-hansen/AutoAWQ.git ``` -Get started by passing an `AwqConfig()` with `version="ipex"`. +Set `version="ipex"` in [`AwqConfig`] to enable ExLlamaV2 kernels. ```python import torch @@ -258,20 +244,8 @@ model = AutoModelForCausalLM.from_pretrained( quantization_config=quantization_config, device_map=device, ) - -input_ids = torch.randint(0, 100, (1, 128), dtype=torch.long, device=device) -output = model(input_ids) -print(output.logits) - -tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ") -input_ids = tokenizer.encode("How to make a cake", return_tensors="pt").to(device) -pad_token_id = tokenizer.eos_token_id -output = model.generate(input_ids, do_sample=True, max_length=50, pad_token_id=pad_token_id) -print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` - - -This feature is supported on Intel CPUs/GPUs. +## Resources - +Run the AWQ demo [notebook](https://colab.research.google.com/drive/1HzZH89yAXJaZgwJDhQj9LqSBux932BvY#scrollTo=Wwsg6nCwoThm) for more examples of how to quantize a model, push a quantized model to the Hub, and more. diff --git a/docs/source/en/quantization/bitnet.md b/docs/source/en/quantization/bitnet.md index 6bd65e8b53a4..5f713a20d3fd 100644 --- a/docs/source/en/quantization/bitnet.md +++ b/docs/source/en/quantization/bitnet.md @@ -16,60 +16,33 @@ rendered properly in your Markdown viewer. # BitNet -[BitNet](https://arxiv.org/abs/2402.17764) replaces traditional Linear layers in Multi-Head Attention and Feed-Forward Networks with specialized layers called BitLinear with ternary (or binary in the older version) precision. The BitLinear layers introduced here quantize the weights using ternary precision (with values of -1, 0, and 1) and quantize the activations to 8-bit precision. - +[BitNet](https://arxiv.org/abs/2402.17764) replaces traditional linear layers in Multi-Head Attention and feed-forward networks with specialized BitLinear layers. The BitLinear layers quantize the weights using ternary precision (with values of -1, 0, and 1) and quantize the activations to 8-bit precision.
Alt Text -
The architecture of BitNet with BitLinear layers
+
The architecture of BitNet with BitLinear layers.
-During training, we start by quantizing the weights into ternary values, using symmetric per tensor quantization. First, we compute the average of the absolute values of the weight matrix and use this as a scale. We then divide the weights by the scale, round the values, constrain them between -1 and 1, and finally rescale them to continue in full precision. - -$$ -scale_w = \frac{1}{\frac{1}{nm} \sum_{ij} |W_{ij}|} -$$ - -$$ -W_q = \text{clamp}_{[-1,1]}(\text{round}(W*scale)) -$$ - -$$ -W_{dequantized} = W_q*scale_w -$$ - -Activations are then quantized to a specified bit-width (e.g., 8-bit) using [absmax](https://arxiv.org/pdf/2208.07339) quantization (symmetric per channel quantization). This involves scaling the activations into a range [−128,127[. The quantization formula is: +BitNet models can't be quantized on the fly. They need to be quantized during pretraining or fine-tuning because it is a Quantization-Aware Training (QAT) technique. During training, the weights are quantized to ternary values with symmetric per tensor quantization. -$$ -scale_x = \frac{127}{|X|_{\text{max}, \, \text{dim}=-1}} -$$ +1. Compute the average of the absolute values of the weight matrix and use as a scale. +2. Divide the weights by the scale, round the values, constrain them between -1 and 1, and rescale them to continue in full precision. +3. Activations are quantized to a specified bit-width (8-bit) using [absmax](https://arxiv.org/pdf/2208.07339) quantization (symmetric per channel quantization). This involves scaling the activations into a range of [−128,127]. -$$ -X_q = \text{clamp}_{[-128,127]}(\text{round}(X*scale)) -$$ +Refer to this [PR](https://github.com/huggingface/nanotron/pull/180) to pretrain or fine-tune a 1.58-bit model with [Nanotron](https://github.com/huggingface/nanotron). For fine-tuning, convert a model from the Hugging Face to Nanotron format. Find the conversion steps in this [PR](https://github.com/huggingface/nanotron/pull/174). -$$ -X_{dequantized} = X_q * scale_x -$$ - -To learn more about how we trained, and fine-tuned bitnet models checkout the blogpost [here](https://huggingface.co/blog/1_58_llm_extreme_quantization) - -## Load a BitNet Model from the Hub -BitNet models can't be quantized on the fly—they need to be pre-trained or fine-tuned with the quantization applied (it's a Quantization aware training technique). Once trained, these models are already quantized and available as packed versions on the hub. - -A quantized model can be load : +Load a BitNet quantized model with [`~PreTrainedModel.from_pretrained`]. ```py from transformers import AutoModelForCausalLM path = "/path/to/model" model = AutoModelForCausalLM.from_pretrained(path, device_map="auto") ``` -## Pre-training / Fine-tuning a BitNet Model -If you're looking to pre-train or fine-tune your own 1.58-bit model using Nanotron, check out this [PR](https://github.com/huggingface/nanotron/pull/180), all you need to get started is there ! +## Kernels -For fine-tuning, you'll need to convert the model from Hugging Face format to Nanotron format (which has some differences). You can find the conversion steps in this [PR](https://github.com/huggingface/nanotron/pull/174). +`@torch.compile` is used to unpack the weights and perform the forward pass. It’s very straightforward to implement and delivers significant speed improvements. Additional optimized kernels will be integrated in future versions. -## Kernels +## Resources -In our initial version, we chose to use `@torch.compile` to unpack the weights and perform the forward pass. It’s very straightforward to implement and delivers significant speed improvements. We plan to integrate additional optimized kernels in future versions. \ No newline at end of file +Read [Fine-tuning LLMs to 1.58bit: extreme quantization made easy](https://huggingface.co/blog/1_58_llm_extreme_quantization) to learn more about how BitNet models are trained and fine-tuned. diff --git a/docs/source/en/quantization/bitsandbytes.md b/docs/source/en/quantization/bitsandbytes.md index 368a649bae3b..26070070d51d 100644 --- a/docs/source/en/quantization/bitsandbytes.md +++ b/docs/source/en/quantization/bitsandbytes.md @@ -16,42 +16,24 @@ rendered properly in your Markdown viewer. # bitsandbytes -[bitsandbytes](https://github.com/TimDettmers/bitsandbytes) is the easiest option for quantizing a model to 8 and 4-bit. 8-bit quantization multiplies outliers in fp16 with non-outliers in int8, converts the non-outlier values back to fp16, and then adds them together to return the weights in fp16. This reduces the degradative effect outlier values have on a model's performance. 4-bit quantization compresses a model even further, and it is commonly used with [QLoRA](https://hf.co/papers/2305.14314) to finetune quantized LLMs. +[bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes) features the LLM.int8 and QLoRA quantization to enable accessible large language model inference and training. -To use bitsandbytes, make sure you have the following libraries installed: +LLM.int8 matrix multiplication, or 8-bit quantization, is based on vector-wise quantization to quantize most of the weights to 8-bits and treating outliers with 16-bit matrix multiplication to reduce their degradative effect on model accuracy. - - +QLoRA, or 4-bit quantization, compresses a model even further to 4-bits and inserts a small set of trainable low-rank adaptation (LoRA) weights to allowing training. -```bash -pip install transformers accelerate bitsandbytes>0.37.0 -``` - - - +Run the command below to install bitsandbytes. ```bash -pip install bitsandbytes>=0.39.0 -pip install --upgrade accelerate transformers +pip install --upgrade transformers accelerate bitsandbytes ``` - - - - - -bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend). - -We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links. - - - -Now you can quantize a model by passing a `BitsAndBytesConfig` to [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it supports loading with Accelerate and contains `torch.nn.Linear` layers. +Quantize a model by passing a [`BitsAndBytesConfig`] to [`~PreTrainedModel.from_pretrained`]. This works for any model in any modality, as long as it supports [Accelerate]https://huggingface.co/docs/accelerate/index() and contains [torch.nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) layers. -Quantizing a model in 8-bit halves the memory-usage, and for large models, set `device_map="auto"` to efficiently use the GPUs available: +Quantizing a model in 8-bit halves the memory-usage, and for large models, set `device_map="auto"` to efficiently distribute the weights across all available GPUs. ```py from transformers import AutoModelForCausalLM, BitsAndBytesConfig @@ -64,7 +46,7 @@ model_8bit = AutoModelForCausalLM.from_pretrained( ) ``` -By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file. +By default, all other modules such as [torch.nn.LayerNorm](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter.. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file. ```py import torch @@ -80,7 +62,7 @@ model_8bit = AutoModelForCausalLM.from_pretrained( model_8bit.model.decoder.layers[-1].final_layer_norm.weight.dtype ``` -Once a model is quantized to 8-bit, you can't push the quantized weights to the Hub unless you're using the latest version of Transformers and bitsandbytes. If you have the latest versions, then you can push the 8-bit model to the Hub with the [`~PreTrainedModel.push_to_hub`] method. The quantization config.json file is pushed first, followed by the quantized model weights. +Once a model is quantized to 8-bit, you can't push the quantized weights to the Hub unless you're using the latest version of Transformers and bitsandbytes. If you have the latest versions, then you can push the 8-bit model to the Hub with [`~PreTrainedModel.push_to_hub`]. The quantization config.json file is pushed first, followed by the quantized model weights. ```py from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig @@ -99,7 +81,7 @@ model.push_to_hub("bloom-560m-8bit") -Quantizing a model in 4-bit reduces your memory-usage by 4x, and for large models, set `device_map="auto"` to efficiently use the GPUs available: +Quantizing a model in 4-bit reduces your memory-usage by 4x, and for large models, set `device_map="auto"` to efficiently distribute the weights across all available GPUs. ```py from transformers import AutoModelForCausalLM, BitsAndBytesConfig @@ -112,7 +94,7 @@ model_4bit = AutoModelForCausalLM.from_pretrained( ) ``` -By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file. +By default, all other modules such as [torch.nn.LayerNorm](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter.. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file. ```py import torch @@ -128,24 +110,21 @@ model_4bit = AutoModelForCausalLM.from_pretrained( model_4bit.model.decoder.layers[-1].final_layer_norm.weight.dtype ``` -If you have `bitsandbytes>=0.41.3`, you can serialize 4-bit models and push them on Hugging Face Hub. Simply call `model.push_to_hub()` after loading it in 4-bit precision. You can also save the serialized 4-bit models locally with `model.save_pretrained()` command. +Make sure you have the latest bitsandbytes version so you can serialize 4-bit models and push them to the Hub with [`~PreTrainedModel.push_to_hub`]. Use [`~PreTrainedModel.save_pretrained`] to save the 4-bit model locally. - +> [!WARNING] +> 8 and 4-bit training is only supported for training *extra* parameters. -Training with 8-bit and 4-bit weights are only supported for training *extra* parameters. - - - -You can check your memory footprint with the `get_memory_footprint` method: +Check your memory footprint with `get_memory_footprint`. ```py print(model.get_memory_footprint()) ``` -Quantized models can be loaded from the [`~PreTrainedModel.from_pretrained`] method without needing to specify the `load_in_8bit` or `load_in_4bit` parameters: +Load quantized models with [`~PreTrainedModel.from_pretrained`] without a `quantization_config`. ```py from transformers import AutoModelForCausalLM, AutoTokenizer @@ -153,19 +132,13 @@ from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit", device_map="auto") ``` -## 8-bit (LLM.int8() algorithm) - - - -Learn more about the details of 8-bit quantization in this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration)! +## LLM.int8 - - -This section explores some of the specific features of 8-bit models, such as offloading, outlier thresholds, skipping module conversion, and finetuning. +This section explores some of the specific features of 8-bit quantization, such as offloading, outlier thresholds, skipping module conversion, and finetuning. ### Offloading -8-bit models can offload weights between the CPU and GPU to support fitting very large models into memory. The weights dispatched to the CPU are actually stored in **float32**, and aren't converted to 8-bit. For example, to enable offloading for the [bigscience/bloom-1b7](https://huggingface.co/bigscience/bloom-1b7) model, start by creating a [`BitsAndBytesConfig`]: +8-bit models can offload weights between the CPU and GPU to fit very large models into memory. The weights dispatched to the CPU are stored in **float32** and aren't converted to 8-bit. For example, enable offloading for [bigscience/bloom-1b7](https://huggingface.co/bigscience/bloom-1b7) through [`BitsAndBytesConfig`]. ```py from transformers import AutoModelForCausalLM, BitsAndBytesConfig @@ -173,7 +146,7 @@ from transformers import AutoModelForCausalLM, BitsAndBytesConfig quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True) ``` -Design a custom device map to fit everything on your GPU except for the `lm_head`, which you'll dispatch to the CPU: +Design a custom device map to fit everything on your GPU except for the `lm_head`, which is dispatched to the CPU. ```py device_map = { @@ -185,7 +158,7 @@ device_map = { } ``` -Now load your model with the custom `device_map` and `quantization_config`: +Now load your model with the custom `device_map` and `quantization_config`. ```py model_8bit = AutoModelForCausalLM.from_pretrained( @@ -200,7 +173,7 @@ model_8bit = AutoModelForCausalLM.from_pretrained( An "outlier" is a hidden state value greater than a certain threshold, and these values are computed in fp16. While the values are usually normally distributed ([-3.5, 3.5]), this distribution can be very different for large models ([-60, 6] or [6, 60]). 8-bit quantization works well for values ~5, but beyond that, there is a significant performance penalty. A good default threshold value is 6, but a lower threshold may be needed for more unstable models (small models or finetuning). -To find the best threshold for your model, we recommend experimenting with the `llm_int8_threshold` parameter in [`BitsAndBytesConfig`]: +To find the best threshold for your model, experiment with the `llm_int8_threshold` parameter in [`BitsAndBytesConfig`]. ```py from transformers import AutoModelForCausalLM, BitsAndBytesConfig @@ -222,7 +195,7 @@ model_8bit = AutoModelForCausalLM.from_pretrained( ### Skip module conversion -For some models, like [Jukebox](model_doc/jukebox), you don't need to quantize every module to 8-bit which can actually cause instability. With Jukebox, there are several `lm_head` modules that should be skipped using the `llm_int8_skip_modules` parameter in [`BitsAndBytesConfig`]: +For some models, like [Jukebox](model_doc/jukebox), you don't need to quantize every module to 8-bit because it can actually cause instability. With Jukebox, there are several `lm_head` modules that should be skipped using the `llm_int8_skip_modules` parameter in [`BitsAndBytesConfig`]. ```py from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig @@ -243,22 +216,15 @@ model_8bit = AutoModelForCausalLM.from_pretrained( ### Finetuning -With the [PEFT](https://github.com/huggingface/peft) library, you can finetune large models like [flan-t5-large](https://huggingface.co/google/flan-t5-large) and [facebook/opt-6.7b](https://huggingface.co/facebook/opt-6.7b) with 8-bit quantization. You don't need to pass the `device_map` parameter for training because it'll automatically load your model on a GPU. However, you can still customize the device map with the `device_map` parameter if you want to (`device_map="auto"` should only be used for inference). - -## 4-bit (QLoRA algorithm) - - +The [PEFT](https://github.com/huggingface/peft) library supports fine-tuning large models like [flan-t5-large](https://huggingface.co/google/flan-t5-large) and [facebook/opt-6.7b](https://huggingface.co/facebook/opt-6.7b) with 8-bit quantization. You don't need to pass the `device_map` parameter for training because it automatically loads your model on a GPU. However, you can still customize the device map with the `device_map` parameter (`device_map="auto"` should only be used for inference). -Try 4-bit quantization in this [notebook](https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf) and learn more about it's details in this [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes). - - - -This section explores some of the specific features of 4-bit models, such as changing the compute data type, using the Normal Float 4 (NF4) data type, and using nested quantization. +## QLoRA +This section explores some of the specific features of 4-bit quantization, such as changing the compute data type, the Normal Float 4 (NF4) data type, and nested quantization. ### Compute data type -To speedup computation, you can change the data type from float32 (the default value) to bf16 using the `bnb_4bit_compute_dtype` parameter in [`BitsAndBytesConfig`]: +Change the data type from float32 (the default value) to bf16 in [`BitsAndBytesConfig`] to speedup computation. ```py import torch @@ -269,7 +235,7 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dty ### Normal Float 4 (NF4) -NF4 is a 4-bit data type from the [QLoRA](https://hf.co/papers/2305.14314) paper, adapted for weights initialized from a normal distribution. You should use NF4 for training 4-bit base models. This can be configured with the `bnb_4bit_quant_type` parameter in the [`BitsAndBytesConfig`]: +NF4 is a 4-bit data type from the [QLoRA](https://hf.co/papers/2305.14314) paper, adapted for weights initialized from a normal distribution. You should use NF4 for training 4-bit base models. ```py from transformers import BitsAndBytesConfig @@ -286,7 +252,7 @@ For inference, the `bnb_4bit_quant_type` does not have a huge impact on performa ### Nested quantization -Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter. For example, with nested quantization, you can finetune a [Llama-13b](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) model on a 16GB NVIDIA T4 GPU with a sequence length of 1024, a batch size of 1, and enabling gradient accumulation with 4 steps. +Nested quantization can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter. For example, with nested quantization, you can finetune a [Llama-13b](https://huggingface.co/meta-llama/Llama-2-13b) model on a 16GB NVIDIA T4 GPU with a sequence length of 1024, a batch size of 1, and enable gradient accumulation with 4 steps. ```py from transformers import BitsAndBytesConfig @@ -299,22 +265,19 @@ double_quant_config = BitsAndBytesConfig( model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf", torch_dtype="auto", quantization_config=double_quant_config) ``` -## Dequantizing `bitsandbytes` models +## Dequantizing bitsandbytes models -Once quantized, you can dequantize the model to the original precision but this might result in a small quality loss of the model. Make sure you have enough GPU RAM to fit the dequantized model. +Once quantized, you can [`~PreTrainedModel.dequantize`] a model to the original precision but this may result in some quality loss. Make sure you have enough GPU memory to fit the dequantized model. ```python from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer -model_id = "facebook/opt-125m" - -model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=BitsAndBytesConfig(load_in_4bit=True)) -tokenizer = AutoTokenizer.from_pretrained(model_id) - +model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m", BitsAndBytesConfig(load_in_4bit=True)) model.dequantize() +``` + +## Resources -text = tokenizer("Hello my name is", return_tensors="pt").to(0) +Learn more about the details of 8-bit quantization in [A Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration). -out = model.generate(**text) -print(tokenizer.decode(out[0])) -``` \ No newline at end of file +Try 4-bit quantization in this [notebook](https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf) and learn more about it's details in [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes). diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index 1534a977f343..57e9c10a6a46 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -16,31 +16,9 @@ rendered properly in your Markdown viewer. # GPTQ - +[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save your memory-usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory. Inference is also faster because a lower bitwidth takes less time to communicate. -Try GPTQ quantization with PEFT in this [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) and learn more about it's details in this [blog post](https://huggingface.co/blog/gptq-integration)! - - - -Both [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) libraries implement the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes error. These weights are quantized to int4, stored as int32 (int4 x 8) and dequantized (restored) to fp16 on the fly during inference. This can save memory by almost 4x because the int4 weights are often dequantized in a fused kernel. You can also expect a substantial speedup in inference due to lower bandwidth requirements for lower bitwidth. - -[GPTQModel](https://github.com/ModelCloud/GPTQModel) started as a maintained fork of AutoGPTQ but has since differentiated itself with the following major differences. - -* Model support: GPTQModel continues to support all of the latest LLM models. -* Multimodal support: GPTQModel supports accurate quantization of Qwen 2-VL and Ovis 1.6-VL image-to-text models. -* Platform support: Linux, macOS (Apple Silicon), and Windows 11. -* Hardware support: NVIDIA CUDA, AMD ROCm, Apple Silicon M1/MPS /CPU, Intel/AMD CPU, and Intel Datacenter Max/Arc GPUs. -* Asymmetric support: Asymmetric quantization can potentially introduce lower quantization errors compared to symmetric quantization. However, it is not backward compatible with AutoGPTQ, and not all kernels, such as Marlin, support asymmetric quantization. -* IPEX kernel for Intel/AMD accelerated CPU and Intel GPU (Datacenter Max/Arc GPUs) support. -* Updated Marlin kernel from Neural Magic optimized for A100 (Ampere). -* Updated kernels with auto-padding for legacy model support and models with non-uniform in/out-features. -* Faster quantization, lower memory usage, and more accurate default quantization via GPTQModel quantization APIs. -* User and developer friendly APIs. - - -[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) will likely be deprecated in the future due the lack of continued support for new models and features. - -Before you begin, make sure the following libraries are installed and updated to the latest release: +Run the commands below to install AutoGPTQ. ```bash pip install --upgrade accelerate optimum transformers @@ -58,49 +36,51 @@ or pip install auto-gptq --no-build-isolation ``` -To quantize a model (currently only supported for text models), you need to create a [`GPTQConfig`] class and set the number of bits to quantize to, a dataset to calibrate the weights for quantization, and a tokenizer to prepare the dataset. +Create a [`GPTQConfig`] class and set the number of bits to quantize to, a dataset to calbrate the weights for quantization, and a tokenizer to prepare the dataset. ```py from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig -model_id = "facebook/opt-125m" -tokenizer = AutoTokenizer.from_pretrained(model_id) +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") gptq_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer) ``` -You could also pass your own dataset as a list of strings, but it is highly recommended to use the same dataset from the GPTQ paper. - -```py -dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."] -gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer) -``` +> [!TIP] +> You can pass your own dataset as a list of strings, but it is highly recommended to use the same dataset from the GPTQ paper. +> +> ```py +> dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."] +> gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer) +> ``` -Load a model to quantize and pass the `gptq_config` to the [`~AutoModelForCausalLM.from_pretrained`] method. Set `device_map="auto"` to automatically offload the model to a CPU to help fit the model in memory, and allow the model modules to be moved between the CPU and GPU for quantization. +Load a model to quantize and pass [`GPTQConfig`] to [`~AutoModelForCausalLM.from_pretrained`]. Set `device_map="auto"` to automatically offload the model to a CPU to help fit the model in memory, and allow the model modules to be moved between the CPU and GPU for quantization. ```py -quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config) +quantized_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m", device_map="auto", quantization_config=gptq_config) ``` -If you're running out of memory because a dataset is too large, disk offloading is not supported. If this is the case, try passing the `max_memory` parameter to allocate the amount of memory to use on your device (GPU and CPU): +If you're running out of memory because a dataset is too large (disk offloading is not supported), try passing the `max_memory` parameter to allocate the amount of memory to use on your device (GPU and CPU). ```py -quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "30GiB", 1: "46GiB", "cpu": "30GiB"}, quantization_config=gptq_config) +quantized_model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + device_map="auto", + max_memory={0: "30GiB", 1: "46GiB", "cpu": "30GiB"}, + quantization_config=gptq_config +) ``` - +> [!WARNING] +> Depending on your hardware, it can take some time to quantize a model from scratch. It can take ~5 minutes to quantize the [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) model on a free-tier Google Colab GPU, but it'll take ~4 hours to quantize a 175B parameter model on a NVIDIA A100. Before you quantize a model, it is a good idea to check the Hub if a GPTQ-quantized version of the model already exists. -Depending on your hardware, it can take some time to quantize a model from scratch. It can take ~5 minutes to quantize the [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) model on a free-tier Google Colab GPU, but it'll take ~4 hours to quantize a 175B parameter model on a NVIDIA A100. Before you quantize a model, it is a good idea to check the Hub if a GPTQ-quantized version of the model already exists. - - - -Once your model is quantized, you can push the model and tokenizer to the Hub where it can be easily shared and accessed. Use the [`~PreTrainedModel.push_to_hub`] method to save the [`GPTQConfig`]: +Once a model is quantized, you can use [`~PreTrainedModel.push_to_hub`] to push the model and tokenizer to the Hub where it can be easily shared and accessed. This saves the [`GPTQConfig`]. ```py quantized_model.push_to_hub("opt-125m-gptq") tokenizer.push_to_hub("opt-125m-gptq") ``` -You could also save your quantized model locally with the [`~PreTrainedModel.save_pretrained`] method. If the model was quantized with the `device_map` parameter, make sure to move the entire model to a GPU or CPU before saving it. For example, to save the model on a CPU: +[`~PreTrainedModel.save_pretrained`] saves a quantized model locally. If the model was quantized with the `device_map` parameter, make sure to move the entire model to a GPU or CPU before saving it. The example below saves the model on a CPU. ```py quantized_model.save_pretrained("opt-125m-gptq") @@ -111,7 +91,7 @@ quantized_model.to("cpu") quantized_model.save_pretrained("opt-125m-gptq") ``` -Reload a quantized model with the [`~PreTrainedModel.from_pretrained`] method, and set `device_map="auto"` to automatically distribute the model on all available GPUs to load the model faster without using more memory than needed. +Reload a quantized model with [`~PreTrainedModel.from_pretrained`], and set `device_map="auto"` to automatically distribute the model on all available GPUs to load the model faster without using more memory than needed. ```py from transformers import AutoModelForCausalLM @@ -134,27 +114,39 @@ model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", de ## ExLlama -[ExLlama](https://github.com/turboderp/exllama) is a CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter: +> [!WARNING] +> Only 4-bit models are supported, and we recommend deactivating the ExLlama kernels if you're finetuning a quantized model with PEFT. + +[ExLlama](https://github.com/turboderp/exllama) is a Python/C++/CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. + +To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter in [`GPTQConfig`]. ```py import torch from transformers import AutoModelForCausalLM, GPTQConfig gptq_config = GPTQConfig(bits=4, exllama_config={"version":2}) -model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=gptq_config) +model = AutoModelForCausalLM.from_pretrained( + "{your_username}/opt-125m-gptq", + device_map="auto", + quantization_config=gptq_config +) ``` - - -Only 4-bit models are supported, and we recommend deactivating the ExLlama kernels if you're finetuning a quantized model with PEFT. - - - -The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ or GPTQModel, then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file. +The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ 0.4.2+, disable the ExLlama kernel in [`GPTQConfig`]. This overwrites the attributes related to the ExLlama kernels in the quantization config of the `config.json` file. ```py import torch from transformers import AutoModelForCausalLM, GPTQConfig + gptq_config = GPTQConfig(bits=4, use_exllama=False) -model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="cpu", quantization_config=gptq_config) +model = AutoModelForCausalLM.from_pretrained( + "{your_username}/opt-125m-gptq", + device_map="cpu", + quantization_config=gptq_config +) ``` + +## Resources + +Run the GPTQ quantization with PEFT [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) for a hands-on experience, and read [Making LLMs lighter with AutoGPTQ and transformers](https://huggingface.co/blog/gptq-integration) to learn more about the AutoGPTQ integration. From 67729fadb6567bda3dce0b080bca7a83e71a44ff Mon Sep 17 00:00:00 2001 From: Steven Liu Date: Fri, 20 Dec 2024 17:09:17 -0800 Subject: [PATCH 085/116] quant pt 4 --- docs/source/en/_toctree.yml | 2 + docs/source/en/quantization/vptq.md | 83 ++++++++--------------------- 2 files changed, 24 insertions(+), 61 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 449ae9f61985..6480f78fab03 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -182,6 +182,8 @@ title: Quanto - local: quantization/torchao title: torchao + - local: quantization/vptq + title: VPTQ - local: quantization/contribute title: Contribute - title: Deploy to production diff --git a/docs/source/en/quantization/vptq.md b/docs/source/en/quantization/vptq.md index b86e82f0a350..af082c5f2f24 100644 --- a/docs/source/en/quantization/vptq.md +++ b/docs/source/en/quantization/vptq.md @@ -14,34 +14,33 @@ rendered properly in your Markdown viewer. --> -# VPTQ +# VPTQ -> [!TIP] -> Try VPTQ on [Hugging Face](https://huggingface.co/spaces/microsoft/VPTQ)! -> Try VPTQ on [Google Colab](https://colab.research.google.com/github/microsoft/VPTQ/blob/main/notebooks/vptq_example.ipynb)! -> Know more about VPTQ on [ArXiv](https://arxiv.org/pdf/2409.17066)! +[Vector Post-Training Quantization (VPTQ)](https://github.com/microsoft/VPTQ) is a Post-Training Quantization (PTQ) method that leverages vector quantization to quantize LLMs at an extremely low bit-width (<2-bit). VPTQ can compress a 70B, even a 405B model, to 1-2 bits without retraining and still maintain a high-degree of accuracy. It is a lightweight quantization algorithm that takes ~17 hours to quantize a 405B model. VPTQ features agile quantization inference with low decoding overhead and high throughput and Time To First Token (TTFT). -Vector Post-Training Quantization ([VPTQ](https://github.com/microsoft/VPTQ)) is a novel Post-Training Quantization method that leverages Vector Quantization to high accuracy on LLMs at an extremely low bit-width (<2-bit). VPTQ can compress 70B, even the 405B model, to 1-2 bits without retraining and maintain high accuracy. +Run the command below to install VPTQ which provides efficient kernels for inference on NVIDIA and AMD GPUs. -- Better Accuracy on 1-2 bits, (405B @ <2bit, 70B @ 2bit) -- Lightweight Quantization Algorithm: only cost ~17 hours to quantize 405B Llama-3.1 -- Agile Quantization Inference: low decode overhead, best throughput, and TTFT - -Inference support for VPTQ is released in the `vptq` library. Make sure to install it to run the models: ```bash pip install vptq ``` -The library provides efficient kernels for NVIDIA/AMD GPU inference. +The [VPTQ-community](https://huggingface.co/VPTQ-community) provides a collection of VPTQ-quantized models. The model name contains information about its bitwidth (excluding cookbook, parameter, and padding overhead). Consider the [Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft] model as an example. + +- The model name is Meta-Llama-3.1-70B-Instruct. +- The number of centroids is given by 65536 (2^16). +- The number of residual centroids is given by 256 (2^8). -To run VPTQ models simply load a model that has been quantized with VPTQ: +The equivalent bit-width calculation is given by the following. -## Inference example -**Run Llama 3.1 70b on RTX4090 (24G @ ~2bits) in real time** -![Llama3 1-70b-prompt](https://github.com/user-attachments/assets/d8729aca-4e1d-4fe1-ac71-c14da4bdd97f) +- index: log2(65536) = 16 / 8 = 2-bits +- residual index: log2(256) = 8 / 8 = 1-bit +- total bit-width: 2 + 1 = 3-bits +From here, estimate the model size by multiplying 70B * 3-bits / 8-bits/byte for a total of 26.25GB. -```python +Load a VPTQ quantized model with [`~PreTrainedModel.from_pretrained`]. + +```py from transformers import AutoTokenizer, AutoModelForCausalLM quantized_model = AutoModelForCausalLM.from_pretrained( @@ -49,18 +48,13 @@ quantized_model = AutoModelForCausalLM.from_pretrained( torch_dtype="auto", device_map="auto" ) -tokenizer = AutoTokenizer.from_pretrained("VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft") -input_ids = tokenizer("hello, it's me", return_tensors="pt").to("cuda") -out = model.generate(**input_ids, max_new_tokens=32, do_sample=False) ``` -## Quantize your own model -VPTQ algorithm early-released at [VPTQ ](https://github.com/microsoft/VPTQ/tree/algorithm), -and checkout the [tutorial](https://github.com/microsoft/VPTQ/blob/algorithm/algorithm.md). +To quantize your own model, refer to the [VPTQ Quantization Algorithm Tutorial](https://github.com/microsoft/VPTQ/blob/algorithm/algorithm.md) tutorial. -## Early Results from Tech Report -VPTQ achieves better accuracy and higher throughput with lower quantization overhead across models of different sizes. The following experimental results are for reference only; VPTQ can achieve better outcomes under reasonable parameters, especially in terms of model accuracy and inference speed. +## Benchmarks +VPTQ achieves better accuracy and higher throughput with lower quantization overhead across models of different sizes. The following experimental results are for reference only; VPTQ can achieve better outcomes under reasonable parameters, especially in terms of model accuracy and inference speed. | Model | bitwidth | W2↓ | C4↓ | AvgQA↑ | tok/s↑ | mem(GB) | cost/h↓ | | ----------- | -------- | ---- | ---- | ------ | ------ | ------- | ------- | @@ -71,41 +65,8 @@ VPTQ achieves better accuracy and higher throughput with lower quantization over | LLaMA-2 70B | 2.07 | 3.93 | 5.72 | 68.6 | 9.7 | 19.54 | 19 | | | 2.11 | 3.92 | 5.71 | 68.7 | 9.7 | 20.01 | 19 | +## Resources +See an example demo of VPTQ on the VPTQ Online Demo [Space](https://huggingface.co/spaces/microsoft/VPTQ) or try running the VPTQ inference [notebook](https://colab.research.google.com/github/microsoft/VPTQ/blob/main/notebooks/vptq_example.ipynb). -## More Models in [VPTQ-community](https://huggingface.co/VPTQ-community) - -⚠️ The repository only provides a method of model quantization algorithm. - -⚠️ The open-source community VPTQ-community provides models based on the technical report and quantization algorithm. - - - -**Quick Estimation of Model Bitwidth (Excluding Codebook Overhead)**: - -- **Model Naming Convention**: The model's name includes the **vector length** $v$, **codebook (lookup table) size**, and **residual codebook size**. For example, "Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft" is "Meta-Llama-3.1-70B-Instruct", where: - - **Vector Length**: 8 - - **Number of Centroids**: 65536 (2^16) - - **Number of Residual Centroids**: 256 (2^8) -- **Equivalent Bitwidth Calculation**: - - **Index**: log2(65536) = 16 / 8 = 2 bits - - **Residual Index**: log2(256) = 8 / 8 = 1 bit - - **Total Bitwidth**: 2 + 1 = 3 bits -- **Model Size Estimation**: 70B * 3 bits / 8 bits per Byte = 26.25 GB - -- **Note**: This estimate does not include the size of the codebook (lookup table), other parameter overheads, and the padding overhead for storing indices. For the detailed calculation method, please refer to **Tech Report Appendix C.2**. - - -| Model Series | Collections | (Estimated) Bit per weight | -| :--------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------: | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Llama 3.1 Nemotron 70B Instruct HF | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-nemotron-70b-instruct-hf-without-finetune-671730b96f16208d0b3fe942) | [4 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-0-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-16384-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-1024-woft) [1.5 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-256-woft) | -| Llama 3.1 8B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-8b-instruct-without-finetune-66f2b70b1d002ceedef02d2e) | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-65536-woft) [3.5 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-4096-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-256-woft) [2.3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v12-k65536-4096-woft) | -| Llama 3.1 70B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-70b-instruct-without-finetune-66f2bf454d3dd78dfee2ff11) | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft) [2.25 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-4-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-0-woft) [1.93 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-32768-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k32768-0-woft) [1.75 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k16384-0-woft) | -| Llama 3.1 405B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-405b-instruct-without-finetune-66f4413f9ba55e1a9e52cfb0) | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k65536-256-woft) [2 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-65536-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k32768-32768-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-1024-woft) [1.5 bits (1)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k4096-0-woft) [1.5 bits (2)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-256-woft) [1.43 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-128-woft) [1.375 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-64-woft) | -| Mistral Large Instruct 2407 (123B) | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-mistral-large-instruct-2407-without-finetune-6711ebfb7faf85eed9cceb16) | [4 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-0-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-16384-woft) [1.75 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-4096-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-1024-woft) [1.5 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-256-woft) | -| Qwen 2.5 7B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-7b-instruct-without-finetune-66f3e9866d3167cc05ce954a) | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k256-256-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-0-woft) [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v16-k65536-65536-woft) | -| Qwen 2.5 14B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-14b-instruct-without-finetune-66f827f83c7ffa7931b8376c) | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k256-256-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-0-woft) [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v16-k65536-65536-woft) | -| Qwen 2.5 32B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-32b-instruct-without-finetune-66fe77173bf7d64139f0f613) | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-0-woft) [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k256-256-woft) | -| Qwen 2.5 72B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-72b-instruct-without-finetune-66f3bf1b3757dfa1ecb481c0) | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-256-woft) [2.38 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k1024-512-woft) [2.25 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k512-512-woft) [2.25 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-4-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-0-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-65536-woft) [1.94 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-32768-woft) | -| Reproduced from the tech report | [HF 🤗](https://huggingface.co/collections/VPTQ-community/reproduced-vptq-tech-report-baseline-66fbf1dffe741cc9e93ecf04) | Results from the open source community for reference only, please use them responsibly. | -| Hessian and Inverse Hessian Matrix | [HF 🤗](https://huggingface.co/collections/VPTQ-community/hessian-and-invhessian-checkpoints-66fd249a104850d17b23fd8b) | Collected from RedPajama-Data-1T-Sample, following [Quip#](https://github.com/Cornell-RelaxML/quip-sharp/blob/main/quantize_llama/hessian_offline_llama.py) \ No newline at end of file +For more information, read the VPTQ [paper](https://arxiv.org/pdf/2409.17066). From d5f8973725890f83a53183d5b45c3230b8d60b09 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Tue, 31 Dec 2024 08:43:28 -0800 Subject: [PATCH 086/116] serialization --- docs/source/en/_toctree.yml | 8 +- docs/source/en/quantization/higgs.md | 62 ++++++----- docs/source/en/serialization.md | 153 ++++----------------------- docs/source/en/tflite.md | 46 ++++---- docs/source/en/torchscript.md | 2 +- 5 files changed, 90 insertions(+), 181 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 6480f78fab03..71d88ba75cc3 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -174,6 +174,8 @@ title: GGUF - local: quantization/gptq title: GPTQ + - local: quantization/higgs + title: HIGGS - local: quantization/hqq title: HQQ - local: quantization/optimum @@ -190,11 +192,11 @@ isExpanded: False sections: - local: serialization - title: Export to ONNX + title: ONNX - local: tflite - title: Export to TFLite + title: LiteRT - local: torchscript - title: Export to TorchScript + title: TorchScript - title: Resources isExpanded: False sections: diff --git a/docs/source/en/quantization/higgs.md b/docs/source/en/quantization/higgs.md index d2aa9c9dc497..ce4f33fdcc86 100644 --- a/docs/source/en/quantization/higgs.md +++ b/docs/source/en/quantization/higgs.md @@ -16,11 +16,30 @@ rendered properly in your Markdown viewer. # HIGGS -HIGGS is a 0-shot quantization algorithm that combines Hadamard preprocessing with MSE-Optimal quantization grids to achieve lower quantization error and SOTA performance. You can find more information in the paper [arxiv.org/abs/2411.17525](https://arxiv.org/abs/2411.17525). +[HIGGS](https://arxiv.org/abs/2411.17525) is a zero-shot quantization algorithm that combines Hadamard preprocessing with MSE-Optimal quantization grids to achieve lower quantization error and state-of-the-art performance. -Runtime support for HIGGS is implemented through [FLUTE](https://arxiv.org/abs/2407.10960), and its [library](https://github.com/HanGuo97/flute). +Runtime support for HIGGS is implemented through the [FLUTE](https://github.com/HanGuo97/flute) library. Only the 70B and 405B variants of Llama 3 and Llama 3.0, and the 8B and 27B variants of Gemma 2 are currently supported. HIGGS also doesn't support quantized training and backward passes in general at the moment. -## Quantization Example +Run the command below to install FLUTE. + + + + +```bash +pip install flute-kernel +``` + + + + +```bash +pip install flute-kernel -i https://flute-ai.github.io/whl/cu118 +``` + + + + +Create a [`HiggsConfig`] with the number of bits to quantize a model to. ```python from transformers import AutoModelForCausalLM, AutoTokenizer, HiggsConfig @@ -30,37 +49,32 @@ model = AutoModelForCausalLM.from_pretrained( quantization_config=HiggsConfig(bits=4), device_map="auto", ) - -tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it") - -tokenizer.decode(model.generate( - **tokenizer("Hi,", return_tensors="pt").to(model.device), - temperature=0.5, - top_p=0.80, -)[0]) ``` -## Pre-quantized models +> [!TIP] +> Find models pre-quantized with HIGGS in the official ISTA-DASLab [collection](https://huggingface.co/collections/ISTA-DASLab/higgs-675308e432fd56b7f6dab94e). -Some pre-quantized models can be found in the [official collection](https://huggingface.co/collections/ISTA-DASLab/higgs-675308e432fd56b7f6dab94e) on Hugging Face Hub. +## torch.compile -## Current Limitations +HIGGS is fully compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html). -**Architectures** +```python +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, HiggsConfig -Currently, FLUTE, and HIGGS by extension, **only support Llama 3 and 3.0 of 8B, 70B and 405B parameters, as well as Gemma-2 9B and 27B**. We're working on allowing to run more diverse models as well as allow arbitrary models by modifying the FLUTE compilation procedure. +model = AutoModelForCausalLM.from_pretrained( + "google/gemma-2-9b-it", + quantization_config=HiggsConfig(bits=4), + device_map="auto", +) -**torch.compile** +model = torch.compile(model) +``` -HIGGS is fully compatible with `torch.compile`. Compiling `model.forward`, as described [here](../perf_torch_compile.md), here're the speedups it provides on RTX 4090 for `Llama-3.1-8B-Instruct` (forward passes/sec): +Refer to the table below for a benchmark of forward passes/sec for Llama-3.1-8B-Instruct on a RTX4090. -| Batch Size | BF16 (With `torch.compile`) | HIGGS 4bit (No `torch.compile`) | HIGGS 4bit (With `torch.compile`) | +| Batch Size | BF16 (with `torch.compile`) | HIGGS 4bit (without `torch.compile`) | HIGGS 4bit (with `torch.compile`) | |------------|-----------------------------|----------------------------------|-----------------------------------| | 1 | 59 | 41 | 124 | | 4 | 57 | 42 | 123 | | 16 | 56 | 41 | 120 | - - -**Quantized training** - -Currently, HIGGS doesn't support quantized training (and backward passes in general). We're working on adding support for it. \ No newline at end of file diff --git a/docs/source/en/serialization.md b/docs/source/en/serialization.md index 158db928812e..83d8832bac12 100644 --- a/docs/source/en/serialization.md +++ b/docs/source/en/serialization.md @@ -14,69 +14,41 @@ rendered properly in your Markdown viewer. --> -# Export to ONNX +# ONNX -Deploying 🤗 Transformers models in production environments often requires, or can benefit from exporting the models into -a serialized format that can be loaded and executed on specialized runtimes and hardware. +[ONNX](http://onnx.ai) is an open standard that defines a common set of operators and a file format to represent deep learning models in different frameworks, including PyTorch and TensorFlow. When a model is exported to ONNX, the operators construct a computational graph (or *intermediate representation*) which represents the flow of data through the model. Standardized operators and data types makes it easy to switch between frameworks. -🤗 Optimum is an extension of Transformers that enables exporting models from PyTorch or TensorFlow to serialized formats -such as ONNX and TFLite through its `exporters` module. 🤗 Optimum also provides a set of performance optimization tools to train -and run models on targeted hardware with maximum efficiency. +The [Optimum](https://huggingface.co/docs/optimum/index) library exports a model to ONNX with configuration objects which are supported for [many architectures]((https://huggingface.co/docs/optimum/exporters/onnx/overview)) and can be easily extended. If a model isn't supported, feel free to make a [contribution](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute) to Optimum. -This guide demonstrates how you can export 🤗 Transformers models to ONNX with 🤗 Optimum, for the guide on exporting models to TFLite, -please refer to the [Export to TFLite page](tflite). +The benefits of exporting to ONNX include the following. -## Export to ONNX +- [Graph optimization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) and [quantization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization) for improving inference. +- Use the [`~optimum.onnxruntime.ORTModel`] API to run a model with [ONNX Runtime](https://onnxruntime.ai/). +- Use [optimized inference pipelines](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines) for ONNX models. -[ONNX (Open Neural Network eXchange)](http://onnx.ai) is an open standard that defines a common set of operators and a -common file format to represent deep learning models in a wide variety of frameworks, including PyTorch and -TensorFlow. When a model is exported to the ONNX format, these operators are used to -construct a computational graph (often called an _intermediate representation_) which -represents the flow of data through the neural network. +Export a Transformers model to ONNX with the Optimum CLI or the `optimum.onnxruntime` module. -By exposing a graph with standardized operators and data types, ONNX makes it easy to -switch between frameworks. For example, a model trained in PyTorch can be exported to -ONNX format and then imported in TensorFlow (and vice versa). +## Optimum CLI -Once exported to ONNX format, a model can be: -- optimized for inference via techniques such as [graph optimization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) and [quantization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization). -- run with ONNX Runtime via [`ORTModelForXXX` classes](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort), -which follow the same `AutoModel` API as the one you are used to in 🤗 Transformers. -- run with [optimized inference pipelines](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines), -which has the same API as the [`pipeline`] function in 🤗 Transformers. - -🤗 Optimum provides support for the ONNX export by leveraging configuration objects. These configuration objects come -ready-made for a number of model architectures, and are designed to be easily extendable to other architectures. - -For the list of ready-made configurations, please refer to [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/onnx/overview). - -There are two ways to export a 🤗 Transformers model to ONNX, here we show both: - -- export with 🤗 Optimum via CLI. -- export with 🤗 Optimum with `optimum.onnxruntime`. - -### Exporting a 🤗 Transformers model to ONNX with CLI - -To export a 🤗 Transformers model to ONNX, first install an extra dependency: +Run the command below to install Optimum and the [exporters](https://huggingface.co/docs/optimum/exporters/overview) module. ```bash pip install optimum[exporters] ``` -To check out all available arguments, refer to the [🤗 Optimum docs](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli), -or view help in command line: +> [!TIP] +> Refer to the [Export a model to ONNX with optimum.exporters.onnx](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) guide for all available arguments or with the command below. +> ```bash +> optimum-cli export onnx --help +> ``` -```bash -optimum-cli export onnx --help -``` - -To export a model's checkpoint from the 🤗 Hub, for example, `distilbert/distilbert-base-uncased-distilled-squad`, run the following command: +Set the `--model` argument to export a PyTorch or TensorFlow model from the Hub. ```bash optimum-cli export onnx --model distilbert/distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/ ``` -You should see the logs indicating progress and showing where the resulting `model.onnx` is saved, like this: +You should see logs indicating the progress and showing where the resulting `model.onnx` is saved. ```bash Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx... @@ -90,20 +62,13 @@ Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx... The ONNX export succeeded and the exported model was saved at: distilbert_base_uncased_squad_onnx ``` -The example above illustrates exporting a checkpoint from 🤗 Hub. When exporting a local model, first make sure that you -saved both the model's weights and tokenizer files in the same directory (`local_path`). When using CLI, pass the -`local_path` to the `model` argument instead of the checkpoint name on 🤗 Hub and provide the `--task` argument. -You can review the list of supported tasks in the [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/task_manager). -If `task` argument is not provided, it will default to the model architecture without any task specific head. +For local models, make sure the model weights and tokenizer files are saved in the same directory, for example `local_path`. Pass the directory to the `--model` argument and use `--task` to indicate the [task](https://huggingface.co/docs/optimum/exporters/task_manager) a model can perform. If `--task` isn't provided, the model architecture without a task-specific head is used. ```bash optimum-cli export onnx --model local_path --task question-answering distilbert_base_uncased_squad_onnx/ ``` -The resulting `model.onnx` file can then be run on one of the [many -accelerators](https://onnx.ai/supported-tools.html#deployModel) that support the ONNX -standard. For example, we can load and run the model with [ONNX -Runtime](https://onnxruntime.ai/) as follows: +The `model.onnx` file can be deployed with any [accelerator](https://onnx.ai/supported-tools.html#deployModel) that supports ONNX. The example below demonstrates loading and running a model with ONNX Runtime. ```python >>> from transformers import AutoTokenizer @@ -115,16 +80,9 @@ Runtime](https://onnxruntime.ai/) as follows: >>> outputs = model(**inputs) ``` -The process is identical for TensorFlow checkpoints on the Hub. For instance, here's how you would -export a pure TensorFlow checkpoint from the [Keras organization](https://huggingface.co/keras-io): - -```bash -optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_squad_onnx/ -``` +## optimum.onnxruntime -### Exporting a 🤗 Transformers model to ONNX with `optimum.onnxruntime` - -Alternative to CLI, you can export a 🤗 Transformers model to ONNX programmatically like so: +The `optimum.onnxruntime` module supports programmatically exporting a Transformers model. Instantiate a [`~optimum.onnxruntime.ORTModel`] for a task and set `export=True`. Use [`~OptimizedModel.save_pretrained`] to save the ONNX model. ```python >>> from optimum.onnxruntime import ORTModelForSequenceClassification @@ -133,78 +91,9 @@ Alternative to CLI, you can export a 🤗 Transformers model to ONNX programmati >>> model_checkpoint = "distilbert/distilbert-base-uncased-distilled-squad" >>> save_directory = "onnx/" ->>> # Load a model from transformers and export it to ONNX >>> ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True) >>> tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) ->>> # Save the onnx model and tokenizer >>> ort_model.save_pretrained(save_directory) >>> tokenizer.save_pretrained(save_directory) ``` - -### Exporting a model for an unsupported architecture - -If you wish to contribute by adding support for a model that cannot be currently exported, you should first check if it is -supported in [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview), -and if it is not, [contribute to 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute) -directly. - -### Exporting a model with `transformers.onnx` - - - -`transformers.onnx` is no longer maintained, please export models with 🤗 Optimum as described above. This section will be removed in the future versions. - - - -To export a 🤗 Transformers model to ONNX with `transformers.onnx`, install extra dependencies: - -```bash -pip install transformers[onnx] -``` - -Use `transformers.onnx` package as a Python module to export a checkpoint using a ready-made configuration: - -```bash -python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/ -``` - -This exports an ONNX graph of the checkpoint defined by the `--model` argument. Pass any checkpoint on the 🤗 Hub or one that's stored locally. -The resulting `model.onnx` file can then be run on one of the many accelerators that support the ONNX standard. For example, -load and run the model with ONNX Runtime as follows: - -```python ->>> from transformers import AutoTokenizer ->>> from onnxruntime import InferenceSession - ->>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") ->>> session = InferenceSession("onnx/model.onnx") ->>> # ONNX Runtime expects NumPy arrays as input ->>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np") ->>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs)) -``` - -The required output names (like `["last_hidden_state"]`) can be obtained by taking a look at the ONNX configuration of -each model. For example, for DistilBERT we have: - -```python ->>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig - ->>> config = DistilBertConfig() ->>> onnx_config = DistilBertOnnxConfig(config) ->>> print(list(onnx_config.outputs.keys())) -["last_hidden_state"] -``` - -The process is identical for TensorFlow checkpoints on the Hub. For example, export a pure TensorFlow checkpoint like so: - -```bash -python -m transformers.onnx --model=keras-io/transformers-qa onnx/ -``` - -To export a model that's stored locally, save the model's weights and tokenizer files in the same directory (e.g. `local-pt-checkpoint`), -then export it to ONNX by pointing the `--model` argument of the `transformers.onnx` package to the desired directory: - -```bash -python -m transformers.onnx --model=local-pt-checkpoint onnx/ -``` \ No newline at end of file diff --git a/docs/source/en/tflite.md b/docs/source/en/tflite.md index 09434a81508d..c1ab9618b436 100644 --- a/docs/source/en/tflite.md +++ b/docs/source/en/tflite.md @@ -14,37 +14,39 @@ rendered properly in your Markdown viewer. --> -# Export to TFLite +# LiteRT -[TensorFlow Lite](https://www.tensorflow.org/lite/guide) is a lightweight framework for deploying machine learning models -on resource-constrained devices, such as mobile phones, embedded systems, and Internet of Things (IoT) devices. -TFLite is designed to optimize and run models efficiently on these devices with limited computational power, memory, and -power consumption. -A TensorFlow Lite model is represented in a special efficient portable format identified by the `.tflite` file extension. +[LiteRT](https://ai.google.dev/edge/litert) (previously known as TensorFlow Lite) is a high-performance runtime designed for on-device machine learning. -🤗 Optimum offers functionality to export 🤗 Transformers models to TFLite through the `exporters.tflite` module. -For the list of supported model architectures, please refer to [🤗 Optimum documentation](https://huggingface.co/docs/optimum/exporters/tflite/overview). +The [Optimum](https://huggingface.co/docs/optimum/index) library exports a model to LiteRT for [many architectures]((https://huggingface.co/docs/optimum/exporters/onnx/overview)). -To export a model to TFLite, install the required dependencies: - -```bash -pip install optimum[exporters-tf] -``` +The benefits of exporting to LiteRT include the following. + +- Low-latency, privacy-focused, no internet connectivity required, and reduced model size and power consumption for on-device machine learning. +- Broad platform, model framework, and language support. +- Hardware acceleration for GPUs and Apple Silicon. + +Export a Transformers model to LiteRT with the Optimum CLI. -To check out all available arguments, refer to the [🤗 Optimum docs](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model), -or view help in command line: +Run the command below to install Optimum and the [exporters](https://huggingface.co/docs/optimum/exporters/overview) module for LiteRT. ```bash -optimum-cli export tflite --help +pip install optimum[exporters-tf] ``` -To export a model's checkpoint from the 🤗 Hub, for example, `google-bert/bert-base-uncased`, run the following command: +> [!TIP] +> Refer to the [Export a model to TFLite with optimum.exporters.tflite](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model) guide for all available arguments or with the command below. +> ```bash +> optimum-cli export tflite --help +> ``` + +Set the `--model` argument to export a from the Hub. ```bash optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/ ``` -You should see the logs indicating progress and showing where the resulting `model.tflite` is saved, like this: +You should see logs indicating the progress and showing where the resulting `model.tflite` is saved. ```bash Validating TFLite model... @@ -57,6 +59,8 @@ The TensorFlow Lite export succeeded with the warning: The maximum absolute diff The exported model was saved at: bert_tflite ``` -The example above illustrates exporting a checkpoint from 🤗 Hub. When exporting a local model, first make sure that you -saved both the model's weights and tokenizer files in the same directory (`local_path`). When using CLI, pass the -`local_path` to the `model` argument instead of the checkpoint name on 🤗 Hub. \ No newline at end of file +For local models, make sure the model weights and tokenizer files are saved in the same directory, for example `local_path`. Pass the directory to the `--model` argument and use `--task` to indicate the [task](https://huggingface.co/docs/optimum/exporters/task_manager) a model can perform. If `--task` isn't provided, the model architecture without a task-specific head is used. + +```bash +optimum-cli export tflite --model local_path --task question-answering google-bert/bert-base-uncased --sequence_length 128 bert_tflite/ +``` diff --git a/docs/source/en/torchscript.md b/docs/source/en/torchscript.md index b62e23468f8f..71bd325e7093 100644 --- a/docs/source/en/torchscript.md +++ b/docs/source/en/torchscript.md @@ -14,7 +14,7 @@ rendered properly in your Markdown viewer. --> -# Export to TorchScript +# TorchScript From 46c8bd680e70c82ed1ef90e09da83cfc349f91d1 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Mon, 6 Jan 2025 14:56:40 -0800 Subject: [PATCH 087/116] torchscript --- docs/source/en/torchscript.md | 225 ++++++++++------------------------ 1 file changed, 67 insertions(+), 158 deletions(-) diff --git a/docs/source/en/torchscript.md b/docs/source/en/torchscript.md index 71bd325e7093..ae3c10f77da8 100644 --- a/docs/source/en/torchscript.md +++ b/docs/source/en/torchscript.md @@ -16,106 +16,54 @@ rendered properly in your Markdown viewer. # TorchScript - +[TorchScript](https://pytorch.org/docs/stable/jit.html) serializes PyTorch models into programs that can be executed in non-Python processes. This is especially advantageous in production environments where Python may the most performant choice. -This is the very beginning of our experiments with TorchScript and we are still -exploring its capabilities with variable-input-size models. It is a focus of interest to -us and we will deepen our analysis in upcoming releases, with more code examples, a more -flexible implementation, and benchmarks comparing Python-based codes with compiled -TorchScript. +Transformers can export a model to TorchScript by: - +1. creating dummy inputs to create a *trace* of the model to serialize to TorchScript +2. enabling the `torchscript` parameter in either [`~PretrainedConfig.torchscript`] for a randomly initialized model or [`~PreTrainedModel.from_pretrained`] for a pretrained model -According to the [TorchScript documentation](https://pytorch.org/docs/stable/jit.html): +## Dummy inputs -> TorchScript is a way to create serializable and optimizable models from PyTorch code. +The dummy inputs are used in the forward pass, and as the input values are propagated through each layer, PyTorch tracks the different operations executed on each tensor. The recorded operations are used to create the model trace. Once it is recorded, it is serialized into a TorchScript program. -There are two PyTorch modules, [JIT and -TRACE](https://pytorch.org/docs/stable/jit.html), that allow developers to export their -models to be reused in other programs like efficiency-oriented C++ programs. - -We provide an interface that allows you to export 🤗 Transformers models to TorchScript -so they can be reused in a different environment than PyTorch-based Python programs. -Here, we explain how to export and use our models using TorchScript. - -Exporting a model requires two things: - -- model instantiation with the `torchscript` flag -- a forward pass with dummy inputs - -These necessities imply several things developers should be careful about as detailed -below. - -## TorchScript flag and tied weights - -The `torchscript` flag is necessary because most of the 🤗 Transformers language models -have tied weights between their `Embedding` layer and their `Decoding` layer. -TorchScript does not allow you to export models that have tied weights, so it is -necessary to untie and clone the weights beforehand. - -Models instantiated with the `torchscript` flag have their `Embedding` layer and -`Decoding` layer separated, which means that they should not be trained down the line. -Training would desynchronize the two layers, leading to unexpected results. - -This is not the case for models that do not have a language model head, as those do not -have tied weights. These models can be safely exported without the `torchscript` flag. - -## Dummy inputs and standard lengths - -The dummy inputs are used for a models forward pass. While the inputs' values are -propagated through the layers, PyTorch keeps track of the different operations executed -on each tensor. These recorded operations are then used to create the *trace* of the -model. - -The trace is created relative to the inputs' dimensions. It is therefore constrained by -the dimensions of the dummy input, and will not work for any other sequence length or -batch size. When trying with a different size, the following error is raised: - -``` -`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2` -``` - -We recommended you trace the model with a dummy input size at least as large as the -largest input that will be fed to the model during inference. Padding can help fill the -missing values. However, since the model is traced with a larger input size, the -dimensions of the matrix will also be large, resulting in more calculations. - -Be careful of the total number of operations done on each input and follow the -performance closely when exporting varying sequence-length models. - -## Using TorchScript in Python - -This section demonstrates how to save and load models as well as how to use the trace -for inference. - -### Saving a model - -To export a `BertModel` with TorchScript, instantiate `BertModel` from the `BertConfig` -class and then save it to disk under the filename `traced_bert.pt`: - -```python +```py from transformers import BertModel, BertTokenizer, BertConfig import torch -enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") - -# Tokenizing input text +tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" -tokenized_text = enc.tokenize(text) +tokenized_text = tokenizer.tokenize(text) -# Masking one of the input tokens masked_index = 8 tokenized_text[masked_index] = "[MASK]" -indexed_tokens = enc.convert_tokens_to_ids(tokenized_text) +indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] -# Creating a dummy input +# creating a dummy input tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) dummy_input = [tokens_tensor, segments_tensors] +``` + +The trace is created based on the provided inputs dimensions and it can only handle inputs with the same shape as the provided input during tracing. An input with a different size raises the error message shown below. + +```bash +`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`. +``` + +Try to create a trace with a dummy input size at least as large as the largest expected input during inference. Padding can help fill missing values for larger inputs. It may be slower though since a larger input size requires more calculations. Be mindful of the total number of operations performed on each input and track the model performance when exporting models with variable sequence lengths. + +## torchscript parameter + +Weights between the `Embedding` and `Decoding` layers are tied in Transformers and TorchScript can't export models with tied weights. Instantiating a model with `torchscript=True`, separates the `Embedding` and `Decoding` layers and they aren't trained any further because it would throw the two layers out of sync which can lead to unexpected results. -# Initializing the model with the torchscript flag -# Flag set to True even though it is not necessary as this model does not have an LM Head. +Models *without* a language model head don't have tied weights and can be safely exported without the `torchscript` parameter. + + + + +```py config = BertConfig( vocab_size_or_config_json_file=32000, hidden_size=768, @@ -125,105 +73,66 @@ config = BertConfig( torchscript=True, ) -# Instantiating the model model = BertModel(config) - -# The model needs to be in evaluation mode model.eval() +``` + + + -# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag +```py model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True) +model.eval() +``` + + + + +## Export to TorchScript -# Creating the trace +Create the Torchscript program with [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html), and save with [torch.jit.save](https://pytorch.org/docs/stable/generated/torch.jit.save.html). + +```py traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors]) torch.jit.save(traced_model, "traced_bert.pt") ``` -### Loading a model - -Now you can load the previously saved `BertModel`, `traced_bert.pt`, from disk and use -it on the previously initialised `dummy_input`: +Use [torch.jit.load](https://pytorch.org/docs/stable/generated/torch.jit.load.html) to load the traced model. -```python +```py loaded_model = torch.jit.load("traced_bert.pt") loaded_model.eval() all_encoder_layers, pooled_output = loaded_model(*dummy_input) ``` -### Using a traced model for inference - -Use the traced model for inference by using its `__call__` dunder method: +To use the traced model for inference, use the `__call__` dunder method. -```python +```py traced_model(tokens_tensor, segments_tensors) ``` -## Deploy Hugging Face TorchScript models to AWS with the Neuron SDK - -AWS introduced the [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) -instance family for low cost, high performance machine learning inference in the cloud. -The Inf1 instances are powered by the AWS Inferentia chip, a custom-built hardware -accelerator, specializing in deep learning inferencing workloads. [AWS -Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) is the SDK for -Inferentia that supports tracing and optimizing transformers models for deployment on -Inf1. The Neuron SDK provides: - - -1. Easy-to-use API with one line of code change to trace and optimize a TorchScript - model for inference in the cloud. -2. Out of the box performance optimizations for [improved - cost-performance](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>). -3. Support for Hugging Face transformers models built with either - [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) - or - [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html). - -### Implications - -Transformers models based on the [BERT (Bidirectional Encoder Representations from -Transformers)](https://huggingface.co/docs/transformers/main/model_doc/bert) -architecture, or its variants such as -[distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) and -[roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) run best on -Inf1 for non-generative tasks such as extractive question answering, sequence -classification, and token classification. However, text generation tasks can still be -adapted to run on Inf1 according to this [AWS Neuron MarianMT -tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). -More information about models that can be converted out of the box on Inferentia can be -found in the [Model Architecture -Fit](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) -section of the Neuron documentation. - -### Dependencies - -Using AWS Neuron to convert models requires a [Neuron SDK -environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide) -which comes preconfigured on [AWS Deep Learning -AMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html). - -### Converting a model for AWS Neuron - -Convert a model for AWS NEURON using the same code from [Using TorchScript in -Python](torchscript#using-torchscript-in-python) to trace a `BertModel`. Import the -`torch.neuron` framework extension to access the components of the Neuron SDK through a -Python API: - -```python -from transformers import BertModel, BertTokenizer, BertConfig -import torch +## Deploy to AWS + +TorchScript programs serialized from Transformers can be deployed on [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) instances. The instance is powered by AWS Inferentia chips, a custom hardware accelerator designed for deep learning inference workloads. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) supports tracing Transformers models for deployment on Inf1 instances. + +> [!TIP] +> AWS Neuron requires a [Neuron SDK environment](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/inference-torch-neuron.html#inference-torch-neuron) which is preconfigured on [AWS DLAMI](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html). + +Instead of [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html), use [torch.neuron.trace](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuron/api-compilation-python-api.html) to trace a model and optimize it for Inf1 instances. + +```py import torch.neuron + +torch.neuron.trace(model, [tokens_tensor, segments_tensors]) ``` -You only need to modify the following line: +Refer to the [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html) documentation for more information. -```diff -- torch.jit.trace(model, [tokens_tensor, segments_tensors]) -+ torch.neuron.trace(model, [tokens_tensor, segments_tensors]) -``` +### Model architectures + +BERT-based models - like [DistilBERT](./model_doc/distilbert) or [RoBERTa](./model_doc/roberta) - run best on Inf1 instances for non-generative tasks such as extractive question answering, and sequence or token classification. -This enables the Neuron SDK to trace the model and optimize it for Inf1 instances. +Text generation can be adapted to run on an Inf1 instance as shown in the [Transformers MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html) tutorial. -To learn more about AWS Neuron SDK features, tools, example tutorials and latest -updates, please see the [AWS NeuronSDK -documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html). +Refer to the [Inference Samples/Tutorials (Inf1)](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/models/inference-inf1-samples.html#model-samples-inference-inf1) guide for more information about which models can be converted out of the box to run on Inf1 instances. From d94f933a78dbb77cf98353c4b56279a11f7929f4 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Tue, 7 Jan 2025 12:04:35 -0800 Subject: [PATCH 088/116] scripts --- docs/source/en/_toctree.yml | 22 ++- docs/source/en/run_scripts.md | 314 ++++++++++------------------------ 2 files changed, 97 insertions(+), 239 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 71d88ba75cc3..fb2fa52a0d8a 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -135,7 +135,7 @@ title: Multi-GPU debugging - local: perf_train_gpu_many title: Parallelism methods - - title: Hardware-specific training + - title: Hardware sections: - local: perf_train_gpu_one title: GPU @@ -188,7 +188,7 @@ title: VPTQ - local: quantization/contribute title: Contribute -- title: Deploy to production +- title: Export to production isExpanded: False sections: - local: serialization @@ -270,27 +270,25 @@ title: Video-text-to-text - local: run_scripts title: Training scripts - - local: benchmarks - title: Benchmarks + - local: glossary + title: Glossary + - local: philosophy + title: Philosophy - local: notebooks title: Notebooks with examples - local: community title: Community resources - local: troubleshooting title: Troubleshoot -- title: Community +- title: Contribute isExpanded: False sections: - local: contributing - title: How to contribute to Transformers? + title: Contribute to Transformers - local: testing - title: Testing + title: Transformers model tests - local: pr_checks - title: Checks on a Pull Request - - local: philosophy - title: Philosophy - - local: glossary - title: Glossary + title: Pull request checks - title: API isExpanded: False sections: diff --git a/docs/source/en/run_scripts.md b/docs/source/en/run_scripts.md index b7a895591970..8acb0f06e693 100644 --- a/docs/source/en/run_scripts.md +++ b/docs/source/en/run_scripts.md @@ -14,21 +14,19 @@ rendered properly in your Markdown viewer. --> -# Train with a script +# Training scripts -Along with the 🤗 Transformers [notebooks](./notebooks), there are also example scripts demonstrating how to train a model for a task with [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow), or [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax). +Transformers provides many example training scripts for deep learning frameworks (PyTorch, TensorFlow, Flax) and tasks in [transformers/examples](https://github.com/huggingface/transformers/tree/main/examples). There are additional scripts in [transformers/research projects](https://github.com/huggingface/transformers/tree/main/examples/research_projects) and [transformers/legacy](https://github.com/huggingface/transformers/tree/main/examples/legacy), but these aren't actively maintained and requires a specific version of Transformers. -You will also find scripts we've used in our [research projects](https://github.com/huggingface/transformers/tree/main/examples/research_projects) and [legacy examples](https://github.com/huggingface/transformers/tree/main/examples/legacy) which are mostly community contributed. These scripts are not actively maintained and require a specific version of 🤗 Transformers that will most likely be incompatible with the latest version of the library. +Example scripts are only examples and you may need to adapt the script to your use-case. To help you with this, most scripts are very transparent in how data is preprocessed, allowing you to edit it as necessary. -The example scripts are not expected to work out-of-the-box on every problem, and you may need to adapt the script to the problem you're trying to solve. To help you with this, most of the scripts fully expose how data is preprocessed, allowing you to edit it as necessary for your use case. +For any feature you'd like to implement in an example script, please discuss it on the [forum](https://discuss.huggingface.co/) or in an [issue](https://github.com/huggingface/transformers/issues) before submitting a pull request. While we welcome contributions, it is unlikely a pull request that adds more functionality is added at the cost of readability. -For any feature you'd like to implement in an example script, please discuss it on the [forum](https://discuss.huggingface.co/) or in an [issue](https://github.com/huggingface/transformers/issues) before submitting a Pull Request. While we welcome bug fixes, it is unlikely we will merge a Pull Request that adds more functionality at the cost of readability. - -This guide will show you how to run an example summarization training script in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). All examples are expected to work with both frameworks unless otherwise specified. +This guide will show you how to run an example summarization training script in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). ## Setup -To successfully run the latest version of the example scripts, you have to **install 🤗 Transformers from source** in a new virtual environment: +Install Transformers from source in a new virtual environment to run the latest version of the example script. ```bash git clone https://github.com/huggingface/transformers @@ -36,48 +34,13 @@ cd transformers pip install . ``` -For older versions of the example scripts, click on the toggle below: - -
- Examples for older versions of 🤗 Transformers - -
- -Then switch your current clone of 🤗 Transformers to a specific version, like v3.5.1 for example: +Run the command below to checkout a script from a specific or older version of Transformers. ```bash git checkout tags/v3.5.1 ``` -After you've setup the correct library version, navigate to the example folder of your choice and install the example specific requirements: +After you've setup the correct version, navigate to the example folder of your choice and install the example specific requirements. ```bash pip install -r requirements.txt @@ -85,13 +48,35 @@ pip install -r requirements.txt ## Run a script - - -The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset with the [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/google-t5/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task. +Start with a smaller dataset by including the `max_train_samples`, `max_eval_samples`, and `max_predict_samples` parameters to truncate the dataset to a maximum number of samples. This helps ensure training works as expected before committing to the entire dataset which can take hours to complete. + +> [!WARNING] +> Not all example scripts support the `max_predict_samples` parameter. Run the command below to check whether a script supports it or not. +> ```bash +> examples/pytorch/summarization/run_summarization.py -h +> ``` + +The example below fine-tunes [T5-small](https://huggingface.co/google-t5/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/abisee/cnn_dailymail) dataset. T5 requires an additional `source_prefix` parameter to prompt it to summarize. + + + + +The example script downloads and preprocesses a dataset, and then fine-tunes it with [`Trainer`] with a supported model architecture. + +Resuming training from a checkpoint is very useful if training is interrupted because you don't have to start over again. There are two ways to resume training from a checkpoint. + +* `--output dir previous_output_dir` resumes training from the latest checkpoint stored in `output_dir`. Remove the `--overwrite_output_dir` parameter if you're using this method. +* `--resume_from_checkpoint path_to_specific_checkpoint` resumes training from a specific checkpoint folder. + +Share your model on the [Hub](https://huggingface.co/) with the `--push_to_hub` parameter. It creates a repository and uploads the model to the folder name specified in `--output_dir`. You could also use the `--push_to_hub_model_id` parameter to specify the repository name. ```bash python examples/pytorch/summarization/run_summarization.py \ --model_name_or_path google-t5/t5-small \ + # remove the `max_train_samples`, `max_eval_samples` and `max_predict_samples` if everything works + --max_train_samples 50 \ + --max_eval_samples 50 \ + --max_predict_samples 50 \ --do_train \ --do_eval \ --dataset_name cnn_dailymail \ @@ -100,83 +85,47 @@ python examples/pytorch/summarization/run_summarization.py \ --output_dir /tmp/tst-summarization \ --per_device_train_batch_size=4 \ --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` - - -The example script downloads and preprocesses a dataset from the 🤗 [Datasets](https://huggingface.co/docs/datasets/) library. Then the script fine-tunes a dataset using Keras on an architecture that supports summarization. The following example shows how to fine-tune [T5-small](https://huggingface.co/google-t5/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. The T5 model requires an additional `source_prefix` argument due to how it was trained. This prompt lets T5 know this is a summarization task. - -```bash -python examples/tensorflow/summarization/run_summarization.py \ - --model_name_or_path google-t5/t5-small \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size 8 \ - --per_device_eval_batch_size 16 \ - --num_train_epochs 3 \ - --do_train \ - --do_eval + --push_to_hub \ + --push_to_hub_model_id finetuned-t5-cnn_dailymail \ + # remove if using `output_dir previous_output_dir` + # --overwrite_output_dir \ + --output_dir previous_output_dir \ + # --resume_from_checkpoint path_to_specific_checkpoint \ + --predict_with_generate \ ``` - - - -## Distributed training and mixed precision -The [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) supports distributed training and mixed precision, which means you can also use it in a script. To enable both of these features: +For mixed precision and distributed training, include the following parameters and launch training with [torchrun](https://pytorch.org/docs/stable/elastic/run.html). -- Add the `fp16` or `bf16` argument to enable mixed precision. XPU devices only supports `bf16` for mixed precision training. -- Set the number of GPUs to use with the `nproc_per_node` argument. +* Add the `fp16` or `bf16` parameters to enable mixed precision training. XPU devices only supports `bf16`. +* Add the `nproc_per_node` parameter to set number of GPUs to train with. ```bash torchrun \ --nproc_per_node 8 pytorch/summarization/run_summarization.py \ --fp16 \ - --model_name_or_path google-t5/t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate + ... + ... ``` -TensorFlow scripts utilize a [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) for distributed training, and you don't need to add any additional arguments to the training script. The TensorFlow script will use multiple GPUs by default if they are available. - -## Run a script on a TPU - - - -Tensor Processing Units (TPUs) are specifically designed to accelerate performance. PyTorch supports TPUs with the [XLA](https://www.tensorflow.org/xla) deep learning compiler (see [here](https://github.com/pytorch/xla/blob/master/README.md) for more details). To use a TPU, launch the `xla_spawn.py` script and use the `num_cores` argument to set the number of TPU cores you want to use. +PyTorch supports TPUs, hardware designed to accelerate performance, through the [PyTorch/XLA](https://github.com/pytorch/xla/blob/master/README.md) package. Launch the `xla_spawn.py` script and use `num _cores` to set the number of TPU cores to train with. ```bash -python xla_spawn.py --num_cores 8 \ - summarization/run_summarization.py \ +python xla_spawn.py --num_cores 8 pytorch/summarization/run_summarization.py \ --model_name_or_path google-t5/t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate + ... + ... ``` - - -Tensor Processing Units (TPUs) are specifically designed to accelerate performance. TensorFlow scripts utilize a [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) for training on TPUs. To use a TPU, pass the name of the TPU resource to the `tpu` argument. + +
+ ```bash -python run_summarization.py \ - --tpu name_of_tpu_resource \ +python examples/tensorflow/summarization/run_summarization.py \ --model_name_or_path google-t5/t5-small \ + # remove the `max_train_samples`, `max_eval_samples` and `max_predict_samples` if everything works + --max_train_samples 50 \ + --max_eval_samples 50 \ + --max_predict_samples 50 \ --dataset_name cnn_dailymail \ --dataset_config "3.0.0" \ --output_dir /tmp/tst-summarization \ @@ -184,33 +133,46 @@ python run_summarization.py \ --per_device_eval_batch_size 16 \ --num_train_epochs 3 \ --do_train \ - --do_eval + --do_eval \ +``` + +TensorFlow uses the [MirroredStrategy](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) for distributed training and doesn't require adding any additional parameters. The script uses multiple GPUs by default if they are available. + +For TPU training, TensorFlow scripts use the [TPUStrategy](https://www.tensorflow.org/guide/distributed_training#tpustrategy). Pass the TPU resource name to the `--tpu` parameter. + +```bash +python run_summarization.py \ + --tpu name_of_tpu_resource \ + ... + ... ``` - - -## Run a script with 🤗 Accelerate + + -🤗 [Accelerate](https://huggingface.co/docs/accelerate) is a PyTorch-only library that offers a unified method for training a model on several types of setups (CPU-only, multiple GPUs, TPUs) while maintaining complete visibility into the PyTorch training loop. Make sure you have 🤗 Accelerate installed if you don't already have it: +## Accelerate + +[Accelerate](https://huggingface.co/docs/accelerate) is designed to simplify distributed training while offering complete visibility into the PyTorch training loop. If you're planning on training with a script with Accelerate, use the `_no_trainer.py` version of the script. + +Install Accelerate from source to ensure you have the latest version. -> Note: As Accelerate is rapidly developing, the git version of accelerate must be installed to run the scripts ```bash pip install git+https://github.com/huggingface/accelerate ``` -Instead of the `run_summarization.py` script, you need to use the `run_summarization_no_trainer.py` script. 🤗 Accelerate supported scripts will have a `task_no_trainer.py` file in the folder. Begin by running the following command to create and save a configuration file: +Run the [accelerate config](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-config) command to answer a few questions about your training setup. This creates and saves a config file about your system. ```bash accelerate config ``` -Test your setup to make sure it is configured correctly: +You can use [accelerate test](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-test) to ensure your system is properly configured. ```bash accelerate test ``` -Now you are ready to launch the training: +Run [accelerate launch](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) to start training. ```bash accelerate launch run_summarization_no_trainer.py \ @@ -218,18 +180,18 @@ accelerate launch run_summarization_no_trainer.py \ --dataset_name cnn_dailymail \ --dataset_config "3.0.0" \ --source_prefix "summarize: " \ - --output_dir ~/tmp/tst-summarization + --output_dir ~/tmp/tst-summarization \ ``` -## Use a custom dataset +## Custom dataset -The summarization script supports custom datasets as long as they are a CSV or JSON Line file. When you use your own dataset, you need to specify several additional arguments: +The summarization scripts supports custom datasets as long as they are a CSV or JSONL file. When using your own dataset, you need to specify the following additional parameters. -- `train_file` and `validation_file` specify the path to your training and validation files. -- `text_column` is the input text to summarize. -- `summary_column` is the target text to output. +* `train_file` and `validation_file` specify the path to your training and validation files. +* `text_column` is the input text to summarize. +* `summary_column` is the target text to output. -A summarization script using a custom dataset would look like this: +An example command for summarizing a custom dataset is shown below. ```bash python examples/pytorch/summarization/run_summarization.py \ @@ -245,107 +207,5 @@ python examples/pytorch/summarization/run_summarization.py \ --overwrite_output_dir \ --per_device_train_batch_size=4 \ --per_device_eval_batch_size=4 \ - --predict_with_generate + --predict_with_generate \ ``` - -## Test a script - -It is often a good idea to run your script on a smaller number of dataset examples to ensure everything works as expected before committing to an entire dataset which may take hours to complete. Use the following arguments to truncate the dataset to a maximum number of samples: - -- `max_train_samples` -- `max_eval_samples` -- `max_predict_samples` - -```bash -python examples/pytorch/summarization/run_summarization.py \ - --model_name_or_path google-t5/t5-small \ - --max_train_samples 50 \ - --max_eval_samples 50 \ - --max_predict_samples 50 \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` - -Not all example scripts support the `max_predict_samples` argument. If you aren't sure whether your script supports this argument, add the `-h` argument to check: - -```bash -examples/pytorch/summarization/run_summarization.py -h -``` - -## Resume training from checkpoint - -Another helpful option to enable is resuming training from a previous checkpoint. This will ensure you can pick up where you left off without starting over if your training gets interrupted. There are two methods to resume training from a checkpoint. - -The first method uses the `output_dir previous_output_dir` argument to resume training from the latest checkpoint stored in `output_dir`. In this case, you should remove `overwrite_output_dir`: - -```bash -python examples/pytorch/summarization/run_summarization.py \ - --model_name_or_path google-t5/t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --output_dir previous_output_dir \ - --predict_with_generate -``` - -The second method uses the `resume_from_checkpoint path_to_specific_checkpoint` argument to resume training from a specific checkpoint folder. - -```bash -python examples/pytorch/summarization/run_summarization.py \ - --model_name_or_path google-t5/t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --resume_from_checkpoint path_to_specific_checkpoint \ - --predict_with_generate -``` - -## Share your model - -All scripts can upload your final model to the [Model Hub](https://huggingface.co/models). Make sure you are logged into Hugging Face before you begin: - -```bash -huggingface-cli login -``` - -Then add the `push_to_hub` argument to the script. This argument will create a repository with your Hugging Face username and the folder name specified in `output_dir`. - -To give your repository a specific name, use the `push_to_hub_model_id` argument to add it. The repository will be automatically listed under your namespace. - -The following example shows how to upload a model with a specific repository name: - -```bash -python examples/pytorch/summarization/run_summarization.py \ - --model_name_or_path google-t5/t5-small \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --source_prefix "summarize: " \ - --push_to_hub \ - --push_to_hub_model_id finetuned-t5-cnn_dailymail \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=4 \ - --overwrite_output_dir \ - --predict_with_generate -``` \ No newline at end of file From 20ebe8a2583e0d551f6f87d61f30800ccf56154c Mon Sep 17 00:00:00 2001 From: stevhliu Date: Wed, 8 Jan 2025 12:27:55 -0800 Subject: [PATCH 089/116] tpu --- docs/source/en/perf_train_tpu_tf.md | 353 +++++++++++++++++++++------- 1 file changed, 273 insertions(+), 80 deletions(-) diff --git a/docs/source/en/perf_train_tpu_tf.md b/docs/source/en/perf_train_tpu_tf.md index 8c64dab189a6..caf95b69c939 100644 --- a/docs/source/en/perf_train_tpu_tf.md +++ b/docs/source/en/perf_train_tpu_tf.md @@ -15,101 +15,320 @@ rendered properly in your Markdown viewer. # TPU - +TPU, Tensor Processing Unit, is a type of hardware designed to accelerate tensor computations for training and inference. TPUs are generally accessed through Google's cloud services, but smaller TPUs are also available for free from [Google Colab](https://colab.research.google.com/notebooks/tpu.ipynb) or [Kaggle](https://www.kaggle.com/docs/tpu). -If you don't need long explanations and just want TPU code samples to get started with, check out [our TPU example notebook!](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) +This guide focuses on training a Keras model for sequence classification on a TPU from Google Colab. Make sure the TPU runtime is enabled by going to **Runtime > Change runtime type** and selecting a TPU. - +Run the command below to install the latest version of Transformers and [Datasets](https://huggingface.co/docs/datasets). -### What is a TPU? +```py +!pip install --U transformers datasets +``` + +Create an instance of [tf.distribute.cluster_resolver.TPUClusterResolver](https://www.tensorflow.org/api_docs/python/tf/distribute/cluster_resolver/TPUClusterResolver), and then connect to the remote cluster and initialize the TPUs. + +```py +import tensorflow as tf + +resolver = tf.distribute.cluster_resolver.TPUClusterResolver() +tf.config.experimental_connect_to_cluster(resolver) +tf.tpu.experimental.initialize_tpu_system(resolver) +``` + +There are various distribution strategies for running your model on multiple TPUs. The [tpu.distribute.TPUStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/TPUStrategy) offers synchronized distributed training. + +```py +strategy = tf.distribute.TPUStrategy(resolver) +``` + +Load and tokenize a dataset - this example uses [CoLA](https://huggingface.co/datasets/nyu-mll/glue/viewer/cola) from the GLUE benchmark - and pad all samples to the maximum length so it is easier to load as an array and to avoid XLA compilation issues. + +```py +from transformers import AutoTokenizer +from datasets import load_dataset +import numpy as np + +dataset = load_dataset("glue", "cola")["train"] +tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased") + +train_data = tokenizer( + dataset["sentence"], + padding="max_length", + truncation=True, + max_length=128, + return_tensors="np", +) +train_data = dict(train_data) +train_labels = np.array(dataset["label"]) +``` + +The model **must** be created inside [Strategy.scope](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy#scope) in order to replicate the model layers on each TPU device. + +```py +from transformers import TFAutoModelForSequenceClassification + +with strategy.scope(): + model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint) + model.compile(optimizer="adam") +``` -A TPU is a **Tensor Processing Unit.** They are hardware designed by Google, which are used to greatly speed up the tensor computations within neural networks, much like GPUs. They can be used for both network training and inference. They are generally accessed through Google’s cloud services, but small TPUs can also be accessed directly for free through Google Colab and Kaggle Kernels. +TPUs only accept [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) inputs unlike the Keras [fit](https://keras.io/api/models/model_training_apis/#fit-method) method which accepts a broader range of inputs. -Because [all TensorFlow models in 🤗 Transformers are Keras models](https://huggingface.co/blog/tensorflow-philosophy), most of the methods in this document are generally applicable to TPU training for any Keras model! However, there are a few points that are specific to the HuggingFace ecosystem (hug-o-system?) of Transformers and Datasets, and we’ll make sure to flag them up when we get to them. +```py +BATCH_SIZE = 8 * strategy.num_replicas_in_sync -### What kinds of TPU are available? +tf_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels)) +tf_dataset = tf_dataset.shuffle(len(tf_dataset)) +tf_dataset = tf_dataset.batch(BATCH_SIZE, drop_remainder=True) +``` + +Finally, call [fit](https://keras.io/api/models/model_training_apis/#fit-method) to start training. + +```py +model.fit(tf_dataset) +``` -New users are often very confused by the range of TPUs, and the different ways to access them. The first key distinction to understand is the difference between **TPU Nodes** and **TPU VMs.** +## Large datasets + +The dataset created above pads every sample to the maximum length and loads the whole dataset into memory. This may not be possible if you're working with larger datasets. When training on large datasets, you may want to create a [tf.TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord) instead of stream the data. + +### tf.TFRecord + +[tf.TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord) is the standard [tf.data](https://www.tensorflow.org/guide/data) format for storing training data. For very large training jobs, it's worth preprocessing your data and storing it in the `tf.TFRecord` format and building a `tf.data` pipeline on top. Refer to the table below to help you decide whether `tf.TFRecord` is helpful for you. + +| pros | cons | +|---|---| +| works on all TPU instances | costs associated with cloud storage | +| supports huge datasets and massive throughput | some data types (images) can take a lot of space to store | +| suitable for training on entire TPU pods | | +| preprocessing is done in advance, maximizing training speed | | + +Preprocess and tokenize the dataset before writing it to a `tf.TFRecord` to avoid writing every time the data is loaded. + +An exception is made for *train-time augmentations*, because augmentations applied after writing to a `tf.TFRecord` results in the same augmentation for each epoch. Instead, apply augmentations in the `tf.data` pipeline that loads the data. + +> [!TIP] +> In practice, you probably won't be able to load the entire dataset in memory. Load a chunk of the dataset at a time and convert it to `TFRecord`, and repeat until the entire dataset is in the `TFRecord` format. Then you can use a list of all the files to create a `TFRecordDataset`. The example below demonstrates a single file for simplicity. + +```py +tokenized_data = tokenizer( + dataset["sentence"], + padding="max_length", + truncation=True, + max_length=128, + return_tensors="np", +) +labels = dataset["label"] + +with tf.io.TFRecordWriter("dataset.tfrecords") as file_writer: + for i in range(len(labels)): + features = { + "input_ids": tf.train.Feature( + int64_list=tf.train.Int64List(value=tokenized_data["input_ids"][i]) + ), + "attention_mask": tf.train.Feature( + int64_list=tf.train.Int64List(value=tokenized_data["attention_mask"][i]) + ), + "labels": tf.train.Feature( + int64_list=tf.train.Int64List(value=[labels[i]]) + ), + } + features = tf.train.Features(feature=features) + example = tf.train.Example(features=features) + record_bytes = example.SerializeToString() + file_writer.write(record_bytes) +``` + +Build a [TFRecordDataset](https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset) using the saved filename to load it. + +```py +def decode_fn(sample): + features = { + "input_ids": tf.io.FixedLenFeature((128,), dtype=tf.int64), + "attention_mask": tf.io.FixedLenFeature((128,), dtype=tf.int64), + "labels": tf.io.FixedLenFeature((1,), dtype=tf.int64), + } + return tf.io.parse_example(sample, features) + +# TFRecordDataset can handle gs:// paths +tf_dataset = tf.data.TFRecordDataset(["gs://matt-tf-tpu-tutorial-datasets/cola/dataset.tfrecords"]) +tf_dataset = tf_dataset.map(decode_fn) +tf_dataset = tf_dataset.shuffle(len(dataset)).batch(BATCH_SIZE, drop_remainder=True) +tf_dataset = tf_dataset.apply( + tf.data.experimental.assert_cardinality(len(labels) // BATCH_SIZE) +) +``` -When you use a **TPU Node**, you are effectively indirectly accessing a remote TPU. You will need a separate VM, which will initialize your network and data pipeline and then forward them to the remote node. When you use a TPU on Google Colab, you are accessing it in the **TPU Node** style. +The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method. -Using TPU Nodes can have some quite unexpected behaviour for people who aren’t used to them! In particular, because the TPU is located on a physically different system to the machine you’re running your Python code on, your data cannot be local to your machine - any data pipeline that loads from your machine’s internal storage will totally fail! Instead, data must be stored in Google Cloud Storage where your data pipeline can still access it, even when the pipeline is running on the remote TPU node. +```py +model.fit(tf_dataset) +``` - +### Stream from raw data -If you can fit all your data in memory as `np.ndarray` or `tf.Tensor`, then you can `fit()` on that data even when using Colab or a TPU Node, without needing to upload it to Google Cloud Storage. +Data can be stored in its native format and preprocessed in a [tf.data](https://www.tensorflow.org/guide/data) pipeline as the data is loaded. This approach isn't supported for many models with complex tokenization schemes, but some models like BERT are supported because their tokenization can be compiled. Refer to the table below to help you decide whether this approach is helpful for you. - +| pros | cons | +|---|---| +| suitable for highly compressed big data in native format (images, audio) | requires writing a full preprocessing pipeline | +| convenient if raw data is available in a public cloud bucket | complex preprocessing on-the-fly can hurt throughput | +| works on all TPU instances if data is stored in Google Cloud | must place data in cloud storage if not already there | +| | not as suitable for text data because writing a tokenization pipeline is hard (use `TFRecord` for text) | - +The example below demonstrates streaming data for an image model. -**🤗Specific Hugging Face Tip🤗:** The methods `Dataset.to_tf_dataset()` and its higher-level wrapper `model.prepare_tf_dataset()` , which you will see throughout our TF code examples, will both fail on a TPU Node. The reason for this is that even though they create a `tf.data.Dataset` it is not a “pure” `tf.data` pipeline and uses `tf.numpy_function` or `Dataset.from_generator()` to stream data from the underlying HuggingFace `Dataset`. This HuggingFace `Dataset` is backed by data that is on a local disc and which the remote TPU Node will not be able to read. +Load an image dataset and get a list of the underlying image file paths and labels. - +```py +from datasets import load_dataset -The second way to access a TPU is via a **TPU VM.** When using a TPU VM, you connect directly to the machine that the TPU is attached to, much like training on a GPU VM. TPU VMs are generally easier to work with, particularly when it comes to your data pipeline. All of the above warnings do not apply to TPU VMs! +image_dataset = load_dataset("beans", split="train") +filenames = image_dataset["image_file_path"] +labels = image_dataset["labels"] +``` -This is an opinionated document, so here’s our opinion: **Avoid using TPU Node if possible.** It is more confusing and more difficult to debug than TPU VMs. It is also likely to be unsupported in future - Google’s latest TPU, TPUv4, can only be accessed as a TPU VM, which suggests that TPU Nodes are increasingly going to become a “legacy” access method. However, we understand that the only free TPU access is on Colab and Kaggle Kernels, which uses TPU Node - so we’ll try to explain how to handle it if you have to! Check the [TPU example notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) for code samples that explain this in more detail. +Convert the local filenames in the dataset into `gs://` paths in Google Cloud Storage. -### What sizes of TPU are available? +```py +# strip everything but the category directory and filenames +base_filenames = ['/'.join(filename.split('/')[-2:]) for filename in filenames] +# prepend the Google Cloud base path to everything instead +gs_paths = ["gs://matt-tf-tpu-tutorial-datasets/beans/"+filename for filename in base_filenames] -A single TPU (a v2-8/v3-8/v4-8) runs 8 replicas. TPUs exist in **pods** that can run hundreds or thousands of replicas simultaneously. When you use more than a single TPU but less than a whole pod (for example, a v3-32), your TPU fleet is referred to as a **pod slice.** +# create tf_dataset +tf_dataset = tf.data.Dataset.from_tensor_slices( + {"filename": gs_paths, "labels": labels} +) +tf_dataset = tf_dataset.shuffle(len(tf_dataset)) +``` -When you access a free TPU via Colab, you generally get a single v2-8 TPU. +Transformers preprocessing classes like [`AutoImageProcessor`] are framework-agnostic and can't be compiled into a pipeline by `tf.data`. To get around this, get the normalization values (`mean` and `std`) from the [`AutoImageProcessor`] and use them in the `tf.data` pipeline. -### I keep hearing about this XLA thing. What’s XLA, and how does it relate to TPUs? +```py +from transformers import AutoImageProcessor -XLA is an optimizing compiler, used by both TensorFlow and JAX. In JAX it is the only compiler, whereas in TensorFlow it is optional (but mandatory on TPU!). The easiest way to enable it when training a Keras model is to pass the argument `jit_compile=True` to `model.compile()`. If you don’t get any errors and performance is good, that’s a great sign that you’re ready to move to TPU! +processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") +image_size = (processor.size["height"], processor.size["width"]) +image_mean = processor.image_mean +image_std = processor.image_std +``` -Debugging on TPU is generally a bit harder than on CPU/GPU, so we recommend getting your code running on CPU/GPU with XLA first before trying it on TPU. You don’t have to train for long, of course - just for a few steps to make sure that your model and data pipeline are working like you expect them to. +Use these normalization values to create a function to load and preprocess the images. - +```py +BATCH_SIZE = 8 * strategy.num_replicas_in_sync -XLA compiled code is usually faster - so even if you’re not planning to run on TPU, adding `jit_compile=True` can improve your performance. Be sure to note the caveats below about XLA compatibility, though! +def decode_fn(sample): + image_data = tf.io.read_file(sample["filename"]) + image = tf.io.decode_jpeg(image_data, channels=3) + image = tf.image.resize(image, image_size) + array = tf.cast(image, tf.float32) + array /= 255.0 + array = (array - image_mean) / image_std + array = tf.transpose(array, perm=[2, 0, 1]) + return {"pixel_values": array, "labels": sample["labels"]} - +tf_dataset = tf_dataset.map(decode_fn) +tf_dataset = tf_dataset.batch(BATCH_SIZE, drop_remainder=True) +print(tf_dataset.element_spec) +``` - +The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method. -**Tip born of painful experience:** Although using `jit_compile=True` is a good way to get a speed boost and test if your CPU/GPU code is XLA-compatible, it can actually cause a lot of problems if you leave it in when actually training on TPU. XLA compilation will happen implicitly on TPU, so remember to remove that line before actually running your code on a TPU! +```py +from transformers import TFAutoModelForImageClassification - +with strategy.scope(): + model = TFAutoModelForImageClassification.from_pretrained(image_model_checkpoint) + model.compile(optimizer="adam") -### How do I make my model XLA compatible? +model.fit(tf_dataset) +``` -In many cases, your code is probably XLA-compatible already! However, there are a few things that work in normal TensorFlow that don’t work in XLA. We’ve distilled them into three core rules below: +### Stream dataset with prepare_tf_dataset - +[`~TFPreTrainedModel.prepare_tf_dataset`] creates a `tf.data` pipeline that loads samples from [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset). The pipeline uses [tf.numpy_function]() or [`~datasets.Dataset.from_generator`], which can't be compiled by TensorFlow, to access the underlying `tf.data.Dataset`. It also won't work on a Colab TPU or TPU Nodes because the pipeline stream data from a local disk. Refer to the table below to help you decide whether this approach is helpful for you. -**🤗Specific HuggingFace Tip🤗:** We’ve put a lot of effort into rewriting our TensorFlow models and loss functions to be XLA-compatible. Our models and loss functions generally obey rule #1 and #2 by default, so you can skip over them if you’re using `transformers` models. Don’t forget about these rules when writing your own models and loss functions, though! +| pros | cons | | | | +|---|---|---|---|---| +| simple code | only works on TPU VM | | | | +| same approach on TPU/GPU | data must be available as a Hugging Face Dataset | | | | +| dataset doesn't have to fit in memory | data must fit on local storage | | | | +| supports variable padding | data loading may be a bottleneck on a big TPU pod slice | | | | - +[`~TFPreTrainedModel.prepare_tf_dataset`] only works on **TPU VM**. Add the tokenizer output as columns in the dataset since the dataset is stored on disk, which means it can handle data larger than the available memory. Use [`~TFPreTrainedModel.prepare_tf_dataset`] to stream data from the dataset by wrapping it with a `tf.data` pipeline. -#### XLA Rule #1: Your code cannot have “data-dependent conditionals” +```py +def tokenize_function(examples): + return tokenizer( + examples["sentence"], padding="max_length", truncation=True, max_length=128 + ) +# add the tokenizer output to the dataset as new columns +dataset = dataset.map(tokenize_function) -What that means is that any `if` statement cannot depend on values inside a `tf.Tensor`. For example, this code block cannot be compiled with XLA! +# prepare_tf_dataset() chooses columns that match the models input names +tf_dataset = model.prepare_tf_dataset( + dataset, batch_size=BATCH_SIZE, shuffle=True, tokenizer=tokenizer +) +``` -```python +The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method. + +```py +from transformers import AutoTokenizer, TFAutoModelForSequenceClassification + +with strategy.scope(): + model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint) + model.compile(optimizer="adam") + +model.fit(tf_dataset) +``` + +## TPU types + +There are two types of TPUs, a TPU Node and a TPU VM. + +A TPU Node indirectly accesses a remote TPU. It requires a separate VM to initialize your network and data pipeline and then forwards it to the remote node. Google Colab TPUs are an example of a TPU Node. You can't use local data because the TPU is remotely located, and data must be stored in Google Cloud Storage where the data pipeline can access it. + +TPU VM are connected directly to the machine the TPU is located on, and they are generally easier to work with, especially when it comes to your data pipeline. + +> [!TIP] +> We recommend avoiding TPU Nodes if possible because it is more difficult to debug than TPU VMs. TPU Nodes may also be unsupported in the future and become a legacy access method. + +A single TPU (v2-8, v3-8, v4-8) runs 8 replicas. TPUs can exist in **pods** which run hundreds or even thousands of replicas simultaneously. When you only use a portion of a pod, it is referred to as a **pod slice**. On Google Colab, you'll typically get a single v2-8 TPU. + +## XLA + +[XLA](https://openxla.org/xla) is a linear algebra compiler for high-performance execution and it is used by default to improve performance on TPUs. + +Before executing your code on a TPU, it's a good idea to try it first on a CPU or GPU because it is easier to debug. You can train for a few steps to make sure the model and data pipeline work as expected. Set `jit_compile=True` in the [compile](https://keras.io/api/models/model_training_apis/#compile-method) method to enable XLA compilation (but remember to remove this line of code before running on a TPU). + +The section below outlines three rules for making your code XLA-compatible. Transformers enforce the first two rules for models and loss functions by default, but don't forget about them if you're writing your own models and loss functions. + +### Data dependent conditionals + +Any `if` statements cannot depend on values inside a [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor). The code below can't be compiled by XLA. + +```py if tf.reduce_sum(tensor) > 10: tensor = tensor / 2.0 ``` -This might seem very restrictive at first, but most neural net code doesn’t need to do this. You can often get around this restriction by using `tf.cond` (see the documentation [here](https://www.tensorflow.org/api_docs/python/tf/cond)) or by removing the conditional and finding a clever math trick with indicator variables instead, like so: +To compile with XLA, use [tf.cond](https://www.tensorflow.org/api_docs/python/tf/cond) or remove the conditional and use indicator variables instead as shown below. -```python +```py sum_over_10 = tf.cast(tf.reduce_sum(tensor) > 10, tf.float32) tensor = tensor / (1.0 + sum_over_10) ``` -This code has exactly the same effect as the code above, but by avoiding a conditional, we ensure it will compile with XLA without problems! - -#### XLA Rule #2: Your code cannot have “data-dependent shapes” +### Data dependent shapes -What this means is that the shape of all of the `tf.Tensor` objects in your code cannot depend on their values. For example, the function `tf.unique` cannot be compiled with XLA, because it returns a `tensor` containing one instance of each unique value in the input. The shape of this output will obviously be different depending on how repetitive the input `Tensor` was, and so XLA refuses to handle it! +The shape of a [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor) cannot depend on their values. For example, [tf.unique](https://www.tensorflow.org/api_docs/python/tf/unique) can't be compiled because it returns a tensor containing an instance of each unique value in the input. The shape of this output depends on how repetitive the input [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor) is. -In general, most neural network code obeys rule #2 by default. However, there are a few common cases where it becomes a problem. One very common one is when you use **label masking**, setting your labels to a negative value to indicate that those positions should be ignored when computing the loss. If you look at NumPy or PyTorch loss functions that support label masking, you will often see code like this that uses [boolean indexing](https://numpy.org/doc/stable/user/basics.indexing.html#boolean-array-indexing): +This is an issue during **label masking**, where labels are set to a negative value to indicate they should be ignored when computing the loss. The code below can't be compiled by XLA because the shape of `masked_outputs` and `masked_labels` depend on how many positions are masked. -```python +```py label_mask = labels >= 0 masked_outputs = outputs[label_mask] masked_labels = labels[label_mask] @@ -117,46 +336,20 @@ loss = compute_loss(masked_outputs, masked_labels) mean_loss = torch.mean(loss) ``` -This code is totally fine in NumPy or PyTorch, but it breaks in XLA! Why? Because the shape of `masked_outputs` and `masked_labels` depends on how many positions are masked - that makes it a **data-dependent shape.** However, just like for rule #1, we can often rewrite this code to yield exactly the same output without any data-dependent shapes. +To compile with XLA, avoid the data-dependent shapes by computing the loss for every position and zeroing out the masked positions in both the numerator and denominator when calculating the mean. Convert `tf.bool` to `tf.float32` as an indicator variable to make your code XLA-compatible. -```python +```py label_mask = tf.cast(labels >= 0, tf.float32) loss = compute_loss(outputs, labels) -loss = loss * label_mask # Set negative label positions to 0 +loss = loss * label_mask mean_loss = tf.reduce_sum(loss) / tf.reduce_sum(label_mask) ``` -Here, we avoid data-dependent shapes by computing the loss for every position, but zeroing out the masked positions in both the numerator and denominator when we calculate the mean, which yields exactly the same result as the first block while maintaining XLA compatibility. Note that we use the same trick as in rule #1 - converting a `tf.bool` to `tf.float32` and using it as an indicator variable. This is a really useful trick, so remember it if you need to convert your own code to XLA! - -#### XLA Rule #3: XLA will need to recompile your model for every different input shape it sees - -This is the big one. What this means is that if your input shapes are very variable, XLA will have to recompile your model over and over, which will create huge performance problems. This commonly arises in NLP models, where input texts have variable lengths after tokenization. In other modalities, static shapes are more common and this rule is much less of a problem. - -How can you get around rule #3? The key is **padding** - if you pad all your inputs to the same length, and then use an `attention_mask`, you can get the same results as you’d get from variable shapes, but without any XLA issues. However, excessive padding can cause severe slowdown too - if you pad all your samples to the maximum length in the whole dataset, you might end up with batches consisting endless padding tokens, which will waste a lot of compute and memory! - -There isn’t a perfect solution to this problem. However, you can try some tricks. One very useful trick is to **pad batches of samples up to a multiple of a number like 32 or 64 tokens.** This often only increases the number of tokens by a small amount, but it hugely reduces the number of unique input shapes, because every input shape now has to be a multiple of 32 or 64. Fewer unique input shapes means fewer XLA compilations! - - - -**🤗Specific HuggingFace Tip🤗:** Our tokenizers and data collators have methods that can help you here. You can use `padding="max_length"` or `padding="longest"` when calling tokenizers to get them to output padded data. Our tokenizers and data collators also have a `pad_to_multiple_of` argument that you can use to reduce the number of unique input shapes you see! - - - -### How do I actually train my model on TPU? - -Once your training is XLA-compatible and (if you’re using TPU Node / Colab) your dataset has been prepared appropriately, running on TPU is surprisingly easy! All you really need to change in your code is to add a few lines to initialize your TPU, and to ensure that your model and dataset are created inside a `TPUStrategy` scope. Take a look at [our TPU example notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) to see this in action! +### Recompile different input shapes -### Summary +XLA recompiles your model if input shapes are variable which create huge performance problems. It is especially common in text models because input texts have variable lengths after tokenization. -There was a lot in here, so let’s summarize with a quick checklist you can follow when you want to get your model ready for TPU training: +> [!WARNING] +> Execessive padding can also severely slow down training because requires more compute and memory to process. -- Make sure your code follows the three rules of XLA -- Compile your model with `jit_compile=True` on CPU/GPU and confirm that you can train it with XLA -- Either load your dataset into memory or use a TPU-compatible dataset loading approach (see [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)) -- Migrate your code either to Colab (with accelerator set to “TPU”) or a TPU VM on Google Cloud -- Add TPU initializer code (see [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)) -- Create your `TPUStrategy` and make sure dataset loading and model creation are inside the `strategy.scope()` (see [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb)) -- Don’t forget to take `jit_compile=True` out again when you move to TPU! -- 🙏🙏🙏🥺🥺🥺 -- Call `model.fit()` -- You did it! \ No newline at end of file +To avoid different shapes, use padding to pad all your inputs to the same length and use an `attention_mask`. Try padding batches of samples to a multiple of 32 or 64 tokens. Use the parameters `padding="max_length"`, `padding="longest"`, or `pad_to_multiple_of` to help with padding. This often increases the number of tokens by a small amount, but it significantly reduces the number of unique input shapes because every input shape is a multiple of 32 or 64. Fewer unique input shapes requires fewer recompilation. \ No newline at end of file From e18e75538d69b4c85d72dd4bbdb5ba47fc284816 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Fri, 10 Jan 2025 17:10:20 -0800 Subject: [PATCH 090/116] review --- docs/source/en/_toctree.yml | 10 +- docs/source/en/custom_models.md | 32 +-- docs/source/en/how_to_hack_models.md | 149 +++-------- docs/source/en/index.md | 364 +-------------------------- docs/source/en/installation.md | 24 +- docs/source/en/model_sharing.md | 63 ++--- docs/source/en/models.md | 64 +++-- docs/source/en/quicktour.md | 86 +++---- 8 files changed, 188 insertions(+), 604 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index fb2fa52a0d8a..f6e3b34e8e51 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -12,15 +12,15 @@ - title: Models sections: - local: models - title: Load + title: Loading - local: custom_models - title: Create a custom model + title: Customizing models - local: how_to_hack_models - title: Customize model components + title: Customizing model components - local: model_sharing - title: Share + title: Sharing - local: add_new_model - title: Add a new model + title: Adding a new model - local: modular_transformers title: Modular transformers - local: task_summary diff --git a/docs/source/en/custom_models.md b/docs/source/en/custom_models.md index c81b99bdc06e..6ae5099a0312 100644 --- a/docs/source/en/custom_models.md +++ b/docs/source/en/custom_models.md @@ -14,23 +14,23 @@ rendered properly in your Markdown viewer. --> -# Customize +# Customizing models -Transformers models are easily customizable. Models are fully contained in the [model](https://github.com/huggingface/transformers/tree/main/src/transformers/models) subfolder of the Transformers repository. Each folder contains a `modeling.py` and a `configuration.py` file. Copy these files to start customizing a model. +Transformers models are designed to be customizable. A models code is fully contained in the [model](https://github.com/huggingface/transformers/tree/main/src/transformers/models) subfolder of the Transformers repository. Each folder contains a `modeling.py` and a `configuration.py` file. Copy these files to start customizing a model. > [!TIP] -> It may be easier to start from scratch if you're creating an entirely new model. For models that are very similar to an existing one in Transformers, it is faster to reuse or subclass the same configuration and model class. +> It may be easier to start from scratch if you're creating an entirely new model. But for models that are very similar to an existing one in Transformers, it is faster to reuse or subclass the same configuration and model class. -This guide will show you how to customize a ResNet model, enable [AutoClass](./models#autoclass) API support, and share it on the Hub. +This guide will show you how to customize a ResNet model, enable [AutoClass](./models#autoclass) support, and share it on the Hub. ## Configuration A configuration, given by the base [`PretrainedConfig`] class, contains all the necessary information to build a model. This is where you'll configure the attributes of the custom ResNet model. Different attributes gives different ResNet model types. -The three main rules for customizing a configuration are: +The main rules for customizing a configuration are: 1. A custom configuration must subclass [`PretrainedConfig`]. This ensures a custom model has all the functionality of a Transformers' model such as [`~PretrainedConfig.from_pretrained`], [`~PretrainedConfig.save_pretrained`], and [`~PretrainedConfig.push_to_hub`]. -2. The [`PretrainedConfig`] `__init__` must accept any `kwargs`, and they must be passed to the superclass `__init__`. [`PretrainedConfig`] has more fields than the ones you're setting in your custom configuration, so when you load a configuration with [`~PretrainedConfig.from_pretrained`], those fields need to be accepted by your configuration and passed to the superclass. +2. The [`PretrainedConfig`] `__init__` must accept any `kwargs` and they must be passed to the superclass `__init__`. [`PretrainedConfig`] has more fields than the ones set in your custom configuration, so when you load a configuration with [`~PretrainedConfig.from_pretrained`], those fields need to be accepted by your configuration and passed to the superclass. > [!TIP] > It is useful to check the validity of some of the parameters. In the example below, a check is implemented to ensure `block_type` and `stem_type` belong to one of the predefined values. @@ -74,7 +74,7 @@ class ResnetConfig(PretrainedConfig): super().__init__(**kwargs) ``` -Save the configuration to a JSON file with [`PretrainedConfig.save_pretrained`]. This file is stored in your custom model folder, `custom-resnet`. +Save the configuration to a JSON file in your custom model folder, `custom-resnet`, with [`~PretrainedConfig.save_pretrained`]. ```py resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) @@ -85,9 +85,9 @@ resnet50d_config.save_pretrained("custom-resnet") With the custom ResNet configuration, you can now create and customize the model. The model subclasses the base [`PreTrainedModel`] class. Like [`PretrainedConfig`], inheriting from [`PreTrainedModel`] and initializing the superclass with the configuration extends Transformers' functionalities such as saving and loading to the custom model. -Transformers' models follow the convention of accepting a `config` object in the `__init__` method. This passes the entire `config` to the models sublayers, instead of breaking the `config` object into multiple arguments that are individually passed to the sublayers. +Transformers' models follow the convention of accepting a `config` object in the `__init__` method. This passes the entire `config` to the model sublayers, instead of breaking the `config` object into multiple arguments that are individually passed to the sublayers. -Writing models this way produces simpler code with a clear *source of truth* for any hyperparameters. It is also easier to reuse code from other Transformers' models. +Writing models this way produces simpler code with a clear source of truth for any hyperparameters. It also makes it easier to reuse code from other Transformers' models. You'll create two ResNet models, a barebones ResNet model that outputs the hidden states and a ResNet model with an image classification head. @@ -176,7 +176,7 @@ Instantiate the custom model class with the configuration. resnet50d = ResnetModelForImageClassification(resnet50d_config) ``` -At this point, you can load pretrained weights into the model or train it from scratch. You'll load pretrained weights in this guide. +At this point, you can load pretrained weights into the model or train it from scratch. In this guide, you'll load pretrained weights. Load the pretrained weights from the [timm](https://hf.co/docs/timm/index) library, and then transfer those weights to the custom model with [load_state_dict](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.load_state_dict). @@ -187,11 +187,11 @@ pretrained_model = timm.create_model("resnet50d", pretrained=True) resnet50d.model.load_state_dict(pretrained_model.state_dict()) ``` -## AutoClass support +## AutoClass -The [AutoClass](./models#model-classes) API is a shortcut for automatically loading the correct architecture for a given model. It may be convenient to enable this for users when loading your custom model. +The [AutoClass](./models#model-classes) API is a shortcut for automatically loading the correct architecture for a given model. It is convenient to enable this for users loading your custom model. -Make sure you have the `model_type` attribute (must be different from existing model types) in the configuration class and `config_class` attribute in the model class. With the [`~AutoConfig.register`] method, add the custom configuration and model to the [AutoClass](./models#model-classes) API. +Make sure you have the `model_type` attribute (must be different from existing model types) in the configuration class and `config_class` attribute in the model class. Use the [`~AutoConfig.register`] method to add the custom configuration and model to the [AutoClass](./models#model-classes) API. > [!TIP] > The first argument to [`AutoConfig.register`] must match the `model_type` attribute in the custom configuration class, and the first argument to [`AutoModel.register`] must match the `config_class` of the custom model class. @@ -265,7 +265,7 @@ pretrained_model = timm.create_model("resnet50d", pretrained=True) resnet50d.model.load_state_dict(pretrained_model.state_dict()) ``` -The model is ready to be pushed to the Hub now. Login to your Hugging Face account from the command line or notebook. +The model is ready to be pushed to the Hub now. Log in to your Hugging Face account from the command line or notebook. @@ -292,6 +292,6 @@ Call [`~PreTrainedModel.push_to_hub`] on the model to upload the model to the Hu resnet50d.push_to_hub("custom-resnet50d") ``` -The pretrained weights, configuration in JSON format, `modeling.py` and `configuration.py` files should all be uploaded to the Hub now under a namespace and specified directory [here](https://hf.co/sgugger/custom-resnet50d). +The pretrained weights, configuration, `modeling.py` and `configuration.py` files should all be uploaded to the Hub now in a [repository](https://hf.co/sgugger/custom-resnet50d) under your namespace. -Because a custom model doesn't use the same modeling code as a Transformers' model, you need to add `trust_remode_code=True` in the [`~PreTrainedModel.from_pretrained`] method to load it. Refer to the load [custom models](./models#custom-models) section for more information. +Because a custom model doesn't use the same modeling code as a Transformers' model, you need to add `trust_remode_code=True` in [`~PreTrainedModel.from_pretrained`] to load it. Refer to the load [custom models](./models#custom-models) section for more information. diff --git a/docs/source/en/how_to_hack_models.md b/docs/source/en/how_to_hack_models.md index 5e2aa8297bcf..550ac85d4f50 100644 --- a/docs/source/en/how_to_hack_models.md +++ b/docs/source/en/how_to_hack_models.md @@ -13,68 +13,19 @@ rendered properly in your Markdown viewer. --> -# How to Hack Any Transformers Model +# Customizing model components -The [🤗 Transformers](https://github.com/huggingface/transformers) library offers a collection of pre-trained models and tools for natural language processing, vision, and beyond. While these models cover a wide range of applications, you might encounter use cases that aren't supported out of the box. Customizing models can unlock new possibilities, such as adding new layers, altering architectures, or optimizing attention mechanisms. This guide will show you how to modify existing Transformers models to fit your specific needs. The great thing is, you don’t have to step away from the Transformers framework to make these changes. You can actually modify models directly in Transformers and still take advantage of features like the [Trainer API](https://huggingface.co/docs/transformers/main/en/main_classes/trainer), [PreTrainedModel](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel), and efficient fine-tuning with tools like [PEFT](https://huggingface.co/docs/peft/index). +Another way to customize a model is to modify their components, rather than writing a new model entirely, allowing you to tailor a model to your specific use case. For example, you can add new layers or optimize the attention mechanism of an architecture. Customizations are applied directly to a Transformers model so that you can continue to use features such as [`Trainer`], [`PreTrainedModel`], and the [PEFT](https://huggingface.co/docs/peft/en/index) library. -In this guide, we’ll walk you through how to customize existing Transformers models to meet your requirements—without losing the benefits of the ecosystem. +This guide will show you how to customize a models attention mechanism in order to apply [Low-Rank Adaptation (LoRA)](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) to it. -You'll learn how to: +## Attention class -- Modify a model's architecture by changing its attention mechanism. -- Apply techniques like Low-Rank Adaptation (LoRA) to specific model components. +[Segment Anything](./model_doc/sam) is an image segmentation model, and it combines the query-key-value (`qkv`) projection in its attention mechanims. To reduce the number of trainable parameters and computational overhead, you can apply LoRA to the `qkv` projection. This requires splitting the `qkv` projection so that you can separately target the `q` and `v` with LoRA. -We encourage you to contribute your own hacks and share them here with the community! +1. Create a custom attention class, `SamVisionAttentionSplit`, by subclassing the original `SamVisionAttention` class. In the `__init__`, delete the combined `qkv` and create a separate linear layer for `q`, `k` and `v`. -## Efficient Development Workflow - -When modifying model code, you'll often need to test your changes without restarting your Python session. The `clear_import_cache()` utility helps with this workflow, especially during model development and contribution when you need to frequently test and compare model outputs: - -```python -from transformers import AutoModel -model = AutoModel.from_pretrained("bert-base-uncased") - -# Make modifications to the transformers code... - -# Clear the cache to reload the modified code -from transformers.utils.import_utils import clear_import_cache -clear_import_cache() - -# Reimport to get the changes -from transformers import AutoModel -model = AutoModel.from_pretrained("bert-base-uncased") # Will use updated code -``` - -This is particularly useful when: -- Iteratively modifying model architectures -- Debugging model implementations -- Testing changes during model development -- Comparing outputs between original and modified versions -- Working on model contributions - -The `clear_import_cache()` function removes all cached Transformers modules and allows Python to reload the modified code. This enables rapid development cycles without constantly restarting your environment. - -This workflow is especially valuable when implementing new models, where you need to frequently compare outputs between the original implementation and your Transformers version (as described in the [Add New Model](https://huggingface.co/docs/transformers/add_new_model) guide). - -## Example: Modifying the Attention Mechanism in the Segment Anything Model (SAM) - -The **Segment Anything Model (SAM)** is a state-of-the-art model for image segmentation. In its default implementation, SAM uses a combined query-key-value (`qkv`) projection in its attention mechanism. However, you might want to fine-tune only specific components of the attention mechanism, such as the query (`q`) and value (`v`) projections, to reduce the number of trainable parameters and computational resources required. - -### Motivation - -By splitting the combined `qkv` projection into separate `q`, `k`, and `v` projections, you can apply techniques like **LoRA** (Low-Rank Adaptation) to only the `q` and `v` projections. This approach allows you to: - -- Fine-tune fewer parameters, reducing computational overhead. -- Potentially achieve better performance by focusing on specific components. -- Experiment with different adaptation strategies in the attention mechanism. - -### Implementation - -#### **Step 1: Create a Custom Attention Class** - -Next, subclass the original `SamVisionAttention` class and modify it to have separate `q`, `k`, and `v` projections. - -```python +```py import torch import torch.nn as nn from transformers.models.sam.modeling_sam import SamVisionAttention @@ -82,30 +33,39 @@ from transformers.models.sam.modeling_sam import SamVisionAttention class SamVisionAttentionSplit(SamVisionAttention, nn.Module): def __init__(self, config, window_size): super().__init__(config, window_size) + # remove combined qkv del self.qkv - # Separate q, k, v projections + # separate q, k, v projections self.q = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) self.k = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) self.v = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) self._register_load_state_dict_pre_hook(self.split_q_k_v_load_hook) +``` + +2. The `_split_qkv_load_hook` function splits the pretrained `qkv` weights into separate `q`, `k`, and `v` weights when loading the model to ensure compatibility with any pretrained model. +```py def split_q_k_v_load_hook(self, state_dict, prefix, *args): keys_to_delete = [] for key in list(state_dict.keys()): if "qkv." in key: - # Split q, k, v from the combined projection + # split q, k, v from the combined projection q, k, v = state_dict[key].chunk(3, dim=0) - # Replace with individual q, k, v projections + # replace with individual q, k, v projections state_dict[key.replace("qkv.", "q.")] = q state_dict[key.replace("qkv.", "k.")] = k state_dict[key.replace("qkv.", "v.")] = v - # Mark the old qkv key for deletion + # mark the old qkv key for deletion keys_to_delete.append(key) - # Remove old qkv keys + # remove old qkv keys for key in keys_to_delete: del state_dict[key] +``` + +3. In the `forward` pass, `q`, `k`, and `v` are computed separately while the rest of the attention mechanism remains the same. +```py def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor: batch_size, height, width, _ = hidden_states.shape qkv_shapes = (batch_size * self.num_attention_heads, height * width, -1) @@ -133,78 +93,49 @@ class SamVisionAttentionSplit(SamVisionAttention, nn.Module): return outputs ``` -**Explanation:** - -- **Separate Projections:** The combined `qkv` projection is removed, and separate `q`, `k`, and `v` linear layers are created. -- **Weight Loading Hook:** The `_split_qkv_load_hook` method splits the pre-trained `qkv` weights into separate `q`, `k`, and `v` weights when loading the model. This ensures compatibility with any pre-trained model. -- **Forward Pass:** Queries, keys, and values are computed separately, and the attention mechanism proceeds as usual. +Assign the custom `SamVisionAttentionSplit` class to the original models `SamVisionAttention` module to replace it. All instances of `SamVisionAttention` in the model is replaced with the split attention version. -#### **Step 2: Replace the Original Attention Class** +Load the model with [`~PreTrainedModel.from_pretrained`]. -Replace the original `SamVisionAttention` class with your custom class so that the model uses the modified attention mechanism. - -```python +```py from transformers import SamModel from transformers.models.sam import modeling_sam -# Replace the attention class in the modeling_sam module +# replace the attention class in the modeling_sam module modeling_sam.SamVisionAttention = SamVisionAttentionSplit -# Load the pre-trained SAM model +# load the pretrained SAM model model = SamModel.from_pretrained("facebook/sam-vit-base") ``` -**Explanation:** - -- **Class Replacement:** By assigning your custom class to `modeling_sam.SamVisionAttention`, any instances of `SamVisionAttention` in the model will use the modified version. Thus when you call `SamModel`, it will use the newly defined `SamVisionAttentionSplit`. -- **Model Loading:** The model is loaded using `from_pretrained`, and the custom attention mechanism is integrated. +## LoRA -#### **Step 3: Apply LoRA to Specific Projections** +With separate `q`, `k`, and `v` projections, apply LoRA to `q` and `v`. -With separate `q`, `k`, and `v` projections, you can now apply LoRA to specific components, such as the `q` and `v` projections. +Create a [`~peft.LoraConfig`] and specify the rank `r`, `lora_alpha`, `lora_dropout`, `task_type`, and most importantly, the modules to target. -```python +```py from peft import LoraConfig, get_peft_model config = LoraConfig( r=16, lora_alpha=32, - target_modules=["q", "v"], # Apply LoRA to q and v projections + # apply LoRA to q and v + target_modules=["q", "v"], lora_dropout=0.1, task_type="mask-generation" ) - -# Apply LoRA to the model -model = get_peft_model(model, config) ``` -**Explanation:** - -- **LoRA Configuration:** The `LoraConfig` specifies the rank `r`, scaling factor `lora_alpha`, target modules (`"q"` and `"v"`), dropout, and task type. -- **Applying LoRA:** The `get_peft_model` function applies LoRA to the specified modules in the model. -- **Parameter Reduction:** By focusing on `q` and `v`, you reduce the number of trainable parameters, leading to faster training and lower memory usage. - -#### **Step 4: Verify the Number of Trainable Parameters** - -It's simple to verify the number of trainable parameters and see what impact your modification had. +Pass the model and [`~peft.LoraConfig`] to [`~peft.get_peft_model`] to apply LoRA to the model. -```python -model.print_trainable_parameters() -``` - -**Expected Output:** - -``` -trainable params: 608,256 || all params: 94,343,728 || trainable%: 0.6447 -trainable params: 912,384 || all params: 94,647,856 || trainable%: 0.9640 # with k +```py +model = get_peft_model(model, config) ``` -## Contributing Your Own Hacks - -Modifying pre-trained models can open up new avenues for research and application. By understanding and adjusting the internal mechanisms of models like SAM, you can tailor them to your specific needs, optimize performance, and experiment with new ideas. +Call [`~peft.PeftModel.print_trainable_parameters`] to view the number of parameters you're training as a result versus the total number of parameters. -If you've developed your own hacks for Transformers models and would like to share them, consider contributing to this doc. - -- **Open a Pull Request:** Share your code changes and improvements directly in the repository. -- **Write Documentation:** Provide clear explanations and examples of your modifications. -- **Engage with the Community:** Discuss your ideas and get feedback from other developers and researchers by opening an issue. \ No newline at end of file +```py +model.print_trainable_parameters() +"trainable params: 608,256 || all params: 94,343,728 || trainable%: 0.6447" +``` \ No newline at end of file diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 3eb9fc9d398b..8120e12937ae 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -15,28 +15,26 @@ rendered properly in your Markdown viewer. # Transformers -Transformers is a library of pretrained natural language processing, computer vision, audio, and multimodal models. +Transformers is a library of pretrained natural language processing, computer vision, audio, and multimodal models for inference and training. Use Transformers to train models on your data, build inference applications, and generate text with large language models. -It supports [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/), [Flax](https://flax.readthedocs.io/en/latest/) and provides inference and training APIs to get started with pretrained models right away. - -Join us on the [Hugging Face Hub](https://huggingface.co/), [Discord](https://discord.com/invite/JfAtkvEtRb), or [forum](https://discuss.huggingface.co/) today! +Explore the [Hugging Face Hub](https://huggingface.com) today to find a model and use Transformers to help you get started right away. ## Features -Transformers provides everything you need for inference or training with state-of-the-art pretrained models. Some of its main features include: +Transformers provides everything you need for inference or training with state-of-the-art pretrained models. Some of the main features include: -- [`Pipeline`]: A high-level API that supports optimized inference for many machine learning tasks like text generation, image segmentation, automatic speech recognition, document question answering, and more. -- [`Trainer`]: An extensive API that supports many features such as mixed precision, torch.compile, and FlashAttention for training and distributed training for PyTorch models. -- [`~GenerationMixin.generate`]: A generation API for large language models (LLMs) and vision language models (VLMs) that supports streaming and many decoding strategies. +- [Pipeline](./pipeline_tutorial): Simple and optimized inference class for many machine learning tasks like text generation, image segmentation, automatic speech recognition, document question answering, and more. +- [Trainer](./trainer): A comprehensive trainer that supports features such as mixed precision, torch.compile, and FlashAttention for training and distributed training for PyTorch models. +- [generate](./llm_tutorial): Fast text generation with large language models (LLMs) and vision language models (VLMs), including support for streaming and multiple decoding strategies. ## Design > [!TIP] -> For a more detailed explanation of Transformers' design principles, learn more in our [Philosophy](./philosophy). +> Read our [Philosophy](./philosophy) to learn more about Transformers' design principles. -Transformers is designed for developers and machine learning engineers and researchers alike. Its main design principles are: +Transformers is designed for developers and machine learning engineers and researchers. Its main design principles are: -1. Easy and fast to use: Every model is implemented from only three main classes (configuration, model, and preprocessor) and can be quickly used for inference or training with just [`Pipeline`] or [`Trainer`]. +1. Fast and easy to use: Every model is implemented from only three main classes (configuration, model, and preprocessor) and can be quickly used for inference or training with [`Pipeline`] or [`Trainer`]. 2. Pretrained models: Reduce your carbon footprint, compute cost and time by using a pretrained model instead of training an entirely new one. Each pretrained model is reproduced as closely as possible to the original model and offers state-of-the-art performance.
@@ -45,346 +43,4 @@ Transformers is designed for developers and machine learning engineers and resea
-## Supported models and frameworks - -Check the table below to see whether a model supports PyTorch, TensorFlow, or JAX. - - - -| Model | PyTorch support | TensorFlow support | Flax Support | -|:------------------------------------------------------------------------:|:---------------:|:------------------:|:------------:| -| [ALBERT](model_doc/albert) | ✅ | ✅ | ✅ | -| [ALIGN](model_doc/align) | ✅ | ❌ | ❌ | -| [AltCLIP](model_doc/altclip) | ✅ | ❌ | ❌ | -| [Aria](model_doc/aria) | ✅ | ❌ | ❌ | -| [AriaText](model_doc/aria_text) | ✅ | ❌ | ❌ | -| [Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer) | ✅ | ❌ | ❌ | -| [Autoformer](model_doc/autoformer) | ✅ | ❌ | ❌ | -| [Bamba](model_doc/bamba) | ✅ | ❌ | ❌ | -| [Bark](model_doc/bark) | ✅ | ❌ | ❌ | -| [BART](model_doc/bart) | ✅ | ✅ | ✅ | -| [BARThez](model_doc/barthez) | ✅ | ✅ | ✅ | -| [BARTpho](model_doc/bartpho) | ✅ | ✅ | ✅ | -| [BEiT](model_doc/beit) | ✅ | ❌ | ✅ | -| [BERT](model_doc/bert) | ✅ | ✅ | ✅ | -| [Bert Generation](model_doc/bert-generation) | ✅ | ❌ | ❌ | -| [BertJapanese](model_doc/bert-japanese) | ✅ | ✅ | ✅ | -| [BERTweet](model_doc/bertweet) | ✅ | ✅ | ✅ | -| [BigBird](model_doc/big_bird) | ✅ | ❌ | ✅ | -| [BigBird-Pegasus](model_doc/bigbird_pegasus) | ✅ | ❌ | ❌ | -| [BioGpt](model_doc/biogpt) | ✅ | ❌ | ❌ | -| [BiT](model_doc/bit) | ✅ | ❌ | ❌ | -| [Blenderbot](model_doc/blenderbot) | ✅ | ✅ | ✅ | -| [BlenderbotSmall](model_doc/blenderbot-small) | ✅ | ✅ | ✅ | -| [BLIP](model_doc/blip) | ✅ | ✅ | ❌ | -| [BLIP-2](model_doc/blip-2) | ✅ | ❌ | ❌ | -| [BLOOM](model_doc/bloom) | ✅ | ❌ | ✅ | -| [BORT](model_doc/bort) | ✅ | ✅ | ✅ | -| [BridgeTower](model_doc/bridgetower) | ✅ | ❌ | ❌ | -| [BROS](model_doc/bros) | ✅ | ❌ | ❌ | -| [ByT5](model_doc/byt5) | ✅ | ✅ | ✅ | -| [CamemBERT](model_doc/camembert) | ✅ | ✅ | ❌ | -| [CANINE](model_doc/canine) | ✅ | ❌ | ❌ | -| [Chameleon](model_doc/chameleon) | ✅ | ❌ | ❌ | -| [Chinese-CLIP](model_doc/chinese_clip) | ✅ | ❌ | ❌ | -| [CLAP](model_doc/clap) | ✅ | ❌ | ❌ | -| [CLIP](model_doc/clip) | ✅ | ✅ | ✅ | -| [CLIPSeg](model_doc/clipseg) | ✅ | ❌ | ❌ | -| [CLVP](model_doc/clvp) | ✅ | ❌ | ❌ | -| [CodeGen](model_doc/codegen) | ✅ | ❌ | ❌ | -| [CodeLlama](model_doc/code_llama) | ✅ | ❌ | ✅ | -| [Cohere](model_doc/cohere) | ✅ | ❌ | ❌ | -| [Cohere2](model_doc/cohere2) | ✅ | ❌ | ❌ | -| [ColPali](model_doc/colpali) | ✅ | ❌ | ❌ | -| [Conditional DETR](model_doc/conditional_detr) | ✅ | ❌ | ❌ | -| [ConvBERT](model_doc/convbert) | ✅ | ✅ | ❌ | -| [ConvNeXT](model_doc/convnext) | ✅ | ✅ | ❌ | -| [ConvNeXTV2](model_doc/convnextv2) | ✅ | ✅ | ❌ | -| [CPM](model_doc/cpm) | ✅ | ✅ | ✅ | -| [CPM-Ant](model_doc/cpmant) | ✅ | ❌ | ❌ | -| [CTRL](model_doc/ctrl) | ✅ | ✅ | ❌ | -| [CvT](model_doc/cvt) | ✅ | ✅ | ❌ | -| [DAB-DETR](model_doc/dab-detr) | ✅ | ❌ | ❌ | -| [DAC](model_doc/dac) | ✅ | ❌ | ❌ | -| [Data2VecAudio](model_doc/data2vec) | ✅ | ❌ | ❌ | -| [Data2VecText](model_doc/data2vec) | ✅ | ❌ | ❌ | -| [Data2VecVision](model_doc/data2vec) | ✅ | ✅ | ❌ | -| [DBRX](model_doc/dbrx) | ✅ | ❌ | ❌ | -| [DeBERTa](model_doc/deberta) | ✅ | ✅ | ❌ | -| [DeBERTa-v2](model_doc/deberta-v2) | ✅ | ✅ | ❌ | -| [Decision Transformer](model_doc/decision_transformer) | ✅ | ❌ | ❌ | -| [Deformable DETR](model_doc/deformable_detr) | ✅ | ❌ | ❌ | -| [DeiT](model_doc/deit) | ✅ | ✅ | ❌ | -| [DePlot](model_doc/deplot) | ✅ | ❌ | ❌ | -| [Depth Anything](model_doc/depth_anything) | ✅ | ❌ | ❌ | -| [DepthPro](model_doc/depth_pro) | ✅ | ❌ | ❌ | -| [DETA](model_doc/deta) | ✅ | ❌ | ❌ | -| [DETR](model_doc/detr) | ✅ | ❌ | ❌ | -| [DialoGPT](model_doc/dialogpt) | ✅ | ✅ | ✅ | -| [DiffLlama](model_doc/diffllama) | ✅ | ❌ | ❌ | -| [DiNAT](model_doc/dinat) | ✅ | ❌ | ❌ | -| [DINOv2](model_doc/dinov2) | ✅ | ❌ | ✅ | -| [DINOv2 with Registers](model_doc/dinov2_with_registers) | ✅ | ❌ | ❌ | -| [DistilBERT](model_doc/distilbert) | ✅ | ✅ | ✅ | -| [DiT](model_doc/dit) | ✅ | ❌ | ✅ | -| [DonutSwin](model_doc/donut) | ✅ | ❌ | ❌ | -| [DPR](model_doc/dpr) | ✅ | ✅ | ❌ | -| [DPT](model_doc/dpt) | ✅ | ❌ | ❌ | -| [EfficientFormer](model_doc/efficientformer) | ✅ | ✅ | ❌ | -| [EfficientNet](model_doc/efficientnet) | ✅ | ❌ | ❌ | -| [ELECTRA](model_doc/electra) | ✅ | ✅ | ✅ | -| [Emu3](model_doc/emu3) | ✅ | ❌ | ❌ | -| [EnCodec](model_doc/encodec) | ✅ | ❌ | ❌ | -| [Encoder decoder](model_doc/encoder-decoder) | ✅ | ✅ | ✅ | -| [ERNIE](model_doc/ernie) | ✅ | ❌ | ❌ | -| [ErnieM](model_doc/ernie_m) | ✅ | ❌ | ❌ | -| [ESM](model_doc/esm) | ✅ | ✅ | ❌ | -| [FairSeq Machine-Translation](model_doc/fsmt) | ✅ | ❌ | ❌ | -| [Falcon](model_doc/falcon) | ✅ | ❌ | ❌ | -| [Falcon3](model_doc/falcon3) | ✅ | ❌ | ✅ | -| [FalconMamba](model_doc/falcon_mamba) | ✅ | ❌ | ❌ | -| [FastSpeech2Conformer](model_doc/fastspeech2_conformer) | ✅ | ❌ | ❌ | -| [FLAN-T5](model_doc/flan-t5) | ✅ | ✅ | ✅ | -| [FLAN-UL2](model_doc/flan-ul2) | ✅ | ✅ | ✅ | -| [FlauBERT](model_doc/flaubert) | ✅ | ✅ | ❌ | -| [FLAVA](model_doc/flava) | ✅ | ❌ | ❌ | -| [FNet](model_doc/fnet) | ✅ | ❌ | ❌ | -| [FocalNet](model_doc/focalnet) | ✅ | ❌ | ❌ | -| [Funnel Transformer](model_doc/funnel) | ✅ | ✅ | ❌ | -| [Fuyu](model_doc/fuyu) | ✅ | ❌ | ❌ | -| [Gemma](model_doc/gemma) | ✅ | ❌ | ✅ | -| [Gemma2](model_doc/gemma2) | ✅ | ❌ | ❌ | -| [GIT](model_doc/git) | ✅ | ❌ | ❌ | -| [GLM](model_doc/glm) | ✅ | ❌ | ❌ | -| [GLPN](model_doc/glpn) | ✅ | ❌ | ❌ | -| [GOT-OCR2](model_doc/got_ocr2) | ✅ | ❌ | ❌ | -| [GPT Neo](model_doc/gpt_neo) | ✅ | ❌ | ✅ | -| [GPT NeoX](model_doc/gpt_neox) | ✅ | ❌ | ❌ | -| [GPT NeoX Japanese](model_doc/gpt_neox_japanese) | ✅ | ❌ | ❌ | -| [GPT-J](model_doc/gptj) | ✅ | ✅ | ✅ | -| [GPT-Sw3](model_doc/gpt-sw3) | ✅ | ✅ | ✅ | -| [GPTBigCode](model_doc/gpt_bigcode) | ✅ | ❌ | ❌ | -| [GPTSAN-japanese](model_doc/gptsan-japanese) | ✅ | ❌ | ❌ | -| [Granite](model_doc/granite) | ✅ | ❌ | ❌ | -| [GraniteMoeMoe](model_doc/granitemoe) | ✅ | ❌ | ❌ | -| [GraniteMoeSharedMoe](model_doc/granitemoeshared) | ✅ | ❌ | ❌ | -| [Graphormer](model_doc/graphormer) | ✅ | ❌ | ❌ | -| [Grounding DINO](model_doc/grounding-dino) | ✅ | ❌ | ❌ | -| [GroupViT](model_doc/groupvit) | ✅ | ✅ | ❌ | -| [Helium](model_doc/helium) | ✅ | ❌ | ❌ | -| [HerBERT](model_doc/herbert) | ✅ | ✅ | ✅ | -| [Hiera](model_doc/hiera) | ✅ | ❌ | ❌ | -| [Hubert](model_doc/hubert) | ✅ | ✅ | ❌ | -| [I-BERT](model_doc/ibert) | ✅ | ❌ | ❌ | -| [I-JEPA](model_doc/ijepa) | ✅ | ❌ | ❌ | -| [IDEFICS](model_doc/idefics) | ✅ | ✅ | ❌ | -| [Idefics2](model_doc/idefics2) | ✅ | ❌ | ❌ | -| [Idefics3](model_doc/idefics3) | ✅ | ❌ | ❌ | -| [Idefics3VisionTransformer](model_doc/idefics3_vision) | ❌ | ❌ | ❌ | -| [ImageGPT](model_doc/imagegpt) | ✅ | ❌ | ❌ | -| [Informer](model_doc/informer) | ✅ | ❌ | ❌ | -| [InstructBLIP](model_doc/instructblip) | ✅ | ❌ | ❌ | -| [InstructBlipVideo](model_doc/instructblipvideo) | ✅ | ❌ | ❌ | -| [Jamba](model_doc/jamba) | ✅ | ❌ | ❌ | -| [JetMoe](model_doc/jetmoe) | ✅ | ❌ | ❌ | -| [Jukebox](model_doc/jukebox) | ✅ | ❌ | ❌ | -| [KOSMOS-2](model_doc/kosmos-2) | ✅ | ❌ | ❌ | -| [LayoutLM](model_doc/layoutlm) | ✅ | ✅ | ❌ | -| [LayoutLMv2](model_doc/layoutlmv2) | ✅ | ❌ | ❌ | -| [LayoutLMv3](model_doc/layoutlmv3) | ✅ | ✅ | ❌ | -| [LayoutXLM](model_doc/layoutxlm) | ✅ | ❌ | ❌ | -| [LED](model_doc/led) | ✅ | ✅ | ❌ | -| [LeViT](model_doc/levit) | ✅ | ❌ | ❌ | -| [LiLT](model_doc/lilt) | ✅ | ❌ | ❌ | -| [LLaMA](model_doc/llama) | ✅ | ❌ | ✅ | -| [Llama2](model_doc/llama2) | ✅ | ❌ | ✅ | -| [Llama3](model_doc/llama3) | ✅ | ❌ | ✅ | -| [LLaVa](model_doc/llava) | ✅ | ❌ | ❌ | -| [LLaVA-NeXT](model_doc/llava_next) | ✅ | ❌ | ❌ | -| [LLaVa-NeXT-Video](model_doc/llava_next_video) | ✅ | ❌ | ❌ | -| [LLaVA-Onevision](model_doc/llava_onevision) | ✅ | ❌ | ❌ | -| [Longformer](model_doc/longformer) | ✅ | ✅ | ❌ | -| [LongT5](model_doc/longt5) | ✅ | ❌ | ✅ | -| [LUKE](model_doc/luke) | ✅ | ❌ | ❌ | -| [LXMERT](model_doc/lxmert) | ✅ | ✅ | ❌ | -| [M-CTC-T](model_doc/mctct) | ✅ | ❌ | ❌ | -| [M2M100](model_doc/m2m_100) | ✅ | ❌ | ❌ | -| [MADLAD-400](model_doc/madlad-400) | ✅ | ✅ | ✅ | -| [Mamba](model_doc/mamba) | ✅ | ❌ | ❌ | -| [mamba2](model_doc/mamba2) | ✅ | ❌ | ❌ | -| [Marian](model_doc/marian) | ✅ | ✅ | ✅ | -| [MarkupLM](model_doc/markuplm) | ✅ | ❌ | ❌ | -| [Mask2Former](model_doc/mask2former) | ✅ | ❌ | ❌ | -| [MaskFormer](model_doc/maskformer) | ✅ | ❌ | ❌ | -| [MatCha](model_doc/matcha) | ✅ | ❌ | ❌ | -| [mBART](model_doc/mbart) | ✅ | ✅ | ✅ | -| [mBART-50](model_doc/mbart50) | ✅ | ✅ | ✅ | -| [MEGA](model_doc/mega) | ✅ | ❌ | ❌ | -| [Megatron-BERT](model_doc/megatron-bert) | ✅ | ❌ | ❌ | -| [Megatron-GPT2](model_doc/megatron_gpt2) | ✅ | ✅ | ✅ | -| [MGP-STR](model_doc/mgp-str) | ✅ | ❌ | ❌ | -| [Mimi](model_doc/mimi) | ✅ | ❌ | ❌ | -| [Mistral](model_doc/mistral) | ✅ | ✅ | ✅ | -| [Mixtral](model_doc/mixtral) | ✅ | ❌ | ❌ | -| [Mllama](model_doc/mllama) | ✅ | ❌ | ❌ | -| [mLUKE](model_doc/mluke) | ✅ | ❌ | ❌ | -| [MMS](model_doc/mms) | ✅ | ✅ | ✅ | -| [MobileBERT](model_doc/mobilebert) | ✅ | ✅ | ❌ | -| [MobileNetV1](model_doc/mobilenet_v1) | ✅ | ❌ | ❌ | -| [MobileNetV2](model_doc/mobilenet_v2) | ✅ | ❌ | ❌ | -| [MobileViT](model_doc/mobilevit) | ✅ | ✅ | ❌ | -| [MobileViTV2](model_doc/mobilevitv2) | ✅ | ❌ | ❌ | -| [ModernBERT](model_doc/modernbert) | ✅ | ❌ | ❌ | -| [Moonshine](model_doc/moonshine) | ✅ | ❌ | ❌ | -| [Moshi](model_doc/moshi) | ✅ | ❌ | ❌ | -| [MPNet](model_doc/mpnet) | ✅ | ✅ | ❌ | -| [MPT](model_doc/mpt) | ✅ | ❌ | ❌ | -| [MRA](model_doc/mra) | ✅ | ❌ | ❌ | -| [MT5](model_doc/mt5) | ✅ | ✅ | ✅ | -| [MusicGen](model_doc/musicgen) | ✅ | ❌ | ❌ | -| [MusicGen Melody](model_doc/musicgen_melody) | ✅ | ❌ | ❌ | -| [MVP](model_doc/mvp) | ✅ | ❌ | ❌ | -| [NAT](model_doc/nat) | ✅ | ❌ | ❌ | -| [Nemotron](model_doc/nemotron) | ✅ | ❌ | ❌ | -| [Nezha](model_doc/nezha) | ✅ | ❌ | ❌ | -| [NLLB](model_doc/nllb) | ✅ | ❌ | ❌ | -| [NLLB-MOE](model_doc/nllb-moe) | ✅ | ❌ | ❌ | -| [Nougat](model_doc/nougat) | ✅ | ✅ | ✅ | -| [Nyströmformer](model_doc/nystromformer) | ✅ | ❌ | ❌ | -| [OLMo](model_doc/olmo) | ✅ | ❌ | ❌ | -| [OLMo2](model_doc/olmo2) | ✅ | ❌ | ❌ | -| [OLMoE](model_doc/olmoe) | ✅ | ❌ | ❌ | -| [OmDet-Turbo](model_doc/omdet-turbo) | ✅ | ❌ | ❌ | -| [OneFormer](model_doc/oneformer) | ✅ | ❌ | ❌ | -| [OpenAI GPT](model_doc/openai-gpt) | ✅ | ✅ | ❌ | -| [OpenAI GPT-2](model_doc/gpt2) | ✅ | ✅ | ✅ | -| [OpenLlama](model_doc/open-llama) | ✅ | ❌ | ❌ | -| [OPT](model_doc/opt) | ✅ | ✅ | ✅ | -| [OWL-ViT](model_doc/owlvit) | ✅ | ❌ | ❌ | -| [OWLv2](model_doc/owlv2) | ✅ | ❌ | ❌ | -| [PaliGemma](model_doc/paligemma) | ✅ | ❌ | ❌ | -| [PatchTSMixer](model_doc/patchtsmixer) | ✅ | ❌ | ❌ | -| [PatchTST](model_doc/patchtst) | ✅ | ❌ | ❌ | -| [Pegasus](model_doc/pegasus) | ✅ | ✅ | ✅ | -| [PEGASUS-X](model_doc/pegasus_x) | ✅ | ❌ | ❌ | -| [Perceiver](model_doc/perceiver) | ✅ | ❌ | ❌ | -| [Persimmon](model_doc/persimmon) | ✅ | ❌ | ❌ | -| [Phi](model_doc/phi) | ✅ | ❌ | ❌ | -| [Phi3](model_doc/phi3) | ✅ | ❌ | ❌ | -| [Phimoe](model_doc/phimoe) | ✅ | ❌ | ❌ | -| [PhoBERT](model_doc/phobert) | ✅ | ✅ | ✅ | -| [Pix2Struct](model_doc/pix2struct) | ✅ | ❌ | ❌ | -| [Pixtral](model_doc/pixtral) | ✅ | ❌ | ❌ | -| [PLBart](model_doc/plbart) | ✅ | ❌ | ❌ | -| [PoolFormer](model_doc/poolformer) | ✅ | ❌ | ❌ | -| [Pop2Piano](model_doc/pop2piano) | ✅ | ❌ | ❌ | -| [ProphetNet](model_doc/prophetnet) | ✅ | ❌ | ❌ | -| [PVT](model_doc/pvt) | ✅ | ❌ | ❌ | -| [PVTv2](model_doc/pvt_v2) | ✅ | ❌ | ❌ | -| [QDQBert](model_doc/qdqbert) | ✅ | ❌ | ❌ | -| [Qwen2](model_doc/qwen2) | ✅ | ❌ | ❌ | -| [Qwen2_5_VL](model_doc/qwen2_5_vl) | ✅ | ❌ | ❌ | -| [Qwen2Audio](model_doc/qwen2_audio) | ✅ | ❌ | ❌ | -| [Qwen2MoE](model_doc/qwen2_moe) | ✅ | ❌ | ❌ | -| [Qwen2VL](model_doc/qwen2_vl) | ✅ | ❌ | ❌ | -| [RAG](model_doc/rag) | ✅ | ✅ | ❌ | -| [REALM](model_doc/realm) | ✅ | ❌ | ❌ | -| [RecurrentGemma](model_doc/recurrent_gemma) | ✅ | ❌ | ❌ | -| [Reformer](model_doc/reformer) | ✅ | ❌ | ❌ | -| [RegNet](model_doc/regnet) | ✅ | ✅ | ✅ | -| [RemBERT](model_doc/rembert) | ✅ | ✅ | ❌ | -| [ResNet](model_doc/resnet) | ✅ | ✅ | ✅ | -| [RetriBERT](model_doc/retribert) | ✅ | ❌ | ❌ | -| [RoBERTa](model_doc/roberta) | ✅ | ✅ | ✅ | -| [RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm) | ✅ | ✅ | ✅ | -| [RoCBert](model_doc/roc_bert) | ✅ | ❌ | ❌ | -| [RoFormer](model_doc/roformer) | ✅ | ✅ | ✅ | -| [RT-DETR](model_doc/rt_detr) | ✅ | ❌ | ❌ | -| [RT-DETR-ResNet](model_doc/rt_detr_resnet) | ✅ | ❌ | ❌ | -| [RT-DETRv2](model_doc/rt_detr_v2) | ✅ | ❌ | ❌ | -| [RWKV](model_doc/rwkv) | ✅ | ❌ | ❌ | -| [SAM](model_doc/sam) | ✅ | ✅ | ❌ | -| [SeamlessM4T](model_doc/seamless_m4t) | ✅ | ❌ | ❌ | -| [SeamlessM4Tv2](model_doc/seamless_m4t_v2) | ✅ | ❌ | ❌ | -| [SegFormer](model_doc/segformer) | ✅ | ✅ | ❌ | -| [SegGPT](model_doc/seggpt) | ✅ | ❌ | ❌ | -| [SEW](model_doc/sew) | ✅ | ❌ | ❌ | -| [SEW-D](model_doc/sew-d) | ✅ | ❌ | ❌ | -| [SigLIP](model_doc/siglip) | ✅ | ❌ | ❌ | -| [Speech Encoder decoder](model_doc/speech-encoder-decoder) | ✅ | ❌ | ✅ | -| [Speech2Text](model_doc/speech_to_text) | ✅ | ✅ | ❌ | -| [SpeechT5](model_doc/speecht5) | ✅ | ❌ | ❌ | -| [Splinter](model_doc/splinter) | ✅ | ❌ | ❌ | -| [SqueezeBERT](model_doc/squeezebert) | ✅ | ❌ | ❌ | -| [StableLm](model_doc/stablelm) | ✅ | ❌ | ❌ | -| [Starcoder2](model_doc/starcoder2) | ✅ | ❌ | ❌ | -| [SuperGlue](model_doc/superglue) | ✅ | ❌ | ❌ | -| [SuperPoint](model_doc/superpoint) | ✅ | ❌ | ❌ | -| [SwiftFormer](model_doc/swiftformer) | ✅ | ✅ | ❌ | -| [Swin Transformer](model_doc/swin) | ✅ | ✅ | ❌ | -| [Swin Transformer V2](model_doc/swinv2) | ✅ | ❌ | ❌ | -| [Swin2SR](model_doc/swin2sr) | ✅ | ❌ | ❌ | -| [SwitchTransformers](model_doc/switch_transformers) | ✅ | ❌ | ❌ | -| [T5](model_doc/t5) | ✅ | ✅ | ✅ | -| [T5v1.1](model_doc/t5v1.1) | ✅ | ✅ | ✅ | -| [Table Transformer](model_doc/table-transformer) | ✅ | ❌ | ❌ | -| [TAPAS](model_doc/tapas) | ✅ | ✅ | ❌ | -| [TAPEX](model_doc/tapex) | ✅ | ✅ | ✅ | -| [TextNet](model_doc/textnet) | ✅ | ❌ | ❌ | -| [Time Series Transformer](model_doc/time_series_transformer) | ✅ | ❌ | ❌ | -| [TimeSformer](model_doc/timesformer) | ✅ | ❌ | ❌ | -| [TimmWrapperModel](model_doc/timm_wrapper) | ✅ | ❌ | ❌ | -| [Trajectory Transformer](model_doc/trajectory_transformer) | ✅ | ❌ | ❌ | -| [Transformer-XL](model_doc/transfo-xl) | ✅ | ✅ | ❌ | -| [TrOCR](model_doc/trocr) | ✅ | ❌ | ❌ | -| [TVLT](model_doc/tvlt) | ✅ | ❌ | ❌ | -| [TVP](model_doc/tvp) | ✅ | ❌ | ❌ | -| [UDOP](model_doc/udop) | ✅ | ❌ | ❌ | -| [UL2](model_doc/ul2) | ✅ | ✅ | ✅ | -| [UMT5](model_doc/umt5) | ✅ | ❌ | ❌ | -| [UniSpeech](model_doc/unispeech) | ✅ | ❌ | ❌ | -| [UniSpeechSat](model_doc/unispeech-sat) | ✅ | ❌ | ❌ | -| [UnivNet](model_doc/univnet) | ✅ | ❌ | ❌ | -| [UPerNet](model_doc/upernet) | ✅ | ❌ | ❌ | -| [VAN](model_doc/van) | ✅ | ❌ | ❌ | -| [VideoLlava](model_doc/video_llava) | ✅ | ❌ | ❌ | -| [VideoMAE](model_doc/videomae) | ✅ | ❌ | ❌ | -| [ViLT](model_doc/vilt) | ✅ | ❌ | ❌ | -| [VipLlava](model_doc/vipllava) | ✅ | ❌ | ❌ | -| [Vision Encoder decoder](model_doc/vision-encoder-decoder) | ✅ | ✅ | ✅ | -| [VisionTextDualEncoder](model_doc/vision-text-dual-encoder) | ✅ | ✅ | ✅ | -| [VisualBERT](model_doc/visual_bert) | ✅ | ❌ | ❌ | -| [ViT](model_doc/vit) | ✅ | ✅ | ✅ | -| [ViT Hybrid](model_doc/vit_hybrid) | ✅ | ❌ | ❌ | -| [VitDet](model_doc/vitdet) | ✅ | ❌ | ❌ | -| [ViTMAE](model_doc/vit_mae) | ✅ | ✅ | ❌ | -| [ViTMatte](model_doc/vitmatte) | ✅ | ❌ | ❌ | -| [ViTMSN](model_doc/vit_msn) | ✅ | ❌ | ❌ | -| [ViTPose](model_doc/vitpose) | ✅ | ❌ | ❌ | -| [ViTPoseBackbone](model_doc/vitpose_backbone) | ✅ | ❌ | ❌ | -| [VITS](model_doc/vits) | ✅ | ❌ | ❌ | -| [ViViT](model_doc/vivit) | ✅ | ❌ | ❌ | -| [Wav2Vec2](model_doc/wav2vec2) | ✅ | ✅ | ✅ | -| [Wav2Vec2-BERT](model_doc/wav2vec2-bert) | ✅ | ❌ | ❌ | -| [Wav2Vec2-Conformer](model_doc/wav2vec2-conformer) | ✅ | ❌ | ❌ | -| [Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme) | ✅ | ✅ | ✅ | -| [WavLM](model_doc/wavlm) | ✅ | ❌ | ❌ | -| [Whisper](model_doc/whisper) | ✅ | ✅ | ✅ | -| [X-CLIP](model_doc/xclip) | ✅ | ❌ | ❌ | -| [X-MOD](model_doc/xmod) | ✅ | ❌ | ❌ | -| [XGLM](model_doc/xglm) | ✅ | ✅ | ✅ | -| [XLM](model_doc/xlm) | ✅ | ✅ | ❌ | -| [XLM-ProphetNet](model_doc/xlm-prophetnet) | ✅ | ❌ | ❌ | -| [XLM-RoBERTa](model_doc/xlm-roberta) | ✅ | ✅ | ✅ | -| [XLM-RoBERTa-XL](model_doc/xlm-roberta-xl) | ✅ | ❌ | ❌ | -| [XLM-V](model_doc/xlm-v) | ✅ | ✅ | ✅ | -| [XLNet](model_doc/xlnet) | ✅ | ✅ | ❌ | -| [XLS-R](model_doc/xls_r) | ✅ | ✅ | ✅ | -| [XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2) | ✅ | ✅ | ✅ | -| [YOLOS](model_doc/yolos) | ✅ | ❌ | ❌ | -| [YOSO](model_doc/yoso) | ✅ | ❌ | ❌ | -| [Zamba](model_doc/zamba) | ✅ | ❌ | ❌ | -| [Zamba2](model_doc/zamba2) | ✅ | ❌ | ❌ | -| [ZoeDepth](model_doc/zoedepth) | ✅ | ❌ | ❌ | - - +Join us on the Hugging Face [Hub](https://huggingface.co/), [Discord](https://discord.com/invite/JfAtkvEtRb), or [forum](https://discuss.huggingface.co/) to collaborate and build models, datasets, and applications together. diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md index a5df5a2c08c3..8a3d9c11e7c1 100644 --- a/docs/source/en/installation.md +++ b/docs/source/en/installation.md @@ -24,7 +24,7 @@ Transformers works with [PyTorch](https://pytorch.org/get-started/locally/), [Te ## Virtual environment -A virtual environment can help you manage different projects and avoid compatibility issues between dependencies. Take a look at the [Install packages in a virtual environment using pip and venv](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) guide if you're unfamiliar with Python virtual environments. +A virtual environment helps manage different projects and avoids compatibility issues between dependencies. Take a look at the [Install packages in a virtual environment using pip and venv](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) guide if you're unfamiliar with Python virtual environments. Create a virtual environment in your project directory. @@ -82,7 +82,7 @@ pip install 'transformers[torch]'
-For M1 ARM-based hardware, you need to install CMake and pkg-config first. +For Apple M1 hardware, you need to install CMake and pkg-config first. ```bash brew install cmake @@ -105,7 +105,7 @@ pip install 'transformers[flax]'
-Test whether your install was successful with the following command to quickly inference with a pretrained model. It should return a label and score for the provided text. +Test whether the install was successful with the following command. It should return a label and score for the provided text. ```bash python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('hugging face is the best'))" @@ -114,7 +114,7 @@ python -c "from transformers import pipeline; print(pipeline('sentiment-analysis ### Source install -Installing from source installs the *latest* version rather than a *stable* version of the library. This ensures you get the most up-to-date changes in the library, which is useful for experimenting with the latest features or fixing a bug that hasn't been officially released in the stable version yet. +Installing from source installs the *latest* version rather than the *stable* version of the library. It ensures you have the most up-to-date changes in Transformers and it's useful for experimenting with the latest features or fixing a bug that hasn't been officially released in the stable version yet. The downside is that the latest version may not always be stable. If you encounter any problems, please open a [GitHub Issue](https://github.com/huggingface/transformers/issues) so we can fix it as soon as possible. @@ -124,7 +124,7 @@ Install from source with the following command. pip install git+https://github.com/huggingface/transformers ``` -Check if the install was successful with the command below to quickly inference with a pretrained model. It should return a label and score for the provided text. +Check if the install was successful with the command below. It should return a label and score for the provided text. ```bash python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('hugging face is the best'))" @@ -159,19 +159,17 @@ git pull conda install conda-forge::transformers ``` -## Setup +## Set up -After installation, you can configure the Transformers cache location or setup the library for offline usage if you want. +After installation, you can configure the Transformers cache location or set up the library for offline usage. ### Cache directory When you load a pretrained model with [`~PreTrainedModel.from_pretrained`], the model is downloaded from the Hub and locally cached. -Every time you load a model, it checks whether the cached model is up-to-date. If it's the same, then the local model is loaded. If it's not the same, the newer model is downloaded and cached. This ensures you always have the latest model version. +Every time you load a model, it checks whether the cached model is up-to-date. If it's the same, then the local model is loaded. If it's not the same, the newer model is downloaded and cached. -The default directory given by the shell environment variable `TRANSFORMERS_CACHE` is `~/.cache/huggingface/hub`. - -On Windows, the default directory is given by `C:\Users\username\.cache\huggingface\hub`. +The default directory given by the shell environment variable `TRANSFORMERS_CACHE` is `~/.cache/huggingface/hub`. On Windows, the default directory is `C:\Users\username\.cache\huggingface\hub`. Cache a model in a different directory by changing the path in the following shell environment variables (listed by priority). @@ -183,7 +181,7 @@ Older versions of Transformers uses the shell environment variables `PYTORCH_TRA ### Offline mode -To use Transformers in an offline or firewalled environment requires having the downloaded and cached files ahead of time. Download a model repository from the Hub with the [`~huggingface_hub.snapshot_download`] method. +To use Transformers in an offline or firewalled environment requires the downloaded and cached files ahead of time. Download a model repository from the Hub with the [`~huggingface_hub.snapshot_download`] method. > [!TIP] > Refer to the [Download files from the Hub](https://hf.co/docs/huggingface_hub/guides/download) guide for more options for downloading files from the Hub. You can download files from specific revisions, download from the CLI, and even filter which files to download from a repository. @@ -201,7 +199,7 @@ HF_HUB_OFFLINE=1 \ python examples/pytorch/language-modeling/run_clm.py --model_name_or_path meta-llama/Llama-2-7b-hf --dataset_name wikitext ... ``` -Another option for only loading cached files is to set the `local_files_only` parameter to `True` in [`~PreTrainedModel.from_pretrained`]. +Another option for only loading cached files is to set `local_files_only=True` in [`~PreTrainedModel.from_pretrained`]. ```py from transformers import LlamaForCausalLM diff --git a/docs/source/en/model_sharing.md b/docs/source/en/model_sharing.md index 9a1fddf9a4f5..a6ebdfb39657 100644 --- a/docs/source/en/model_sharing.md +++ b/docs/source/en/model_sharing.md @@ -14,18 +14,18 @@ rendered properly in your Markdown viewer. --> -# Share +# Sharing The Hugging Face [Hub](https://hf.co/models) is a platform for sharing, discovering, and consuming models of all different types and sizes. We highly recommend sharing your model on the Hub to push open-source machine learning forward for everyone! This guide will show you how to share a model to the Hub from Transformers. -## Setup +## Set up -To share a model to the Hub, you need a Hugging Face [account](https://hf.co/join). Create a [User Access Token](https://hf.co/docs/hub/security-tokens#user-access-tokens) and login to your account from either the CLI or a notebook. +To share a model to the Hub, you need a Hugging Face [account](https://hf.co/join). Create a [User Access Token](https://hf.co/docs/hub/security-tokens#user-access-tokens) (stored in the [cache](./installation#cache-directory) by default) and login to your account from either the command line or notebook. - + ```bash huggingface-cli login @@ -47,35 +47,35 @@ notebook_login() -Each model repository supports versioning, commit history, and visualizing diffs. +Each model repository features versioning, commit history, and diff visualization.
-The repository's built-in versioning is based on [Git](https://git-scm.com/) and [Git Large File Storage (LFS)](https://git-lfs.github.com/). Version control enables revisions, a way to specify a model version with a commit hash, tag or branch. +Versioning is based on [Git](https://git-scm.com/) and [Git Large File Storage (LFS)](https://git-lfs.github.com/), and it enables revisions, a way to specify a model version with a commit hash, tag or branch. -For example, specify the `revision` parameter in [`~PreTrainedModel.from_pretrained`] to load a specific model version. +For example, use the `revision` parameter in [`~PreTrainedModel.from_pretrained`] to load a specific model version from a commit hash. ```py model = AutoModel.from_pretrained( - "julien-c/EsperBERTo-small", revision="v2.0.1" + "julien-c/EsperBERTo-small", revision="4c77982" ) ``` -Model repositories also support [gating](https://hf.co/docs/hub/models-gated) for more control over who can access a model. Gating is common for allowing a select group of users to preview a research model before it's made public. +Model repositories also support [gating](https://hf.co/docs/hub/models-gated) to control who can access a model. Gating is common for allowing a select group of users to preview a research model before it's made public.
-The model repository also includes an inference [widget](https://hf.co/docs/hub/models-widgets) for users to directly interact with a model on the Hub. +A model repository also includes an inference [widget](https://hf.co/docs/hub/models-widgets) for users to directly interact with a model on the Hub. -Check out the Hub [Models](https://hf.co/docs/hub/models) documentation to learn more about. +Check out the Hub [Models](https://hf.co/docs/hub/models) documentation to for more information. ## Model framework conversion -Reach a wider audience by making a model available in PyTorch, TensorFlow, and Flax. While users can still load a model if they're using a different framework, it is slower because Transformers needs to convert the checkpoint on the fly. It is faster to convert the checkpoint beforehand. +Reach a wider audience by making a model available in PyTorch, TensorFlow, and Flax. While users can still load a model if they're using a different framework, it is slower because Transformers needs to convert the checkpoint on the fly. It is faster to convert the checkpoint first. @@ -83,7 +83,7 @@ Reach a wider audience by making a model available in PyTorch, TensorFlow, and F Set `from_tf=True` to convert a checkpoint from TensorFlow to PyTorch and then save it. ```py -import DistilBertForSequenceClassification +from transformers import DistilBertForSequenceClassification pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True) pt_model.save_pretrained("path/to/awesome-name-you-picked") @@ -95,7 +95,7 @@ pt_model.save_pretrained("path/to/awesome-name-you-picked") Set `from_pt=True` to convert a checkpoint from PyTorch to TensorFlow and then save it. ```py -import TFDistilBertForSequenceClassification +from transformers import TFDistilBertForSequenceClassification tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True) tf_model.save_pretrained("path/to/awesome-name-you-picked") @@ -107,6 +107,7 @@ tf_model.save_pretrained("path/to/awesome-name-you-picked") Set `from_pt=True` to convert a checkpoint from PyTorch to Flax and then save it. ```py +from transformers import FlaxDistilBertForSequenceClassification flax_model = FlaxDistilBertForSequenceClassification.from_pretrained( "path/to/awesome-name-you-picked", from_pt=True ) @@ -116,17 +117,17 @@ flax_model.save_pretrained("path/to/awesome-name-you-picked") -## Upload a model +## Uploading a model -There are several ways to upload a model to the Hub depending on your workflow preference. You can push a model with the [`Trainer`], call the [`~PreTrainedModel.push_to_hub`] method directly on a model, or use the Hub's web interface. +There are several ways to upload a model to the Hub depending on your workflow preference. You can push a model with [`Trainer`], a callback for TensorFlow models, call [`~PreTrainedModel.push_to_hub`] directly on a model, or use the Hub web interface. ### Trainer -The [`Trainer`], Transformers' training API, can push a model directly to the Hub after training. Set `push_to_hub=True` in the [`TrainingArguments`] class and pass it to the [`Trainer`]. Once training is complete, call [`~transformers.Trainer.push_to_hub`] to upload the model. +[`Trainer`] can push a model directly to the Hub after training. Set `push_to_hub=True` in [`TrainingArguments`] and pass it to [`Trainer`]. Once training is complete, call [`~transformers.Trainer.push_to_hub`] to upload the model. -The [`~transformers.Trainer.push_to_hub`] method automatically adds useful information like training hyperparameters and results to the model card. +[`~transformers.Trainer.push_to_hub`] automatically adds useful information like training hyperparameters and results to the model card. ```py from transformers import TrainingArguments, Trainer @@ -142,9 +143,9 @@ trainer = Trainer( trainer.push_to_hub() ``` -#### TensorFlow models +### PushToHubCallback -For TensorFlow models, add the [`PushToHubCallback`] to [fit](https://keras.io/api/models/model_training_apis/#fit-method). +For TensorFlow models, add the [`PushToHubCallback`] to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method. ```py from transformers import PushToHubCallback @@ -157,9 +158,9 @@ model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, cal ### PushToHubMixin -The [`PushToHubMixin`] provides the functionality for pushing a model or tokenizer to the Hub. +The [`~utils.PushToHubMixin`] provides functionality for pushing a model or tokenizer to the Hub. -Call [`~PushToHubMixin.push_to_hub`] directly on a model to upload it to the Hub. It creates a repository under your namespace with the model name specified in [`~PushToHubMixin.push_to_hub`]. +Call [`~utils.PushToHubMixin.push_to_hub`] directly on a model to upload it to the Hub. It creates a repository under your namespace with the model name specified in [`~utils.PushToHubMixin.push_to_hub`]. ```py model.push_to_hub("my-awesome-model") @@ -171,15 +172,15 @@ Other objects like a tokenizer or TensorFlow model are also pushed to the Hub in tokenizer.push_to_hub("my-awesome-model") ``` -Your Hugging Face profile should now display the newly created model repository. Navigate to the **Files** tab to see all the uploaded files +Your Hugging Face profile should now display the newly created model repository. Navigate to the **Files** tab to see all the uploaded files. -Refer to the [Upload files to the Hub](https://hf.co/docs/hub/how-to-upstream) guide for more details about pushing files to the Hub. +Refer to the [Upload files to the Hub](https://hf.co/docs/hub/how-to-upstream) guide for more information about pushing files to the Hub. ### Hub web interface -For a no-code approach, upload a model with the Hub's web interface. +The Hub web interface is a no-code approach for uploading a model. -Create a new repository by selecting [**New Model**](https://huggingface.co/new). +1. Create a new repository by selecting [**New Model**](https://huggingface.co/new).
@@ -190,11 +191,11 @@ Add some information about your model: - Select the **owner** of the repository. This can be yourself or any of the organizations you belong to. - Pick a name for your model, which will also be the repository name. - Choose whether your model is public or private. -- Specify the license usage for your model. +- Set the license usage. -Click on **Create model** to create the model repository. +2. Click on **Create model** to create the model repository. -Now select the **Files** tab and click on the **Add file** button to drag-and-drop a file to your repository. Add a commit message and click on **Commit changes to `main`** to commit the file. +3. Select the **Files** tab and click on the **Add file** button to drag-and-drop a file to your repository. Add a commit message and click on **Commit changes to main** to commit the file.
@@ -202,13 +203,13 @@ Now select the **Files** tab and click on the **Add file** button to drag-and-dr ## Model card -[Model cards](https://hf.co/docs/hub/model-cards#model-cards) inform users about a model's performance, limitations, potential biases, and ethical considerations. It is highly recommended to add a model card to your repository! +[Model cards](https://hf.co/docs/hub/model-cards#model-cards) inform users about a models performance, limitations, potential biases, and ethical considerations. It is highly recommended to add a model card to your repository! A model card is a `README.md` file in your repository. Add this file by: - manually creating and uploading a `README.md` file - clicking on the **Edit model card** button in the repository -Take a look at the Llama 3.1 [model card](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) for an example of the type of information to include on a model card. +Take a look at the Llama 3.1 [model card](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) for an example of what to include on a model card. Learn more about other model card metadata (carbon emissions, license, link to paper, etc.) available in the [Model Cards](https://hf.co/docs/hub/model-cards#model-cards) guide. diff --git a/docs/source/en/models.md b/docs/source/en/models.md index 24fecc318836..5263c6c11885 100644 --- a/docs/source/en/models.md +++ b/docs/source/en/models.md @@ -14,22 +14,22 @@ rendered properly in your Markdown viewer. --> -# Load +# Loading -Transformers provides many pretrained models that are ready to use with just a single line of code. It requires a model class and the [`~PreTrainedModel.from_pretrained`] method. +Transformers provides many pretrained models that are ready to use with a single line of code. It requires a model class and the [`~PreTrainedModel.from_pretrained`] method. -To load a model, call [`~PreTrainedModel.from_pretrained`] to download and load the model weights and configuration stored on the Hugging Face [Hub](https://hf.co/models). +Call [`~PreTrainedModel.from_pretrained`] to download and load a models weights and configuration stored on the Hugging Face [Hub](https://hf.co/models). > [!TIP] -> The [`~PreTrainedModel.from_pretrained`] method loads weights stored in the [safetensors](https://hf.co/docs/safetensors/index) file format if they're available. Traditionally, PyTorch model weights are serialized with the [pickle](https://docs.python.org/3/library/pickle.html) utility which is known to be unsecure. Safetensor files are more secure and faster to load. +> [`~PreTrainedModel.from_pretrained`] loads weights stored in the [safetensors](https://hf.co/docs/safetensors/index) file format if they're available. Traditionally, PyTorch model weights are serialized with the [pickle](https://docs.python.org/3/library/pickle.html) utility which is known to be unsecure. Safetensor files are more secure and faster to load. ```py from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype="auto", device_map="auto") ``` -This guide will briefly explain how models are loaded, the different ways you can load a model, how to overcome memory issues for really big models, and how to load custom models. +This guide explains how models are loaded, the different ways you can load a model, how to overcome memory issues for really big models, and how to load custom models. ## Models and configurations @@ -38,12 +38,12 @@ All models have a `configuration.py` file with specific attributes like the numb > [!TIP] -> An *architecture* refers to the model's skeleton and a *checkpoint* refers to the model's weights for a given architecture. For example, [BERT](./model_doc/bert) is an architecture while [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) is a checkpoint. You'll see the term *model* used interchangeably for architecture and checkpoint. +> An *architecture* refers to the model's skeleton and a *checkpoint* refers to the model's weights for a given architecture. For example, [BERT](./model_doc/bert) is an architecture while [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) is a checkpoint. You'll see the term *model* used interchangeably with architecture and checkpoint. There are two general types of models you can load: -1. A barebones model like [`AutoModel`] or [`LlamaModel`] that outputs hidden states. -2. A model with a specific *head* attached to the model, like [`AutoModelForCausalLM`] or [`LlamaForCausalLM`], for performing specific tasks. +1. A barebones model, like [`AutoModel`] or [`LlamaModel`], that outputs hidden states. +2. A model with a specific *head* attached, like [`AutoModelForCausalLM`] or [`LlamaForCausalLM`], for performing specific tasks. For each model type, there is a separate class for each machine learning framework (PyTorch, TensorFlow, Flax). Pick the corresponding prefix for the framework you're using. @@ -54,8 +54,8 @@ For each model type, there is a separate class for each machine learning framewo from transformers import AutoModelForCausalLM, MistralForCausalLM # load with AutoClass or model-specific class -model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") -model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", , torch_dtype="auto", device_map="auto") +model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", , torch_dtype="auto", device_map="auto") ``` @@ -94,9 +94,9 @@ There are two model classes, the [AutoModel](./model_doc/auto) class and a model -The [AutoModel](./model_doc/auto) class is a convenient way to load an architecture without needing to know the exact model class name because there are many architectures. It automatically selects the correct model class based on the configuration file. You only need to know the task and checkpoint you want to use. +The [AutoModel](./model_doc/auto) class is a convenient way to load an architecture without needing to know the exact model class name because there are many models available. It automatically selects the correct model class based on the configuration file. You only need to know the task and checkpoint you want to use. -The AutoClass makes it easy to switch between models or tasks, as long as the architecture is supported for a given task. +Easily switch between models or tasks, as long as the architecture is supported for a given task. For example, the same model can be used for separate tasks. @@ -109,7 +109,7 @@ model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-2-7 model = AutoModelForQuestionAnswering.from_pretrained("meta-llama/Llama-2-7b-hf") ``` -In other cases, you may want to quickly try out several models for a task. +In other cases, you may want to quickly try out several different models for a task. ```py from transformers import AutoModelForCausalLM @@ -123,7 +123,7 @@ model = AutoModelForCausalLM.from_pretrained("google/gemma-7b") -The AutoModel class builds on top of model-specific classes. All model classes that support a specific task are mapped to their respective `AutoModelFor` task class. +The [AutoModel](./model_doc/auto) class builds on top of model-specific classes. All model classes that support a specific task are mapped to their respective `AutoModelFor` task class. If you already know which model class you want to use, then you could use its model-specific class directly. @@ -136,7 +136,7 @@ model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") -## Big models +## Large models Large pretrained models require a lot of memory to load. The loading process involves: @@ -146,17 +146,17 @@ Large pretrained models require a lot of memory to load. The loading process inv You need enough memory to hold two copies of the model weights (random and pretrained) which may not be possible depending on your hardware. In distributed training environments, this is even more challenging because each process loads a pretrained model. -Transformers reduces some of these memory-related challenges with fast initialization, sharded checkpoints, leveraging Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature, and supporting lower bit data types. +Transformers reduces some of these memory-related challenges with fast initialization, sharded checkpoints, Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature, and supporting lower bit data types. ### Fast initialization A PyTorch model is instantiated with random weights, or "empty" tensors, that take up space in memory without filling it. -Transformers boosts loading speed and avoids random weight initialization with the [_fast_init](https://github.com/huggingface/transformers/blob/c9f6e5e35156e068b227dd9b15521767f6afd4d2/src/transformers/modeling_utils.py#L2710) parameter if the pretrained weights are correctly initialized. This parameter is set to `True` by default. +Transformers boosts loading speed by skipping random weight initialization with the [_fast_init](https://github.com/huggingface/transformers/blob/c9f6e5e35156e068b227dd9b15521767f6afd4d2/src/transformers/modeling_utils.py#L2710) parameter if the pretrained weights are correctly initialized. This parameter is set to `True` by default. ### Sharded checkpoints -Transformers' [`~PreTrainedModel.save_pretrained`] method automatically shards checkpoints larger than 10GB. +The [`~PreTrainedModel.save_pretrained`] method automatically shards checkpoints larger than 10GB. Each shard is loaded sequentially after the previous shard is loaded, limiting memory usage to only the model size and the largest shard size. @@ -193,9 +193,7 @@ with tempfile.TemporaryDirectory() as tmp_dir: load_sharded_checkpoint(model, tmp_dir) ``` -#### Model metadata - -Transformers' [`~PreTrainedModel.save_pretrained`] method creates an index file that maps parameter names to the files they're stored in. The index file has two keys, `metadata` and `weight_map`. +The [`~PreTrainedModel.save_pretrained`] method creates an index file that maps parameter names to the files they're stored in. The index file has two keys, `metadata` and `weight_map`. ```py import json @@ -234,9 +232,9 @@ index["weight_map"] -The [`~PreTrainedModel.from_pretrained`] method is supercharged by Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature. +[`~PreTrainedModel.from_pretrained`] method is supercharged with Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature. -Big Model Inference creates a *model skeleton* on PyTorch's [meta](https://pytorch.org/docs/main/meta.html) device. The meta device doesn't store any real data, only the metadata. +Big Model Inference creates a *model skeleton* on the PyTorch [meta](https://pytorch.org/docs/main/meta.html) device. The meta device doesn't store any real data, only the metadata. Randomly initialized weights are only created when the pretrained weights are loaded to avoid maintaining two copies of the model in memory at the same time. The maximum memory usage is only the size of the model. @@ -247,7 +245,7 @@ Big Model Inference's second feature relates to how weights are loaded and dispa Both features combined reduces memory usage and loading times for big pretrained models. -Set the [device_map](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3061) parameter to `"auto"` to enable Big Model Inference. This also sets the [low_cpu_mem_usage](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3028) parameter to `True`. +Set [device_map](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3061) to `"auto"` to enable Big Model Inference. This also sets the [low_cpu_mem_usage](https://github.com/huggingface/transformers/blob/026a173a64372e9602a16523b8fae9de4b0ff428/src/transformers/modeling_utils.py#L3028) parameter to `True`, such that not more than 1x the model size is used in CPU memory. ```py from transformers import AutoModelForCausalLM @@ -255,9 +253,9 @@ from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", device_map="auto") ``` -To manually assign layers to devices, create a `device_map`. It should map all model parameters to a device, but you don't have to detail where all the submodules of a layer go if the entire layer is on the same device. +You can also manually assign layers to a device in `device_map`. It should map all model parameters to a device, but you don't have to detail where all the submodules of a layer go if the entire layer is on the same device. -Access the `hf_device_map` attribute to see how the model is distributed across devices. +Access the `hf_device_map` attribute to see how a model is distributed across devices. ```py device_map = {"model.layers.1": 0, "model.layers.14": 1, "model.layers.31": "cpu", "lm_head": "disk"} @@ -266,9 +264,9 @@ model.hf_device_map ### Model data type -PyTorch model weights are initialized as torch.float32. To load a model in a different data type, like torch.float16, it requires additional memory to load the model again in the desired data type. +PyTorch model weights are initialized in `torch.float32` by default. Loading a model in a different data type, like `torch.float16`, requires additional memory because the model is loaded again in the desired data type. -Explicitly set the [torch_dtype](https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype) parameter to directly initialize the model in the desired data type instead of loading the weights twice (torch.float32, torch.float16). You could also set `torch_dtype="auto"` to automatically load the weights with the most optimal memory pattern (the data type is derived from the model weights). +Explicitly set the [torch_dtype](https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype) parameter to directly initialize the model in the desired data type instead of loading the weights twice (`torch.float32` then `torch.float16`). You could also set `torch_dtype="auto"` to automatically load the weights in the data type they are stored in. @@ -303,9 +301,9 @@ model = AutoModel.from_config(my_config) ## Custom models -Custom models use Transformers' configuration and modeling classes, supports the [AutoClass](#autoclass) API, and are loaded with [`~PreTrainedModel.from_pretrained`]. What makes custom models different is the modeling code is not from Transformers. +Custom models builds on Transformers' configuration and modeling classes, supports the [AutoClass](#autoclass) API, and are loaded with [`~PreTrainedModel.from_pretrained`]. The difference is that the modeling code is *not* from Transformers. -Extra precaution should be taken when loading a custom model. While the Hub includes [malware scanning](https://hf.co/docs/hub/security-malware#malware-scanning) for every repository, extra care should still be taken when loading a custom model to avoid inadvertently executing malicious code. +Take extra precaution when loading a custom model. While the Hub includes [malware scanning](https://hf.co/docs/hub/security-malware#malware-scanning) for every repository, you should still be careful to avoid inadvertently executing malicious code. Set `trust_remote_code=True` in [`~PreTrainedModel.from_pretrained`] to load a custom model. @@ -315,7 +313,7 @@ from transformers import AutoModelForImageClassification model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True) ``` -For an extra layer of security, load a custom model from a specific revision to make sure the model code hasn't changed. The commit hash can be copied from the model's [commit history](https://hf.co/sgugger/custom-resnet50d/commits/main). +As an extra layer of security, load a custom model from a specific revision to avoid loading model code that may have changed. The commit hash can be copied from the models [commit history](https://hf.co/sgugger/custom-resnet50d/commits/main). ```py commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292" @@ -324,4 +322,4 @@ model = AutoModelForImageClassification.from_pretrained( ) ``` -Learn more about how to create a custom model in [Customize](./custom_models). +Refer to the [Customize models](./custom_models) guide for more information. diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md index 8c603433e0b8..81fcacf20756 100755 --- a/docs/source/en/quicktour.md +++ b/docs/source/en/quicktour.md @@ -18,21 +18,19 @@ rendered properly in your Markdown viewer. [[open-in-colab]] -Get up and running with Transformers, a library of pretrained models! +Transformers is designed to be fast and easy to use so that everyone can start learning or building with transformer models. -There are only three classes to instantiate any model and two APIs for inference or training. By limiting the number of user-facing abstractions, Transformers is easier to learn and faster to use. - -Whether you're a developer or a machine learning engineer, this quickstart introduces you to Transformers' key features and shows you how easy it is to: +The number of user-facing abstractions is limited to only three classes for instantiating a model, and two APIs for inference or training. This quickstart introduces you to Transformers' key features and shows you how to: - load a pretrained model -- run inference with the [`Pipeline`] API -- train a model with the [`Trainer`] API +- run inference with [`Pipeline`] +- fine-tune a model with [`Trainer`] -## Setup +## Set up -To start, we recommend creating a Hugging Face [account](https://hf.co/join). This allows you to host and access version controlled models, datasets, and [Spaces](https://hf.co/spaces) on the [Hugging Face Hub](https://hf.co/docs/hub/index), a collaborative platform for discovery and building. +To start, we recommend creating a Hugging Face [account](https://hf.co/join). An account lets you host and access version controlled models, datasets, and [Spaces](https://hf.co/spaces) on the Hugging Face [Hub](https://hf.co/docs/hub/index), a collaborative platform for discovery and building. -Create a [User Access Token](https://hf.co/docs/hub/security-tokens#user-access-tokens) and login to your account. +Create a [User Access Token](https://hf.co/docs/hub/security-tokens#user-access-tokens) and log in to your account. ```py from huggingface_hub import notebook_login @@ -40,7 +38,7 @@ from huggingface_hub import notebook_login notebook_login() ``` -Make sure your preferred machine learning framework is installed. +Install a machine learning framework. @@ -59,7 +57,7 @@ Make sure your preferred machine learning framework is installed. -Install an up-to-date version of Transformers and some additional libraries from the Hugging Face ecosystem for accessing datasets and vision models, evaluating training, and optimizing training for large models. +Then install an up-to-date version of Transformers and some additional libraries from the Hugging Face ecosystem for accessing datasets and vision models, evaluating training, and optimizing training for large models. ```bash !pip install -U transformers datasets evaluate accelerate timm @@ -72,34 +70,37 @@ Each pretrained model inherits from three base classes. | **Class** | **Description** | |---|---| | [`PretrainedConfig`] | A file that specifies a models attributes such as the number of attention heads or vocabulary size. | -| [`PreTrainedModel`] | A model (or architecture) defined by the model attributes from the configuration file. A pretrained model only returns the raw hidden states. For a specific task, use the appropriate model head to convert the raw hidden states into a meaningful result (e.g., [`LlamaModel`] versus [`LlamaForCausalLM`]). | +| [`PreTrainedModel`] | A model (or architecture) defined by the model attributes from the configuration file. A pretrained model only returns the raw hidden states. For a specific task, use the appropriate model head to convert the raw hidden states into a meaningful result (for example, [`LlamaModel`] versus [`LlamaForCausalLM`]). | | Preprocessor | A class for converting raw inputs (text, images, audio, multimodal) into numerical inputs to the model. For example, [`PreTrainedTokenizer`] converts text into tensors and [`ImageProcessingMixin`] converts pixels into tensors. | We recommend using the [AutoClass](./model_doc/auto) API to load models and preprocessors because it automatically infers the appropriate architecture for each task and machine learning framework based on the name or path to the pretrained weights and configuration file. -Use the [`~PreTrainedModel.from_pretrained`] method to load the weights and configuration file from the Hub into the model and preprocessor class. +Use [`~PreTrainedModel.from_pretrained`] to load the weights and configuration file from the Hub into the model and preprocessor class. -When you load a model, especially a large language model (LLM), setting `device_map="auto"` automatically allocates the model weights to your fastest device(s) first which is typically the GPU. +When you load a model, configure the following parameters to ensure the model is optimally loaded. + +- `device_map="auto"` automatically allocates the model weights to your fastest device first, which is typically the GPU. +- `torch_dtype="auto"` directly initializes the model weights in the data type they're stored in, which can help avoid loading the weights twice (PyTorch loads weights in torch.float32 by default). ```py from transformers import AutoModelForCausalLM, AutoTokenizer -model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="auto") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype="auto", device_map="auto") tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") ``` -Tokenize the text and return PyTorch tensors with the tokenizer. To accelerate inference, move the model to a GPU if it's available. +Tokenize the text and return PyTorch tensors with the tokenizer. Move the model to a GPU if it's available to accelerate inference. ```py -model_inputs = tokenizer(["Hugging Face is a"], return_tensors="pt").to("cuda") +model_inputs = tokenizer(["The secret to baking a good cake is "], return_tensors="pt").to("cuda") ``` The model is now ready for inference or training. -For inference, pass the tokenized inputs to the [`~GenerationMixin.generate`] API to generate text. Decode the token ids back into text with the [`~PreTrainedTokenizerBase.batch_decode`] method. +For inference, pass the tokenized inputs to [`~GenerationMixin.generate`] to generate text. Decode the token ids back into text with [`~PreTrainedTokenizerBase.batch_decode`]. ```py generated_ids = model.generate(**model_inputs, max_length=30) @@ -120,12 +121,12 @@ tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-xl") Tokenize the text and return TensorFlow tensors with the tokenizer. ```py -model_inputs = tokenizer(["Hugging Face is a"], return_tensors="tf") +model_inputs = tokenizer(["The secret to baking a good cake is "], return_tensors="tf") ``` The model is now ready for inference or training. -For inference, call the [`~GenerationMixin.generate`] API to generate text and the [`~PreTrainedTokenizerBase.batch_decode`] method to convert the token ids back into text. +For inference, pass the tokenized inputs to [`~GenerationMixin.generate`] to generate text. Decode the token ids back into text with [`~PreTrainedTokenizerBase.batch_decode`]. ```py generated_ids = model.generate(**model_inputs, max_length=30) @@ -136,21 +137,22 @@ tokenizer.batch_decode(generated_ids)[0] -For training, skip ahead to the [Trainer API](#trainer-api) section. +> [!TIP] +> Skip ahead to the [Trainer](#trainer-api) section to learn how to fine-tune a model. ## Pipeline -The [`Pipeline`] is the most convenient way to inference with a pretrained model. It supports many tasks such as text generation, image segmentation, automatic speech recognition, document question answering, and more. +The [`Pipeline`] class is the most convenient way to inference with a pretrained model. It supports many tasks such as text generation, image segmentation, automatic speech recognition, document question answering, and more. > [!TIP] -> Check out the [Pipeline](./main_classes/pipelines) API reference for a complete list of available tasks. +> Refer to the [Pipeline](./main_classes/pipelines) API reference for a complete list of available tasks. -Create a [`Pipeline`] object and select a task. By default, the [`Pipeline`] downloads and caches a default pretrained model for a given task. To choose a specific model, pass the model name to the `model` parameter. +Create a [`Pipeline`] object and select a task. By default, [`Pipeline`] downloads and caches a default pretrained model for a given task. Pass the model name to the `model` parameter to choose a specific model. -Set `device="cuda"`, if it's available, to accelerate inference with a GPU. +Set `device="cuda"` to accelerate inference with a GPU. ```py from transformers import pipeline @@ -158,7 +160,7 @@ from transformers import pipeline pipeline = pipeline("text-generation", model="meta-llama/Llama-2-7b-hf", device="cuda") ``` -Prompt the [`Pipeline`] with some initial text to generate more text. +Prompt [`Pipeline`] with some initial text to generate more text. ```py pipeline("The secret to baking a good cake is ", max_length=50) @@ -176,7 +178,7 @@ from transformers import pipeline pipeline = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic", device="cuda") ``` -Pass an image (a URL or local path to the image) to the [`Pipeline`]. +Pass an image - a URL or local path to the image - to [`Pipeline`].
@@ -201,7 +203,7 @@ from transformers import pipeline pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device="cuda") ``` -Pass an audio file to the [`Pipeline`]. +Pass an audio file to [`Pipeline`]. ```py pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac") @@ -213,9 +215,9 @@ pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac") ## Trainer -The [`Trainer`] is an optimized training and evaluation loop for PyTorch models. It abstracts away a lot of the standard boilerplate usually involved in manually writing a training loop. You can start training faster and focus on training design choices. You only need a model, dataset, a preprocessor, and a data collator to build batches of data from the dataset. +[`Trainer`] is a complete training and evaluation loop for PyTorch models. It abstracts away a lot of the boilerplate usually involved in manually writing a training loop, so you can start training faster and focus on training design choices. You only need a model, dataset, a preprocessor, and a data collator to build batches of data from the dataset. -Customize the training process with the [`TrainingArguments`] class. It provides many options for training, evaluation, and more. The training process can be as complex or simple as you want or need. Experiment with training hyperparameters and features like batch size, learning rate, mixed precision, torch.compile, and more. Or if you prefer, just use the default settings to quickly produce a baseline. +Use the [`TrainingArguments`] class to customize the training process. It provides many options for training, evaluation, and more. Experiment with training hyperparameters and features like batch size, learning rate, mixed precision, torch.compile, and more to meet your training needs. You could also use the default training parameters to quickly produce a baseline. Load a model, tokenizer, and dataset for training. @@ -236,7 +238,7 @@ def tokenize_dataset(dataset): dataset = dataset.map(tokenize_dataset, batched=True) ``` -Load a data collator to create batches of data, and pass the tokenizer to it. +Load a data collator to create batches of data and pass the tokenizer to it. ```py from transformers import DataCollatorWithPadding @@ -244,7 +246,7 @@ from transformers import DataCollatorWithPadding data_collator = DataCollatorWithPadding(tokenizer=tokenizer) ``` -Next, create an instance of [`TrainingArguments`] with the training features and hyperparameters you want. +Next, set up [`TrainingArguments`] with the training features and hyperparameters. ```py from transformers import TrainingArguments @@ -259,7 +261,7 @@ training_args = TrainingArguments( ) ``` -Finally, pass all these separate components to [`Trainer`] and call the [`~Trainer.train`] method to start. +Finally, pass all these separate components to [`Trainer`] and call [`~Trainer.train`] to start. ```py from transformers import Trainer @@ -276,7 +278,7 @@ trainer = Trainer( trainer.train() ``` -Use the [`~Trainer.push_to_hub`] method to share your model and tokenizer to the Hub. +Share your model and tokenizer to the Hub with [`~Trainer.push_to_hub`]. ```py trainer.push_to_hub() @@ -287,7 +289,7 @@ Congratulations, you just trained your first model with Transformers! ### TensorFlow > [!WARNING] -> Not all pretrained models are available in TensorFlow. Check which ones are implemented in [Supported models and frameworks](./index#supported-models-and-frameworks). +> Not all pretrained models are available in TensorFlow. Refer to a models API doc to check whether a TensorFlow implementation is supported. [`Trainer`] doesn't work with TensorFlow models, but you can still train a Transformers model implemented in TensorFlow with [Keras](https://keras.io/). Transformers TensorFlow models are a standard [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model), which is compatible with Keras' [compile](https://keras.io/api/models/model_training_apis/#compile-method) and [fit](https://keras.io/api/models/model_training_apis/#fit-method) methods. @@ -327,12 +329,10 @@ model.fit(tf_dataset) ## Next steps -Great work on completing the quickstart! - -Now that you have a better understanding of the library and what it offers, it's time to keep exploring and learning what interests you the most. +Now that you have a better understanding of Transformers and what it offers, it's time to keep exploring and learning what interests you the most. -- Base classes: Learn more about the base classes, and the configuration, model and processor classes that inherit from it. This will help you understand how to create your own custom models, preprocess different types of inputs (audio, images, multimodal), and how to share your model. -- Inference: Explore the [`Pipeline`] API further, inference with LLMs, chatting with LLMs, agents, and how to optimize inference with your machine learning framework and hardware. -- Training: Study the [`Trainer`] API in more detail, as well as distributed training and optimizing training on specific hardware. -- Quantization: Reduce memory and storage requirements with quantization and speed up inference by representing weights with fewer bits. -- Resources: Looking for end-to-end recipes for how to train and inference with a model for a specific task? Check out the task recipes! +- **Base classes**: Learn more about the configuration, model and processor classes. This will help you understand how to create and customize models, preprocess different types of inputs (audio, images, multimodal), and how to share your model. +- **Inference**: Explore the [`Pipeline`] further, inference and chatting with LLMs, agents, and how to optimize inference with your machine learning framework and hardware. +- **Training**: Study the [`Trainer`] in more detail, as well as distributed training and optimizing training on specific hardware. +- **Quantization**: Reduce memory and storage requirements with quantization and speed up inference by representing weights with fewer bits. +- **Resources**: Looking for end-to-end recipes for how to train and inference with a model for a specific task? Check out the task recipes! From bf38bdccb921834c9ab96287d62c12c604cb4aea Mon Sep 17 00:00:00 2001 From: stevhliu Date: Mon, 13 Jan 2025 16:35:05 -0800 Subject: [PATCH 091/116] model addition timeline --- docs/source/en/add_new_model.md | 166 ++++++++++++++++++-------------- 1 file changed, 92 insertions(+), 74 deletions(-) diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md index 1ef99d91bc0c..702f04db64f4 100644 --- a/docs/source/en/add_new_model.md +++ b/docs/source/en/add_new_model.md @@ -13,9 +13,9 @@ rendered properly in your Markdown viewer. --> -# Add a new model +# Adding a new model -Transformers is fortunate to have a passionate community of developers and researchers contributing models to the library. As an open-source first project, we're invested in empowering the community to actively add models. +Many of the models in Transformers are contributed by developers and researchers. As an open-source first project, we're invested in empowering the community to actively and independently add more models. When you add a model to Transformers, you'll learn: @@ -25,31 +25,16 @@ When you add a model to Transformers, you'll learn: - how to efficiently test large models - how to use Python utilities like [Black](https://black.readthedocs.io/en/stable/) and [Ruff](https://docs.astral.sh/ruff/) to create clean and readable code -It is a challenging, but also rewarding process! +It is a challenging but rewarding process. -This guide will walk you through adding an example "BrandNewBert" PyTorch model to Transformers. - -## New model addition - -Open a [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) issue to add a specific model. - -> [!TIP] -> Filter by the [New model](https://github.com/huggingface/transformers/labels/New%20model) label on GitHub if you're open to adding any model. - -Now is a good time to get familiar with BrandNewBert. It is helpful to read a models research paper to understand its technical design and implementation. You don't necessarily have to worry too much about the theoretical details. Instead, focus on the practical ones. Use the questions below to guide your reading. - -- What type of model is BrandNewBert? Is it a encoder, decoder, or encoder-decoder model? -- What tasks can BrandNewBert be used for? -- What makes BrandNewBert different from other models? -- What models in Transformers are most similar to BrandNewBert? -- What tokenizer does BrandNewBert use? +This guide will walk you through adding an example BrandNewBert PyTorch model to Transformers. Before you begin, it is a good idea to familiarize yourself with the library. ## Transformers overview -Transformers is an opinionated library with its own unique philosophy and design choices. These choices help us scale Transformers while maintaining a sustainable level of maintenance. +Transformers is an opinionated library with its own unique philosophy and design choices. These choices help us sustainably scale and maintain Transformers. > [!TIP] -> Learn more about our design principles on the [Philosophy](./philosophy) page. +> Learn more about our design principles on the [Philosophy](./philosophy) doc. Some of these design choices are: @@ -65,13 +50,11 @@ This section describes how the model and configuration classes interact and the All Transformers' models inherit from a base [`PreTrainedModel`] and [`PretrainedConfig`] class. The configuration is the models blueprint. - - -To keep the code readable, there is never more than two levels of abstraction for any model. The example model here, BrandNewBert, traces its inheritance from `BrandNewBertPreTrainedModel` and [`PreTrainedModel`]. It is important that a new model only depends on [`PreTrainedModel`] because it allows a model to be loaded and saved with [`~PreTrainedModel.from_pretrained`] and [`~PreTrainedModel.save_pretrained`]. +There is never more than two levels of abstraction for any model to keep the code readable. The example model here, BrandNewBert, inherits from `BrandNewBertPreTrainedModel` and [`PreTrainedModel`]. It is important that a new model only depends on [`PreTrainedModel`] so that it can use the [`~PreTrainedModel.from_pretrained`] and [`~PreTrainedModel.save_pretrained`] methods. Other important functions like the forward method are defined in the `modeling.py` file. -Specific model heads (for example, for sequence classification or language modeling) should use the base model as a component that is called in the forward pass rather than inherting from it. This keeps abstraction low. +Specific model heads (for example, sequence classification or language modeling) should call the base model in the forward pass rather than inherting from it to keep abstraction low. New models require a configuration, for example `BrandNewBertConfig`, that is stored as an attribute of [`PreTrainedModel`]. @@ -80,15 +63,15 @@ model = BrandNewBertModel.from_pretrained("username/brand_new_bert") model.config ``` -Like [`PreTrainedModel`], [`PretrainedConfig`] provides the [`~PretrainedConfig.from_pretrained`] and [`~PretrainedConfig.save_pretrained`] methods. +[`PretrainedConfig`] provides the [`~PretrainedConfig.from_pretrained`] and [`~PretrainedConfig.save_pretrained`] methods. -When you use [`PreTrainedModel.save_pretrained`], it automatically calls [`~PretrainedConfig.save_pretrained`] so that both the model and configuration are saved together. +When you use [`PreTrainedModel.save_pretrained`], it automatically calls [`PretrainedConfig.save_pretrained`] so that both the model and configuration are saved together. A model is saved to a `model.safetensors` file and a configuration is saved to a `config.json` file. ### Code style -Transformers prefers a clean and readable code style over a more abstracted one. Some of the code style choices include: +Transformers prefers a clean and readable code over a more abstracted code style. Some of the code style choices include: - The forward pass is written in the `modeling.py` file, completely independent of other models in the library. To reuse a block from another model, copy the code and paste it with a `# Copied from` comment above it. For example, the `RobertaSelfAttention` class is copied from the `BertSelfAttention` class. @@ -97,32 +80,43 @@ Transformers prefers a clean and readable code style over a more abstracted one. class RobertaSelfAttention(nn.Module): ``` - Refer to the [Check copies](./pr_checks#check-copies) section for more information about the `# Copied from` comment. + Refer to the [Check copies](./pr_checks#check-copies) docs for more information about the `# Copied from` comment. -- The code should be accessible to users from a non-native English background. Pick descriptive variable names and avoid abbreviations. For example, "activation" is preferred over "act". One letter variables names are highly discouraged unless it's an index in a for loop. +- The code should be accessible to non-English users. Pick descriptive variable names and avoid abbreviations. For example, "activation" is preferred over "act". One letter variables names are highly discouraged unless it's an index in a for loop. -- Explicit code is preferred over shorter code even if it's longer. +- Explicit code is preferred - even if it's longer - over shorter code. - Avoid subclassing [nn.Sequential](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html). Subclass [nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module) instead so the code can be quickly debugged with print statements or breakpoints. -- Function signatures should be type-annotated. Otherwise, good variable names are preferred because they're more readable and understandable. +- Function signatures should be type-annotated. Otherwise, use good variable names so they're more understandable. -## Add a new model +## New model addition issue -With some background knowledge about your model and the Transformers library, you're ready to add BrandNewBert now! +Open a [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) issue to add a specific model. > [!TIP] -> Each contributor has a unique style and workflow for porting models to Transformers. It may be helpful to take a look at how [GPT2](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) and [WMT19](https://huggingface.co/blog/porting-fsmt) were ported. +> Filter by the [New model](https://github.com/huggingface/transformers/labels/New%20model) label on GitHub to view and add any existing model requests. -Some final tips to keep in mind are: +Now is a good time to get familiar with BrandNewBert. It is helpful to read a models research paper to understand its technical design and implementation. You don't necessarily have to worry too much about the theoretical details. Instead, focus on the practical ones. Use the questions below to guide your reading. + +- What type of model is BrandNewBert? Is it a encoder, decoder, or encoder-decoder model? +- What tasks can BrandNewBert be used for? +- What makes BrandNewBert different from other models? +- What models in Transformers are most similar to BrandNewBert? +- What tokenizer does BrandNewBert use? + +In addition to learning more about your model, use the tips below to help you add a model faster. + +> [!TIP] +> Each contributor has a unique style and workflow for porting models to Transformers. It may be helpful to take a look at how [GPT2](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) and [WMT19](https://huggingface.co/blog/porting-fsmt) were ported. - Don't reinvent the wheel! Take your time to explore existing models and tokenizers to see what you can copy and reuse. [Grep](https://www.gnu.org/software/grep/) and [ripgrep](https://github.com/BurntSushi/ripgrep) are great tools for this. -- This is an engineering challenge more than a scientific one. Focus on the more practical aspects (set up an efficient debugging environment for example) instead of theoretical ones. +- This is more of an engineering than a science challenge. Focus on the more practical (setting up an efficient debugging environment for example) instead of the theorertical aspects of the model. - Don't be shy to ask for help! We are here to support you. 🤗 -### Dev environment +## Dev environment -Click on the **Fork** button on the [Transformers](https://github.com/huggingface/transformers) repository to create your own copy of it to work on. Then clone the repository to your local disk and add the base repository as the remote. +Click on the **Fork** button on the [Transformers](https://github.com/huggingface/transformers) repository to create your own copy to work on. Clone the repository to your local disk and add the base repository as the remote. ```bash git clone https://github.com/[your Github handle]/transformers.git @@ -130,7 +124,7 @@ cd transformers git remote add upstream https://github.com/huggingface/transformers.git ``` -Create a virtual environment and do an [editable install](./installation#editable-install) of the library with the "dev" or development dependencies. +Create a virtual environment and perform an [editable install](./installation#editable-install) of the library with the "dev" or development dependencies. ```bash python -m venv .env @@ -138,7 +132,7 @@ source .env/bin/activate pip install -e ".[dev]" ``` -Due to the number of optional dependencies as Transformers grows, this command may fail. In that case, install the "quality" dependencies. Also make sure you have a deep learning framework installed. +Due to the number of optional dependencies as Transformers grows, this command may fail. In this case, install the "quality" dependencies. Also make sure you have a deep learning framework installed. ```bash pip install -e ".[quality]" @@ -163,9 +157,9 @@ There are two possible debugging environments for running the original model, a > [!WARNING] > We don't recommend setting up a GPU environment to run the original model because it can be expensive. Instead, work in a CPU environment first to verify the model works in Transformers. Once it does, then you can verify it on a GPU. -Notebooks are great for executing code cell-by-cell which can help split logical components from one another. It can also accelerate debugging cycles because intermediate results can be stored. Notebooks can also be shared when working with other contributors. +Notebooks are great for executing code cell-by-cell which can help split logical components from one another. It can also accelerate debugging cycles because intermediate results can be stored. You can also share notebooks when working with other contributors. -The downside of notebooks is that if you aren't used to them, it may take some time to get used to. +The downside is that if you aren't used to them, it may take some time to get used to. > [!TIP] > If the model architecture is identical to an existing model, skip ahead to add a [conversion script](#conversion-script), because you can reuse the architecture of the existing model. @@ -176,9 +170,9 @@ Run the command below to start and complete the questionnaire with some basic in transformers-cli add-new-model-like ``` -### Create a pull request +## Create a pull request -Before you start adapting the code, create a pull request to track your progress and get feedback from the Transformers team. Title your pull request "[WIP] Add BrandNewBert" so it's clear that this is a work in progress. +Before you start adapting the code, create a pull request to track your progress and get feedback from the Transformers team. Title your pull request **[WIP] Add BrandNewBert** so it's clear that this is a work in progress. Create a branch with a descriptive name from your main branch. @@ -201,7 +195,7 @@ Push any changes to your branch and click on **Compare & pull request** to open git push -u origin a-descriptive-name-for-my-changes ``` -Include relevant Hugging Face team members GitHub handles in the pull request for questions, feedback, comments, and reviews. Direct team members to specific parts of the code you want by clicking on the **Files changed** tab, and then clicking on **+** to the left of the line number to add a comment. When a question or problem is solved, click on **Resolve** to indicate the issue is resolved. This keeps the conversation organized and clean. +Include relevant Hugging Face team members by adding their GitHub handles in the pull request for questions, feedback, comments, and reviews. Direct team members to specific parts of the code you want by clicking on the **Files changed** tab, and then clicking on **+** to the left of the line number to add a comment. When a question or problem is solved, click on **Resolve** to indicate the issue is resolved. This keeps the conversation organized and clean. Remember to periodically commit and push your work, and update your work with the current main branch. @@ -210,9 +204,9 @@ git fetch upstream git merge upstream/main ``` -### Run original checkpoint +## Original checkpoint -Before you start working on your model implementation, you should work on the original model implementation first to understand how it works. +Take some time to work on the original model implementation first to understand how it works. This can be difficult if the original model repository is lacking documentation or if the codebase is complex. But you should use this as your motivation to implement the model in Transformers. Your contribution makes it more accessible and user-friendly to everyone! @@ -235,7 +229,9 @@ input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids original_output = model.predict(input_ids) ``` -If you run into issues, you'll need to choose one of the following debugging decomposition strategies depending on the original models codebase. +### Debugging + +If you run into issues, you'll need to choose one of the following debugging strategies depending on the original models codebase. @@ -300,23 +296,25 @@ Here are some tips for an efficient debugging environment. Once you're able to run the original checkpoint, you're ready to start adapting the model code for Transformers. -### Adapt the model code +## Adapt the model code The `transformers-cli add-new-model-like` command should have generated a model and configuration file. - `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` - `src/transformers/models/brand_new_bert/configuration_brand_new_bert.py` -The automatically generated code in the `modeling.py` file will have the same architecture as BERT if you answered it's an encoder-only model or it will have the same architecture as BART if you answered it's an encoder-decoder model. The generated code is just a starting point. Based on your research on the new model, you'll need to implement those specific changes by adapting the generated code. This may involve changes to the self-attention layer, the order of the normalization layer, and so on. +The automatically generated code in the `modeling.py` file has the same architecture as BERT if you answered it's an encoder-only model or it will have the same architecture as BART if you answered it's an encoder-decoder model. The generated code is just a starting point. Based on your research on the new model, you'll need to implement those specific changes by adapting the generated code. This may involve changes to the self-attention layer, the order of the normalization layer, and so on. -At this point, your code doesn't have to be clean or even fully correct! It is more efficiently to quickly create a first draft and then iteratively improve on it. The only thing that matters is that your model can be instantiated from Transformers. The command below creates a model from the configuration with random weights, verifying that the the `__init__` method works. +### Model initialization + +At this point, your code doesn't have to be clean or even fully correct, It is more efficient to quickly create a first draft and then iteratively improve on it. The most important thing is that your model can be instantiated from Transformers. The command below creates a model from the configuration with random weights, verifying that the `__init__` method works. ```py from transformers import BrandNewBert, BrandNewBertConfig model = BrandNewBert(BrandNewBertConfig()) ``` -Random initialization occurs in BrandNewBertPreTrainedModel's `_init_weights` method. All leaf modules are initialized depending on the configuration's variables. +Random initialization occurs in the `_init_weights` method of `BrandNewBertPreTrainedModel`. All leaf modules are initialized depending on the configuration's variables. ```py def _init_weights(self, module): @@ -352,7 +350,7 @@ def _init_weights(self, module): module.bias.data.zero_() ``` -### Conversion script +### Convert checkpoints to Transformers The original checkpoint must be converted to a Transformers compatible checkpoint. @@ -448,7 +446,7 @@ assert ( logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}") ``` -When the shape or name don't match, you may have assigned the incorrect checkpoint weight to a randomly initialized layer. An incorrect shape may be because BrandNewBerts parameters don't exactly match the original models parameters. But it could also be that the PyTorch layer implementation requires the weights to be transposed first. +When the shape or name don't match, you may have assigned the incorrect checkpoint weight to a randomly initialized layer. An incorrect shape may be because the `BrandNewBert` parameters don't exactly match the original models parameters. But it could also be that the PyTorch layer implementation requires the weights to be transposed first. ### Implement the forward pass @@ -460,7 +458,7 @@ input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19] output = model(input_ids).last_hidden_states ``` -Don't be discouraged if your forward pass isn't identical with the output from the original model or if it returns an error! Check that the forward pass doesn't throw any errors. This is often because the dimensions are wrong (dimensionality mismatch) or because the wrong data type is used ([torch.long](https://pytorch.org/docs/stable/generated/torch.Tensor.long.html) instead of [torch.float32](https://pytorch.org/docs/stable/tensors.html)). +Don't be discouraged if your forward pass isn't identical with the output from the original model or if it returns an error. Check that the forward pass doesn't throw any errors. This is often because the dimensions are wrong or because the wrong data type is used ([torch.long](https://pytorch.org/docs/stable/generated/torch.Tensor.long.html) instead of [torch.float32](https://pytorch.org/docs/stable/tensors.html)). Your output should have a precision of *1e-3*. Ensure the output shapes and output values are identical. Common reasons for why the outputs aren't identical include: @@ -490,7 +488,7 @@ And if you're stuck or struggling with this step, don't hesitate to ask for help ### Add model tests -While the model works, you still need to add tests to ensure it is compatible with Transformers and all the tests pass. Tests are important because they help users understand your work by looking at specific tests, and because they prevent your model from breaking in the future if any changes are made. +While the model works, you still need to add tests to ensure it is compatible with Transformers. Tests are important because they help users understand your work by looking at specific tests, and because they prevent your model from breaking in the future if any changes are made. [Cookiecutter](https://cookiecutter.readthedocs.io/en/stable/) should have added a test file for your model. Run the test file below to make sure all common tests pass. @@ -498,7 +496,7 @@ While the model works, you still need to add tests to ensure it is compatible wi pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py ``` -The integration tests should be added first because they serve the same purpose as the debugging scripts you used earlier to implement the new model in Transformers. A template of those model tests, `BrandNewBertModelIntegrationTests`, was added by Cookiecutter and just needs to be filled out. To ensure it passes, run the following command. +The integration tests should be added first because they serve the same purpose as the debugging scripts you used earlier to implement the new model in Transformers. A template of those model tests, `BrandNewBertModelIntegrationTests`, was added by Cookiecutter and should be filled out. To ensure it passes, run the following command. @@ -519,14 +517,14 @@ SET RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_be All features unique to BrandNewBert should be tested in a separate test under `BrandNewBertModelTester/BrandNewBertModelTest`. This test is often overlooked, but it is extremely important because: -- it helps transfer knowledge you acquired during the process to the community by showing how the novel features of the new model works +- it helps transfer knowledge you acquired during the process to the community by showing how the models novel features work - future contributors can quickly test changes to the model by running these special tests -### Implement tokenizer +## Implement tokenizer With the model out of the way, time to focus on the tokenizer. The tokenizer should be identical or very similar to an existing tokenizer in Transformers. -Find and load the original tokenizer file into your implementation. Create a script in the original repository that inputs a string and returns the `input_ids`. The pseudocode should look similar to this. +Find and load the original tokenizer file into your implementation. Create a script in the original repository that inputs a string and returns the `input_ids`. The pseudocode should look similar to the code below. ```py input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." @@ -534,7 +532,7 @@ model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") input_ids = model.tokenize(input_str) ``` -You may need to search the original repository to find the correct tokenizer function or modify the existing tokenizer in your clone of the original repository to only return the `input_ids`. The script for your tokenizer should look something like this. +You may need to search the original repository to find the correct tokenizer function or modify the existing tokenizer in your clone of the original repository to only return the `input_ids`. The script for your tokenizer should look similar to the following. ```py from transformers import BrandNewBertTokenizer @@ -546,25 +544,25 @@ input_ids = tokenizer(input_str).input_ids When both implementations have the same `input_ids`, add a tokenizer test file. This file is analogous to the modeling test files. The tokenizer test files should contain a couple of hardcoded integration tests. -### Run integration tests +## Integration tests -Now that you have a model and tokenizer, add end-to-end integration tests using both the model and tokenizer to `tests/models/brand_new_bert/test_modeling_brand-new_bert.py`. +Now that you have a model and tokenizer, add end-to-end integration tests for the model and tokenizer to `tests/models/brand_new_bert/test_modeling_brand-new_bert.py`. The test should provide a meaningful text-to-text example to show the model works as expected. For example, you can include a source-to-target translation pair, an article-to-summary pair, or a question-to-answer pair. -If the checkpoint hasn't been finetuned on a downstream task, then the model tests will suffice. +If the checkpoint hasn't been fine-tuned on a downstream task, then the model tests are sufficient. -Finally, try to make sure your tests can run on a GPU by adding `.to(self.device)` statements to the models internal tensors. Don't worry if you don't have access to a GPU, we can take care of that for you if that's the case. +Finally, try to make sure your tests can run on a GPU by adding `.to(self.device)` statements to the models internal tensors. If you don't have access to a GPU, we can take care of that for you. -### Add documentation +## Add documentation Your model is only useful if users know how to use it. This is why it's important to add documentation and docstrings. Cookiecutter added a template file, `docs/source/model_doc/brand_new_bert.md`, that you can fill out with information about your model. This is generally a user's first interaction with a model, so the documentation should be clear and concise. It is often very useful to add examples of how the model should be used. -Make sure docstrings are added to `src/transformers/models/brand_new_bert/modeling_brand_new_bert/py` and includes all necessary inputs and outputs. Review our [guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification) for writing documentation and docstrings. +Make sure docstrings are added to `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` and includes all necessary inputs and outputs. Review our [guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification) for writing documentation and docstrings. -### Refactor +## Refactor Time to tidy things up and make sure the code style is consistent with the rest of the library. Run the following command to automatically fix incorrect styles. @@ -582,12 +580,12 @@ There may be other failing tests or checks (missing docstring or incorrect namin After ensuring the code runs correctly, you may want to refactor it to make it more readable or cleaner. -### Upload to the Hub +## Upload to the Hub Convert and upload all checkpoints to the [Hub](https://hf.co/models). Add a model card to provide more transparency and context about the model. The model card should highlight specific characteristics of a checkpoint, how the model was trained, and code examples of how to use it. > [!TIP] -> In many cases, adding an interactive notebook users can run is a great way to showcase how to use the model for inference or finetune it on a downstream task. While not mandatory, including a notebook can drive greater adoption of your model. +> In many cases, adding an interactive notebook users can run is a great way to showcase how to use the model for inference or fine-tune it on a downstream task. While not required, including a notebook can drive greater adoption of your model. You should also consult with the Transformers team to decide on an appropriate name for the model, and getting the required access rights to upload the model. @@ -597,12 +595,32 @@ Use the [`~PreTrainedModel.push_to_hub`] method to upload the model. brand_new_bert.push_to_hub("brand_new_bert") ``` -Refer to the [Share](./model_sharing) guide for more information about uploading models to the Hub. +Refer to the [Sharing](./model_sharing) guide for more information about uploading models to the Hub. -### Merge your model +## Merge your model You're finally ready to merge your pull request and officially add the model to Transformers! Make sure all the tests are passing and all comments and feedback have been addressed. Congratulations on adding a new model to Transformers! 🥳 -This is a very significant contribution. Your work here makes Transformers more accessible to developers and researchers around the world. You should be proud of your contribution and share your accomplishment with the community! \ No newline at end of file +This is a very significant contribution. Your work makes Transformers more accessible to developers and researchers around the world. You should be proud of your contribution and share your accomplishment with the community! + +## Model addition timeline + +There are four timelines for model additions depending on the model contributor and community demand for an architecture. + +- **day-0 integration**: If you plan on having a Transformers-first release, this is a great option because we can ensure the documentation is clear and optimize your model as much as possible (quantization, FlashAttention, KV-cache, etc.). We can also help you add the model, provide early reviews and make sure it works as expected. + + Reach out to transformers@huggingface.co a few days (preferably weeks) in advance, especially if an architecture is particularly novel, to ensure model integration. We'll work together on a private fork of Transformers until your checkpoint and release is ready. + +- **same week integration**: Models with significant requests/demand are usually added the same week if the model author doesn't reach out. + + Use the [issue tracker](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&projects=&template=new-model-addition.yml) to request a specific model to add. The more activity on the issue, the faster and more likely we'll integrate it. + +- **post-release integration**: Models without popular requests/demand or if we don't have the bandwidth to integrate it are added post-release. + + This is a good opportunity if you're interested in contributing a model to Transformers. Take a look at open issues tagged with ["New model"](https://github.com/huggingface/transformers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+model%22). Feel free to give the most requested models a try first to multiply the impact of your contribution. We'll be there to help you each step of the way! + +- **Hub-first release**: Transformers [remote-code](./models#custom-models) feature allows Transformers-based projects to be shared directly on the Hub. This is a good option if you don't have the bandwidth to add a model directly to Transformers. + + If a model ends up being very popular, then it's very likely that we'll integrate it in Transformers ourselves to enable better support (documentation, maintenance, optimization, etc.) for it. A Hub-first release is the most frictionless way to add a model. \ No newline at end of file From 67eeed6d7f009f794c51fc3f94d3ca0de4901f20 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Tue, 14 Jan 2025 17:18:45 -0800 Subject: [PATCH 092/116] modular --- docs/source/en/_toctree.yml | 2 +- docs/source/en/modular_transformers.md | 617 +++---------------------- 2 files changed, 64 insertions(+), 555 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index f6e3b34e8e51..b6579514a95b 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -22,7 +22,7 @@ - local: add_new_model title: Adding a new model - local: modular_transformers - title: Modular transformers + title: Modular Transformers - local: task_summary title: What 🤗 Transformers can do - local: tasks_explained diff --git a/docs/source/en/modular_transformers.md b/docs/source/en/modular_transformers.md index dca1282bcf99..ca7bbf9376f3 100644 --- a/docs/source/en/modular_transformers.md +++ b/docs/source/en/modular_transformers.md @@ -1,88 +1,55 @@ -# Modular transformers +# Modular Transformers -`transformers` is an opinionated framework; our philosophy is defined in the following [conceptual guide](./philosophy). +Modular Transformers lowers the bar for contributing models and significantly reduces the code required to add a model by allowing imports and inheritance. -The core of that philosophy is exemplified by the [single model, single file](https://huggingface.co/blog/transformers-design-philosophy) -aspect of the library. This component's downside is that it limits the inheritance and importability of components from -files to others in the toolkit. +One of Transformers' core design feature is the [single model, single file](https://huggingface.co/blog/transformers-design-philosophy) policy. Model components - such as attention layers - are repeated across many files and any independent implementations tend to diverge as fixes and changes are applied to specific parts of the code. -As a result, model components tend to be repeated across many files. There are as many attention layers defined -in `transformers` as there are models, and a significant number of those are identical to each other. -The unfortunate consequence is that independent implementations tend to diverge as fixes and changes get applied -to specific parts of the code. +The [`# Copied from`](./pr_checks#check-copies) statements prevents the code from diverging, and it is enforced by our continuous integration tests and local commands. The downside is that this approach is tedious and adds significantly more lines of code, most of which is boilerplate. -In order to balance this issue, we introduced the concept of "copies" across the library. By adding a comment indicating -that code is a copy of another, we can enforce through CI and local commands that copies do not diverge. However, -while the complexity is low, this is often quite tedious to do. +## Motivation -And, finally, this contributes to adding a significant overhead to contributing models which we would like to remove. -This approach often requires model contributions to add modeling code (~1k lines), processor (~500 lines), tests, docs, -etc. Model contribution PRs rarely add less than 3-5k lines of code, with much of this code being boilerplate. +Modular Transformers addresses these issues by adding a *modular* file to a model folder. The modular file can import code from other models and inherit code from other classes unlike traditional modeling and processing files. -This raises the bar for contributions, and with Modular Transformers, we're aiming to lower the bar to a much more -acceptable point. +> [!TIP] +> Modular Transformers isn't meant to replace the modeling code, and if your model isn't based on an existing model, you'll need to add a `modeling.py` file manually. -If you plan to add a model to `transformers` make sure you read [How to add a model to 🤗 Transformers?](https://huggingface.co/docs/transformers/add_new_model). -For any kind of contributions, see [CONTRIBUTING.md](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md). +A modular file contains model, processor, and configuration class code that would otherwise be in separate files under the single model, single file policy. -## What is it? +Model users still import and use the single-file interface they've grown familiar with. In doing so, we hope to enable simpler contributions while sticking to our philosophy. -Modular Transformers introduces the concept of a "modular" file to a model folder. This modular file accepts code -that isn't typically accepted in modeling/processing files, as it allows importing from neighbouring models as well -as inheritance from classes to others. +## Create a modeling.py file -This modular file defines models, processors, and the configuration class that would otherwise be defined in their -respective modules. +A linter "unravels" the modular file into a `modeling.py` file to preserve the single model, single file directory structure (modeling, processor, etc.). Inheritance is flattened to only a **single** level. -Finally, this feature introduces a new `linter` which will "unravel" the modular file into the "single model, single -file" directory structure. These files will get auto-generated every time the script is run; reducing the required -contributions to the modular file, and therefore only to the changes between the contributed model and others. - -Model users will end up importing and using the single-file interface, so no change is expected here. Doing this, we -hope to combine the best of both worlds: enabling simple contributions while sticking to our philosophy. - -This is therefore a replacement for the `# Copied from` markers, and previously contributed models can be expected to -be moved to the new Modular Transformers format in the coming months. - -### Details - -To generate a single file from the modular file, run the following command. +Run the command below to automatically generate a `modeling.py` file from a modular file. ```bash python utils/modular_model_converter.py --files-to-parse src/transformers/models//modular_.py ``` -The "linter", which unravels the inheritance and creates all single-files from the modular file, will flatten the -inheritance while trying to be invisible to Python users. At this time, the linter flattens a **single** level of -inheritance. - For example: -- If a configuration class inherits from another and adds/deletes an argument, the generated file will either directly - reference it (in case of addition) or completely remove it (in case of deletion). -- If a class inherits from another, for example: `class GemmaModel(LlamaModel):`, dependencies are automatically - inferred. All submodules will be automatically added from the superclass. -- If you define new functions in the `modular` and use them inside classes, the linter will automatically infer the -You should be able to write everything (the tokenizer, the image processor, the model, the config) in this `modular` -file, and the corresponding files will be created for you. +- If a configuration class inherits from another class, but adds and deletes an argument, the generated file directly references it if an argument is added or completely removes it if an argument is deleted. +- If a class inherits from another, like `GemmaModel(LlamaModel)`, the dependencies are automatically inferred. All submodules are also automatically inferred from the superclass. +- If a new function is defined in the modular file and used inside classes, the linter automatically infers these as well. -### Enforcement +You should be able to write everything (tokenizer, image processor, model, config, etc.) in a modular and their corresponding single-files are generated. -Run the command below to ensure the generated content matches `modular_.py` +Run the command below to ensure the generated content matches `modular_.py`. ```bash python utils/check_modular_conversion.py --files src/transformers/models//modular_.py ``` -### Examples +The example below demonstrates how a model can be added with significantly fewer lines of code with Modular Transformers. -Here is a quick example with BERT and RoBERTa. The two models are intimately related: their modeling implementation -differs solely by a change in the embedding layer. +### BERT and RoBERTa -Instead of redefining the model entirely, here is what the `modular_roberta.py` file looks like for the modeling & -configuration classes (for the sake of the example, the tokenizer is ignored at this time as very different). +BERT and RoBERTa, two very similar models, differ solely in how the embedding layer is implemented. -```python +Instead of redefining the model entirely, consider the `modular_roberta.py` file shown below for the modeling and configuration classes (the tokenizer isn't shown in this example). + +```py from torch import nn from ..bert.configuration_bert import BertConfig from ..bert.modeling_bert import ( @@ -91,11 +58,11 @@ from ..bert.modeling_bert import ( BertForMaskedLM ) -# The RoBERTa config is identical to BERT's config +# RoBERTa and BERT config is identical class RobertaConfig(BertConfig): model_type = 'roberta' -# We redefine the embeddings here to highlight the padding ID difference, and we redefine the position embeddings +# Redefine the embeddings to highlight the padding id difference, and redefine the position embeddings class RobertaEmbeddings(BertEmbeddings): def __init__(self, config): super().__init__(config()) @@ -105,379 +72,51 @@ class RobertaEmbeddings(BertEmbeddings): config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx ) -# The RoBERTa model is identical to the BERT model, except for the embedding layer. -# We redefine the embeddings above, so here there is no need to do additional work +# RoBERTa and BERT model is identical except for the embedding layer, which is defined above, so no need for additional changes here class RobertaModel(BertModel): def __init__(self, config): super().__init__(config) self.embeddings = RobertaEmbeddings(config) -# The heads now only need to redefine the model inside to the correct `RobertaModel` +# The model heads now only need to redefine the model inside to `RobertaModel` class RobertaForMaskedLM(BertForMaskedLM): def __init__(self, config): super().__init__(config) self.model = RobertaModel(config) ``` -## What it is not - -It is not a replacement for the modeling code (yet?), and if your model is not based on anything else that ever existed, then you can add a `modeling` file as usual. Similarly, if you cannot easily inherit your `configuration` (or `tokenization` or `processing`) file from another model's similar file, you can add that filetype directly (even though defining it in the modular file would work, it would clutter it). - - -## Real world example breakdown - -As explained, modular allows you to use regular Python inheritance from any other model's code in the library, in order to define your own. For this reason, it will work better/be easier if you first browse the library a bit to find models close to yours, in order to inherit from them. For example, are you using a sliding window in the `Attention` class? Then start by checking models that are well known to use it, e.g. `Mistral`, or `Qwen2`! Are you using interleaved `RotaryEmbedding` modules? Check out `Cohere`, `Cohere2` and `Glm` models! Otherwise a very strong starting point is to check out `Llama`. And if you are doing a bit of all of that at once, then you can mix and match! - -Here are some common properties that your model might be using, and corresponding modeling files to check as an example: -- Mixture of expert: `SwitchTransformers` or `Mixtral` -- Interleaved (and/or partial) rotary embedding: `Glm`, `Phi` -- State space models: - - Hybrid with attention: `Jamba` , `Bamba`, `Zamba` - - Mamba2: `Mamba2` -- Recurrent hidden states: `Gemma2` -- Different sliding window attention/full attention patterns per layer: `Gemma2`, `Cohere2` -- Clipping of QKV: `Olmo` -- Normalization of QK: `Olmo2`, `Cohere` -- Fused QKV (not recommended): `Phi3` - -At Hugging Face, we feel that learning by example is usually (one of) the best way, so we will now go over a typical modular file, and the different features our linter provides (and its limitations)! 🤗 Let's use a real world example with Olmo2 model, which I feel provides a very good illustration of the modular mechanisms. The original file can be found [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modular_olmo2.py). For simplicity, we will go over it class by class, and repeat the modular's definition of ech class. For reference, the modeling and configuration of Olmo (v1) on which we will inherit a lot can be found [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo/modeling_olmo.py) and [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo/configuration_olmo.py) respectively. The final modeling of Olmo2 (generated by running our linter on the modular we will describe below) can be found [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py) - -Let's break it down! - - -### Config class - -Here is the `Config` definition in modular: - -```py -from ..olmo.configuration_olmo import OlmoConfig - -class Olmo2Config(OlmoConfig): - r""" - This is the configuration class to store the configuration of a [`Olmo2Model`]. - """ - - def __init__( - self, - vocab_size=50304, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - use_cache=True, - pad_token_id=1, - bos_token_id=None, - eos_token_id=50279, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - rms_norm_eps=1e-5, - **kwargs, - ): - super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - use_cache=use_cache, - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - rope_theta=rope_theta, - rope_scaling=rope_scaling, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - **kwargs, - ) - - self.rms_norm_eps = rms_norm_eps - del self.clip_qkv -``` - -Here, we correctly identified that the `Config` in Olmo2 is similar to Olmo's, up to a few details: -1. The default value of most arguments has changed -2. we have a new argument, `rms_norm_eps` -3. the argument `clip_qkv` is not used anymore - -To solve points 1. and 2., simply overwriting the `__init__` function with the new default arguments and adding the new one is enough, as you would expect when you want to overwrite a method in Python! Of course you also need to assign the new attribute `rms_norm_eps` to `self` in the `__init__`'s body. -For point 3., we use the special syntax `del self.clip_qkv`, which, has you can expect, removed the assignment of this attribute in the unravelled code (after the conversion with the linter). - -Now, there is a subtility here: as you can see, we used `super().__init__(...)`. Usually, in Python, it is simply used to call the parent's `__init__`. In modular terms, however, it has a _slightly_ different meaning. When we find a call such as `super().my_function(...)` in the modular file, the linter will take the body of the `my_function` function in the parent, and unravel it where the call to `super().my_function(...)` occured. Then, the `del self.clip_qkv` statement will remove the reference to `self.clip_qkv` from the unravelled body. Thus `del self.xxx` can only work in pair with `super().my_function(...)`, and should always be placed after it (but you can add whatever you want _before_ calling `super()`, and it will be placed, as you can expect, before the parent's body). - -### Norm class - -Here is the `Norm` class: - -```py -from ..llama.modeling_llama import LlamaRMSNorm - -class Olmo2RMSNorm(LlamaRMSNorm): - pass -``` - -What to say here, it is pretty explicit isn't it? We do not modify anything from the `LlamaRMSNorm` definition. Thus the linter will unravel exactly the content of the parent (`LlamaRMSNorm`). Only change will be that every reference to "llama" on the docstrings, type hints, and comments (basically everywhere) will be changed to references to "olmo2" for consistency! - -### Attention class - -Here is the `Attention` class: - -```py -from ..llama.modeling_llama import eager_attention_forward -from ..olmo.modeling_olmo import OlmoAttention, apply_rotary_pos_emb - - -# Olmo2 attention is identical to OLMo attention except: -# - Norm is applied to attention queries and keys. -# - No qkv clipping. -class Olmo2Attention(OlmoAttention): - def __init__(self, config: Olmo2Config, layer_idx: Optional[int] = None): - super().__init__(config, layer_idx=layer_idx) - self.q_norm = Olmo2RMSNorm(config.num_attention_heads * self.head_dim, config.rms_norm_eps) - self.k_norm = Olmo2RMSNorm(config.num_key_value_heads * self.head_dim, config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - position_embeddings: Tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], - past_key_value: Optional[Cache] = None, - cache_position: Optional[torch.LongTensor] = None, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - input_shape = hidden_states.shape[:-1] - hidden_shape = (*input_shape, -1, self.head_dim) - - query_states = self.q_norm(self.q_proj(hidden_states)) - key_states = self.k_norm(self.k_proj(hidden_states)) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(hidden_shape).transpose(1, 2) - key_states = key_states.view(hidden_shape).transpose(1, 2) - value_states = value_states.view(hidden_shape).transpose(1, 2) - - cos, sin = position_embeddings - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) - - if past_key_value is not None: - # sin and cos are specific to RoPE models; cache_position needed for the static cache - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] - - attn_output, attn_weights = attention_interface( - self, - query_states, - key_states, - value_states, - attention_mask, - dropout=0.0 if not self.training else self.attention_dropout, - scaling=self.scaling, - **kwargs, - ) - - attn_output = attn_output.reshape(*input_shape, -1).contiguous() - attn_output = self.o_proj(attn_output) - return attn_output, attn_weights -``` - -Now, what's happening here? In the `__init__`, we call `super().__init__(...)`, thus copying the parent's definition, then add 2 new layers of the `Olmo2RMSNorm` we just added previously. Indeed, those were not present in the original `Olmo` (v1) model. So, now, we also have to overwrite the `forward` method to use these 2 new layers right? Indeed, if you check carefully, the definition of `forward` is identical to `Olmo`'s, but we added a pass with the norm layers just before projecting with `q_proj` and `k_proj`. However, to help us, we directly imported the functions `eager_attention_forward` from llama, and `apply_rotary_pos_emb` from olmo. The linter will then automatically add these imported functions in the final `modeling_olmo2.py` file, by copying their definitions from the source (imported) files. And it will even add the `rotate_half` and `repeat_kv` functions (which are used inside `apply_rotary_pos_embed` and `eager_attention_forward` respectively) by figuring out the dependency automatically. Neat, right? -Note that we had to redefine this class, because we did not find any model defining the `Attention` layer with the added `RMSNorm` layer anywhere else in the library! Otherwise, we would have simply inherited from this model instead as we did for the `RMSNorm`! - -### The DecoderLayer class - -Here is the `DecoderLayer` class: - -```py -from ..olmo.modeling_olmo import OlmoDecoderLayer - -# The OLMo2 layers are identical to those of the OLMo model except: -# - RMSNorm is used instead of standard layer norm. -# - Norm is applied after attention/feedforward rather than before. -class Olmo2DecoderLayer(OlmoDecoderLayer): - def __init__(self, config: Olmo2Config, layer_idx: int): - super().__init__(config, layer_idx=layer_idx) - self.post_attention_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_feedforward_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.self_attn = Olmo2Attention(config=config, layer_idx=layer_idx) - del self.input_layernorm - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = False, - cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC - **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - residual = hidden_states - - # Self Attention - hidden_states, self_attn_weights = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - cache_position=cache_position, - position_embeddings=position_embeddings, - **kwargs, - ) - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.mlp(hidden_states) - hidden_states = self.post_feedforward_layernorm(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - if output_attentions: - outputs += (self_attn_weights,) - - return outputs -``` - -At this point, you should start to pick up what is happening for this class. We switched the type of norm in the `__init__` by overwriting `self.post_attention_layernorm` after the call to `super().__init__(...)`, thus going from a `LayerNorm` in the parent class, to our `RMSNorm` in this class. Then we simply deleted the `self.input_layernorm` attribute, and replaced it by `self.post_feedforward_layernorm`, because the name was not making sense anymore as we apply it after in `Olmo2` instead of before in `Olmo`. For this reason, we also need to overwrite the `forward` method, to reflect the logic change. - -Note however that if we had only switched `self.post_attention_layernorm` and `self.input_layernorm` from `LayerNorm`s to `RMSNorm`s (without the name and logic change of `elf.input_layernorm`), we would not have had to redefine the `forward` method! +If you don't use the defined dependency, you'll receive the following error. -### The Model class - -```py -from ..olmo.modeling_olmo import OlmoModel - -# The OLMo2 model is identical to the OLMo model, except RMSNorm is used instead of -# standard layer norm for the output norm. -class Olmo2Model(OlmoModel): - def __init__(self, config: Olmo2Config): - super().__init__(config) - self.norm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.layers = nn.ModuleList( - [Olmo2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - ) ``` - -Here, this is exactly what I was pointing out before: we simply change the _type_ of the `self.norm` attribute (going from `LayerNorn` in `Olmo` to `RMSNorm` in `Olmo2`). Since this change does not reflect the logic of the `forward` method (the name of the layer and where it is used is identical to the parent's), then we do not even need to overwrite it! It will be unravelled automatically! Note that we redefined `self.layers` for the sake of being explicit, but this is not even strictly required here as the definition is similar to what is found in `Olmo` (v1). - -### Finally... The ForCausalLM class - -Finally, here is the definition of the `ForCausalLM`: - -```py -from ..olmo.modeling_olmo import OlmoForCausalLM - -class Olmo2ForCausalLM(OlmoForCausalLM): - pass +ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should be used when you define `BertModel`, as it is one of it's direct dependencies. Make sure you use it in the `__init__` function. ``` -As for the `RMSNorm`, it is exactly similar to the parent's in logic, so we do not have anything to do, the linter will all figure it out by itself. Almost disappointing, no? - +## Removing attributes and functions - -### But... What about the MLP, RotaryEmbedding and PreTrainedModel classes? - -Indeed, if you inspect the file [modeling_olmo2.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py) which is created by running the linter on `modular_olmo2.py`, you will notice that it also creates `Olmo2MLP`, `Olmo2RotaryEmbedding`, and `Olmo2PreTrainedModel` classes, that we did not define explicitly in `modular_olmo2.py`. - -Well, it is one of the main feature of our modular linter. Similarly to how some functions were added automatically with the `Attention` class (without directly importing them), classes that are a dependency of one of the class inherited class and which are not explicitly defined in the modular file, will be added automatically as part of the dependeny tracing. For example, in `OlmoDecoderLayer`, there is an attribute defined as `self.mlp = OlmoMLP(config)`. Because we never explicitly redefined a class named `Olmo2MLP` in `modular_olmo2.py`, the linter automatically created a class `Olmo2MLP`, similar to `OlmoMLP`. This is exactly the same as if we had done: +Use `del` to remove attributes that aren't used in your model or if you don't want to include it in the unravelled `modeling.py` file. The example [`GemmaModel`] below removes the `embed_tokens` from the original [`LlamaModel`] it inherits from. ```py -from ..olmo.modeling_olmo import OlmoMLP - -class Olmo2MLP(OlmoMLP): - pass +class GemmaModel(LlamaModel): | class GemmaModel(PreTrainedModel): + def __init__(self, config): | def __init__(self, config): + super().__init__(self, eos_token) | super().__init__(config) + del self.embed_tokens | self.padding_idx = config.pad_token_id + | self.vocab_size = config.vocab_size + | + | self.layers = nn.ModuleList( + | [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + | ) + | self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + | self.rotary_emb = LlamaRotaryEmbedding(config=config) + | self.gradient_checkpointing = False + | + | # Initialize weights and apply final processing + | self.post_init() ``` -but we did not even bother, because we _know_ this class is supposed to be exactly similar, and we never needed it anywhere else in the `modular_olmo2.py` file. In contrast, the class `Olmo2RMSNorm` was needed to (re)define the norms both in the `Attention` and `DecoderLayer` classes. The same logic is true for the `Olmo2PreTrainedModel` and `Olmo2RotaryEmbedding` classes. - -Note however that if not redefined, classes will be copied from the file in which an inherited module uses them first. So if you wanted e.g. `Olmo2MLP` to inherit from, say, `MistralMLP` instead of `OlmoMLP` (here it was `OlmoMLP` because it was first implicitly used in `Olmo2DecoderLayer`, which inherited from `OlmoDecoderLayer`), you would need to be explicit and do: +Remove a function by writing it with a `raise AttributeError("")` to mimic the behavior you actually want when you remove a parent function in Python. ```py -# switch to mistral definition -from ..mistral.modeling_mistral import MistralMLP - -class Olmo2MLP(MistralMLP): - pass -``` - -## Advanced usage - -Now that you should have a good grasp of how modular works, let's see some more advanced use cases and features you can use. - -### Removing attributes which are not just assignments - -As we have seen before, after using `super().__init__()`, we can use `del self.attribute` to remove a specific attribute which was defined in the parent. What if this attribute was used elsewhere though? Meaning it was not just "defined to be stored" as in the config for example. For example, consider the following case: - -```py -class DummyModel(nn.Module): - - def __init__(self, config: DummyConfig): - super().__init__() - self.attribute = config.attribute - if self.attribute: - # do more stuff with `self.attribute` here - ... -``` - -Then inheriting from this `DummyModel` and doing - -```py -class MyNewDummyModel(DummyModel): - - def __init__(self, config: MyNewDummyConfig): - super().__init__(config) - del self.attribute -``` - -is not supported, because it will only suppress the assignment, i.e. the line `self.attribute = config.attribute` will disappear, but the `if` statement will stay and reference the attribute. We tried to make it work by suppressing every mentions of the attribute, however it it not a sound solution in the general case (it can lead to very surprising effects and remove other important parts) and is therefore not possible. - -But what if I still want to inherit from `DummyModel`? How to properly do it? How to use `super().__init__()` without copy/pasting the parent then? This brings us to the next point: - -### Avoiding super() special meaning - -Say you still want to inherit from `DummyModel` (because it is convenient for some other methods) but you do want to remove the `self.attribute`. How to properly override the `__init__` method, while calling `super()` but without unravelling the parent's code? Well, then be explicit about which class `super()`'s you are calling! If we want to call the `nn.Module`'s `super()` for example, we can do the following (unravelled code on the right): - -```py -class MyNewDummyModel(DummyModel, nn.Module): | class MyNewDummyModel(nn.Module): - | - def __init__(self, config: MyNewDummyConfig): | def __init__(self, config: MyNewDummyConfig): - nn.Module.__init__(config) | super().__init__() - self.foo = config.foo | self.foo = config.foo - ... | ... -``` - -### Deleting unused methods - -Removing a class method is pretty similar to remove an attribute, you just need to overwrite it with a `raise AttributeError("")` to mimick the behaviour you actually want when you remove a parent function in python. For example, the following will remove the methods in the unravelled code: - -```python class GemmaTokenizer(LlamaTokenizer): ... @@ -488,11 +127,11 @@ class GemmaTokenizer(LlamaTokenizer): raise AttributeError("Not needed for Gemma") ``` -### Define new functions +## Define new functions -Of course, if you define a new function in the `modular` file, and use it inside an inherited class, say +New functions can be defined in the modular file and used inside a class. The new function - and recursively, any other new function called in its body - is automatically copy-pasted in the file where it is used. -```python +```py def my_new_function(*args, **kwargs): # Do something here pass @@ -501,161 +140,31 @@ class DummyModel(LlamaModel): def forward(*args, **kwargs): # Call the function example = my_new_function(*args, **kwargs) - # continue here -``` - -the `my_new_function` function (and, recursively, any other functions called in its body) will be automatically added to the unravelled code even if it is not present in the parent's file (here Llama). - -### Decorators - -By default, if you inherit from a class and override a method which has 1 (or more) decorators in the parent's method, the decorators will be added as well in the unravelled code, _but only if you do not add any yourself_. Otherwise, it will of course use whatever decorator your redefined. - -That, is, imagine the following parent class - -```py -class DummyModel(nn.Module): - ... - - @decorator(...) - def forward(...) - # do stuff here + # Continue here ``` -Then, if you simply override the method it will produce (modular on the left, unravelled code on the right): +## Calling super() -```py -class NewModel(DummyModel): | class NewModel(nn.Module): - ... | ... - | - def forward(...): | @decorator(...) - ... | def forward(...): - | ... -``` +You don't have to unravel a call to `super()` or if you want to differentiate which `super().__init__()` call you're doing. -That is, it keeps the parent's decorators by default. However, if you do: +The example below shows how you only need to add `eos_token` to the `__init__` instead of calling `super().__init__(eos_token)`. ```py -class NewModel(DummyModel): | class NewModel(nn.Module): - ... | ... - | - @my_new_decorator(...) | @my_new_decorator(...) - def forward(...): | def forward(...): - ... | ... +class GemmaTokenizer(LlamaTokenizer, PretrainedTokenizerFast): | class GemmaModel(nn.Module): + def __init__(self, eos_token=""): | def __init__(self): + eos_token = AddedToken(eos_token) | eos_token = AddedToken(eos_token) + PretrainedTokenizerFast.__init__(self, eos_token) | super().__init__(eos_token) ``` -Then it keeps you own new decorator. +## Special naming -### The super_kwargs special case +Special naming for classes is also supported, which is useful for composite models. -In the above case about decorators, what if the `forward` method is really long, and I just want to switch the decorators? Do I really have to redefine it all and copy/paste the body just for the decorator? Fortunately, no. If you followed until this point, you now that you can use `super().forward(...)`, and it will unravel the parent's body automatically. But what if there are plenty of arguments in the function's signature, and we are very lazy? For that use-case, we introduced the special syntax `**super_kwargs` in the overriden method signature. It basically mean: "unravel all the parent's signature arguments here". For example, a common signature in the `ForCausalLM` model is the following (copied from llama's modeling): +The example below shows how you can use `GemmaVisionModel` even though it's not the same as the modular Gemma model. ```py -class LlamaForCausalLM(nn.Module): - ... - - @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - cache_position: Optional[torch.LongTensor] = None, - num_logits_to_keep: int = 0, - **kwargs: Unpack[KwargsForCausalLM], - ) -> Union[Tuple, CausalLMOutputWithPast]: - ... -``` - -As you can see, this is a rather long and complicated signature. But if you do the following (as usual, modular on the left, unravelled code by the linter on the right): - -```py -class NewModelForCausalLM(LlamaForCausalLM): | class LlamaForCausalLM(nn.Module): - ... | ... - | - @my_new_decorator | @my_new_decorator - def forward(self, **super_kwargs): | def forward( - super().forward(**super_kwargs) | self, - | input_ids: torch.LongTensor = None, - | attention_mask: Optional[torch.Tensor] = None, - | position_ids: Optional[torch.LongTensor] = None, - | past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = |None, - | inputs_embeds: Optional[torch.FloatTensor] = None, - | labels: Optional[torch.LongTensor] = None, - | use_cache: Optional[bool] = None, - | output_attentions: Optional[bool] = None, - | output_hidden_states: Optional[bool] = None, - | return_dict: Optional[bool] = None, - | cache_position: Optional[torch.LongTensor] = None, - | num_logits_to_keep: int = 0, - | **kwargs: Unpack[KwargsForCausalLM], - | ) -> Union[Tuple, CausalLMOutputWithPast]: - | ... -``` - -and the `**super_kwargs` syntax unravelled all the arguments, while the `super().forward()` syntax unravelled the whole body! As you can see, this is great combo when you just want to switch the decorators, as it is very easy to use, and make it explicit that the only change you want to apply is the decorator. - -However, we want to make it clear that the `**super_kwargs` syntax is not a replacement to being explicit when you redefine your methods: if you actually overwrite the method (i.e. you do not call `super().method()`), then we want you to explicitly write the signature as you would usually. This is only a short-cut when switching decorators, and a few other niche cases. - -### The DOCSTRING variables - -Usually, if whatever object is defned both in the modular file and the modeling file from which we inherit, then the definition of the modular takes precedence. However, this is not the case for assignments containing the pattern `DOCSTRING`. Indeed, we usually have variables defined as `MODEL_START_DOCSTRING` and `MODEL_INPUT_DOCSTRING` in the modeling files. These are just very big blocks of, well, docstrings... But they are (almost) always exactly the same up to the model name! And modular automatically rewrite the names everywhere! For this reason, assignments containing the pattern will _always_ use the definition found in the source file instead of the modular file. This is extremely handy if we need the variable reference somewhere (e.g. to redefine a decorator) but we do not want to clutter the modular file with 100 lines of docstrings which are always the same. It allows to do the following (taken from [modular_starcoder2.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/starcoder2/modular_starcoder2.py#L146)) - -```py -STARCODER2_INPUTS_DOCSTRING = None # will be automatically redefined - -class Starcoder2Model(MistralModel): - ... - - @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING) - def forward(...) - ... -``` - -and here, the linter will correctly take the same definition of the docstring as in `Mistral`, without having to clutter the modular file! - -## Limitations - -Now, let's go over some of the limitations of modular. - -### Special naming (essentially for multimodal models) - -Because our linter automatically renames everything when inheriting from a class (defining `class NewModelMLP(LlamaMLP)` will rename every mention of `Llama` to `NewModel`, and recursively for all dependencies grabbed), it has somewhat strict rules when it comes to naming. For consistency reasons, we require that you always use the same class name prefix when inheriting different classes from the same file. For example, doing: - -```py -class MyModelIncredibleMLP(LlamaMLP): - ... - -class MyModelDecoderLayer(LlamaDecoderLayer): - ... -``` - -is not recommended, first because it breaks standards in the library and we do not like it, and second because the linter will not know how to rename potential high-order dependencies (should we use `MyModelIncredible`, or `MyModel`?). - -If there are no dependencies to grab implicitly however (see [this section](#dependencies) to understand implicit dependencies), local renaming (for a single class) will not be an issue and the linter will not complain. But make sure to explicitly redefine every other mentions of the class with the new name pattern! For example in the example above, all mentions of `LlamaMLP` in other modules inherited should be explicitly replaced by mentions to `MyModelIncredibleMLP`, otherwise the linter may add a new and unwanted `MyModelMLP` class! - -In any way, if there is an ambiguous case detected, the linter will raise a warning such as - -``` -We detected multiple prefix names when inheriting from transformers.models.llama.modeling_llama: ('Emu3Text', 'Emu3'). We will only use the most used 'Emu3' prefix when grabbing args and dependencies. Make sure to subclass the intermediate classes with the prefix you want (if different from 'Emu3') or use a single prefix in all the modular (best). -``` - -explaining what is happening, and which prefix is used by default for grabbing dependencies. As explained, if you see automatic dependencies appear with a prefix but you want another one, then explicitly rename these classes locally with a simple `pass` class, such as - -```py -class Emu3TextMLP(LlamaMLP): +class GemmaVisionModel(CLIPModel): pass ``` -Such warnings and renaming patterns complications usually only arise when defining multimodel models, when you want to define e.g. the text part of your model from an existing model, but want to add the part `Text` to the class names to make it clear what they refer to in the multimodal setup. - -### Automatic docstrings issue (mostly for Configs) - When inheriting a Config class and adding or deleting some attributes, it may be tempting to only redefine the new attributes in the docstring, and hoping that modular will do the rest. And similarly when deleting an argument, do nothing and hope that modular will remove itself from the docstring. However, due to current limitations of our linter, this is not yet supported. Thus, if you are in this case, you need to directly put the whole docstring (as it should appear in the end, with the correct arguments and default values) directly in the modular file under the class definition. \ No newline at end of file From 553e9246bc511b9b2f7ad64358b530245ec218bc Mon Sep 17 00:00:00 2001 From: stevhliu Date: Wed, 15 Jan 2025 16:58:50 -0800 Subject: [PATCH 093/116] more reviews --- docs/source/en/_toctree.yml | 2 +- docs/source/en/backbones.md | 30 +++++------ docs/source/en/fast_tokenizers.md | 78 +++++++++++++++++----------- docs/source/en/feature_extractors.md | 26 +++------- docs/source/en/image_processors.md | 62 +++++++++++++++++----- docs/source/en/processors.md | 10 ++-- docs/source/en/tiktoken.md | 33 +++++++++--- 7 files changed, 153 insertions(+), 88 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index b6579514a95b..797a640f3f9d 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -36,7 +36,7 @@ - local: fast_tokenizers title: Tokenizers - local: tiktoken - title: TikToken + title: tiktoken - local: image_processors title: Image processors - local: backbones diff --git a/docs/source/en/backbones.md b/docs/source/en/backbones.md index c43acd907135..792b0b0d38f1 100644 --- a/docs/source/en/backbones.md +++ b/docs/source/en/backbones.md @@ -16,13 +16,13 @@ rendered properly in your Markdown viewer. # Backbones -For some higher-level computer visions tasks such as object detection or image segmentation, it is common to use several models together to generate a prediction. These networks combine a *backbone*, neck, and head. The backbone extracts useful features from an input image into a feature map, the neck combines and processes the feature maps, and the head uses them to make a prediction. +Higher-level computer visions tasks, such as object detection or image segmentation, use several models together to generate a prediction. A separate model is used for the *backbone*, neck, and head. The backbone extracts useful features from an input image into a feature map, the neck combines and processes the feature maps, and the head uses them to make a prediction.
-Load a backbone with [`~AutoBackbone.from_pretrained`]. +Load a backbone with [`~PretrainedConfig.from_pretrained`] and use the `out_indices` parameter to determine which layer, given by the index, to extract a feature map from. ```py from transformers import AutoBackbone @@ -34,10 +34,10 @@ This guide describes the backbone class, backbones from the [timm](https://hf.co ## Backbone classes -There are two backbone classes for Transformers' models. +There are two backbone classes. -- [`BackboneMixin`] allows you to load a backbone and includes functions for extracting the feature maps and indices. -- [`BackboneConfigMixin`] allows you to set the feature map and indices of a backbone configuration. +- [`~transformers.utils.BackboneMixin`] allows you to load a backbone and includes functions for extracting the feature maps and indices. +- [`~transformers.utils.BackboneConfigMixin`] allows you to set the feature map and indices of a backbone configuration. Refer to the [Backbone](./main_classes/backbones) API documentation to check which models support a backbone. @@ -46,11 +46,11 @@ There are two ways to load a Transformers backbone, [`AutoBackbone`] and a model -The [AutoClass](./model_doc/auto) API automatically loads a pretrained vision model with [`~AutoBackbone.from_pretrained`] as a backbone if it's supported. +The [AutoClass](./model_doc/auto) API automatically loads a pretrained vision model with [`~PretrainedConfig.from_pretrained`] as a backbone if it's supported. Set the `out_indices` parameter to the layer you'd like to get the feature map from. If you know the name of the layer, you could also use `out_features`. These parameters can be used interchangeably, but if you use both, make sure they refer to the same layer. -When you don't use `out_indices` or `out_features`, the backbone returns the feature map from the last layer. Specify `out_indices=(1,)` to get the feature map from the first layer. +When `out_indices` or `out_features` isn't used, the backbone returns the feature map from the last layer. The example code below uses `out_indices=(1,)` to get the feature map from the first layer.
@@ -65,11 +65,11 @@ model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", o -When you know a model supports a backbone, you can load the backbone and neck directly into the models configuration. Then pass the configuration to the model to initialize it for a task. +When you know a model supports a backbone, you can load the backbone and neck directly into the models configuration. Pass the configuration to the model to initialize it for a task. -For example, load a [ResNet](./model_doc/resnet) backbone and neck for use in a [MaskFormer](./model_doc/maskformer) instance segmentation head. +The example below loads a [ResNet](./model_doc/resnet) backbone and neck for use in a [MaskFormer](./model_doc/maskformer) instance segmentation head. -Set the `backbone` parameter to the pretrained model to load the model configuration class. Toggle the `use_pretrained_backbone` parameter to determine whether you want to use pretrained or randomly initialized weights. +Set `backbone` to a pretrained model and `use_pretrained_backbone=True` to use pretrained weights instead of randomly initialized weights. ```py from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation @@ -78,7 +78,7 @@ config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbon model = MaskFormerForInstanceSegmentation(config) ``` -Another option is to separately load the backbone configuration and then pass it to the `backbone_config` paramater in the model configuration. +Another option is to separately load the backbone configuration and then pass it to `backbone_config` in the model configuration. ```py from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig @@ -98,7 +98,7 @@ model = MaskFormerForInstanceSegmentation(config) [timm](https://hf.co/docs/timm/index) is a collection of vision models for training and inference. Transformers supports timm models as backbones with the [`TimmBackbone`] and [`TimmBackboneConfig`] classes. -Set `use_timm_backbone=True` to load pretrained timm weights. The `use_pretrained_backbone` parameter can be toggled to use pretrained or randomly initialized weights. +Set `use_timm_backbone=True` to load pretrained timm weights, and `use_pretrained_backbone` to use pretrained or randomly initialized weights. ```py from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation @@ -115,7 +115,7 @@ from transformers import TimmBackboneConfig backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=True) ``` -Pass the backbone configuration to the model configuration and then instantiate the model head, [`MaskFormerForInstanceSegmentation`], with the backbone. +Pass the backbone configuration to the model configuration and instantiate the model head, [`MaskFormerForInstanceSegmentation`], with the backbone. ```py from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation @@ -126,9 +126,9 @@ model = MaskFormerForInstanceSegmentation(config) ## Feature extraction -The backbone is used for image feature extraction. Pass an image through the backbone to get the feature maps. +The backbone is used to extract image features. Pass an image through the backbone to get the feature maps. -Load and preprocess an image, and then pass it to the backbone. +Load and preprocess an image and pass it to the backbone. The example below extracts the feature maps from the first layer. ```py from transformers import AutoImageProcessor, AutoBackbone diff --git a/docs/source/en/fast_tokenizers.md b/docs/source/en/fast_tokenizers.md index fe3ff4f69e92..fdd05e1e85dc 100644 --- a/docs/source/en/fast_tokenizers.md +++ b/docs/source/en/fast_tokenizers.md @@ -16,14 +16,14 @@ rendered properly in your Markdown viewer. # Tokenizers -Tokenizers convert text into an array of numbers known as tensors, which are the inputs to a text model. There are several tokenizer types, but they all share the same purpose. Split text into smaller words or subwords (tokens) according to some rules, and convert them into numbers (input ids). A tokenizer also returns an attention mask to indicate which tokens should be attended to. +Tokenizers convert text into an array of numbers known as tensors, the inputs to a text model. There are several tokenizer algorithms, but they all share the same purpose. Split text into smaller words or subwords (tokens) according to some rules, and convert them into numbers (input ids). A Transformers tokenizer also returns an attention mask to indicate which tokens should be attended to. > [!TIP] -> Learn about the most popular tokenization algorithms on the [Summary of the tokenizers](./tokenizer_summary) page. +> Learn about the most popular tokenization algorithms on the [Summary of the tokenizers](./tokenizer_summary) doc. -To load a tokenizer, call [`~PreTrainedTokenizer.from_pretrained`] to load the tokenizer and its configuration from the Hugging Face [Hub](https://hf.co). The pretrained tokenizer is saved in a [tokenizer.model](https://huggingface.co/google/gemma-2-2b/blob/main/tokenizer.model) file with all its associated vocabulary files. This method accepts a Hub model repository name or a local directory. +Call [`~PreTrainedTokenizer.from_pretrained`] to load a tokenizer and its configuration from the Hugging Face [Hub](https://hf.co) or a local directory. The pretrained tokenizer is saved in a [tokenizer.model](https://huggingface.co/google/gemma-2-2b/blob/main/tokenizer.model) file with all its associated vocabulary files. -Apply the tokenizer to a string of text to return the input ids and attention mask. Set the framework tensor type to return with the `return_tensors` parameter. +Pass a string of text to the tokenizer to return the input ids and attention mask, and set the framework tensor type to return with the `return_tensors` parameter. ```py from transformers import AutoTokenizer @@ -36,13 +36,13 @@ tokenizer("We are very happy to show you the 🤗 Transformers library", return_ } ``` -Whatever tokenizer you use, make sure the tokenizer vocabulary is the same as the pretrained models tokenizer vocabulary. This is especially important if you're using a custom tokenizer with a different vocabulary than the one generated by a pretrained models tokenizer. +Whichever tokenizer you use, make sure the tokenizer vocabulary is the same as the pretrained models tokenizer vocabulary. This is especially important if you're using a custom tokenizer with a different vocabulary from the pretrained models tokenizer. This guide provides a brief overview of the tokenizer classes and how to preprocess text with it. ## Tokenizer classes -All tokenizers inherit from a [`PreTrainedTokenizerBase`] class that provides common methods for all tokenizers like [`~PreTrainedTokenizerBase.from_pretrained`] and [`~PreTrainedTokenizerBase.batch_decode`]. From this base class, there are two main tokenizer classes. +All tokenizers inherit from a [`PreTrainedTokenizerBase`] class that provides common methods for all tokenizers like [`~PreTrainedTokenizerBase.from_pretrained`] and [`~PreTrainedTokenizerBase.batch_decode`]. There are two main tokenizer classes that build on top of the base class. - [`PreTrainedTokenizer`] is a Python implementation, for example [`LlamaTokenizer`]. - [`PreTrainedTokenizerFast`] is a fast Rust-based implementation from the [Tokenizers](https://hf.co/docs/tokenizers/index) library, for example [`LlamaTokenizerFast`]. @@ -52,9 +52,9 @@ There are two ways you can load a tokenizer, with [`AutoTokenizer`] or a model-s -The [AutoClass](./model_doc/auto) API is a fast and easy way to load a tokenizer without needing to know whether a Python or Rust-based implementation is available. By default, an [`AutoTokenizer`] tries to load a fast tokenizer if it's available for a given model, otherwise, it loads the Python implementation. +The [AutoClass](./model_doc/auto) API is a fast and easy way to load a tokenizer without needing to know whether a Python or Rust-based implementation is available. By default, [`AutoTokenizer`] tries to load a fast tokenizer if it's available, otherwise, it loads the Python implementation. -Use the [`~PreTrainedTokenizer.from_pretrained`] method to load a tokenizer. +Use [`~PreTrainedTokenizer.from_pretrained`] to load a tokenizer. ```py from transformers import AutoTokenizer @@ -67,7 +67,7 @@ tokenizer("We are very happy to show you the 🤗 Transformers library.", return } ``` -Load your own tokenizer by passing its vocabulary file to the [`~AutoTokenizer.from_pretrained`] method. +Load your own tokenizer by passing its vocabulary file to [`~AutoTokenizer.from_pretrained`]. ```py from transformers import AutoTokenizer @@ -78,7 +78,10 @@ tokenizer = AutoTokenizer.from_pretrained("./model_directory/my_vocab_file.txt") -Each pretrained model is associated with a tokenizer and the specific vocabulary it was trained on. A tokenizer can be loaded directly from the model-specific class. Check a model's API documentation to check whether a fast tokenizer is supported for a model. +Each pretrained model is associated with a tokenizer and the specific vocabulary it was trained on. A tokenizer can be loaded directly from the model-specific class. + +> [!TIP] +> Refer to a models API documentation to check whether a fast tokenizer is supported. ```py from transformers import GemmaTokenizer @@ -107,19 +110,36 @@ tokenizer = GemmaTokenizerFast(vocab_file="my_vocab_file.txt") +## Multimodal tokenizers + +In addition to text tokens, multimodal tokenizers also holds tokens from other modalities as a part of its attributes for easy access. + +To add these special tokens to a tokenizer, pass them as a dictionary to the `extra_special_tokens` parameter in [`~AutoTokenizer.from_pretrained`]. The example below adds the `image_token` to a vision-language model. + +Save the tokenizer so you can reuse it with direct access to the `image_token`, `boi_token`, and `eoi_token`. + +```py +vision_tokenizer = AutoTokenizer.from_pretrained( + "llava-hf/llava-1.5-7b-hf", + extra_special_tokens={"image_token": "", "boi_token": "", "eoi_token": ""} +) +print(vision_tokenizer.image_token, vision_tokenizer.image_token_id) +("", 32000) + +vision_tokenizer.save_pretrained("./path/to/tokenizer") +``` + ## Fast tokenizers [`PreTrainedTokenizerFast`] or *fast tokenizers* are Rust-based tokenizers from the [Tokenizers](https://hf.co/docs/tokenizers) library. It is significantly faster at batched tokenization and provides additional alignment methods compared to the Python-based tokenizers. -If you're using [`AutoTokenizer`], it automatically loads a fast tokenizer if it's supported for a given model. Otherwise, you need to explicitly load the fast tokenizer. +[`AutoTokenizer`] automatically loads a fast tokenizer if it's supported. Otherwise, you need to explicitly load the fast tokenizer. This section will show you how to train a fast tokenizer and reuse it in Transformers. -### Train - -To train a Byte-Pair Encoding (BPE) tokenizer, create an instance of a [`~tokenizers.Tokenizer`] and [`~tokenizers.trainers.BpeTrainer`] and define the unknown token and special tokens. +To train a Byte-Pair Encoding (BPE) tokenizer, create a [`~tokenizers.Tokenizer`] and [`~tokenizers.trainers.BpeTrainer`] class and define the unknown token and special tokens. ```py from tokenizers import Tokenizer @@ -138,22 +158,20 @@ from tokenizers.pre_tokenizers import Whitespace tokenizer.pre_tokenizer = Whitespace() ``` -Pass the text files and trainer to the tokenizer and call [`~tokenizers.Tokenizer.train`] to train the tokenizer. +Call [`~tokenizers.Tokenizer.train`] on the text files and trainer to start training. ```py files = [...] tokenizer.train(files, trainer) ``` -Use the [`~tokenizers.Tokenizer.save`] method to save the tokenizers configuration and vocabulary to a JSON file. +Use [`~tokenizers.Tokenizer.save`] to save the tokenizers configuration and vocabulary to a JSON file. ```py tokenizer.save("tokenizer.json") ``` -### Load - -To load and use the tokenizer object in Transformers, pass it to the `tokenizer_object` parameter in [`PreTrainedTokenizerFast`]. +Now you can load and reuse the tokenizer object in Transformers by passing it to the `tokenizer_object` parameter in [`PreTrainedTokenizerFast`]. ```py from transformers import PreTrainedTokenizerFast @@ -173,7 +191,7 @@ fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") -A Transformers model expects the input as a PyTorch, TensorFlow, or NumPy tensor. A tokenizers job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter. +A Transformers model expects the input to be a PyTorch, TensorFlow, or NumPy tensor. A tokenizers job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter. ```py from transformers import AutoTokenizer @@ -186,14 +204,12 @@ tokenizer("We are very happy to show you the 🤗 Transformers library.", return } ``` -When passing a string of text to a tokenizer, there are actually two steps the tokenizer performs to convert the text into input ids. - - +The tokenization process of converting text into input ids is completed in two steps. -In the first step, a string of text is split into tokens. How the text is split depends on the tokenization algorithm. Call [`~PreTrainedTokenizer.tokenize`] to tokenize the text. +In the first step, a string of text is split into tokens by the [`~PreTrainedTokenizer.tokenize`] function. How the text is split depends on the tokenization algorithm. ```py tokens = tokenizer.tokenize("We are very happy to show you the 🤗 Transformers library") @@ -230,7 +246,7 @@ print(decoded_string) ### Special tokens -Special tokens are used by the tokenizer to provide the model with some additional information about the text. +Special tokens provide the model with some additional information about the text. For example, if you compare the tokens obtained from passing text directly to the tokenizer and from [`~PreTrainedTokenizer.convert_tokens_to_ids`], you'll notice some additional tokens are added. @@ -256,7 +272,7 @@ Not all models need special tokens, but if they do, a tokenizer automatically ad It is faster and more efficient to preprocess *batches* of text instead of a single sentence at a time. Fast tokenizers are especially good at parallelizing tokenization. -Pass a list of the string text to the tokenizer. +Pass a list of string text to the tokenizer. ```py batch_sentences = [ @@ -271,7 +287,9 @@ print(encoded_inputs) [[2, 1860, 1212, 1105, 2257, 14457, 235336], [2, 4454, 235303, 235251, 1742, 693, 9242, 1105, 2257, 14457, 235269, 48782, 235265], [2, 1841, 1105, 29754, 37453, 235336]], - 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]] + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1]] } ``` @@ -280,7 +298,7 @@ print(encoded_inputs) > [!TIP] > Learn about additional padding strategies in the [Padding and truncation](./pad_truncation) guide. -Examine the `input_ids` and you'll notice each element has a different length. This is an issue because Transformers expects the elements to have the same lengths so it can pack them into a batch. Sequences with uneven lengths can't be batched. +In the output above, the `input_ids` have different lengths. This is an issue because Transformers expects them to have the same lengths so it can pack them into a batch. Sequences with uneven lengths can't be batched. Padding adds a special *padding token* to ensure all sequences have the same length. Set `padding=True` to pad the sequences to the longest sequence length in the batch. @@ -296,9 +314,9 @@ The tokenizer added the special padding token `0` to the left side (*left paddin > [!TIP] > Learn about additional truncation strategies in the [Padding and truncation](./pad_truncation) guide. -Models are only able to process sequences up to a certain length. If you try to process a sequence longer than a model can handle, it'll crash. +Models are only able to process sequences up to a certain length. If you try to process a sequence longer than a model can handle, it crashes. -Truncation removes tokens from a sequence to ensure it doesn't exceed the maximum length. Set `truncation=True` to truncate a sequence to the maximum length accepted by the model. Or you can set the maximum length yourself with the `max_length` parameter. +Truncation removes tokens from a sequence to ensure it doesn't exceed the maximum length. Set `truncation=True` to truncate a sequence to the maximum length accepted by the model. You can also set the maximum length yourself with the `max_length` parameter. ```py encoded_inputs = tokenizer(batch_sentences, max_length=8, truncation=True, return_tensors="pt") diff --git a/docs/source/en/feature_extractors.md b/docs/source/en/feature_extractors.md index 09f817569b88..6cc202057697 100644 --- a/docs/source/en/feature_extractors.md +++ b/docs/source/en/feature_extractors.md @@ -18,7 +18,7 @@ rendered properly in your Markdown viewer. Feature extractors preprocess audio data into the correct format for a given model. It takes the raw audio signal and converts it into a tensor that can be fed to a model. The tensor shape depends on the model, but the feature extractor will correctly preprocess the audio data for you given the model you're using. Feature extractors also include methods for padding, truncation, and resampling. -To load a feature extractor, call [`~AutoFeatureExtractor.from_pretrained`] to load the feature extractor and its preprocessor configuration from the Hugging Face [Hub](https://hf.co/models). The feature extractor and preprocessor configuration is saved in a [preprocessor_config.json](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json) file. This method loads a feature extractor from a Hub model repository name or local directory +Call [`~AutoFeatureExtractor.from_pretrained`] to load a feature extractor and its preprocessor configuration from the Hugging Face [Hub](https://hf.co/models) or local directory. The feature extractor and preprocessor configuration is saved in a [preprocessor_config.json](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json) file. Pass the audio signal, typically stored in `array`, to the feature extractor and set the `sampling_rate` parameter to the pretrained audio models sampling rate. It is important the sampling rate of the audio data matches the sampling rate of the data a pretrained audio model was trained on. @@ -32,15 +32,13 @@ processed_sample -2.8888427e-03, 9.4472744e-05, 9.4472744e-05], dtype=float32)]} ``` -The feature extractor returns an input, `input_values`, that is ready for the model to accept. +The feature extractor returns an input, `input_values`, that is ready for the model to consume. This guide walks you through the feature extractor classes and how to preprocess audio data. ## Feature extractor classes -Transformers feature extractors inherit from the [`SequenceFeatureExtractor`] class, which subclasses [`FeatureExtractionMixin`]. - - +Transformers feature extractors inherit from the base [`SequenceFeatureExtractor`] class which subclasses [`FeatureExtractionMixin`]. - [`SequenceFeatureExtractor`] provides a method to [`~SequenceFeatureExtractor.pad`] sequences to a certain length to avoid uneven sequence lengths. - [`FeatureExtractionMixin`] provides [`~FeatureExtractionMixin.from_pretrained`] and [`~FeatureExtractionMixin.save_pretrained`] to load and save a feature extractor. @@ -63,7 +61,7 @@ feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny") -Every pretrained audio model has a specific associated feature extractor for correctly processing audio data. When you load a feature extractor, it retrieves the feature extractors configuration (feature size, chunk length, etc.) from [`preprocessor_config.json`](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json). +Every pretrained audio model has a specific associated feature extractor for correctly processing audio data. When you load a feature extractor, it retrieves the feature extractors configuration (feature size, chunk length, etc.) from [preprocessor_config.json](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json). A feature extractor can be loaded directly from its model-specific class. @@ -80,13 +78,13 @@ feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny A feature extractor expects the input as a PyTorch tensor of a certain shape. The exact input shape can vary depending on the specific audio model you're using. -For example, [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper) expects `input_features` which is a tensor of shape (batch_size, feature_size, sequence_length) but [Wav2Vec2](https://hf.co/docs/transformers/model_doc/wav2vec2) expects `input_values` which is a tensor of shape (batch_size, sequence_length). +For example, [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper) expects `input_features` to be a tensor of shape `(batch_size, feature_size, sequence_length)` but [Wav2Vec2](https://hf.co/docs/transformers/model_doc/wav2vec2) expects `input_values` to be a tensor of shape `(batch_size, sequence_length)`. The feature extractor generates the correct input shape for whichever audio model you're using. A feature extractor also sets the sampling rate (the number of audio signal values taken per second) of the audio files. The sampling rate of your audio data must match the sampling rate of the dataset a pretrained model was trained on. This value is typically given in the model card. -Load a dataset and feature extractor. +Load a dataset and feature extractor with [`~FeatureExtractionMixin.from_pretrained`]. ```py from datasets import load_dataset, Audio @@ -104,7 +102,7 @@ array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414, 0. , 0. ]) ``` -The feature extractor preprocesses `array` into the expected input format for a given audio model. Set the appropriate sampling rate with the `sampling_rate` parameter. +The feature extractor preprocesses `array` into the expected input format for a given audio model. Use the `sampling_rate` parameter to set the appropriate sampling rate. ```py processed_dataset = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000) @@ -138,11 +136,6 @@ def preprocess_function(examples): return inputs processed_dataset = preprocess_function(dataset[:5]) -``` - -The sequence lengths are the same now. - -```py processed_dataset["input_values"][0].shape (86699,) @@ -168,11 +161,6 @@ def preprocess_function(examples): return inputs processed_dataset = preprocess_function(dataset[:5]) -``` - -The sequence lengths are now 50000. - -```py processed_dataset["input_values"][0].shape (50000,) diff --git a/docs/source/en/image_processors.md b/docs/source/en/image_processors.md index faa3919e660a..2e5e466cd5d2 100644 --- a/docs/source/en/image_processors.md +++ b/docs/source/en/image_processors.md @@ -16,12 +16,12 @@ rendered properly in your Markdown viewer. # Image processors -An image processor converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision or video model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on. +Image processors converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision or video model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on. - [`~BaseImageProcessor.center_crop`] to resize an image - [`~BaseImageProcessor.normalize`] or [`~BaseImageProcessor.rescale`] pixel values -Load an image processor with [`~ImageProcessingMixin.from_pretrained`]. This loads the image processors configuration (image size, whether to normalize and rescale, etc.) from a vision model on the Hugging Face [Hub](https://hf.co). The specific image processor configuration for each pretrained model is saved in a [preprocessor_config.json](https://huggingface.co/google/vit-base-patch16-224/blob/main/preprocessor_config.json) file. This method accepts a Hub model repository name or a local directory. +Use [`~ImageProcessingMixin.from_pretrained`] to load an image processors configuration (image size, whether to normalize and rescale, etc.) from a vision model on the Hugging Face [Hub](https://hf.co) or local directory. The configuration for each pretrained model is saved in a [preprocessor_config.json](https://huggingface.co/google/vit-base-patch16-224/blob/main/preprocessor_config.json) file. ```py from transformers import AutoImageProcessor @@ -29,7 +29,7 @@ from transformers import AutoImageProcessor image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") ``` -Pass an image to the image processor to transform it into pixel values. Set `return_tensors="pt"` to return PyTorch tensors, and feel free to print out the inputs to see what the image looks like as a tensor. +Pass an image to the image processor to transform it into pixel values, and set `return_tensors="pt"` to return PyTorch tensors. Feel free to print out the inputs to see what the image looks like as a tensor. ```py from PIL import Image @@ -44,23 +44,21 @@ This guide covers the image processor class and how to preprocess images for vis ## Image processor classes - - -Transformers image processors inherit from the [`BaseImageProcessor`] class which provides the [`~BaseImageProcessor.center_crop`], [`~BaseImageProcessor.normalize`], and [`~BaseImageProcessor.rescale`] operations.. There are two types of image processors. +Image processors inherit from the [`BaseImageProcessor`] class which provides the [`~BaseImageProcessor.center_crop`], [`~BaseImageProcessor.normalize`], and [`~BaseImageProcessor.rescale`] functions. There are two types of image processors. - [`BaseImageProcessor`] is a Python implementation. -- [`BaseImageProcessorFast`] is a faster [torchvision](https://pytorch.org/vision/stable/index.html) backed version. For a batch of torch.Tensor inputs, this can be up to 33x faster. This is not available for all vision models at the moment. Refer to a models API documentation to check if it is supported. +- [`BaseImageProcessorFast`] is a faster [torchvision-backed](https://pytorch.org/vision/stable/index.html) version. For a batch of [torch.Tensor](https://pytorch.org/docs/stable/tensors.html) inputs, this can be up to 33x faster. [`BaseImageProcessorFast`] is not available for all vision models at the moment. Refer to a models API documentation to check if it is supported. Each image processor subclasses the [`ImageProcessingMixin`] class which provides the [`~ImageProcessingMixin.from_pretrained`] and [`~ImageProcessingMixin.save_pretrained`] methods for loading and saving image processors. -There are two ways you can load an image processor, [`AutoImageProcessor`] and a model-specific image processor. +There are two ways you can load an image processor, with [`AutoImageProcessor`] or a model-specific image processor. The [AutoClass](./model_doc/auto) API provides a convenient method to load an image processor without directly specifying the model the image processor is associated with. -Use [`~AutoImageProcessor.from_pretrained`] to load an image processor. Set `use_fast=True` to load a fast image processor if it's supported for a model. +Use [`~AutoImageProcessor.from_pretrained`] to load an image processor, and set `use_fast=True` to load a fast image processor if it's supported. ```py from transformers import AutoImageProcessor @@ -71,9 +69,9 @@ image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-22 -Each image processor is associated with a specific pretrained vision model, and the image processor's configuration contains the model's expected size and whether to normalize and resize. +Each image processor is associated with a specific pretrained vision model, and the image processors configuration contains the models expected size and whether to normalize and resize. -The image processor can be loaded directly from the model-specific class. Check a model's API documentation to see whether it supports a fast image processor. +The image processor can be loaded directly from the model-specific class. Check a models API documentation to see whether it supports a fast image processor. ```py from transformers import ViTImageProcessor @@ -92,6 +90,46 @@ image_processor = ViTImageProcessorFast.from_pretrained("google/vit-base-patch16 +## Fast image processors + +[`BaseImageProcessorFast`] is based on [torchvision](https://pytorch.org/vision/stable/index.html) and is significantly faster, especially when processing on a GPU. This class can be used as a drop-in replacement for [`BaseImageProcessor`] if it's available for a model because it has the same design. Make sure [torchvision](https://pytorch.org/get-started/locally/#mac-installation) is installed, and set the `use_fast` parameter to `True`. + +```py +from transformers import AutoImageProcessor + +processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True) +``` + +Control which device processing is performed on with the `device` parameter. Processing is performed on the same device as the input by default if the inputs are tensors, otherwise they are processed on the CPU. The example below places the fast processor on a GPU. + +```py +from torchvision.io import read_image +from transformers import DetrImageProcessorFast + +images = read_image("image.jpg") +processor = DetrImageProcessorFast.from_pretrained("facebook/detr-resnet-50") +images_processed = processor(images, return_tensors="pt", device="cuda") +``` + +
+Benchmarks + +The benchmarks are obtained from an [AWS EC2 g5.2xlarge](https://aws.amazon.com/ec2/instance-types/g5/) instance with a NVIDIA A10G Tensor Core GPU. + +
+ +
+
+ +
+
+ +
+
+ +
+
+ ## Preprocess Transformers' vision models expects the input as PyTorch tensors of pixel values. An image processor handles the conversion of images to pixel values, which is represented by the batch size, number of channels, height, and width. To achieve this, an image is resized (center cropped) and the pixel values are normalized and rescaled to the models expected values. @@ -110,7 +148,7 @@ from datasets import load_dataset dataset = load_dataset("food101", split="train[:100]") ``` -From the [transforms](https://pytorch.org/vision/stable/transforms.html) module, use the [Compose](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) API to chain together [RandomResizedCrop](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) and [ColorJitter](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html). These transforms randomly crop and resize an image, and randomly adjusts the colors of an image. +From the [transforms](https://pytorch.org/vision/stable/transforms.html) module, use the [Compose](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) API to chain together [RandomResizedCrop](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) and [ColorJitter](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html). These transforms randomly crop and resize an image, and randomly adjusts an images colors. The image size to randomly crop to can be retrieved from the image processor. For some models, an exact height and width are expected while for others, only the `shortest_edge` is required. diff --git a/docs/source/en/processors.md b/docs/source/en/processors.md index 48ce5ec5faf0..9e0083eac882 100644 --- a/docs/source/en/processors.md +++ b/docs/source/en/processors.md @@ -20,7 +20,7 @@ Multimodal models require a preprocessor capable of handling inputs that combine For example, [PaliGemma](./model_doc/paligemma) is a vision-language model that uses the [SigLIP](./model_doc/siglip) image processor and the [Llama](./model_doc/llama) tokenizer. A [`ProcessorMixin`] class wraps both of these preprocessor types, providing a single and unified processor class for a multimodal model. -To load a processor, call [`~ProcessorMixin.from_pretrained`]. Pass the input type to the processor to generate the expected model inputs, the input ids and pixel values. +Call [`~ProcessorMixin.from_pretrained`] to load a processor. Pass the input type to the processor to generate the expected model inputs, input ids and pixel values. ```py from transformers import AutoProcessor, PaliGemmaForConditionalGeneration @@ -41,7 +41,7 @@ This guide describes the processor class and how to preprocess multimodal inputs ## Processor classes -All processors inherit from the [`ProcessorMixin`] class which provides methods like [`~ProcessorMixin.from_pretrained`], [`~ProcessorMixin.save_pretrained`], and [`~ProcessorMixin.push_to_hub`] for loading, saving, and sharing processors to the Hub repsectively. +All processors inherit from the [`ProcessorMixin`] class which provides methods like [`~ProcessorMixin.from_pretrained`], [`~ProcessorMixin.save_pretrained`], and [`~ProcessorMixin.push_to_hub`] for loading, saving, and sharing processors to the Hub. There are two ways to load a processor, with an [`AutoProcessor`] and with a model-specific processor class. @@ -103,13 +103,13 @@ dataset[0]["text"] 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition' ``` -Remember to resample the sampling rate to match the model's requirements. +Remember to resample the sampling rate to match the pretrained models required sampling rate. ```py dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) ``` -Load a processor and pass the audio `array` to the `audio` parameter and pass the `text` column to the `text` parameter. +Load a processor and pass the audio `array` and `text` columns to it. ```py from transformers import AutoProcessor @@ -122,7 +122,7 @@ def prepare_dataset(example): return example ``` -Apply the `prepare_dataset` function to the dataset to preprocess it. The processor returns the `input_features` for the `audio` column and `labels` for the text column. +Apply the `prepare_dataset` function to preprocess the dataset. The processor returns `input_features` for the `audio` column and `labels` for the text column. ```py prepare_dataset(dataset[0]) diff --git a/docs/source/en/tiktoken.md b/docs/source/en/tiktoken.md index 8bd4ba9eafe2..37bb05775360 100644 --- a/docs/source/en/tiktoken.md +++ b/docs/source/en/tiktoken.md @@ -14,13 +14,13 @@ rendered properly in your Markdown viewer. --> -# Tiktoken +# tiktoken -[Tiktoken](https://github.com/openai/tiktoken) is a [byte-pair encoding (BPE)](./tokenizer_summary#byte-pair-encoding-bpe) tokenizer by OpenAI, inclyding several tokenization schemes or encodings for how text should be tokenized. +[tiktoken](https://github.com/openai/tiktoken) is a [byte-pair encoding (BPE)](./tokenizer_summary#byte-pair-encoding-bpe) tokenizer by OpenAI. It includes several tokenization schemes or encodings for how text should be tokenized. -There are currently two models trained and released with tiktoken, GPT2 and Llama3. Transformers supports loading these models with the [tokenizer.model](https://hf.co/meta-llama/Meta-Llama-3-8B/blob/main/original/tokenizer.model) tiktoken file. The tiktoken file is automatically converted into Transformers Rust-based [`PreTrainedTokenizerFast`]. +There are currently two models trained and released with tiktoken, GPT2 and Llama3. Transformers supports models with a [tokenizer.model](https://hf.co/meta-llama/Meta-Llama-3-8B/blob/main/original/tokenizer.model) tiktoken file. The tiktoken file is automatically converted into Transformers Rust-based [`PreTrainedTokenizerFast`]. -Add the `subfolder` parameter to [`~PreTrainedModel.from_pretrained`] to specify where the `tokenizer.model` file is located. +Add the `subfolder` parameter to [`~PreTrainedModel.from_pretrained`] to specify where the `tokenizer.model` tiktoken file is located. ```py from transformers import AutoTokenizer @@ -28,11 +28,32 @@ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", subfolder="original") ``` -You can visualize how the tiktoken tokenizer works for Llama3 with the Tokenizer Playground below. +## Create a tiktoken tokenizer + +The tiktoken `tokenizer.model` file contains no information about additional tokens or pattern strings. If these are important, convert the tokenizer to `tokenizer.json` (the appropriate format for [`PreTrainedTokenizerFast`]). + +Generate the tiktoken `tokenizer.model` file with the [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) function, and convert it to `tokenizer.json` with [convert_tiktoken_to_fast](https://github.com/huggingface/transformers/blob/99e0ab6ed888136ea4877c6d8ab03690a1478363/src/transformers/integrations/tiktoken.py#L8). + +```py +from transformers.integrations.tiktoken import convert_tiktoken_to_fast +from tiktoken import get_encoding + +# Load your custom encoding or the one provided by OpenAI +encoding = get_encoding("gpt2") +convert_tiktoken_to_fast(encoding, "config/save/dir") +``` + +The resulting `tokenizer.json` file is saved to the specified directory and loaded with [`~PreTrainedTokenizerFast.from_pretrained`]. + +```py +tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir") +``` + +Visualize how the tiktoken tokenizer works by selecting Llama3 in the Tokenizer Playground below. From 7b2edc9e2fcf05af123a05f0182650211e5567f4 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Tue, 21 Jan 2025 09:55:49 -0800 Subject: [PATCH 094/116] reviews --- docs/source/en/_toctree.yml | 14 +-- docs/source/en/add_new_pipeline.md | 58 ++++++++---- docs/source/en/cache_explanation.md | 24 ++++- docs/source/en/generation_features.md | 82 +++++++++++++++++ docs/source/en/generation_strategies.md | 90 ++++++++++++++----- docs/source/en/kv_cache.md | 8 +- docs/source/en/llm_optims.md | 115 +++--------------------- docs/source/en/llm_tutorial.md | 90 ++++++++++++++++--- docs/source/en/pipeline_gradio.md | 2 +- docs/source/en/pipeline_tutorial.md | 63 ++++++------- docs/source/en/pipeline_webserver.md | 20 ++--- docs/source/en/tasks/prompting.md | 32 +++---- docs/source/en/tiktoken.md | 2 +- docs/source/en/training.md | 93 +++++++++++++++++++ 14 files changed, 467 insertions(+), 226 deletions(-) create mode 100644 docs/source/en/generation_features.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 797a640f3f9d..972646f305a0 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -61,21 +61,23 @@ - local: pipeline_webserver title: Web server inference - local: add_new_pipeline - title: Add a new pipeline + title: Adding a new pipeline - title: LLMs sections: + - local: llm_tutorial + title: Text generation + - local: generation_strategies + title: Generation strategies - local: tasks/prompting + - local: generation_features + title: Generation features title: Prompt engineering - local: llm_optims - title: Optimize inference + title: Optimizing inference - local: kv_cache title: KV cache strategies - local: cache_explanation title: Caching - - local: llm_tutorial - title: Text generation - - local: generation_strategies - title: Generation strategies - local: llm_tutorial_optimization title: Getting the most out of LLMs - local: perplexity diff --git a/docs/source/en/add_new_pipeline.md b/docs/source/en/add_new_pipeline.md index d7ec9b21261a..c0029146b207 100644 --- a/docs/source/en/add_new_pipeline.md +++ b/docs/source/en/add_new_pipeline.md @@ -13,21 +13,21 @@ rendered properly in your Markdown viewer. --> -# Add a new pipeline +# Adding a new pipeline -You can make [`Pipeline`] your own by subclassing it, and then implementing a few methods. Share the code with the community on the [Hub](https://hf.co) and register the pipeline with Transformers so that everyone can quickly and easily use it. +Make [`Pipeline`] your own by subclassing it and implementing a few methods. Share the code with the community on the [Hub](https://hf.co) and register the pipeline with Transformers so that everyone can quickly and easily use it. This guide will walk you through the process of adding a new pipeline to Transformers. ## Design choices -At a bare minimum, you only need to provide [`Pipeline`] with an appropriate input for a task. This is also where you should begin when designing your pipeline. +At a minimum, you only need to provide [`Pipeline`] with an appropriate input for a task. This is also where you should begin when designing your pipeline. Decide what input types [`Pipeline`] can accept. It can be strings, raw bytes, dictionaries, and so on. Try to keep the inputs in pure Python where possible because it's more compatible. Next, decide on the output [`Pipeline`] should return. Again, keeping the output in Python is the simplest and best option because it's easier to work with. -Keeping the inputs and outputs simple, and ideally JSON-serializable, makes it easier for users to run your [`Pipeline`] without needing to learn new object types. It's also common to support many different input types, for even greater ease of use. For example, making an audio file acceptable from a filename, URL, or raw bytes gives the user more flexibility in how they provide the audio data. +Keeping the inputs and outputs simple, and ideally JSON-serializable, makes it easier for users to run your [`Pipeline`] without needing to learn new object types. It's also common to support many different input types for even greater ease of use. For example, making an audio file acceptable from a filename, URL, or raw bytes gives the user more flexibility in how they provide the audio data. -## Implement a pipeline +## Create a pipeline With an input and output decided, you can start implementing [`Pipeline`]. Your pipeline should inherit from the base [`Pipeline`] class and include 4 methods. @@ -52,7 +52,7 @@ def preprocess(self, inputs, maybe_arg=2): return {"model_input": model_input} ``` -1. `_forward` shouldn't be called directly. `forward` is the preferred method because it includes safeguards to make sure everything works correctly on the expected device. Anything linked to the model belongs in `_forward`, and everything else belongs in either `preprocess` or `postprocess`. +1. `_forward` shouldn't be called directly. `forward` is the preferred method because it includes safeguards to make sure everything works correctly on the expected device. Anything linked to the model belongs in `_forward` and everything else belongs in either `preprocess` or `postprocess`. ```py def _forward(self, model_inputs): @@ -68,7 +68,7 @@ def postprocess(self, model_outputs, top_k=5): return best_class ``` -1. `_sanitize_parameters` lets users pass additional parameters to [`Pipeline`]. This could be during the initialization or when [`Pipeline`] is called. `_sanitize_parameters` returns 3 dicts of additional keyword arguments that are passed directly to `preprocess`, `_forward`, and `postprocess`. Don't add anything if a user didn't call the pipeline with an extra parameters! This keeps the default arguments in the function definition which is always more *natural*. +1. `_sanitize_parameters` lets users pass additional parameters to [`Pipeline`]. This could be during initialization or when [`Pipeline`] is called. `_sanitize_parameters` returns 3 dicts of additional keyword arguments that are passed directly to `preprocess`, `_forward`, and `postprocess`. Don't add anything if a user didn't call the pipeline with extra parameters. This keeps the default arguments in the function definition which is always more natural. For example, add a `top_k` parameter in `postprocess` to return the top 5 most likely classes. Then in `_sanitize_parameters`, check if the user passed in `top_k` and add it to `postprocess_kwargs`. @@ -84,7 +84,7 @@ def _sanitize_parameters(self, **kwargs): return preprocess_kwargs, {}, postprocess_kwargs ``` -Now the pipeline can return the top most likely labels if they choose to. +Now the pipeline can return the top most likely labels if a user chooses to. ```py from transformers import pipeline @@ -122,26 +122,23 @@ PIPELINE_REGISTRY.register_pipeline( Share your pipeline with the community on the [Hub](https://hf.co) or you can add it directly to Transformers. -It's faster to upload your pipeline code to the Hub because it doesn't require any review from the Transformers team. Adding the pipeline to Transformers may be slower because it requires a review and you need to add tests to ensure the [`Pipeline`] works. +It's faster to upload your pipeline code to the Hub because it doesn't require a review from the Transformers team. Adding the pipeline to Transformers may be slower because it requires a review and you need to add tests to ensure your [`Pipeline`] works. ### Upload to the Hub Add your pipeline code to the Hub in a Python file. -For example, a custom pipeline for sentence pair classification might look the following code below. +For example, a custom pipeline for sentence pair classification might look like the following code below. The implementation works for PyTorch and TensorFlow models. ```py import numpy as np - from transformers import Pipeline - def softmax(outputs): maxes = np.max(outputs, axis=-1, keepdims=True) shifted_exp = np.exp(outputs - maxes) return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) - class PairClassificationPipeline(Pipeline): def _sanitize_parameters(self, **kwargs): preprocess_kwargs = {} @@ -166,6 +163,37 @@ class PairClassificationPipeline(Pipeline): return {"label": label, "score": score, "logits": logits} ``` +Save the code in a file named `pair_classification.py`, and import and register it as shown below. + +```py +from pair_classification import PairClassificationPipeline +from transformers.pipelines import PIPELINE_REGISTRY +from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification + +PIPELINE_REGISTRY.register_pipeline( + "pair-classification", + pipeline_class=PairClassificationPipeline, + pt_model=AutoModelForSequenceClassification, + tf_model=TFAutoModelForSequenceClassification, +) +``` + +The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5fb0164e89d4998e5776897c16f7330d3df/src/transformers/pipelines/base.py#L1387) function registers the pipeline details (task type, pipeline class, supported backends) to a models `config.json` file. + +```json + "custom_pipelines": { + "pair-classification": { + "impl": "pair_classification.PairClassificationPipeline", + "pt": [ + "AutoModelForSequenceClassification" + ], + "tf": [ + "TFAutoModelForSequenceClassification" + ], + } + }, +``` + Call [`~Pipeline.push_to_hub`] to push the pipeline to the Hub. The Python file containing the code is copied to the Hub, and the pipelines model and tokenizer are also saved and pushed to the Hub. Your pipeline should now be available on the Hub under your namespace. ```py @@ -189,11 +217,11 @@ Adding a custom pipeline to Transformers requires adding tests to make sure ever Add your pipeline code as a new module to the [pipelines](https://github.com/huggingface/transformers/tree/main/src/transformers/pipelines) submodule, and add it to the list of tasks defined in [pipelines/__init__.py](https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/__init__.py). -Next, add a new test for the pipeline in [transformers/tests/pipelines](https://github.com/huggingface/transformers/tree/main/tests/pipelines). You can look at the other tests for examples of how to test your pipeline. For example, take a look at the text classification pipeline test. +Next, add a new test for the pipeline in [transformers/tests/pipelines](https://github.com/huggingface/transformers/tree/main/tests/pipelines). You can look at the other tests for examples of how to test your pipeline. The [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function should be very generic and run on the models defined in [model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L48) and [tf_model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L49). This is important for testing future compatibility with new models. -You'll also notice `ANY` is used throughout the `run_pipeline_test` function. The models are random, so you can't check the actual values. Using `ANY` allows the test to just match the output of the pipeline type instead. +You'll also notice `ANY` is used throughout the [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function. The models are random, so you can't check the actual values. Using `ANY` allows the test to match the output of the pipeline type instead. Finally, you should also implement the following 4 tests. diff --git a/docs/source/en/cache_explanation.md b/docs/source/en/cache_explanation.md index 613e89275759..9ac1f711deff 100644 --- a/docs/source/en/cache_explanation.md +++ b/docs/source/en/cache_explanation.md @@ -37,7 +37,7 @@ When you use Transformers' [`Cache`] class, the attention module performs severa 2. When the `forward` method is called iteratively, it's crucial that the attention mask shape matches the combined length of the past and current kv pairs. The attention mask should have the shape `(batch_size, past_kv_length + new_tokens_length)`. This is typically handled internally in [`~GenerationMixin.generate`], but if you want to implement your own generation loop with [`Cache`], keep this in mind! The attention mask should hold the past and current token values. -3. It is also important to be aware of the `cache_position`. This is important if you want to reuse a prefilled [`Cache`] with the `forward` method because you have to pass a valid `cache_position` value. This indicates the input positions in a sequence. `cache_position` is unaffected by padding, and it always adds one more position for each token. For example, if a kv cache contains 10 tokens - regardless of pad tokens - the cache position for the next token should be `torch.tensor([10]). +3. It is also important to be aware of the `cache_position`. This is important if you want to reuse a prefilled [`Cache`] with the `forward` method because you have to pass a valid `cache_position` value. This indicates the input positions in a sequence. `cache_position` is unaffected by padding, and it always adds one more position for each token. For example, if a kv cache contains 10 tokens - regardless of pad tokens - the cache position for the next token should be `torch.tensor([10])`. The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token. @@ -72,3 +72,25 @@ for _ in range(max_new_tokens): print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]) "[INST] Hello, what's your name. [/INST] Hello! My name is LLaMA," ``` + +## Legacy cache format + +Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format has is dynamic because it grows as text is generated, similar to [`DynamicCache`]. + +If your project depends on this legacy format, you can convert between [`DynamicCache`] and a tuple of tuples as shown below with the [`~DynamicCache.from_legacy_cache`] and [`DynamicCache.to_legacy_cache`] functions. This is helpful if you have custom logic for manipulating a cache in a specific format. + +```py +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache + +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto") +inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device) + +# `return_dict_in_generate=True` is required to return the cache and `return_legacy_cache` forces the returned cache +# in the the legacy format +generation_outputs = model.generate(**inputs, return_dict_in_generate=True, return_legacy_cache=True, max_new_tokens=5) + +cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values) +legacy_format_cache = cache.to_legacy_cache() +``` \ No newline at end of file diff --git a/docs/source/en/generation_features.md b/docs/source/en/generation_features.md new file mode 100644 index 000000000000..110d4f76b2bb --- /dev/null +++ b/docs/source/en/generation_features.md @@ -0,0 +1,82 @@ + + +# Generation features + +The [`~GenerationMixin.generate`] API supports a couple features for building applications on top of it. + +This guide will show you how to use these features. + +## Streaming + +Streaming starts returning text as soon as it is generated so you don't have to wait to see the entire generated response all at once. It is important in user-facing applications because it reduces perceived latency and allows users to see the generation progression. + +
+ +
+ +> [!TIP] +> Learn more about streaming in the [Text Generation Inference](https://huggingface.co/docs/text-generation-inference/en/conceptual/streaming) docs. + +Create an instance of [`TextStreamer`] with the tokenizer. Pass [`TextStreamer`] to the `streamer` parameter in [`~GenerationMixin.generate`] to stream the output one word at a time. + +```py +from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer + +tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") +model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") +inputs = tokenizer(["The secret to baking a good cake is "], return_tensors="pt") +streamer = TextStreamer(tokenizer) + +_ = model.generate(**inputs, streamer=streamer, max_new_tokens=20) +``` + +The `streamer` parameter is compatible with any class with a [`~TextStreamer.put`] and [`~TextStreamer.end`] method. [`~TextStreamer.put`] pushes new tokens and [`~TextStreamer.end`] flags the end of generation. You can create your own streamer class as long as they include these two methods, or you can use Transformers' basic streamer classes. + +## Watermarking + +Watermarking is useful for detecting whether text is generated. The [watermarking strategy](https://hf.co/papers/2306.04634) in Transformers randomly "colors" a subset of the tokens green. When green tokens are generated, they have a small bias added to their logits, and a higher probability of being generated. You can detect generated text by comparing the proportion of green tokens to the amount of green tokens typically found in human-generated text. + +Watermarking is supported for any generative model in Transformers and doesn't require an extra classfication model to detect the watermarked text. + +Create a [`WatermarkingConfig`] with the bias value to add to the logits and watermarking algorithm. The example below uses the `"selfhash"` algorithm, where the green token selection only depends on the current token. Pass the [`WatermarkingConfig`] to [`~GenerationMixin.generate`]. + +> [!TIP] +> [`WatermarkDetector`] detects the proportion of green tokens in generated text, which is why it is recommended to strip the prompt text, if it is much longer than the generated text. Padding can also have an effect on [`WatermarkDetector`]. + +```py +from transformers import AutoTokenizer, AutoModelForCausalLM, WatermarkDetector, WatermarkingConfig + +model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") +tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") +tokenizer.pad_token_id = tokenizer.eos_token_id +tokenizer.padding_side = "left" + +inputs = tokenizer(["This is the beginning of a long story", "Alice and Bob are"], padding=True, return_tensors="pt") +input_len = inputs["input_ids"].shape[-1] + +watermarking_config = WatermarkingConfig(bias=2.5, seeding_scheme="selfhash") +out = model.generate(**inputs, watermarking_config=watermarking_config, do_sample=False, max_length=20) +``` + +Create an instance of [`WatermarkDetector`] and pass the model output to it to detect whether the text is machine-generated. The [`WatermarkDetector`] must have the same [`WatermarkingConfig`] used during generation. + +```py +detector = WatermarkDetector(model_config=model.config, device="cpu", watermarking_config=watermarking_config) +detection_out = detector(out, return_dict=True) +detection_out.prediction +array([True, True]) +``` diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md index 5ab8bd05534e..6899cbf61645 100644 --- a/docs/source/en/generation_strategies.md +++ b/docs/source/en/generation_strategies.md @@ -20,9 +20,9 @@ A decoding strategy informs how a model should select the next generated token. This guide will help you understand the different decoding strategies available in Transformers and how and when to use them. -## Default decoding strategy +## Greedy search -Greedy search is the default decoding strategy. It selects the next most likely token at each step. Unless specified in the [`GenerationConfig`], this strategy generates a maximum of 20 tokens. +Greedy search is the default decoding strategy. It selects the next most likely token at each step. Unless specified in [`GenerationConfig`], this strategy generates a maximum of 20 tokens. Greedy search works well for tasks with relatively short outputs. However, it breaks down when generating longer sequences because it begins to repeat itself. @@ -146,7 +146,7 @@ outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams= [Speculative](https://hf.co/papers/2211.17192) or assistive decoding isn't a search or sampling strategy. This method is especially useful for LLMs where it can be more costly and slower to generate tokens. Instead, speculative decoding adds a second smaller model to generate candidate tokens. The main model verifies the candidate tokens in a single `forward` pass, which speeds up the decoding process overall. Refer to the [speculative decoding](./llm_optims#speculative-decoding) guide to learn more. -Currently, only greed search and multinomial sampling are supported with speculative decoding. Batched inputs aren't supported either. +Currently, only greedy search and multinomial sampling are supported with speculative decoding. Batched inputs aren't supported either. Enable speculative decoding with the `assistant_model` parameter. You'll notice the fastest speed up with an assistant model that is much smaller than the main model. Add `do_sample=True` to enable token validation with resampling. @@ -166,27 +166,21 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True) 'Hugging Face is an open-source company that provides a platform for developers to build and deploy machine' ``` - - -If you're using a `pipeline` object, all you need to do is to pass the assistant checkpoint under `assistant_model` +Speculative decoding is also supported in [`Pipeline`] with the `assistant_model` parameter. ```python ->>> from transformers import pipeline ->>> import torch - ->>> pipe = pipeline( -... "text-generation", -... model="meta-llama/Llama-3.1-8B", -... assistant_model="meta-llama/Llama-3.2-1B", # This extra line is all that's needed, also works with UAD -... torch_dtype=torch.bfloat16 -... ) ->>> pipe_output = pipe("Once upon a time, ", max_new_tokens=50, do_sample=False) ->>> pipe_output[0]["generated_text"] -'Once upon a time, 3D printing was a niche technology that was only' -``` - - +from transformers import pipeline +import torch +pipe = pipeline( + "text-generation", + model="meta-llama/Llama-3.1-8B", + assistant_model="meta-llama/Llama-3.2-1B", + torch_dtype=torch.bfloat16 +) +pipe_output = pipe("Once upon a time, ", max_new_tokens=50, do_sample=False) +pipe_output[0]["generated_text"] +```
@@ -229,13 +223,59 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True) 'Hugging Face is an open-source company that provides a platform for developers to build and deploy machine learning models. It offers a variety of tools' ``` +### Self-speculative decoding + +Early exiting uses the earlier hidden states from the language modeling head as inputs, effectively skipping layers to yield a lower quality output. The lower quality output is used as the assistant output and self-speculation is applied to fix the output using the remaining layers. The final generated result from this self-speculative method is the same (or has the same distribution) as the original models generation. + +The assistant model is also part of the target model, so the caches and weights can be shared, resulting in lower memory requirements. + +For a model trained with early exit, pass `assistant_early_exit` to [`~GenerationMixin.generate`]. + +```py +from transformers import AutoModelForCausalLM, AutoTokenizer + +prompt = "Alice and Bob" +checkpoint = "facebook/layerskip-llama3.2-1B" + +tokenizer = AutoTokenizer.from_pretrained(checkpoint) +inputs = tokenizer(prompt, return_tensors="pt") + +model = AutoModelForCausalLM.from_pretrained(checkpoint) +outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20) +tokenizer.batch_decode(outputs, skip_special_tokens=True) +``` + +### Universal assisted decoding + +Universal assisted decoding (UAD) enables the main and assistant models to use different tokenizers. The main models input tokens are re-encoded into assistant model tokens. Candidate tokens are generated in the assistant encoding which are re-encoded into the main model candidate tokens. The candidate tokens are verified as explained in [speculative decoding](#speculative-decoding). + +Re-encoding involves decoding token ids into text and encoding the text with a different tokenizer. To prevent tokenization discrepancies during re-encoding, UAD finds the longest common sub-sequence between the source and target encodings to ensure the new tokens include the correct prompt suffix. + +Add the `tokenizer` and `assistant_tokenizer` parameters to [`~GenerationMixin.generate`] to enable UAD. + +```py +from transformers import AutoModelForCausalLM, AutoTokenizer + +prompt = "Alice and Bob" + +assistant_tokenizer = AutoTokenizer.from_pretrained("double7/vicuna-68m") +tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b") +inputs = tokenizer(prompt, return_tensors="pt") + +model = AutoModelForCausalLM.from_pretrained("google/gemma-2-9b") +assistant_model = AutoModelForCausalLM.from_pretrained("double7/vicuna-68m") +outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=tokenizer, assistant_tokenizer=assistant_tokenizer) +tokenizer.batch_decode(outputs, skip_special_tokens=True) +['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a'] +``` + ## DoLa -[Decoding by Contrasting Layers (DoLa)](https://hf.co/papers/2309.03883) is a contrastive decoding strategy for improving factuality and reducing hallucination. This strategy works by contrasting the logit diffferences between the final and early layers. As a result, factual knowledge localized to a particular layers are amplified. DoLa is not recommended for smaller models like GPT-2. +[Decoding by Contrasting Layers (DoLa)](https://hf.co/papers/2309.03883) is a contrastive decoding strategy for improving factuality and reducing hallucination. This strategy works by contrasting the logit diffferences between the final and early layers. As a result, factual knowledge localized to particular layers are amplified. DoLa is not recommended for smaller models like GPT-2. Enable DoLa with the following parameters. -- `dola_layers` are the candidate layers to be contrasted with the final layer. It can be a string with `low` or `high` to contrast the lower or higher parts of a layer. `high` is recommended for short-answer tasks like TruthfulQA. `low` is recommended for long-answer reasoning tasks like GSM8K, StrategyQA, FACTOR, and VicunaQA. +- `dola_layers` are the candidate layers to be contrasted with the final layer. It can be a string (`low` or `high`) to contrast the lower or higher parts of a layer. `high` is recommended for short-answer tasks like TruthfulQA. `low` is recommended for long-answer reasoning tasks like GSM8K, StrategyQA, FACTOR, and VicunaQA. When a model has tied word embeddings, layer 0 is skipped and it begins from layer 2. @@ -284,3 +324,7 @@ tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[-1]:], skip_special_tok
+ +## Resources + +Read the [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate) blog post for an explanation of how common decoding strategies work. diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md index 44505f8f8c2f..36f82fb3dc9a 100644 --- a/docs/source/en/kv_cache.md +++ b/docs/source/en/kv_cache.md @@ -16,9 +16,9 @@ rendered properly in your Markdown viewer. # KV cache strategies -The key-value (KV) vectors are used to calculate attention scores, and for autoregressive models, the KV scores are calculated *every* time because the model predicts one token at a time. Each prediction depends on the previous tokens, which means the model performs the same computations each time. +The key-value (KV) vectors are used to calculate attention scores. For autoregressive models, KV scores are calculated *every* time because the model predicts one token at a time. Each prediction depends on the previous tokens, which means the model performs the same computations each time. -A KV *cache* stores these calculations so they can be reused without recomputing them. Efficient caching is crucial for optimizing model performance because it reduces computation time and improves response rates. For a more in-depth explanation about how a cache works, refer to [Caching](./cache_explanation.md). +A KV *cache* stores these calculations so they can be reused without recomputing them. Efficient caching is crucial for optimizing model performance because it reduces computation time and improves response rates. Refer to the [Caching](./cache_explanation.md) doc for a more detailed explanation about how a cache works. Transformers offers several [`Cache`] classes that implement different caching mechanisms. Some of these [`Cache`] classes are optimized to save memory while others are designed to maximize generation speed. Refer to the table below to compare cache types and use it to help you select the best cache for your use case. @@ -221,7 +221,7 @@ tokenizer.batch_decode(out, skip_special_tokens=True)[0] ### Offloaded static cache -The [`OffloadedStaticCache`] is very similar to the [OffloadedCache](#offloaded-cache) except the cache size is set to a maximum cache size. Otherwise, [`OffladedStaticCache`] only keeps the current layer cache on the GPU and the rest are moved to the CPU. +The [`OffloadedStaticCache`] is very similar to the [OffloadedCache](#offloaded-cache) except the cache size is set to a maximum cache size. Otherwise, [`OffloadedStaticCache`] only keeps the current layer cache on the GPU and the rest are moved to the CPU. Enable [`OffloadedStaticCache`] by configuring `cache_implementation="offloaded_static"` in [`~GenerationMixin.generate`]. @@ -241,7 +241,7 @@ Cache offloading requires a CUDA GPU. ### Sliding window cache -[`SlidingWindowCache`] implements a sliding window over the previos kv pairs, and only keeps the last `sliding_window` tokens. This cache type is designed to only work with models that support *sliding window attention*, such as [Mistral](./model_doc/mistral). Older kv states are discarded and replaced by new kv states. +[`SlidingWindowCache`] implements a sliding window over the previous kv pairs, and only keeps the last `sliding_window` tokens. This cache type is designed to only work with models that support *sliding window attention*, such as [Mistral](./model_doc/mistral). Older kv states are discarded and replaced by new kv states. Enable [`SlidingWindowCache`] by configuring `cache_implementation="sliding_window"` in [`~GenerationMixin.generate`]. diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md index e2e7747343b7..7c9bc154ab6e 100644 --- a/docs/source/en/llm_optims.md +++ b/docs/source/en/llm_optims.md @@ -9,24 +9,24 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> -# Optimize inference +# Optimizing inference -Inference with large language models (LLMs) can be challenging because they have to store and handle billions of parameters. To load a 70B parameter [Llama 2](https://hf.co/meta-llama/Llama-2-70b-hf) model, it requires 256GB of memory for full precision weights and 128GB of memory for half-precision weights. For comparison, the most powerful GPUs today - the A100 and H100 - only have 80GB of memory. +Inference with large language models (LLMs) can be challenging because they have to store and handle billions of parameters. To load a 70B parameter [Llama 2](https://hf.co/meta-llama/Llama-2-70b-hf) model, it requires 256GB of memory for full precision weights and 128GB of memory for half-precision weights. The most powerful GPUs today - the A100 and H100 - only have 80GB of memory. On top of the memory requirements, inference is slow because LLMs are called repeatedly to generate the next token. The input sequence increases as generation progresses, which takes longer and longer to process. This guide will show you how to optimize LLM inference to accelerate generation and reduce memory usage. > [!TIP] -> Try out [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a Hugging Face library dedicated to deploying and serving highly optimized LLMs for inference. It includes deployment-oriented optimization features not included in Transformers, such as continuous batching for increasing throughput and tensor parallelism for multi-GPU inference. +> Try out [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a Hugging Face library dedicated to deploying and serving highly optimized LLMs for inference. ## Static kv-cache and torch.compile LLMs compute key-value (kv) values for each input token, and it performs the same kv computation each time because the generated output becomes part of the input. However, performing the same kv computation every time is not very efficient. -A *kv-cache* stores the past keys and values instead of recomputing them each time. But the kv-cache is dynamic and it grows with each generation step which prevents you from taking advantage of [torch.compile](./perf_torch_compile), a powerful optimization method that fuses PyTorch code into optimized kernels. +A *kv-cache* stores the past keys and values instead of recomputing them each time. As a result, the kv-cache is dynamic and it grows with each generation step which prevents you from taking advantage of [torch.compile](./perf_torch_compile), a powerful optimization method that fuses PyTorch code into optimized kernels. -The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value, which allows you to combine it with [torch.compile](./perf_torch_compile) for up to a 4x speed up. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware. +The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value, so you can combine it with [torch.compile](./perf_torch_compile) for up to a 4x speed up. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware. > [!WARNING] > Follow this [issue](https://github.com/huggingface/transformers/issues/28981) to track which models (Llama, Gemma, Mistral, etc.) support a static kv-cache and torch.compile. @@ -151,7 +151,7 @@ To enable static kv-cache and [torch.compile](./perf_torch_compile) with [`Stati 1. Initialize [`StaticCache`] before using the model for inference to configure parameters like the maximum batch size and sequence length. 2. Call [torch.compile](./perf_torch_compile) on the model to compile the forward pass with the static kv-cache. -3. Set `enable_math=True` in the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more. +3. se SDPBackend.MATH in the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more. ```py from torch.nn.attention import SDPBackend, sdpa_kernel @@ -190,7 +190,7 @@ text
-Compiling the entire [`~GenerationMixin.generate`] function also compiles the input preparation logit processor operations, and more in addition to the forward pass. With this approach, you don't need to initialize [`StaticCache`] or set the [cache_implementation](https://hf.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.cache_implementation) parameter. +Compiling the entire [`~GenerationMixin.generate`] function also compiles the input preparation logit processor operations, and more, in addition to the forward pass. With this approach, you don't need to initialize [`StaticCache`] or set the [cache_implementation](https://hf.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.cache_implementation) parameter. ```py from transformers import AutoTokenizer, AutoModelForCausalLM @@ -220,7 +220,7 @@ This usage pattern is more appropriate for unique hardware or use cases, but the -## Decoding +## Decoding strategies Decoding can also be optimized to accelerate generation. You can use a lightweight assistant model to generate candidate tokens faster than the LLM itself or you can use a variant of this decoding strategy that works especially well for input-grounded tasks. @@ -234,7 +234,7 @@ For each input token, the model weights are loaded each time during the forward To get the largest speed up, the assistant model should be a lot smaller than the LLM so that it can generate tokens quickly. The assistant and LLM model must also share the same tokenizer to avoid re-encoding and decoding tokens. > [!WARNING] -> Speculative decoding is only supported for the greedy search and sampling decoding strategies, and it also doesn't support batched inputs. +> Speculative decoding is only supported for the greedy search and sampling decoding strategies, and it doesn't support batched inputs. Enable speculative decoding by loading an assistant model and passing it to [`~GenerationMixin.generate`]. @@ -355,99 +355,6 @@ model = AutoModelForCausalLM.from_pretrained( ) ``` -### Fine-Tuning with torch.compile and Padding-Free Data Collation - -In addition to optimizing inference, you can also enhance the training efficiency of large language models by leveraging torch.compile during fine-tuning and using a padding-free data collator. This approach can significantly speed up training and reduce computational overhead. - -Here's how you can fine-tune a Llama model using SFTTrainer from the TRL library, with torch_compile enabled and a padding-free data collator: - -``` -#################### IMPORTS ################### - -import math -import datasets -import dataclasses -from transformers import ( - AutoModelForCausalLM, - AutoTokenizer, - TrainingArguments -) -from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM - -#################### MODEL LOADING WITH FLASH ATTENTION ################### - -model_name = "meta-llama/Llama-3.2-1B" -model = AutoModelForCausalLM.from_pretrained( - model_name, - attn_implementation="flash_attention_2" # Enables FlashAttention-2 -) -tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) - -#################### DATA PREPROCESSING (PADDING-FREE) ################### - -response_template = "\n### Label:" -response_template_ids = tokenizer.encode( - response_template, add_special_tokens=False -)[2:] # Exclude special tokens - -data_collator = DataCollatorForCompletionOnlyLM( - response_template_ids=response_template_ids, - tokenizer=tokenizer, - ignore_index=-100, - padding_free=True # Enables padding-free collation -) - -def format_dataset(example): - return { - "output": example["output"] + tokenizer.eos_token - } - -data_files = {"train": "path/to/dataset"} # Replace with your dataset path -json_dataset = datasets.load_dataset("json", data_files=data_files) -formatted_train_dataset = json_dataset["train"].map(format_dataset) - -################# TRAINING CONFIGURATION ############################ - -train_args = TrainingArguments( - num_train_epochs=5, - per_device_train_batch_size=4, - per_device_eval_batch_size=4, - gradient_accumulation_steps=4, - learning_rate=1e-5, - weight_decay=0.0, - warmup_ratio=0.03, - lr_scheduler_type="cosine", - logging_steps=1, - include_tokens_per_second=True, - save_strategy="epoch", - output_dir="output", - torch_compile=True, # Enables torch.compile - torch_compile_backend="inductor", - torch_compile_mode="default" -) - -# Convert TrainingArguments to SFTConfig -transformer_train_arg_fields = [x.name for x in dataclasses.fields(SFTConfig)] -transformer_kwargs = { - k: v - for k, v in train_args.to_dict().items() - if k in transformer_train_arg_fields -} -training_args = SFTConfig(**transformer_kwargs) - -####################### FINE-TUNING ##################### - -trainer = SFTTrainer( - model=model, - tokenizer=tokenizer, - train_dataset=formatted_train_dataset, - data_collator=data_collator, - dataset_text_field="output", - args=training_args, -) -trainer.train() -``` - ### PyTorch scaled dot product attention Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and it supports FlashAttention, xFormers, and PyTorch's C++ implementation. SDPA chooses the most performant attention algorithm if you're using a CUDA backend. For other backends, SDPA defaults to the PyTorch C++ implementation. @@ -473,7 +380,9 @@ with sdpa_kernel(SDPBackend.FLASH_ATTENTION): ## Quantization -Quantization reduces the size of model weights by storing them in a lower precision. This translates to lower memory usage and makes loading LLMs for inference more accessible if you're constrained by GPU memory. If you aren't limited by your GPU, you don't necessarily need to quantize your model because it can increase latency slightly (except for AWQ and fused AWQ modules) due to the extra step required to quantize and dequantize the weights. +Quantization reduces the size of model weights by storing them in a lower precision. This translates to lower memory usage and makes loading LLMs for inference more accessible if you're constrained by GPU memory. + +If you aren't limited by your GPU, you don't necessarily need to quantize your model because it can increase latency slightly (except for AWQ and fused AWQ modules) due to the extra step required to quantize and dequantize the weights. > [!TIP] > There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, VPTQ, AWQ, and AutoGPTQ. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post which compares AutoGPTQ and bitsandbytes. diff --git a/docs/source/en/llm_tutorial.md b/docs/source/en/llm_tutorial.md index 3ef1222b4cb2..55ca7f425c9d 100644 --- a/docs/source/en/llm_tutorial.md +++ b/docs/source/en/llm_tutorial.md @@ -18,30 +18,29 @@ rendered properly in your Markdown viewer. [[open-in-colab]] -Text generation is one of the most popular applications of large language models (LLMs). A LLM is trained to generate the next word (token) given some initial text (prompt) along with its own generated outputs up to a predefined length or when it reaches an end-of-sequence (`EOS`) token. +Text generation is the most popular application for large language models (LLMs). A LLM is trained to generate the next word (token) given some initial text (prompt) along with its own generated outputs up to a predefined length or when it reaches an end-of-sequence (`EOS`) token. In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities. -This guide will show you the basics of text generation with the [`~GenerationMixin.generate`] API and some common pitfalls to avoid. +This guide will show you the basics of text generation with [`~GenerationMixin.generate`] and some common pitfalls to avoid. -## Generate +## Default generate Before you begin, it's helpful to install [bitsandbytes](https://hf.co/docs/bitsandbytes/index) to quantize really large models to reduce their memory usage. ```bash -!pip install transformers bitsandbytes>0.39.0 -q +!pip install -U transformers bitsandbytes ``` Bitsandbytes supports multiple backends in addition to CUDA-based GPUs. Refer to the multi-backend installation [guide](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend) to learn more. -Load a LLM with [`~PreTrainedModel.from_pretrained`] and add the following two parameters to lessen the memory requirements. +Load a LLM with [`~PreTrainedModel.from_pretrained`] and add the following two parameters to reduce the memory requirements. -- `device_map="auto` enables Accelerate's [Big Model Inference](./models#big-model-inference) feature for automatically initiating the model skeleton and loading and dispatching the model weights across all available devices, starting with the fastest device (GPU). +- `device_map="auto"` enables Accelerates' [Big Model Inference](./models#big-model-inference) feature for automatically initiating the model skeleton and loading and dispatching the model weights across all available devices, starting with the fastest device (GPU). - `quantization_config` is a configuration object that defines the quantization settings. This examples uses bitsandbytes as the quantization backend (see the [Quantization](./quantization/overview) section for more available backends) and it loads the model in [4-bits](./quantization/bitsandbytes). ```py from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig -# load model and set up quantization configuration quantization_config = BitsAndBytesConfig(load_in_4bit=True) model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto", quantization_config=quantization_config) ``` @@ -52,23 +51,92 @@ Tokenize your input, and set the [`~PreTrainedTokenizer.padding_side`] parameter > Process more than one prompt at a time by passing a list of strings to the tokenizer. Batch the inputs to improve throughput at a small cost to latency and memory. ```py -# tokenize input tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left") model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda") ``` -Pass the inputs to [`~GenerationMixin.generate`] to generate tokens, and then [`~PreTrainedTokenizer.batch_decode`] the generated tokens back to text. +Pass the inputs to [`~GenerationMixin.generate`] to generate tokens, and [`~PreTrainedTokenizer.batch_decode`] the generated tokens back to text. ```py -# generate and decode back to text generated_ids = model.generate(**model_inputs) tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] "A list of colors: red, blue, green, yellow, orange, purple, pink," ``` +## Generation configuration + +All generation settings are contained in [`GenerationConfig`]. In the example above, the generation settings are derived from the `generation_config.json` file of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1). A default decoding strategy is used when no configuration is saved with a model. + +Inspect the configuration through the `generation_config` attribute. It only shows values that are different from the default configuration, in this case, the `bos_token_id` and `eos_token_id`. + +```py +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto") +model.generation_config +GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2 +} +``` + +You can customize [`~GenerationMixin.generate`] by overriding the parameters and values in [`GenerationConfig`]. Some of the most commonly adjusted parameters are [`~GenerationConfig.max_new_tokens`], [`~GenerationConfig.num_beams`], [`~GenerationConfig.do_sample`], and [`~GenerationConfig.num_return_sequences`]. + +```py +# enable beam search sampling strategy +model.generate(**inputs, num_beams=4, do_sample=True) +``` + +[`~GenerationMixin.generate`] can also be extended with external libraries or custom code. The `logits_processor` parameter accepts custom [`LogitsProcessor`] instances for manupulating the next token probability distribution. `stopping_criteria` supports custom [`StoppingCriteria`] to stop text generation. Check out the [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo) for more examples of external [`~GenerationMixin.generate`]-compatible extensions. + +Refer to the [Generation strategies](./generation_strategies) guide to learn more about search, sampling, and decoding strategies. + +### Saving + +Create an instance of [`GenerationConfig`] and specify the decoding parameters you want. + +```py +from transformers import AutoModelForCausalLM, GenerationConfig + +model = AutoModelForCausalLM.from_pretrained("my_account/my_model") +generation_config = GenerationConfig( + max_new_tokens=50, do_sample=True, top_k=50, eos_token_id=model.config.eos_token_id +) +``` + +Use [`~GenerationConfig.save_pretrained`] to save a specific generation configuration and set the `push_to_hub` parameter to `True` to upload it to the Hub. + +```py +generation_config.save_pretrained("my_account/my_model", push_to_hub=True) +``` + +Leave the `config_file_name` parameter empty. This parameter should be used when storing multiple generation configurations in a single directory. It gives you a way to specify which generation configuration to load. You can create different configurations for different generative tasks (creative text generation with sampling, summarization with beam search) for use with a single model. + +```py +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig + +tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small") +model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small") + +translation_generation_config = GenerationConfig( + num_beams=4, + early_stopping=True, + decoder_start_token_id=0, + eos_token_id=model.config.eos_token_id, + pad_token=model.config.pad_token_id, +) + +translation_generation_config.save_pretrained("/tmp", config_file_name="translation_generation_config.json", push_to_hub=True) + +generation_config = GenerationConfig.from_pretrained("/tmp", config_file_name="translation_generation_config.json") +inputs = tokenizer("translate English to French: Configuration files are easy to use!", return_tensors="pt") +outputs = model.generate(**inputs, generation_config=generation_config) +print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) +``` + ## Pitfalls -The section below covers some common issues that you may encounter during text generation and how to solve them. +The section below covers some common issues you may encounter during text generation and how to solve them. ## Wrong output length diff --git a/docs/source/en/pipeline_gradio.md b/docs/source/en/pipeline_gradio.md index 7a917d1fa865..0cd65665d33d 100644 --- a/docs/source/en/pipeline_gradio.md +++ b/docs/source/en/pipeline_gradio.md @@ -48,5 +48,5 @@ The Space below is created with the code above and hosted on Spaces. src="https://stevhliu-gradio-pipeline-demo.hf.space" frameborder="0" width="850" - height="450" + height="850" > diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md index 7460743e3bcc..e6857ce29708 100644 --- a/docs/source/en/pipeline_tutorial.md +++ b/docs/source/en/pipeline_tutorial.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Pipeline -The [`Pipeline`] is a simple but powerful inference API that is readily available for a variety of machine learning tasks with any model from the Hugging Face [Hub](https://hf.co/models). Tailor the [`Pipeline`] to your task with certain task specific parameters, such as adding timestamps to an automatic speech recognition (ASR) pipeline for transcribing meeting notes. [`Pipeline`] supports GPUs, Apple silicon, and half-precision weights to accelerate inference and save memory. +The [`Pipeline`] is a simple but powerful inference API that is readily available for a variety of machine learning tasks with any model from the Hugging Face [Hub](https://hf.co/models). + +Tailor the [`Pipeline`] to your task with task specific parameters such as adding timestamps to an automatic speech recognition (ASR) pipeline for transcribing meeting notes. [`Pipeline`] supports GPUs, Apple Silicon, and half-precision weights to accelerate inference and save memory. @@ -45,11 +47,11 @@ pipeline(["the secret to baking a really good cake is ", "a baguette is "]) [{'generated_text': 'a baguette is 100% bread.\n\na baguette is 100%'}]] ``` -This guide will introduce you to the [`Pipeline`], demonstrate its features, and show you how to configure its various parameters. +This guide will introduce you to the [`Pipeline`], demonstrate its features, and show how to configure its various parameters. ## Tasks -[`Pipeline`] is compatible with many machine learning tasks across different modalities. You just need to pass an appropriate input to the pipeline and it will handle the rest. +[`Pipeline`] is compatible with many machine learning tasks across different modalities. Pass an appropriate input to the pipeline and it will handle the rest. Here are some examples of how to use [`Pipeline`] for different tasks and modalities. @@ -111,13 +113,13 @@ pipeline( ## Parameters -At a minimum, a [`Pipeline`] only requires a task identifier, model, and the appropriate input. But there are many parameters available to configure the pipeline with, from task-specific parameters to optimizing performance. +At a minimum, [`Pipeline`] only requires a task identifier, model, and the appropriate input. But there are many parameters available to configure the pipeline with, from task-specific parameters to optimizing performance. -This section will walk you through some of the more important parameters. +This section introduces you to some of the more important parameters. ### Device -[`Pipeline`] is compatible with many hardware types, including GPUs, CPUs, Apple silicon, and more. This is configured with the `device` parameter. By default, [`Pipeline`] runs on a CPU which is given by `device=-1`. +[`Pipeline`] is compatible with many hardware types, including GPUs, CPUs, Apple Silicon, and more. Configure the hardware type with the `device` parameter. By default, [`Pipeline`] runs on a CPU which is given by `device=-1`. @@ -134,7 +136,7 @@ pipeline("the secret to baking a really good cake is ") You could also let [Accelerate](https://hf.co/docs/accelerate/index), a library for distributed training, automatically choose how to load and store the model weights on the appropriate device. This is especially useful if you have multiple devices. Accelerate loads and stores the model weights on the fastest device first, and then moves the weights to other devices (CPU, hard drive) as needed. Set `device_map="auto"` to let Accelerate choose the device. > [!TIP] -> Make sure you have [Accelerate](https://hf.co/docs/accelerate/basic_tutorials/install) installed. +> Make sure have [Accelerate](https://hf.co/docs/accelerate/basic_tutorials/install) is installed. > > ```py > !pip install -U accelerate @@ -164,9 +166,9 @@ pipeline("the secret to baking a really good cake is ") ### Batch inference -[`Pipeline`] can also process batches of inputs with the `batch_size` parameter. Batch inference may improve speed, especially on a GPU, but it isn't guaranteed to. Other variables such as hardware, data, and the model itself can affect whether batch inference improves speed. For this reason, batch inference is disabled by default. +[`Pipeline`] can also process batches of inputs with the `batch_size` parameter. Batch inference may improve speed, especially on a GPU, but it isn't guaranteed. Other variables such as hardware, data, and the model itself can affect whether batch inference improves speed. For this reason, batch inference is disabled by default. -In this example, when there are 4 inputs and `batch_size` is set to 2, [`Pipeline`] passes a batch of 2 inputs to the model at a time. +In the example below, when there are 4 inputs and `batch_size` is set to 2, [`Pipeline`] passes a batch of 2 inputs to the model at a time. ```py from transformers import pipeline @@ -179,7 +181,7 @@ pipeline(["the secret to baking a really good cake is", "a baguette is", "paris [{'generated_text': 'hotdogs are a staple of the american diet. they are a great source of protein and can'}]] ``` -Another good use case for batch inference is when you stream data in [`Pipeline`]. +Another good use case for batch inference is for streaming data in [`Pipeline`]. ```py from transformers import pipeline @@ -193,25 +195,25 @@ for out in pipeline(KeyDataset(dataset, "text"), batch_size=8, truncation="only_ print(out) ``` -Here are some general rules of thumb for determining whether batch inference can help improve performance. +Keep the following general rules of thumb in mind for determining whether batch inference can help improve performance. 1. The only way to know for sure is to measure performance on your model, data, and hardware. 2. Don't batch inference if you're constrained by latency (a live inference product for example). 3. Don't batch inference if you're using a CPU. 4. Don't batch inference if you don't know the `sequence_length` of your data. Measure performance, iteratively add to `sequence_length`, and include out-of-memory (OOM) checks to recover from failures. -5. Do batch inference if your `sequence_length` is regular, and keep pushing it until you reach an OOM error. The larger the GPU, the more likely batch inference is to be beneficial. +5. Do batch inference if your `sequence_length` is regular, and keep pushing it until you reach an OOM error. The larger the GPU, the more helpful batch inference is. 6. Do make sure you can handle OOM errors if you decide to do batch inference. ### Task-specific parameters -The [`Pipeline`] accepts any parameters that are supported by each individual task pipeline. Make sure to check out each individual task pipeline to see what type of parameters are available. If you can't find a parameter that would be useful for your use case, please feel free to open a GitHub [issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml) to request it! +[`Pipeline`] accepts any parameters that are supported by each individual task pipeline. Make sure to check out each individual task pipeline to see what type of parameters are available. If you can't find a parameter that is useful for your use case, please feel free to open a GitHub [issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml) to request it! -Here are some examples of enabling these task-specific parameters in [`Pipeline`]. +The examples below demonstrate some of the task-specific parameters available. -The [`AutomaticSpeechRecognitionPipeline.__call__`] method has a `return_timestamps` parameter that returns when each word was spoken by setting it to `"word"`. This parameter can be passed along to [`Pipeline`]. +Pass the `return_timestamps="word"` parameter to [`Pipeline`] to return when each word was spoken. ```py from transformers import pipeline @@ -245,9 +247,9 @@ pipeline(audio="https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/ml -The [`~TextGenerationPipeline.__call__`] method has a `return_full_text` parameter that determines whether to return the full text or only the generated text. Set it to `False` to only return the generated text. +Pass `return_full_text=False` to [`Pipeline`] to only return the generated text instead of the full text (prompt and generated text). -[`~TextGenerationPipeline.__call__`] also additional keyword arguments from the [`~GenerationMixin.generate`] method, which itself takes generation configuration parameters from [`GenerationConfig`]. To return more than one generated sequence, set `num_return_sequences` to a value greater than 1. Pass this parameter to [`Pipeline`]. +[`~TextGenerationPipeline.__call__`] also supports additional keyword arguments from the [`~GenerationMixin.generate`] method. To return more than one generated sequence, set `num_return_sequences` to a value greater than 1. ```py from transformers import pipeline @@ -270,36 +272,27 @@ There are some instances where you need to process data in chunks. - for some data types, a single input (for example, a really long audio file) may need to be chunked into multiple parts before it can be processed - for some tasks, like zero-shot classification or question answering, a single input may need multiple forward passes which can cause issues with the `batch_size` parameter -The [`ChunkPipeline`] class is designed to handle these use cases. Both pipeline classes are used in the same way, but since [`ChunkPipeline`] can automatically handle batching you don't need to worry about the number of forward passes your inputs trigger. Instead, you can optimize `batch_size` independently of the inputs. - -Here is how it differs from a regular [`Pipeline`]. +The [ChunkPipeline](https://github.com/huggingface/transformers/blob/99e0ab6ed888136ea4877c6d8ab03690a1478363/src/transformers/pipelines/base.py#L1387) class is designed to handle these use cases. Both pipeline classes are used in the same way, but since [ChunkPipeline](https://github.com/huggingface/transformers/blob/99e0ab6ed888136ea4877c6d8ab03690a1478363/src/transformers/pipelines/base.py#L1387) can automatically handle batching, you don't need to worry about the number of forward passes your inputs trigger. Instead, you can optimize `batch_size` independently of the inputs. - - +The example below shows how it differs from [`Pipeline`]. ```py +# ChunkPipeline all_model_outputs = [] for preprocessed in pipeline.preprocess(inputs): model_outputs = pipeline.model_forward(preprocessed) all_model_outputs.append(model_outputs) outputs =pipeline.postprocess(all_model_outputs) -``` - - - -```py +# Pipeline preprocessed = pipeline.preprocess(inputs) model_outputs = pipeline.forward(preprocessed) outputs = pipeline.postprocess(model_outputs) ``` - - - ## Large datasets -For inference on large datasets, you can iterate directly over the dataset. This avoids immediately allocating memory for the entire dataset, and you don't need to worry about creating batches yourself. As mentioned in the [Batch inference](#batch-inference) section, you can try using the `batch_size` parameter to see if it improves performance. +For inference with large datasets, you can iterate directly over the dataset itself. This avoids immediately allocating memory for the entire dataset, and you don't need to worry about creating batches yourself. Try [Batch inference](#batch-inference) with the `batch_size` parameter to see if it improves performance. ```py from transformers.pipelines.pt_utils import KeyDataset @@ -333,14 +326,14 @@ for out in pipeline(data()): !pip install -U accelerate ``` -As mentioned in the [Device](#device) section, the `device_map="auto"` setting is useful for automatically distributing the model across the fastest devices (GPUs) first before dispatching to other slower devices if available (CPU, hard drive). +The `device_map="auto"` setting is useful for automatically distributing the model across the fastest devices (GPUs) first before dispatching to other slower devices if available (CPU, hard drive). -[`Pipeline`] supports half-precision weights, torch.float16, which can be significantly faster and save memory. Performance loss is negligible for most models, especially for larger models. If your hardware supports it, you can enable torch.bfloat16 instead for more range. +[`Pipeline`] supports half-precision weights (torch.float16), which can be significantly faster and save memory. Performance loss is negligible for most models, especially for larger ones. If your hardware supports it, you can enable torch.bfloat16 instead for more range. > [!TIP] -> Inputs are internally converted to torch.float16, and it only works for models with a PyTorch backend. +> Inputs are internally converted to torch.float16 and it only works for models with a PyTorch backend. -Lastly, [`Pipeline`] also accepts quantized models to really reduce memory usage even further. Make sure you have the [bitsandbytes](https://hf.co/docs/bitsandbytes/installation) library installed first, and then add `load_in_8bit=True` to `model_kwargs` in the pipeline. +Lastly, [`Pipeline`] also accepts quantized models to reduce memory usage even further. Make sure you have the [bitsandbytes](https://hf.co/docs/bitsandbytes/installation) library installed first, and then add `load_in_8bit=True` to `model_kwargs` in the pipeline. ```py import torch diff --git a/docs/source/en/pipeline_webserver.md b/docs/source/en/pipeline_webserver.md index b081d3af615d..5782cd13f5b3 100644 --- a/docs/source/en/pipeline_webserver.md +++ b/docs/source/en/pipeline_webserver.md @@ -16,11 +16,11 @@ rendered properly in your Markdown viewer. # Web server inference -A web server is basically a system that waits for requests and serves them as they come in. This means you can use [`Pipeline`] as an inference engine on the web server, since you can use an iterator (similar to how you would [iterate over a dataset](./pipeline_tutorial#large-datasets)) to handle each incoming request. +A web server is a system that waits for requests and serves them as they come in. This means you can use [`Pipeline`] as an inference engine on a web server, since you can use an iterator (similar to how you would [iterate over a dataset](./pipeline_tutorial#large-datasets)) to handle each incoming request. Designing a web server with [`Pipeline`] is unique though because they're fundamentally different. Web servers are multiplexed (multithreaded, async, etc.) to handle multiple requests concurrently. [`Pipeline`] and its underlying model on the other hand are not designed for parallelism because they take a lot of memory. It's best to give a [`Pipeline`] all the available resources when they're running or for a compute intensive job. -This guide shows how to work around this difference by using the web server to handle the light load of receiving and sending requests, and having a single thread to handle the heavier load of running [`Pipeline`]. +This guide shows how to work around this difference by using a web server to handle the lighter load of receiving and sending requests, and having a single thread to handle the heavier load of running [`Pipeline`]. ## Create a server @@ -77,7 +77,7 @@ Start the server with the following command. uvicorn server:app ``` -The server can be queried now with a POST request. +Query the server with a POST request. ```bash curl -X POST -d "Paris is the [MASK] of France." http://localhost:8000/ @@ -105,12 +105,12 @@ curl -X POST -d "Paris is the [MASK] of France." http://localhost:8000/ ## Queuing requests -The server's queuing mechanism can be used for some interesting applications such as dynamic batching. With dynamic batching, you can accumulate several requests first before processing them with the [`Pipeline`]. +The server's queuing mechanism can be used for some interesting applications such as dynamic batching. Dynamic batching accumulates several requests first before processing them with [`Pipeline`]. The example below is written in pseudocode for readability rather than performance, in particular, you'll notice that: 1. There is no batch size limit. -2. The timeout is reset on every queue fetch, so you could end up waiting much longer than the `timeout` value before processing a request. This would also delay the first inference request by that amount of time. The web server always waits 1ms even if the queue is empty, which is inefficient, because you could be using that time to start inference. It could make sense though if batching is essential to your use case. +2. The timeout is reset on every queue fetch, so you could end up waiting much longer than the `timeout` value before processing a request. This would also delay the first inference request by that amount of time. The web server always waits 1ms even if the queue is empty, which is inefficient, because that time can be used to start inference. It could make sense though if batching is essential to your use case. It would be better to have a single 1ms deadline, instead of resetting it on every fetch. @@ -133,21 +133,21 @@ for rq, out in zip(queues, outs): ## Error checking -There are many things that can go wrong in production. You could run out-of-memory, out of space, fail to load a model, have an incorrect model configuration, have an incorrect query, and so much more! +There are many things that can go wrong in production. You could run out-of-memory, out of space, fail to load a model, have an incorrect model configuration, have an incorrect query, and so much more. -Adding `try...except` statements could be helpful to return these errors to the user for debugging. Keep in mind that this could pose a security risk though if you shouldn't be revealing certain information. +Adding `try...except` statements is helpful for returning these errors to the user for debugging. Keep in mind this could be a security risk if you shouldn't be revealing certain information. ## Circuit breaking -It is better to return errors when the server is overloaded instead of forcing a user to wait indefinitely. Try to return a 503 or 504 error instead of making a user wait for a really long time. +Try to return a 503 or 504 error when the server is overloaded instead of forcing a user to wait indefinitely. -It is relatively simple to implement these error types since it's only a single queue. You should look at the queue size to determine when to start returning errors before your server fails under load. +It is relatively simple to implement these error types since it's only a single queue. Take a look at the queue size to determine when to start returning errors before your server fails under load. ## Block the main thread PyTorch is not async aware, so computation will block the main thread from running. -For this reason, it's better to run PyTorch on its own separate thread or process. When inference of a single request is especially long (> 1s), it's even more important because it means every query during inference must wait 1s before even receiving an error. +For this reason, it's better to run PyTorch on its own separate thread or process. When inference of a single request is especially long (more than 1s), it's even more important because it means every query during inference must wait 1s before even receiving an error. ## Dynamic batching diff --git a/docs/source/en/tasks/prompting.md b/docs/source/en/tasks/prompting.md index 0b7f9a917ff7..e2e688629284 100644 --- a/docs/source/en/tasks/prompting.md +++ b/docs/source/en/tasks/prompting.md @@ -18,9 +18,9 @@ rendered properly in your Markdown viewer. [[open-in-colab]] -Prompt engineering or prompting, refers to using natural language to improve large language model (LLM) performance on a variety of tasks. LLMs have tremendous capacity as a result of their training and size, such that a prompt can steer the model towards generating a desired output. In many cases ([but not all](#finetuning)), you don't need a finetuned model for a task, you just need a good prompt. +Prompt engineering or prompting, uses natural language to improve large language model (LLM) performance on a variety of tasks. A prompt can steer the model towards generating a desired output. In many cases, you don't even need a [fine-tuned](#finetuning) model for a task. You just need a good prompt. -Try prompting a LLM to classify some text. When you create a prompt, it's very important to provide specific instructions about the task you want to perform and what the result should look like. +Try prompting a LLM to classify some text. When you create a prompt, it's important to provide very specific instructions about the task and what the result should look like. ```py from transformers import pipeline @@ -41,13 +41,13 @@ Sentiment: Positive ``` -The challenge lies in designing prompts that produces the results you're expecting, which can be tricky, because language is so incredibly nuanced and expressive. +The challenge lies in designing prompts that produces the results you're expecting because language is so incredibly nuanced and expressive. This guide covers prompt engineering best practices, techniques, and examples for how to solve language and reasoning tasks. ## Best practices -1. Try to pick the latest models for the best performance. Keep in mind that LLMs can come in two flavors, [base](https://hf.co/mistralai/Mistral-7B-v0.1) and [instruction-tuned](https://hf.co/mistralai/Mistral-7B-Instruct-v0.1) (or chat). +1. Try to pick the latest models for the best performance. Keep in mind that LLMs can come in two variants, [base](https://hf.co/mistralai/Mistral-7B-v0.1) and [instruction-tuned](https://hf.co/mistralai/Mistral-7B-Instruct-v0.1) (or chat). Base models are excellent at completing text given an initial prompt, but they're not as good at following instructions. Instruction-tuned models are specifically trained versions of the base models on instructional or conversational data. This makes instruction-tuned models a better fit for prompting. @@ -60,11 +60,11 @@ This guide covers prompt engineering best practices, techniques, and examples fo 4. Clearly separate instructions from the text of interest. -5. Be specific and descriptive about the task and the desired output, including for example, its format, length, style, and language. Avoid ambiguous and vague descriptions and instructions. +5. Be specific and descriptive about the task and the desired output, including for example, its format, length, style, and language. Avoid ambiguous descriptions and instructions. 6. Instructions should focus on "what to do" rather than "what not to do". -7. Help lead the model generate the correct output by writing the first word or even the first sentence. +7. Lead the model to generate the correct output by writing the first word or even the first sentence. 8. Try other techniques like [few-shot](#few-shot) and [chain-of-thought](#chain-of-thought) to improve results. @@ -76,13 +76,13 @@ This guide covers prompt engineering best practices, techniques, and examples fo Crafting a good prompt alone, also known as zero-shot prompting, may not be enough to get the results you want. You may need to try a few prompting techniques to get the best performance. -This section covers a few of these techniques. +This section covers a few prompting techniques. ### Few-shot -Few-shot prompting improves accuracy and performance by including specific examples of what a model should generate given an input. The explicit examples give the model a better understanding of the task and the output format you're looking for. Try experimenting with different numbers of examples (2, 4, 8, etc.) to the model to see how it affects performance. +Few-shot prompting improves accuracy and performance by including specific examples of what a model should generate given an input. The explicit examples give the model a better understanding of the task and the output format you're looking for. Try experimenting with different numbers of examples (2, 4, 8, etc.) to see how it affects performance. -The example below provides the model with 1 example (1-shot) of the output format, a date in MM/DD/YYYY format, it should return. +The example below provides the model with 1 example (1-shot) of the output format (a date in MM/DD/YYYY format) it should return. ```py from transformers import pipeline @@ -109,7 +109,7 @@ The downside of few-shot prompting is that you need to create lengthier prompts Chain-of-thought (CoT) is effective at generating more coherent and well-reasoned outputs by providing a series of prompts that help a model "think" more thoroughly about a topic. -The example below provides the model with several prompts that forces it to work through several intermediate reasoning steps. +The example below provides the model with several prompts to work through intermediate reasoning steps. ```py from transformers import pipeline @@ -137,24 +137,24 @@ If you eat 6 muffins, how many are left? Answer: 6 ``` -Like [few-shot](#few-shot) prompting, the downside of CoT is that it requires more effort to design a series of prompts that help the model reason through a complex task and the prompt length increases latency. +Like [few-shot](#few-shot) prompting, the downside of CoT is that it requires more effort to design a series of prompts that help the model reason through a complex task and prompt length increases latency. -## Finetuning +## Fine-tuning -While prompting is a powerful way to work with LLMs, there are scenarios where a finetuned model or even finetuning a model works better. +While prompting is a powerful way to work with LLMs, there are scenarios where a fine-tuned model or even fine-tuning a model works better. -Here are some examples scenarios where a finetuned model makes sense. +Here are some examples scenarios where a fine-tuned model makes sense. - Your domain is extremely different from what a LLM was pretrained on, and extensive prompting didn't produce the results you want. - Your model needs to work well in a low-resource language. - Your model needs to be trained on sensitive data that have strict regulatory requirements. - You're using a small model due to cost, privacy, infrastructure, or other constraints. -In all of these scenarios, ensure that you have a large enough domain-specific dataset to train your model with, have enough time and resources, and the cost of finetuning is worth it. Otherwise, you may be better off trying to optimize your prompt! +In all of these scenarios, ensure that you have a large enough domain-specific dataset to train your model with, have enough time and resources, and the cost of fine-tuning is worth it. Otherwise, you may be better off trying to optimize your prompt. ## Examples -Here are some examples of prompting a LLM for different tasks. +The examples below demonstrate prompting a LLM for different tasks. diff --git a/docs/source/en/tiktoken.md b/docs/source/en/tiktoken.md index 37bb05775360..4cddd02a8804 100644 --- a/docs/source/en/tiktoken.md +++ b/docs/source/en/tiktoken.md @@ -55,5 +55,5 @@ Visualize how the tiktoken tokenizer works by selecting Llama3 in the Tokenizer src="https://xenova-the-tokenizer-playground.static.hf.space" frameborder="0" width="850" - height="600" + height="850" > diff --git a/docs/source/en/training.md b/docs/source/en/training.md index f31ab8717f6f..a666c4938307 100644 --- a/docs/source/en/training.md +++ b/docs/source/en/training.md @@ -173,3 +173,96 @@ model.fit(tf_dataset) ## Resources Refer to the Transformers [examples](https://github.com/huggingface/transformers/tree/main/examples) for more detailed training scripts on various tasks. You can also check out the [notebooks](./notebooks) for interactive examples. + +### Fine-Tuning with torch.compile and Padding-Free Data Collation + +In addition to optimizing inference, you can also enhance the training efficiency of large language models by leveraging torch.compile during fine-tuning and using a padding-free data collator. This approach can significantly speed up training and reduce computational overhead. + +Here's how you can fine-tune a Llama model using SFTTrainer from the TRL library, with torch_compile enabled and a padding-free data collator: + +``` +#################### IMPORTS ################### + +import math +import datasets +import dataclasses +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + TrainingArguments +) +from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM + +#################### MODEL LOADING WITH FLASH ATTENTION ################### + +model_name = "meta-llama/Llama-3.2-1B" +model = AutoModelForCausalLM.from_pretrained( + model_name, + attn_implementation="flash_attention_2" # Enables FlashAttention-2 +) +tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) + +#################### DATA PREPROCESSING (PADDING-FREE) ################### + +response_template = "\n### Label:" +response_template_ids = tokenizer.encode( + response_template, add_special_tokens=False +)[2:] # Exclude special tokens + +data_collator = DataCollatorForCompletionOnlyLM( + response_template_ids=response_template_ids, + tokenizer=tokenizer, + ignore_index=-100, + padding_free=True # Enables padding-free collation +) + +def format_dataset(example): + return { + "output": example["output"] + tokenizer.eos_token + } + +data_files = {"train": "path/to/dataset"} # Replace with your dataset path +json_dataset = datasets.load_dataset("json", data_files=data_files) +formatted_train_dataset = json_dataset["train"].map(format_dataset) + +################# TRAINING CONFIGURATION ############################ + +train_args = TrainingArguments( + num_train_epochs=5, + per_device_train_batch_size=4, + per_device_eval_batch_size=4, + gradient_accumulation_steps=4, + learning_rate=1e-5, + weight_decay=0.0, + warmup_ratio=0.03, + lr_scheduler_type="cosine", + logging_steps=1, + include_tokens_per_second=True, + save_strategy="epoch", + output_dir="output", + torch_compile=True, # Enables torch.compile + torch_compile_backend="inductor", + torch_compile_mode="default" +) + +# Convert TrainingArguments to SFTConfig +transformer_train_arg_fields = [x.name for x in dataclasses.fields(SFTConfig)] +transformer_kwargs = { + k: v + for k, v in train_args.to_dict().items() + if k in transformer_train_arg_fields +} +training_args = SFTConfig(**transformer_kwargs) + +####################### FINE-TUNING ##################### + +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=formatted_train_dataset, + data_collator=data_collator, + dataset_text_field="output", + args=training_args, +) +trainer.train() +``` \ No newline at end of file From cf71516edd4d9c41fde5705a3c783d2f09d583d0 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Tue, 21 Jan 2025 10:18:56 -0800 Subject: [PATCH 095/116] fix toctree --- docs/source/en/_toctree.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 972646f305a0..df4870d3f37b 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -68,9 +68,9 @@ title: Text generation - local: generation_strategies title: Generation strategies - - local: tasks/prompting - local: generation_features title: Generation features + - local: tasks/prompting title: Prompt engineering - local: llm_optims title: Optimizing inference From a06a55887c47aa8856b263ffea1da630a3d76255 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Tue, 21 Jan 2025 17:32:40 -0800 Subject: [PATCH 096/116] reviews reviews --- docs/source/en/_toctree.yml | 2 + docs/source/en/chat_extras.md | 12 +- docs/source/en/chat_templating_multimodal.md | 183 +++++++++++++++++++ docs/source/en/chat_templating_writing.md | 120 +++++++++++- docs/source/en/conversations.md | 22 ++- docs/source/en/perf_torch_compile.md | 12 +- 6 files changed, 322 insertions(+), 29 deletions(-) create mode 100644 docs/source/en/chat_templating_multimodal.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index df4870d3f37b..7034113389db 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -88,6 +88,8 @@ title: Chat pipeline - local: chat_templating title: Templates + - local: chat_templating_multimodal + title: Multimodal templates - local: chat_templating_writing title: Template writing - local: chat_extras diff --git a/docs/source/en/chat_extras.md b/docs/source/en/chat_extras.md index 697f80d218ff..13f89eb39fc8 100644 --- a/docs/source/en/chat_extras.md +++ b/docs/source/en/chat_extras.md @@ -60,7 +60,7 @@ def get_current_wind_speed(location: str) -> float: tools = [get_current_temperature, get_current_wind_speed] ``` -Load a model and tokenizer that supports tool use like [NousResearch/Hermes-2-Pro-Llama-3-8B](https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B), but you can also consider a larger model like [Command-R](./model_doc/cohere) and [Mixtral-8x22B](./model_doc/mixtral) if your hardware can support it. +Load a model and tokenizer that supports tool-use like [NousResearch/Hermes-2-Pro-Llama-3-8B](https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B), but you can also consider a larger model like [Command-R](./model_doc/cohere) and [Mixtral-8x22B](./model_doc/mixtral) if your hardware can support it. ```py import torch @@ -80,7 +80,7 @@ messages = [ ] ``` -Use [`~PreTrainedTokenizerBase.apply_chat_template`] on the messages and pass the list of tools to the `tools` parameter. Then you can pass the inputs to the model for generation. +Pass `messages` and a list of tools to [`~PreTrainedTokenizerBase.apply_chat_template`]. Then you can pass the inputs to the model for generation. ```py inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") @@ -144,11 +144,11 @@ print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):])) -### Schema +## Schema -[`~PreTrainedTokenizerBase.apply_chat_template`] converts functions into a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step) which is passed to the model chat template. A LLM never sees the code inside the function. In other words, a LLM doesn't care how the model works technically, it only cares about function **definition** and **arguments**. +[`~PreTrainedTokenizerBase.apply_chat_template`] converts functions into a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step) which is passed to the chat template. A LLM never sees the code inside the function. In other words, a LLM doesn't care how the model works technically, it only cares about function **definition** and **arguments**. -The JSON schema is automatically generated behind the scenes as long as your function follows the rules listed earlier above. But you can use [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205) to manually convert a schema for more visibility or debugging. +The JSON schema is automatically generated behind the scenes as long as your function follows the [rules](#tools) listed earlier above. But you can use [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205) to manually convert a schema for more visibility or debugging. ```py from transformers.utils import get_json_schema @@ -191,7 +191,7 @@ print(schema) } ``` -You can edit the schema or write one entirely from scratch. This gives you a lot of flexibility to definie precise schemas for more complex functions. +You can edit the schema or write one entirely from scratch. This gives you a lot of flexibility to define precise schemas for more complex functions. > [!WARNING] > Try keeping your function signatures simple and the arguments to a minimum. These are easier for a model to understand and use than complex functions for example with nested arguments. diff --git a/docs/source/en/chat_templating_multimodal.md b/docs/source/en/chat_templating_multimodal.md new file mode 100644 index 000000000000..509cf452819b --- /dev/null +++ b/docs/source/en/chat_templating_multimodal.md @@ -0,0 +1,183 @@ + + +# Multimodal templates + +Multimodal model chat templates expect a similar [template](./chat_templating) as text-only models. It needs `messages` that includes a dictionary of the `role` and `content`. + +Multimodal templates are included in the [Processor](./processors) class and requires an additional `type` key for specifying whether the included content is an image, video, or text. + +This guide will show you how to format chat templates for multimodal models as well as some best practices for configuring the template + +## ImageTextToTextPipeline + +[`ImageTextToTextPipeline`] is a high-level image and text generation class with a “chat mode”. Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format). + +Start by building a chat history with the following two roles. + +- `system` describes how the model should behave and respond when you’re chatting with it. This role isn’t supported by all chat models. +- `user` is where you enter your first message to the model. + +```py +messages = [ + { + "role": "system", + "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], + }, + { + "role": "user", + "content": [ + {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}, + {"type": "text", "text": "What are these?"}, + ], + }, +] +``` + +Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map=“auto”](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory. + +> [!TIP] +> [`ImageTextToTextPipeline`] accepts chats in the OpenAI format to make inference easier and more accessible. + +```python +import torch +from transformers import pipeline + +pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda", torch_dtype=torch.float16) +pipeline(text=messages, max_new_tokens=50, return_full_text=False) +[{'input_text': [{'role': 'system', + 'content': [{'type': 'text', + 'text': 'You are a friendly chatbot who always responds in the style of a pirate'}]}, + {'role': 'user', + 'content': [{'type': 'image', + 'url': 'http://images.cocodataset.org/val2017/000000039769.jpg'}, + {'type': 'text', 'text': 'What are these?'}]}], + 'generated_text': 'The image shows two cats lying on a pink surface, which appears to be a cushion or a soft blanket. The cat on the left has a striped coat, typical of tabby cats, and is lying on its side with its head resting on the'}] +``` + +## Image inputs + +For multimodal models that accept images like [LLaVA](./model_doc/llava), include the following in `content` as shown below. + +- `"type": "image"` means the content is an image. +- `"url": ""` is a link to the image, but it could also be a file path (`"path"`). Images are automatically loaded, processed, and prepared into pixel values as inputs to the model. + +```python +from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration + +model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf") +processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf") + +messages = [ + { + "role": "system", + "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], + }, + { + "role": "user", + "content": [ + {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}, + {"type": "text", "text": "What are these?"}, + ], + }, +] +``` + +Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content and return the `input_ids` and `pixel_values`. + +```py +processed_chat = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt") +print(processed_chat.keys()) +``` + +These inputs are now ready to be used in [`~GenerationMixin.generate`]. + +## Video inputs + +Some vision models also support video inputs. The message format is very similar to the format for [image inputs](#image-inputs). + +- `"type": "video"` means the content is a video. +- `"url": ""` is a link to the video, , but it could also be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord). + +```python +from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration + +model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" +model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id) +processor = AutoProcessor.from_pretrained(model_id) + +messages = [ + { + "role": "system", + "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], + }, + { + "role": "user", + "content": [ + {"type": "video", "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"}, + {"type": "text", "text": "What do you see in this video?"}, + ], + }, +] +``` + +Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`]. + +- `num_frames` controls how many frames to uniformly sample from the video. Each checkpoint has a maximum frame count it was pretrained with and exceeding this count can significantly lower generation quality. It's important to choose a frame count that fits both the model capacity and your hardware resources. If `num_frames` isn't specified, the entire video is loaded without any frame sampling. +- `video_load_backend` refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html). + +The example below uses Decord as the backend because it is a bit faster than PyAV. + +```python +processed_chat = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + num_frames=32, + video_load_backend="decord", +) +print(processed_chat.keys()) +``` + +These inputs are now ready to be used in [`~GenerationMixin.generate`]. + +## Template configuration + +You can create a custom chat template with [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with [`~ProcessorMixin.chat_template`]. Refer to the [Template writing](./chat_templating_writing) guide for more details. + +For example, to enable a template to handle a *list of content* from multiple modalities while still supporting plain strings for text-only inference, specify how to handle the `content['type']` if it is an image or text as shown below in the Llama 3.2 Vision Instruct [template](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct/blob/main/chat_template.json). + +```jinja +{% for message in messages %} +{% if loop.index0 == 0 %}{{ bos_token }}{% endif %} +{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} +{% if message['content'] is string %} +{{ message['content'] }} +{% else %} +{% for content in message['content'] %} +{% if content['type'] == 'image' %} +{{ '<|image|>' }} +{% elif content['type'] == 'text' %} +{{ content['text'] }} +{% endif %} +{% endfor %} +{% endif %} +{{ '<|eot_id|>' }} +{% endfor %} +{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %} +``` diff --git a/docs/source/en/chat_templating_writing.md b/docs/source/en/chat_templating_writing.md index dd793e611b2d..fbcec9f71c01 100644 --- a/docs/source/en/chat_templating_writing.md +++ b/docs/source/en/chat_templating_writing.md @@ -19,8 +19,8 @@ rendered properly in your Markdown viewer. A chat template is a [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) template stored in the tokenizers [chat_template](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.chat_template) attribute. Jinja is a templating language that allows you to write Python-like code and syntax. A chat template performs the following three roles. 1. Print the role enclosed in `<|` and `|>` (`<|user|>`, `<|assistant|>`, etc.). -2. Print the message followed by an end-of-sequence (EOS) token. -3. Print the assistant token if [add_generation_prompt=True](./chat_templating#add_generation_prompt) so the model knows to generate an assistant response. +2. Print the message followed by an end-of-sequence (`EOS`) token. +3. Print the assistant token if [add_generation_prompt=True](./chat_templating#add_generation_prompt) so the model generates an assistant response. An example template is shown below. @@ -34,7 +34,7 @@ An example template is shown below. {%- endif %} ``` -The template can be customized to handle more complex use cases. This guide will show you how to add and edit templates and some template writing tips. +The template can be customized to handle more complex use cases. This guide will show you how to add and edit templates and includes template writing tips. ## Create a template @@ -60,7 +60,7 @@ template = template.replace("SYS", "SYSTEM") # Change the system token tokenizer.chat_template = template # Set the new template ``` -The template is saved in the `tokenizer_config.json` file. Save it to the Hub with [`~PushToHubMixin.push_to_hub`] so you can reuse it later and make sure everyone is using the right template for your model. +The template is saved in the `tokenizer_config.json` file. Upload it to the Hub with [`~PreTrainedTokenizer.push_to_hub`] so you can reuse it later and make sure everyone is using the right template for your model. ```py tokenizer.push_to_hub("model_name") @@ -109,7 +109,7 @@ There are two callable functions available inside a template. ### Compatibility with non-Python Jinja -Jinka is implemented in multiple languages. and they generally have the same syntax. Writing a template in Python allows you to use Python methods such as [lower](https://docs.python.org/3/library/stdtypes.html#str.lower) on strings or [items](https://docs.python.org/3/library/stdtypes.html#dict.items) on dicts. But this won't work if the template is used in a non-Python implementation, for example, when deploying with Javascript or Rust. +Jinja is implemented in multiple languages and they generally have the same syntax. Writing a template in Python allows you to use Python methods such as [lower](https://docs.python.org/3/library/stdtypes.html#str.lower) on strings or [items](https://docs.python.org/3/library/stdtypes.html#dict.items) on dicts. But this won't work if the template is used in a non-Python implementation, for example, when deploying with Javascript or Rust. Make the changes below to ensure compatibility across all Jinja implementations. @@ -119,7 +119,7 @@ Make the changes below to ensure compatibility across all Jinja implementations. ### Big templates -Newer models or models with features like [tool-calling](./chat_extras#tools) and [RAG](./chat_extras#retrieval-augmented-generation-rag) require larger templates than can be more than 100 lines. It may be easier to write larger templates in a separate file. The line numbers in the separate file corresponds exactly to the line numbers in template parsing or execution errors, making it easier to debug any potential issues. +Newer models or models with features like [tool-calling](./chat_extras#tools) and [RAG](./chat_extras#retrieval-augmented-generation-rag) require larger templates that can be longer than 100 lines. It may be easier to write larger templates in a separate file. The line numbers in the separate file corresponds exactly to the line numbers in template parsing or execution errors, making it easier to debug any potential issues. Write the template in a separate file and extract it to the chat template. @@ -133,9 +133,115 @@ You could also load an edited template back into the tokenizer. tokenizer.chat_template = open("template.jinja").read() ``` +## Templates for tools + +There isn't a specific format for writing templates for tools but it is best to follow the standard API. This ensures the template is widely accessible across models without requiring users to write custom code to use tools with your model. + +> [!WARNING] +> Formatting such as whitespace and special tokens are model-specific. Make sure everything exactly matches the format a model was trained with. + +The following section lists elements of the standard API for writing templates for tools. + +### Tool definitions + +Transformers chat template methods allow a user to pass tools as Python functions or a JSON schema. When functions are passed, a JSON schema is automatically generated and passed to the template. The `tools` variable in a template always takes a list of JSON schemas. + +The specific tokens and tool descriptions should match the ones your model was trained with. Your model doesn't need to understand the JSON schema input because your template can translate the JSON schema into your models format. For example, [Command-R](./model_doc/cohere) was trained with tools defined with Python function headers, but the Command-R tool template accepts JSON schemas. The template internally converts types and renders the input tools as Python headers. + +```json +{ + "type": "function", + "function": { + "name": "multiply", + "description": "A function that multiplies two numbers", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number", + "description": "The first number to multiply" + }, + "b": { + "type": "number", + "description": "The second number to multiply" + } + }, + "required": ["a", "b"] + } + } +} +``` + +An example for handling tool definitions in a chat template is shown below. The specific tokens and tool descriptions should be changed to match the ones a model was trained with. + +``` +{%- if tools %} + {%- for tool in tools %} + {{- '' + tool['function']['name'] + '\n' }} + {%- for argument in tool['function']['parameters']['properties'] %} + {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }} + {%- endfor %} + {{- '\n' }} + {%- endif %} +{%- endif %} +``` + +### Tool calls + +Tool calls, if present, is a list with the `"assistant”` role. This is always a list even though most tool-calling models only support single tool calls, which means the list usually only contains a single element. + +```json +{ + "role": "assistant", + "tool_calls": [ + { + "type": "function", + "function": { + "name": "multiply", + "arguments": { + "a": 5, + "b": 6 + } + } + } + ] +} +``` + +A common pattern for handling tool calls is shown below. + +``` +{%- if message['role'] == 'assistant' and 'tool_calls' in message %} + {%- for tool_call in message['tool_calls'] %} + {{- '' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n' }} + {%- endif %} + {%- endfor %} +{%- endif %} +``` + +### Tool responses + +Tool responses are a message dict with the `role`, `name` (name of the function) and `content` (result of the tool call) keys. + +```json +{ + "role": "tool", + "name": "multiply", + "content": "30" +} +``` + +Not all the keys need to be used in the tool response. For example, if a model doesn’t expect the function name to be included in the tool response, then you can just include the `role` and `content`. + +``` +{%- if message['role'] == 'tool' %} + {{- "" + message['content'] + "" }} +{%- endif %} +``` + ## Contribute -Add a chat template by setting the `chat_template` attribute in the tokenizer and testing it with [`~PreTrainedTokenizerBase.apply_chat_template`]. If it works as expected, then you can upload it to the Hub with with [`~PushToHubMixin.push_to_hub`]. +Add a chat template by setting the `chat_template` attribute in the tokenizer and testing it with [`~PreTrainedTokenizerBase.apply_chat_template`]. If it works as expected, then you can upload it to the Hub with with [`~PreTrainedTokenizer.push_to_hub`]. Even if you're not the model owner, it is still helpful to add a template for a model with an empty chat template or a model that is using a default class template. Open a [pull request](https://hf.co/docs/hub/repositories-pull-requests-discussions) on the model repository to add the template. diff --git a/docs/source/en/conversations.md b/docs/source/en/conversations.md index 5548063fa606..627d72465a47 100644 --- a/docs/source/en/conversations.md +++ b/docs/source/en/conversations.md @@ -16,18 +16,18 @@ rendered properly in your Markdown viewer. # Chat pipeline -Chat models are conversational models that you can send and receive messages with. There are many chat models available to choose from, but in general, larger models tend to be more capable though that's not always the case. The model size is often included in the name, like "8B" or "70B", and it describes the number of parameters. Some mixture-of-expert (MoE) models have names like "8x7B" or "141B-A35B" which basically means it's a 57B and 141B parameter model. Without quantization, you'll need ~2 bytes of memory per parameter. To reduce memory requirements, try quantizing the model. +Chat models are conversational models you can send and receive messages from. There are many chat models available to choose from, but in general, larger models tend to be better though that's not always the case. The model size is often included in the name, like "8B" or "70B", and it describes the number of parameters. Mixture-of-expert (MoE) models have names like "8x7B" or "141B-A35B" which means it's a 56B and 141B parameter model. You can try quantizing larger models to reduce memory requirements, otherwise you'll need ~2 bytes of memory per parameter. Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [LMSys Chatbot Arena](https://chat.lmsys.org/?leaderboard) to further help you identify the best chat models for your use case. Models that are specialized in certain domains (medical, legal text, non-English languages, etc.) may sometimes outperform larger general purpose models. > [!TIP] > Chat with a number of open-source models for free on [HuggingChat](https://hf.co/chat/)! -This guide shows you how to build and format a conversation, and how to quickly start chatting with a model with the [`TextGenerationPipeline`]. +This guide shows you how to build and format a conversation, and how to quickly start chatting with a model with [`TextGenerationPipeline`]. ## TextGenerationPipeline -The [`TextGenerationPipeline`] is a high-level text generation API with a "chat mode". Chat mode is turned on when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format). +[`TextGenerationPipeline`] is a high-level text generation class with a "chat mode". Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format). To start, build a chat history with the following two roles. @@ -41,7 +41,7 @@ chat = [ ] ``` -Create a [`TextGenerationPipeline`] and pass the `chat` to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory. +Create the [`TextGenerationPipeline`] and pass `chat` to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory. ```py import torch @@ -75,7 +75,7 @@ So, there you have it, pal! That's my expert advice on what to do in New York. N excuse me, I've got some oil changes to attend to. (winks) ``` -Use the `append` method on `chat` to respond to the model's message. +Use the `append` method on `chat` to respond to the models message. ```py chat = response[0]["generated_text"] @@ -102,12 +102,12 @@ But, hey, that's what makes art, art, right? (laughs) ## Performance -Transformers load models in full precision by default, and for a 8B model, this requires ~32GB of memory! Reduce a model's memory usage by loading a model in half-precision or bfloat16 (only uses ~2 bytes per parameter). You can even quantize the model to a lower precision like 8-bit or 4-bit with [bitsandbytes](https://hf.co/docs/bitsandbytes/index). +Transformers load models in full precision by default, and for a 8B model, this requires ~32GB of memory! Reduce memory usage by loading a model in half-precision or bfloat16 (only uses ~2 bytes per parameter). You can even quantize the model to a lower precision like 8-bit or 4-bit with [bitsandbytes](https://hf.co/docs/bitsandbytes/index). > [!TIP] -> Refer to the [Quantization](./quantization/overview) section for more information about different quantization backends. +> Refer to the [Quantization](./quantization/overview) docs for more information about the different quantization backends available. -Create a [`BitsAndBytesConfig`] with your desired quantization settings and pass it to the pipelines `model_kwargs` parameter. +Create a [`BitsAndBytesConfig`] with your desired quantization settings and pass it to the pipelines `model_kwargs` parameter. The example below quantizes a model to 8-bits. ```py from transformers import pipeline, BitsAndBytesConfig @@ -116,7 +116,7 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True) pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config}) ``` -In general, larger models are slower in addition to requiring more memory because text generation is bottlenecked by **memory bandwidth** instead of compute power. Each active parameter must be read from memory for each generated token. For a 16GB model, this means 16GB must be read from memory for every generated token. +In general, larger models are slower in addition to requiring more memory because text generation is bottlenecked by **memory bandwidth** instead of compute power. Each active parameter must be read from memory for every generated token. For a 16GB model, 16GB must be read from memory for every generated token. The number of generated tokens/sec is proportional to the total memory bandwidth of the system divided by the model size. Depending on your hardware, total memory bandwidth can vary. Refer to the table below for approximate generation speeds for different hardware types. @@ -126,7 +126,9 @@ The number of generated tokens/sec is proportional to the total memory bandwidth | specialized CPU (Intel Xeon, AMD Threadripper/Epyc, Apple silicon) | 200-900GB/sec | | data center GPU (NVIDIA A100/H100) | 2-3TB/sec | -The easiest solution for improving generation speed is to either reduce the model size in memory with quantization or use hardware with higher memory bandwidth. You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token per `forward` pass. This significantly alleviates the bandwidth bottleneck and improves generation speed. +The easiest solution for improving generation speed is to either quantize a model or use hardware with higher memory bandwidth. + +You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token per `forward` pass. This significantly alleviates the bandwidth bottleneck and improves generation speed. > [!TIP] > Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe.md), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token. diff --git a/docs/source/en/perf_torch_compile.md b/docs/source/en/perf_torch_compile.md index 941bd343e7ae..e7bef363e039 100644 --- a/docs/source/en/perf_torch_compile.md +++ b/docs/source/en/perf_torch_compile.md @@ -15,7 +15,7 @@ rendered properly in your Markdown viewer. # torch.compile -[torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) compiles PyTorch code into optimized kernels that significantly speed up inference. This feature relies on TorchDynamo to compile the code into graphs and TorchInductor to further compile the graphs into optimized kernels. It is a powerful optimization tool and in many cases, it only requires adding a single line of code. +[torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) compiles PyTorch code into optimized kernels that significantly speed up inference. This feature relies on [TorchDynamo](https://pytorch.org/docs/stable/torch.compiler_dynamo_overview.html) to compile the code into graphs and [TorchInductor](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747) to further compile the graphs into optimized kernels. It is a powerful optimization tool, and in many cases, only requires adding a single line of code. Wrap a model with torch.compile to compile and return an optimized model. @@ -29,15 +29,15 @@ compiled_model = torch.compile(model) > [!TIP] > The initial call to torch.compile is slow because the model needs to be compiled. Subsequent calls to the compiled model are much faster because it doesn't need to compile again. -There are several parameters you can use to customize the compilation process. Two of the more important ones are listed below. For a full list of parameters, refer to the [torch.compile documentation](https://pytorch.org/docs/stable/generated/torch.compile.html). +There are several parameters to customize the compilation process. Two of the more important ones are listed below. For a full list of parameters, refer to the torch.compile [documentation](https://pytorch.org/docs/stable/generated/torch.compile.html). ## Modes -The `mode` parameter offers several performance options for compiling, and you should try different modes to see which one works best for your use case. +The `mode` parameter offers several performance options for compiling. Try different modes to see which one works best for your use case. - `default` is a balanced option between speed and memory. - `reduce-overhead` reduces the Python overhead at the expense of a little more memory, but it can be faster. -- `max-autotune` offers the fastest speed, but compiling the code takes longer. +- `max-autotune` offers the fastest speed, but compilation takes longer. ```py from transformers import AutoModelForCausalLM @@ -57,9 +57,9 @@ model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto compiled_model = torch.compile(model, mode="reduce-overhead", fullgraph=True) ``` -## Benchmark results +## Benchmarks -Refer to the table below for performance benchmarks comparing the mean inference time in milliseconds with torch.compile enabled and disabled across various GPUs and batch sizes on the same image. +Refer to the table below for performance benchmarks comparing the mean inference time in milliseconds with torch.compile enabled and disabled across various GPUs and batch sizes on the same image for different vision tasks. Select **Subset** in the table below to switch between different GPUs, as well as benchmarks on [PyTorch nightly](https://download.pytorch.org/whl/nightly/cu118) 2.1.0dev and torch.compile with `reduce-overhead` mode enabled. From ca90e0ad08c1396237fffcf7b96fd6638b960569 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Wed, 22 Jan 2025 10:18:52 -0800 Subject: [PATCH 097/116] continue reviews --- docs/source/en/agents.md | 3 + docs/source/en/multilingual.md | 179 ------------------------- docs/source/en/perf_infer_cpu.md | 2 +- docs/source/en/perf_infer_gpu_multi.md | 50 +++---- docs/source/en/perf_infer_gpu_one.md | 23 ++-- docs/source/en/tf_xla.md | 6 +- docs/source/en/tools.md | 3 + 7 files changed, 39 insertions(+), 227 deletions(-) delete mode 100644 docs/source/en/multilingual.md diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md index 10d163c3a742..2a061eba3385 100644 --- a/docs/source/en/agents.md +++ b/docs/source/en/agents.md @@ -14,6 +14,9 @@ rendered properly in your Markdown viewer. --> +> [!TIP] +> Agents and tools are being spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. These docs will be deprecated in the future! + # Agents [[open-in-colab]] diff --git a/docs/source/en/multilingual.md b/docs/source/en/multilingual.md deleted file mode 100644 index 30a63eea28c8..000000000000 --- a/docs/source/en/multilingual.md +++ /dev/null @@ -1,179 +0,0 @@ - - -# Multilingual models for inference - -[[open-in-colab]] - -There are several multilingual models in 🤗 Transformers, and their inference usage differs from monolingual models. Not *all* multilingual model usage is different though. Some models, like [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased), can be used just like a monolingual model. This guide will show you how to use multilingual models whose usage differs for inference. - -## XLM - -XLM has ten different checkpoints, only one of which is monolingual. The nine remaining model checkpoints can be split into two categories: the checkpoints that use language embeddings and those that don't. - -### XLM with language embeddings - -The following XLM models use language embeddings to specify the language used at inference: - -- `FacebookAI/xlm-mlm-ende-1024` (Masked language modeling, English-German) -- `FacebookAI/xlm-mlm-enfr-1024` (Masked language modeling, English-French) -- `FacebookAI/xlm-mlm-enro-1024` (Masked language modeling, English-Romanian) -- `FacebookAI/xlm-mlm-xnli15-1024` (Masked language modeling, XNLI languages) -- `FacebookAI/xlm-mlm-tlm-xnli15-1024` (Masked language modeling + translation, XNLI languages) -- `FacebookAI/xlm-clm-enfr-1024` (Causal language modeling, English-French) -- `FacebookAI/xlm-clm-ende-1024` (Causal language modeling, English-German) - -Language embeddings are represented as a tensor of the same shape as the `input_ids` passed to the model. The values in these tensors depend on the language used and are identified by the tokenizer's `lang2id` and `id2lang` attributes. - -In this example, load the `FacebookAI/xlm-clm-enfr-1024` checkpoint (Causal language modeling, English-French): - -```py ->>> import torch ->>> from transformers import XLMTokenizer, XLMWithLMHeadModel - ->>> tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-clm-enfr-1024") ->>> model = XLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-clm-enfr-1024") -``` - -The `lang2id` attribute of the tokenizer displays this model's languages and their ids: - -```py ->>> print(tokenizer.lang2id) -{'en': 0, 'fr': 1} -``` - -Next, create an example input: - -```py ->>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1 -``` - -Set the language id as `"en"` and use it to define the language embedding. The language embedding is a tensor filled with `0` since that is the language id for English. This tensor should be the same size as `input_ids`. - -```py ->>> language_id = tokenizer.lang2id["en"] # 0 ->>> langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0]) - ->>> # We reshape it to be of size (batch_size, sequence_length) ->>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1) -``` - -Now you can pass the `input_ids` and language embedding to the model: - -```py ->>> outputs = model(input_ids, langs=langs) -``` - -The [run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) script can generate text with language embeddings using the `xlm-clm` checkpoints. - -### XLM without language embeddings - -The following XLM models do not require language embeddings during inference: - -- `FacebookAI/xlm-mlm-17-1280` (Masked language modeling, 17 languages) -- `FacebookAI/xlm-mlm-100-1280` (Masked language modeling, 100 languages) - -These models are used for generic sentence representations, unlike the previous XLM checkpoints. - -## BERT - -The following BERT models can be used for multilingual tasks: - -- `google-bert/bert-base-multilingual-uncased` (Masked language modeling + Next sentence prediction, 102 languages) -- `google-bert/bert-base-multilingual-cased` (Masked language modeling + Next sentence prediction, 104 languages) - -These models do not require language embeddings during inference. They should identify the language from the -context and infer accordingly. - -## XLM-RoBERTa - -The following XLM-RoBERTa models can be used for multilingual tasks: - -- `FacebookAI/xlm-roberta-base` (Masked language modeling, 100 languages) -- `FacebookAI/xlm-roberta-large` (Masked language modeling, 100 languages) - -XLM-RoBERTa was trained on 2.5TB of newly created and cleaned CommonCrawl data in 100 languages. It provides strong gains over previously released multilingual models like mBERT or XLM on downstream tasks like classification, sequence labeling, and question answering. - -## M2M100 - -The following M2M100 models can be used for multilingual translation: - -- `facebook/m2m100_418M` (Translation) -- `facebook/m2m100_1.2B` (Translation) - -In this example, load the `facebook/m2m100_418M` checkpoint to translate from Chinese to English. You can set the source language in the tokenizer: - -```py ->>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer - ->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." ->>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒." - ->>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh") ->>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") -``` - -Tokenize the text: - -```py ->>> encoded_zh = tokenizer(chinese_text, return_tensors="pt") -``` - -M2M100 forces the target language id as the first generated token to translate to the target language. Set the `forced_bos_token_id` to `en` in the `generate` method to translate to English: - -```py ->>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en")) ->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) -'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.' -``` - -## MBart - -The following MBart models can be used for multilingual translation: - -- `facebook/mbart-large-50-one-to-many-mmt` (One-to-many multilingual machine translation, 50 languages) -- `facebook/mbart-large-50-many-to-many-mmt` (Many-to-many multilingual machine translation, 50 languages) -- `facebook/mbart-large-50-many-to-one-mmt` (Many-to-one multilingual machine translation, 50 languages) -- `facebook/mbart-large-50` (Multilingual translation, 50 languages) -- `facebook/mbart-large-cc25` - -In this example, load the `facebook/mbart-large-50-many-to-many-mmt` checkpoint to translate Finnish to English. You can set the source language in the tokenizer: - -```py ->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM - ->>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." ->>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia." - ->>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI") ->>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") -``` - -Tokenize the text: - -```py ->>> encoded_en = tokenizer(en_text, return_tensors="pt") -``` - -MBart forces the target language id as the first generated token to translate to the target language. Set the `forced_bos_token_id` to `en` in the `generate` method to translate to English: - -```py ->>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]) ->>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) -"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry." -``` - -If you are using the `facebook/mbart-large-50-many-to-one-mmt` checkpoint, you don't need to force the target language id as the first generated token otherwise the usage is the same. diff --git a/docs/source/en/perf_infer_cpu.md b/docs/source/en/perf_infer_cpu.md index 9254e9a9b3d6..7522a013d0d9 100644 --- a/docs/source/en/perf_infer_cpu.md +++ b/docs/source/en/perf_infer_cpu.md @@ -49,7 +49,7 @@ pred = onnx_qa(question, context) > [!WARNING] > BetterTransformer isn't supported for all models. Check this [list](https://hf.co/docs/optimum/bettertransformer/overview#supported-models) to see whether a model supports BetterTransformer. -BetterTransformer is available through Optimum with the [`~PreTrainedModel.to_bettertransformer`] method. +BetterTransformer is available through Optimum with [`~PreTrainedModel.to_bettertransformer`]. ```py from transformers import AutoModelForCausalLM diff --git a/docs/source/en/perf_infer_gpu_multi.md b/docs/source/en/perf_infer_gpu_multi.md index 7f5d52363e4d..19bb9f4394be 100644 --- a/docs/source/en/perf_infer_gpu_multi.md +++ b/docs/source/en/perf_infer_gpu_multi.md @@ -13,65 +13,49 @@ rendered properly in your Markdown viewer. --> -# Multi-GPU inference +# Distributed GPU inference -Built-in Tensor Parallelism (TP) is now available with certain models using PyTorch. Tensor parallelism shards a model onto multiple GPUs, enabling larger model sizes, and parallelizes computations such as matrix multiplication. +[Tensor parallelism](./perf_train_gpu_many#tensor-parallelism) shards a model onto multiple GPUs and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each GPU can process a tensor slice. -To enable tensor parallel, pass the argument `tp_plan="auto"` to [`~AutoModelForCausalLM.from_pretrained`]: +> [!TIP] +> Tensor parallelism is currently only supported for [Llama](./model_doc/llama). Open a GitHub issue or pull request to add tensor parallelism support for another model. -```python +Set `tp_plan="auto"` in [`~AutoModel.from_pretrained`] to enable tensor parallelism for inference. + +```py import os import torch from transformers import AutoModelForCausalLM, AutoTokenizer -model_id = "meta-llama/Meta-Llama-3-8B-Instruct" - -# Initialize distributed +# initialize distributed environment rank = int(os.environ["RANK"]) device = torch.device(f"cuda:{rank}") torch.distributed.init_process_group("nccl", device_id=device) -# Retrieve tensor parallel model +# enable tensor parallelism model = AutoModelForCausalLM.from_pretrained( - model_id, + "meta-llama/Meta-Llama-3-8B-Instruct", tp_plan="auto", ) -# Prepare input tokens -tokenizer = AutoTokenizer.from_pretrained(model_id) +# prepare input tokens +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") prompt = "Can I help" inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device) -# Distributed run +# distributed run outputs = model(inputs) ``` -You can use `torchrun` to launch the above script with multiple processes, each mapping to a GPU: +Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/elastic/run.html) with 4 processes per GPU. -``` +```bash torchrun --nproc-per-node 4 demo.py ``` -PyTorch tensor parallel is currently supported for the following models: -* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) -* [Gemma](https://huggingface.co/docs/transformers/en/model_doc/gemma), [Gemma2](https://huggingface.co/docs/transformers/en/model_doc/gemma2) -* [Granite](https://huggingface.co/docs/transformers/en/model_doc/granite) -* [Mistral](https://huggingface.co/docs/transformers/en/model_doc/mistral) -* [Qwen2](https://huggingface.co/docs/transformers/en/model_doc/qwen2), [Qwen2MoE](https://huggingface.co/docs/transformers/en/model_doc/qwen2_moe), [Qwen2-VL](https://huggingface.co/docs/transformers/v4.48.0/en/model_doc/qwen2_vl) -* [Starcoder2](https://huggingface.co/docs/transformers/en/model_doc/starcoder2) -* [Cohere](https://huggingface.co/docs/transformers/en/model_doc/cohere), [Cohere2](https://huggingface.co/docs/transformers/en/model_doc/cohere2) -* [GLM](https://huggingface.co/docs/transformers/en/model_doc/glm) -* [Mixtral](https://huggingface.co/docs/transformers/en/model_doc/mixtral) -* [OLMo](https://huggingface.co/docs/transformers/en/model_doc/olmo), [OLMo2](https://huggingface.co/docs/transformers/en/model_doc/olmo2) -* [Phi](https://huggingface.co/docs/transformers/en/model_doc/phi), [Phi-3](https://huggingface.co/docs/transformers/en/model_doc/phi3) - -You can request to add tensor parallel support for another model by opening a GitHub Issue or Pull Request. - -### Expected speedups - -You can benefit from considerable speedups for inference, especially for inputs with large batch size or long sequences. +You can benefit from considerable speed ups for inference, especially for inputs with large batch size or long sequences. -For a single forward pass on [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) with a sequence length of 512 and various batch sizes, the expected speedup is as follows: +For a single forward pass on [Llama](./model_doc/llama) with a sequence length of 512 and various batch sizes, you can expect the following speed ups.
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 62fbb35d46f5..4bb34acf6c9f 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -15,7 +15,7 @@ rendered properly in your Markdown viewer. # GPU -GPUs are the standard hardware for machine learning because they're optimized for memory bandwidth and parallelism. With the increasing sizes of modern models, it's more important than ever to make sure GPUs are capable of efficiently handling them and able to deliver the best possible performance. +GPUs are the standard hardware for machine learning because they're optimized for memory bandwidth and parallelism. With the increasing sizes of modern models, it's more important than ever to make sure GPUs are capable of efficiently handling and delivering the best possible performance. This guide will demonstrate a few ways to optimize inference on a GPU. The optimization methods shown below can be combined with each other to achieve even better performance, and they also work for distributed GPUs. @@ -34,7 +34,7 @@ pip install bitsandbytes accelerate For text generation with 8-bit quantization, you should use [`~GenerationMixin.generate`] instead of the high-level [`Pipeline`] API. The [`Pipeline`] returns slower performance because it isn't optimized for 8-bit models, and some sampling strategies (nucleus sampling) also aren't supported. -Set up a [`BitsAndBytesConfg`] and set `load_in_8bit=True` to load a model in 8-bit precision. The [`BitsAndBytesConfig`] is passed to the `quantization_config` parameter in [`~PreTrainedModel.from_pretrained`]. +Set up a [`BitsAndBytesConfig`] and set `load_in_8bit=True` to load a model in 8-bit precision. The [`BitsAndBytesConfig`] is passed to the `quantization_config` parameter in [`~PreTrainedModel.from_pretrained`]. Allow Accelerate to automatically distribute the model across your available hardware by setting [device_map="auto"](https://hf.co/docs/accelerate/concept_guides/big_model_inference#designing-a-device-map). @@ -67,9 +67,9 @@ Learn in more detail the concepts underlying 8-bit quantization in the [Gentle I -Set up a [`BitsAndBytesConfg`] and set `load_in_4bit=True` to load a model in 4-bit precision. The [`BitsAndBytesConfig`] is passed to the `quantization_config` parameter in [`~PreTrainedModel.from_pretrained`]. +Set up a [`BitsAndBytesConfig`] and set `load_in_4bit=True` to load a model in 4-bit precision. The [`BitsAndBytesConfig`] is passed to the `quantization_config` parameter in [`~PreTrainedModel.from_pretrained`]. -Allow Accelerate to automatically distribute the model across your available hardware by setting device_map=“auto”. +Allow Accelerate to automatically distribute the model across your available hardware by setting `device_map=“auto”`. Place all inputs on the same device as the model. @@ -119,7 +119,7 @@ from optimum.onnxruntime import ORTModelForSequenceClassification ort_model = ORTModelForSequenceClassification.from_pretrained( "distilbert/distilbert-base-uncased-finetuned-sst-2-english", -# export=True, + #export=True, provider="CUDAExecutionProvider", ) ``` @@ -145,9 +145,9 @@ Learn more details about using ORT with Optimum in the [Accelerated inference on - skipping unnecessary computation of padding tokens with nested tensors > [!WARNING] -> Some BetterTransformer features are being upstreamed to Transformers with default support for native [torch.nn.scaled_dot_product_attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA). BetterTransformer has than the Transformers SDPA integration, but you can expect more and more architectures to natively support SDPA in Transformers. +> Some BetterTransformer features are being upstreamed to Transformers with default support for native [torch.nn.functional.scaled_dot_product_attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA). BetterTransformer has a wider coverage than the Transformers SDPA integration, but you can expect more and more architectures to natively support SDPA in Transformers. -BetterTransformer is available through Optimum with the [`~PreTrainedModel.to_bettertransformer`] method. +BetterTransformer is available through Optimum with [`~PreTrainedModel.to_bettertransformer`]. ```py from transformers import AutoModelForCausalLM @@ -183,10 +183,11 @@ from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", attn_implementation="sdpa") ``` -SDPA selects the most performant implementation available, but you can also explicitly select an implementation with [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager. The example below shows how to enable the FlashAttention2 implementation with `enable_flash=True`. +SDPA selects the most performant implementation available, but you can also explicitly select an implementation with [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager. The example below shows how to enable the FlashAttention2 implementation with `enable_flash=True`. ```py import torch +from torch.nn.attention import SDPBackend, sdpa_kernel from transformers import AutoModelForCausalLM, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B") @@ -195,7 +196,7 @@ model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_m input_text = "Hello, my llama is cute" inputs = tokenizer(input_text, return_tensors="pt").to("cuda") -with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): +with sdpa_kernel(SDPBackend.FLASH_ATTENTION):: outputs = model.generate(**inputs) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) @@ -238,10 +239,10 @@ Enable FlashAttention2 by setting `attn_implementation="flash_attention_2"` in [ ```py from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="sdpa") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2") ``` -### Benchmark results +### Benchmarks FlashAttention2 speeds up inference considerably especially for inputs with long sequences. However, since FlashAttention2 doesn't support computing attention scores with padding tokens, you must manually pad and unpad the attention scores for batched inference if a sequence contains padding tokens. The downside is batched generation is slower with padding tokens. diff --git a/docs/source/en/tf_xla.md b/docs/source/en/tf_xla.md index d6822ab5b9ec..c8fb13ff6aec 100644 --- a/docs/source/en/tf_xla.md +++ b/docs/source/en/tf_xla.md @@ -34,7 +34,7 @@ model.compile(jit_compile=True) XLA can be used to accelerate any arbitrary [tf.function](https://www.tensorflow.org/api_docs/python/tf/function). -Models with a TensorFlow implementation like [GPT2](./model_doc/gpt2), [T5](./model_doc/t5), [OPT](./model_doc/opt), and [Whisper](./model_doc/whisper) are XLA compatible. The amount of speed up depends on a model, but in general, TensorFlow models in Transformers get a ~100x speed up. +Models with a TensorFlow implementation like [GPT2](./model_doc/gpt2), [T5](./model_doc/t5), [OPT](./model_doc/opt), and [Whisper](./model_doc/whisper) are XLA compatible. The speed up depends on a model, but in general, TensorFlow models in Transformers get a ~100x speed up. ### Functions @@ -65,7 +65,7 @@ my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True) ### Text generation -You could also compile other model functions with XLA. For example, enable XLA for text generation by wrapping the [`~TFGenerationMixin.generate`] method with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function). +You could also compile other model functions with XLA. For example, enable XLA for text generation by wrapping [`~TFGenerationMixin.generate`] with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function). ```py import tensorflow as tf @@ -95,7 +95,7 @@ When executing an XLA-enabled function for the first time, it tries to infer the To ensure a function is only traced once, the inputs must have the same shape as when the graph was built. This usually isn't an issue for fixed input shapes like images, but it can be an issue for inputs with variable shapes like text. -One way to handle this is to pad your text so it always has the same shape. Configure the padding options such as [pad_to_multiple_of](https://hf.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.pad.pad_to_multiple_of) in the tokenizer. +One way to handle this is to pad your text so it always has the same shape. Configure padding options such as [pad_to_multiple_of](https://hf.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.pad.pad_to_multiple_of) in the tokenizer. ```py import tensorflow as tf diff --git a/docs/source/en/tools.md b/docs/source/en/tools.md index 23c3e2bf876e..fb62f68b8d39 100644 --- a/docs/source/en/tools.md +++ b/docs/source/en/tools.md @@ -14,6 +14,9 @@ rendered properly in your Markdown viewer. --> +> [!TIP] +> Agents and tools are being spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. These docs will be deprecated in the future! + # Tools A tool is a function an agent can use to complete a task. Depending on your task, a tool can perform a web search, answer questions about a document, transcribe speech to text, and much more. From 05455f4a22f325921272cff9aac8a9f1cda92f71 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Wed, 22 Jan 2025 16:36:48 -0800 Subject: [PATCH 098/116] more reviews --- docs/source/en/_toctree.yml | 10 +-- docs/source/en/accelerate.md | 4 +- docs/source/en/debugging.md | 24 +++---- docs/source/en/deepspeed.md | 72 +++++++++++--------- docs/source/en/fsdp.md | 10 +-- docs/source/en/gpu_selection.md | 94 +++++++++++++++++++++++++++ docs/source/en/hpo_train.md | 6 +- docs/source/en/optimizers.md | 4 +- docs/source/en/peft.md | 15 +++-- docs/source/en/perf_train_cpu.md | 12 +++- docs/source/en/perf_train_cpu_many.md | 4 +- docs/source/en/perf_train_gpu_one.md | 12 ++-- docs/source/en/perf_train_special.md | 2 +- docs/source/en/perf_train_tpu_tf.md | 26 ++++---- docs/source/en/trainer.md | 41 ++++++------ docs/source/en/training.md | 28 ++++---- 16 files changed, 240 insertions(+), 124 deletions(-) create mode 100644 docs/source/en/gpu_selection.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 7034113389db..f2639153a7e2 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -110,8 +110,6 @@ title: Agents - local: tools title: Tools - - local: multilingual - title: Run inference with multilingual models - title: Training isExpanded: False sections: @@ -120,23 +118,25 @@ - local: trainer title: Trainer - local: training - title: Finetuning + title: Fine-tuning - local: optimizers title: Optimizers - local: hpo_train title: Hyperparameter search - title: Distributed training sections: + - local: gpu_selection + title: GPU selection - local: accelerate title: Accelerate - local: fsdp title: FullyShardedDataParallel - local: deepspeed title: DeepSpeed - - local: perf_train_cpu_many - title: Distributed CPUs - local: debugging title: Multi-GPU debugging + - local: perf_train_cpu_many + title: Distributed CPUs - local: perf_train_gpu_many title: Parallelism methods - title: Hardware diff --git a/docs/source/en/accelerate.md b/docs/source/en/accelerate.md index 0093e0fbca23..c0ad46f8ac91 100644 --- a/docs/source/en/accelerate.md +++ b/docs/source/en/accelerate.md @@ -30,7 +30,7 @@ Start by running [accelerate config](https://hf.co/docs/accelerate/main/en/packa accelerate config ``` -Depending on your setup and the answers you provide, an example configuration file for distributing training with FSDP on one machine with two GPUs. +Depending on your setup and the answers you provide, an example configuration file for distributing training with FSDP on one machine with two GPUs may look like the following. ```yaml compute_environment: LOCAL_MACHINE @@ -107,7 +107,7 @@ accelerator = Accelerator() device = accelerator.device ``` -All PyTorch objects (model, optimizer, scheduler, dataloaders) should be passed to the [`~accelerate.Accelerator.prepare`] method now. This method moves your model to the appropriate device or devices, adapts the optimizer and scheduler to use [`~accelerate.AcceleratedOptimizer`] and [`~accelerate.AcceleratedScheduler`], and creates a new shardable dataloader. +All PyTorch objects (model, optimizer, scheduler, dataloaders) should be passed to the [`~accelerate.Accelerator.prepare`] method now. This method moves your model to the appropriate device or devices, adapts the optimizer and scheduler to use [`~accelerate.optimizer.AcceleratedOptimizer`] and [`~accelerate.scheduler.AcceleratedScheduler`], and creates a new shardable dataloader. ```py train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( diff --git a/docs/source/en/debugging.md b/docs/source/en/debugging.md index 07ea2ed59b06..09394d2229d1 100644 --- a/docs/source/en/debugging.md +++ b/docs/source/en/debugging.md @@ -16,9 +16,11 @@ rendered properly in your Markdown viewer. # Multi-GPU debugging -Distributed training can be tricky because you have to ensure you're using the correct CUDA version across your system, you may encounter inter-communication issues between GPUs, and there may be underflow or overflow problems in your model. This guide covers how to debug these issues, especially as it relates to DeepSpeed and PyTorch. +Distributed training can be tricky because you have to ensure you're using the correct CUDA version across your system. You may encounter inter-communication issues between GPUs, and there may be underflow or overflow problems in your model. -## DeepSpeed CUDA issues +This guide covers how to debug these issues, especially as it relates to DeepSpeed and PyTorch. + +## DeepSpeed CUDA DeepSpeed compiles CUDA C++ which can be a potential source of errors when building PyTorch extensions that require CUDA. These errors depend on how CUDA is installed on your system. This section focuses on PyTorch built with *CUDA 10.2* @@ -29,17 +31,17 @@ pip install deepspeed > [!TIP] > For any other installation issues, please [open an issue](https://github.com/microsoft/DeepSpeed/issues) with the DeepSpeed team. -### Non-identical CUDA toolkits +### Non-identical toolkits PyTorch comes with its own CUDA toolkit, but to use DeepSpeed with PyTorch, you need to have an identical version of CUDA installed system-wide. For example, if you installed PyTorch with `cudatoolkit==10.2` in your Python environment, then you'll also need to have CUDA 10.2 installed everywhere. -The exact location can vary from system to system, but `usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly setup and added to your `PATH` environment variable, you can find the installation location with the following command. +The exact location can vary from system to system, but `usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly set up and added to your `PATH` environment variable, you can find the installation location with the following command. ```bash which nvcc ``` -### Multiple CUDA toolkits +### Multiple toolkits You may also have more than one CUDA toolkit installed on your system. @@ -65,9 +67,9 @@ export PATH=/usr/local/cuda-10.2/bin:$PATH export LD_LIBRARY_PATH=/usr/local/cuda-10.2/lib64:$LD_LIBRARY_PATH ``` -In addition, you should also check that the assigned directories you actually exist. The `lib64` sub-directory contains various CUDA `.so` objects (like `libcudart.so`), and while it is unlikely your system names them differently, you should check the actual names and change them accordingly. +In addition, you should also check that the assigned directories actually exist. The `lib64` sub-directory contains various CUDA `.so` objects (like `libcudart.so`), and while it is unlikely your system names them differently, you should check the actual names and change them accordingly. -### Older CUDA versions +### Older versions Sometimes, older CUDA versions may refuse to build with newer compilers. For example, if you have `gcc-9` but CUDA wants `gcc-7`. Usually, installing the latest CUDA toolkit enables support for the newer compiler. @@ -95,7 +97,7 @@ TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \ > [!TIP] > Add the `DS_BUILD_AIO=1` parameter to the build command to use NVMe offload. Make sure you install the libaio-dev package across your system. -Next, specify your GPU's architecture by editing the `TORCH_CUDA_ARCH_LIST` variable (find a complete list of NVIDIA GPUs and their corresponding architectures on this [page](https://developer.nvidia.com/cuda-gpus)). To check the PyTorch version that corresponds to your architecture, run the following command. +Next, specify your GPUs architecture by editing the `TORCH_CUDA_ARCH_LIST` variable (find a complete list of NVIDIA GPUs and their corresponding architectures on this [page](https://developer.nvidia.com/cuda-gpus)). To check the PyTorch version that corresponds to your architecture, run the following command. ```bash python -c "import torch; print(torch.cuda.get_arch_list())" @@ -144,7 +146,7 @@ This command generates a binary wheel that'll look something like `dist/deepspee pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl ``` -## Communication issues +## Communication Distributed training involves communication between processes and or nodes and this can be a potential source of errors. @@ -193,7 +195,7 @@ debug_overflow = DebugUnderflowOverflow(model) The [`~debug_utils.DebugUnderflowOverflow`] module inserts hooks into the model to test the input and output variables and the corresponding model weights after each forward call. If `inf` or `nan` is detected in at least one element of the activations or weights, the module prints a report like the one shown below. -The example below is for fp16 mixed precision training with a [google/mt5-small](https://huggingface.co/google/mt5-small). +The example below is for fp16 mixed precision training with [google/mt5-small](https://huggingface.co/google/mt5-small). ```shell Detected inf/nan during batch_number=0 @@ -280,7 +282,7 @@ class T5DenseGatedGeluDense(nn.Module): return hidden_states ``` -One solution is to back a few steps before the values started growing too large and switch to fp32 so the numbers don't overflow when multiplied or summed. Another potential solution is to temporarily disable mixed precision training (`amp`). +One solution is to go back a few steps before the values started growing too large and switch to fp32 so the numbers don't overflow when multiplied or summed. Another potential solution is to temporarily disable mixed precision training (`amp`). ```py import torch diff --git a/docs/source/en/deepspeed.md b/docs/source/en/deepspeed.md index b27afa9bea7a..4a84f9338343 100644 --- a/docs/source/en/deepspeed.md +++ b/docs/source/en/deepspeed.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # DeepSpeed -[DeepSpeed](https://www.deepspeed.ai/) is designed to optimize distributed training for large models with data, model, pipeline, and even a combination of all three [parallelism](./perf_train_gpu_many) strategies to provide better memory efficiency and faster training speeds. This is achieved by the [Zero Redundancy Optimizer (ZeRO)](https://hf.co/papers/1910.02054) which consists of three stages. +[DeepSpeed](https://www.deepspeed.ai/) is designed to optimize distributed training for large models with data, model, pipeline, and even a combination of all three [parallelism](./perf_train_gpu_many) strategies to provide better memory efficiency and faster training speeds. This is achieved with the [Zero Redundancy Optimizer (ZeRO)](https://hf.co/papers/1910.02054) which consists of three stages. | ZeRO stage | description | |---|---| @@ -24,9 +24,9 @@ rendered properly in your Markdown viewer. | 2 | partition optimizer and gradient states | | 3 | partition optimizer, gradient, and parameters | -Each stage progressively saves more memory, allowing really large models to fit and be trained on a single GPU. DeepSpeed is integrated with [`Trainer`] for all ZeRO stages and offloading optimizer memory and computations from the GPU to the CPU. Provide a config file or one of the example templates to [`Trainer`] to enable DeepSpeed features. +Each stage progressively saves more memory, allowing really large models to fit and train on a single GPU. All ZeRO stages, offloading optimizer memory and computations from the GPU to the CPU are integrated with [`Trainer`]. Provide a config file or one of the example templates to [`Trainer`] to enable DeepSpeed features. -This guide walks you through setting up a DeepSpeed config file, how to enable its features in [`Trainer`], and deploy training. +This guide walks you through setting up a DeepSpeed config file, how to enable its features in [`Trainer`], and deploy for training. Install DeepSpeed from either PyPI or Transformers. For more detailed installation instructions, refer to the DeepSpeed [installation](https://www.deepspeed.ai/tutorials/advanced-install/) or GitHUB [README](https://github.com/microsoft/deepspeed#installation). @@ -50,7 +50,7 @@ pip install transformers[deepspeed] > [!WARNING] > Refer to the [DeepSpeed CUDA installation](./debugging#deepspeed-cuda-issues) if you're having trouble with your installation. While DeepSpeed has a pip installable package, it is highly recommended to [install it from source](https://www.deepspeed.ai/tutorials/advanced-install/#install-deepspeed-from-source) to ensure it matches your hardware and to support certain features which aren't available in the PyPI distribution. -DeepSpeed provides a tool for estimating the required CPU and GPU memory for the parameters, optimizer and gradient states. You'll also need some memory for the CUDA kernels and activations. +DeepSpeed provides a tool for estimating the required CPU and GPU memory for the parameters, optimizer and gradient states. You'll also to need to reserve some memory for the CUDA kernels and activations. Run the command below to check the memory requirements for [bigscience/T0_3B](https://huggingface.co/docs/transformers/main/en/bigscience/T0_3B) on a single GPU. @@ -75,11 +75,11 @@ SW: Model with 2783M total params, 65M largest layer params. > [!TIP] > If you have enough GPU memory, disable CPU and NVMe offload to speed everything up. -## Choose a ZeRO stage +## Choosing a ZeRO stage Consider the table below to help you choose the appropriate ZeRO stage for training because there is a trade-off between training speed and memory usage. The table orders the ZeRO stages from fastest to slowest and from least memory usage to most. -| fastest | memory usage | +| fastest | least memory usage | |---|---| | ZeRO-1 | ZeRO-3 + offload | | ZeRO-2 | ZeRO-3 | @@ -91,7 +91,7 @@ Decide the type of performance you're optimizing for, speed or memory, and then ## Config file -Enable DeepSpeed in [`Trainer`] with a config file containing all the parameters for how to configure and setup your training. When the training script is executed, DeepSpeed logs the configuration from [`Trainer`] to the console so you can see exactly what's being used. +Once you've decided on a ZeRO stage, set up a config file to enable DeepSpeed with [`Trainer`]. The config file contains all the parameters for how to configure and set up your training. When the training script is executed, DeepSpeed logs the configuration from [`Trainer`] to the console so you can see exactly what's being used. > [!TIP] > Find a complete list of DeepSpeed configuration options on the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. There are also practical examples of various DeepSpeed configuration examples in the [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) main [DeepSpeed](https://github.com/microsoft/DeepSpeed) repository. Run the command below to quickly find specific examples. @@ -146,16 +146,16 @@ There are three types of config parameters. There are two ways to modify the config parameters. > [!TIP] -> Some values, such as `scheduler.params.total_num_steps`, are calculated by the [`Trainer`] during training. +> Some values, such as `scheduler.params.total_num_steps`, are calculated by [`Trainer`] during training. 1. Create or load a DeepSpeed config to use as the main config. 1. Create a [`TrainingArguments`] object based on the DeepSpeed config values. ### ZeRO stage -Each ZeRO stage has its own config, as defined in `zero_optimization`. +Each ZeRO stage config is defined in `zero_optimization`. -For a more detailed explanation of each parameter, refer to the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. These parameters must be setup with DeepSpeed because [`Trainer`] doesn't provide equivalent command line arguments. +For a more detailed explanation of each parameter, refer to the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. These parameters must be set up with DeepSpeed because [`Trainer`] doesn't provide equivalent command line arguments. > [!WARNING] > DeepSpeed doesn't validate parameter names and any typos will fallback on the parameters default setting. Observe the DeepSpeed engine startup log messages to see what values are being used. @@ -179,7 +179,7 @@ ZeRO-1 shards the optimizer states across GPUs and you can expect a small speed ZeRO-2 shards the optimizer and gradient states across GPUs. This stage is primarily used for training since its features are not relevant to inference. Some important parameters to configure for better performance include the following. * `offload_optimizer` should be enabled to reduce GPU memory usage. -* `overlap_comm` when set to `true` uses increased GPU memory usage in exchange for lower allreduce latency. This feature uses 4.5x the `allgather_bucket_size` and `reduce_bucket_size` values. In this example, they're set to `5e8` which means it requires 9GB of GPU memory. If your GPU memory is 8GB or less, you should reduce `overlap_comm` to lower the memory requirements and prevent an out-of-memory (OOM) error. +* `overlap_comm` when set to `true` uses more GPU memory in exchange for lower allreduce latency. This feature uses 4.5x the `allgather_bucket_size` and `reduce_bucket_size` values. In this example, they're set to `5e8` which means it requires 9GB of GPU memory. If your GPU memory is 8GB or less, you should reduce `overlap_comm` to lower the memory requirements and prevent an out-of-memory (OOM) error. * `allgather_bucket_size` and `reduce_bucket_size` trade-off available GPU memory for communication speed. The smaller their values, the slower communication is and the more GPU memory is available. You can balance, for example, whether a bigger batch size is more important than a slightly slower training time. * `round_robin_gradients` is available in DeepSpeed 0.4.4 for CPU offloading. It parallelizes gradient copying to CPU memory among ranks by fine-grained gradient partitioning. Performance benefit grows with gradient accumulation steps (more copying between optimizer steps) or GPU count (increased parallelism). @@ -210,14 +210,14 @@ ZeRO-3 shards the optimizer and gradient states, and parameters across GPUs. Unl * `device: "cpu"` can help if you're running out of GPU memory and if you have free CPU memory available. This offloads model parameters to the CPU. * `pin_memory: true` can improve throughput, but less memory becomes available for other processes because the pinned memory is reserved for the specific process that requested it and it's typically accessed much faster than normal CPU memory. * `stage3_max_live_parameters` is the upper limit on how many full parameters to keep on the GPU at any given time. Reduce this value if you encounter an OOM error. -* `stage3_max_reuse_distance` is a value for determining when a parameter is used again in the future, and it helps decide whether to throw the parameter away or to keep it. If the parameter is going to be reused (if the value is less than `stage3_max_reuse_distance`), then it is kept to reduce communication overhead. This is helpful when activation checkpointing is enabled and you want to keep the parameter in the forward recompute until the backward pass. But reduce this value if you encounter an OOM error. +* `stage3_max_reuse_distance` is a value for determining when a parameter is used again in the future, and it helps decide whether to throw the parameter away or to keep it. If the parameter is going to be reused (if the value is less than `stage3_max_reuse_distance`), then it is kept to reduce communication overhead. This is helpful when activation checkpointing is enabled and you want to keep the parameter in the forward recompute until the backward pass. Reduce this value if you encounter an OOM error. * `stage3_gather_16bit_weights_on_model_save` consolidates fp16 weights when a model is saved. For large models and multiple GPUs, this is expensive in terms of memory and speed. You should enable it if you're planning on resuming training. * `sub_group_size` controls which parameters are updated during the optimizer step. Parameters are grouped into buckets of `sub_group_size` and each bucket is updated one at a time. When used with NVMe offload, `sub_group_size` determines when model states are moved in and out of CPU memory during the optimization step. This prevents running out of CPU memory for extremely large models. `sub_group_size` can be left to its default value if you aren't using NVMe offload, but you may want to change it if you: 1. Run into an OOM error during the optimization step. In this case, reduce `sub_group_size` to reduce memory usage of the temporary buffers. 2. The optimization step is taking a really long time. In this case, increase `sub_group_size` to improve bandwidth utilization as a result of increased data buffers. -* `reduce_bucket_size`, `stage3_prefetch_bucket_size`, and `stage3_param_persistence_threshold` are dependent on a models hidden size. It is recommended to set these values to `auto` and allow the [`Trainer`] to automatically assign the values. +* `reduce_bucket_size`, `stage3_prefetch_bucket_size`, and `stage3_param_persistence_threshold` are dependent on a models hidden size. It is recommended to set these values to `auto` and allow [`Trainer`] to automatically assign the values. ```yml { @@ -244,7 +244,7 @@ ZeRO-3 shards the optimizer and gradient states, and parameters across GPUs. Unl } ``` -#### Initialize large models +### Initialize large models With ZeRO-3, use the [deepspeed.zero.Init](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.zero.Init) context manager to initialize a model faster. @@ -278,7 +278,7 @@ If you encounter a model parameter weight where `tensor([1.])` or the parameter tensor([1.0], device="cuda:0", dtype=torch.float16, requires_grad=True) ``` -[!TIP] +> [!TIP] > For more information about initializing large models with ZeRO-3 and accessing the parameters, take a look at the [Constructing Massive Models](https://deepspeed.readthedocs.io/en/latest/zero3.html#constructing-massive-models) and [Gathering Parameters](https://deepspeed.readthedocs.io/en/latest/zero3.html#gathering-parameters) guides. @@ -374,10 +374,10 @@ DeepSpeed supports many training features that can be configured in the config f ### Gradient checkpointing -Gradient checkpointing saves memory by only storing some of the intermediate activations instead of storing *all* of them. It is useful for fitting larger models on the GPU without running out of memory or to increase the batch size for better performance. Training speed is slower though. +Gradient checkpointing saves memory by only storing *some* of the intermediate activations instead of storing *all* of them. It is useful for fitting larger models on the GPU without running out of memory or to increase the batch size for better performance. Training speed is slower though. -* For a Hugging Face model, set `model.gradient_checkpointing_enable()` or add `--gradient_checkpointing` in the [`TrainingArguments`]. -* For a non-Hugging Face model, use the DeepSpeed [Activation Checkpointing API](https://deepspeed.readthedocs.io/en/latest/activation-checkpointing.html). Replacing Transformers modeling code and [torch.utils.checkpoint](https://pytorch.org/docs/stable/checkpoint.html) with the DeepSpeed API gives you more flexibility because you can offload the forward activations to the CPU memory instead of recalculating them. +* For a Transformers model, set `model.gradient_checkpointing_enable()` or add `--gradient_checkpointing` in the [`TrainingArguments`]. +* For a non-Transformers model, use the DeepSpeed [Activation Checkpointing API](https://deepspeed.readthedocs.io/en/latest/activation-checkpointing.html). Replacing Transformers modeling code and [torch.utils.checkpoint](https://pytorch.org/docs/stable/checkpoint.html) with the DeepSpeed API gives you more flexibility because you can offload the forward activations to the CPU memory instead of recalculating them. ### Batch size @@ -396,7 +396,7 @@ A separate data type is used for communication collectives like reduction, gathe All gather and scatter operations are performed in the same data type the data is in. For example, if you're training in bf16, the data is also gathered in bf16 because gathering is a non-lossy operation. -Reduce operations are lossy, for example, when gradients are averaged across multiple GPUs. When the communication is done if fp16 or bf16, it's more likely to be lossy because adding multiple numbers in low precision isn't exact. This is especially the case with bf16 which has a lower precision than fp16. For this reason, fp16 is the default for reduction operations because the loss is minimal when averaging gradients. +Reduce operations are lossy, for example, when gradients are averaged across multiple GPUs. When the communication is done in fp16 or bf16, it's more likely to be lossy because adding multiple numbers in low precision isn't exact. This is especially the case with bf16 which has a lower precision than fp16. For this reason, fp16 is the default for reduction operations because the loss is minimal when averaging gradients. Choose the communication data type by setting the `communication_data_type` parameter in the config file. For example, choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it's downcasted to whichever half-precision data type you're training in. @@ -447,7 +447,7 @@ Train in fp32 if a model wasn't pretrained in mixed precision because it may cau } ``` -For Ampere GPUs and PyTorch 1.7+, the more efficient [tf32](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices) mode is automatically enabled for some operations but the results are still in fp32. Configure it from the [`Trainer`] by setting `--tf32` to enable it, and `--tf32 0` or `--no_tf32` to disable it. +For Ampere GPUs and PyTorch 1.7+, the more efficient [tf32](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices) mode is automatically enabled for some operations but the results are still in fp32. Configure it in [`Trainer`] by setting `--tf32` to enable it, and `--tf32 0` or `--no_tf32` to disable it. @@ -486,7 +486,7 @@ To configure Apex-like fp16 mixed precision, set up the config as shown below wi > [!TIP] > bf16 requires DeepSpeed 0.6.0. -bf16 has the same dynamic range as fp32, and doesn’t require loss scaling unlike fp16. However, if you use [gradient accumulation](#gradient-accumulation) with bf16, gradients are accumulated in bf16 which may not be desirable because bf16s low precision can lead to lossy accumulation. +bf16 has the same dynamic range as fp32, and doesn’t require loss scaling unlike fp16. However, if you use [gradient accumulation](#gradient-accumulation) with bf16, gradients are accumulated in bf16 which may not be desirable because the lower precision can lead to lossy accumulation. bf16 can be set up in the config file or enabled from the command line when the following arguments are passed: `--bf16` or `--bf16_full_eval`. @@ -575,11 +575,25 @@ You can set the parameters to `"auto"` or manually input your own values. +### Universal checkpointing + +[Universal Checkpointing](https://www.deepspeed.ai/tutorials/universal-checkpointing) saves and loads model, optimizer and training scheduler states across different model architectures, parallelism techniques, and training configurations. By saving them in a Universal format, it enables easier model training continuation and fine-tuning. + +Resume training with a Universal checkpoint by setting `load_universal` to `true` in the config file. + +```yaml +{ + "checkpoint": { + "load_universal": true + } +} +``` + ## Deploy DeepSpeed can be deployed with its native launcher, [torchrun](https://pytorch.org/docs/stable/elastic/run.html) or [Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch). -Add the `--deepspeed ds_config.json` argument to the [`Trainer`] command line. It is recommended to use DeepSpeeds [add_config_arguments](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) utility to add any other command line arguments to your code. +Add the `--deepspeed ds_config.json` argument to [`Trainer`] in the command line. It is recommended to use DeepSpeeds [add_config_arguments](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) utility to add any other command line arguments to your code. @@ -636,7 +650,7 @@ A multi-node setup consists of multiple nodes, where each node has one of more G You could also use the `--save_on_each_node` parameter in [`TrainingArguments`] to automatically add the above `checkpoint` to your config. -The examples below for the torchrun and DeepSpeed launcher shows how to deploy two nodes with eight GPUs each. Access the first node with `ssh hotname1` and the second node with `ssh hostname2`. Both nodes must be able to communicate with each other locally over ssh without a password. +The examples below for the torchrun and DeepSpeed launcher shows how to deploy two nodes with eight GPUs each. Access the first node with `ssh hostname1` and the second node with `ssh hostname2`. Both nodes must be able to communicate with each other locally over ssh without a password. @@ -808,7 +822,7 @@ DeepSpeed stores the main fp32 weights in custom checkpoint optimizer files (`gl ZeRO-2 saves the model weights in fp16. To save the weights in fp16 for ZeRO-3, set `"stage3_gather_16bit_weights_on_model_save": true` in the config file, because the weights are distributed across multiple GPUs. -If you don't, [`Trainer`] won't save the weights in fp16 and it won't create a `pytorch_model.bin` file. This is because DeepSpeed's state_dict contains a placeholder instead of the real weights, so you won't be able to load it. +If you don't, [`Trainer`] won't save the weights in fp16 and won't create a `pytorch_model.bin` file. This is because DeepSpeed's state_dict contains a placeholder instead of the real weights, so you won't be able to load it. ```yaml { @@ -821,7 +835,7 @@ If you don't, [`Trainer`] won't save the weights in fp16 and it won't create a ` ### fp32 -fp32 weights shouldn't be saved during training because it can require a lot of memory, unless you have a lot of free CPU memory. It is usually best to save the fp32 weights offline after training is complete. +Unless you have a lot of free CPU memory, fp32 weights shouldn't be saved during training because it can require a lot of memory. It is usually best to save the fp32 weights offline after training is complete. @@ -896,7 +910,7 @@ model.load_state_dict(state_dict) DeepSpeed also works with Transformers without [`Trainer`]. The [`~integrations.HfDeepSpeedConfig`] is responsible for gathering ZeRO-3 parameters and partitioning a model across multiple GPUs when [`~PreTrainedModel.from_pretrained`] is called. -You must instantiate [`HfDeepSpeedConfig`] before loading a model to efficiently deploy ZeRO-3. +You must instantiate [`~integrations.HfDeepSpeedConfig`] before loading a model to efficiently deploy ZeRO-3. @@ -917,7 +931,7 @@ engine = deepspeed.initialize(model=model, config_params=ds_config, ...) -[`HfDeepSpeedConfig`] is not required for ZeRO-1 or ZeRO-2. +[`~integrations.HfDeepSpeedConfig`] is not required for ZeRO-1 or ZeRO-2. ```py from transformers.integrations import HfDeepSpeedConfig @@ -939,7 +953,7 @@ engine = deepspeed.initialize(model=model, config_params=ds_config, ...) ## Troubleshoot -One of the first things to check when you encounter an error is whether DeepSpeed is the cause because often it isn't. Retry your setup without DeepSpeed, and if the error persists, report the issue. If the issue is unrelated to the Transformers integration, please open the issue on the DeepSpeed [repository](https://github.com/microsoft/DeepSpeed). +One of the first things to check when you encounter an error is whether DeepSpeed is the cause (because often it isn't). Retry your setup without DeepSpeed, and if the error persists, report the issue. If the issue is unrelated to the Transformers integration, please open the issue on the DeepSpeed [repository](https://github.com/microsoft/DeepSpeed). For issues related to the Transformers integration, please provide the following information. @@ -960,7 +974,7 @@ The following sections provide a guide for resolving two of the most common issu ### Process killed at startup -When the DeepSpeed process is killed during launch without a traceback, that usually means the program tried to allocate more CPU memory than is available on your system. Or the process may have tried to allocate more CPU memory than allowed, leading the OS kernel to termine the process. +When the DeepSpeed process is killed during launch without a traceback, that usually means the program tried to allocate more CPU memory than is available on your system. Or the process may have tried to allocate more CPU memory than allowed, leading the OS kernel to terminate the process. In this case, check whether your config file has either `offload_optimizer`, `offlload_param`, or both configured to offload to the CPU. diff --git a/docs/source/en/fsdp.md b/docs/source/en/fsdp.md index 51817045f2d0..944c5a18e109 100644 --- a/docs/source/en/fsdp.md +++ b/docs/source/en/fsdp.md @@ -20,7 +20,7 @@ rendered properly in your Markdown viewer. Unlike [DistributedDataParallel (DDP)](./perf_train_gpu_many#distributeddataparallel), FSDP saves more memory because it doesn't replicate a model on each GPU. It shards the models parameters, gradients and optimizer states across GPUs. Each model shard processes a portion of the data and the results are synchronized to speed up training. -This guide covers how to setup training a model with FSDP using [Accelerate](https://hf.co/docs/accelerate/index), a library for managing distributed training. +This guide covers how to set up training a model with FSDP and [Accelerate](https://hf.co/docs/accelerate/index), a library for managing distributed training. ```bash pip install accelerate @@ -28,7 +28,7 @@ pip install accelerate ## Configuration options -Always start by running the [accelerate config](https://hf.co/docs/accelerate/package_reference/cli#accelerate-config) command to help Accelerate setup the correct distributed training environment. +Always start by running the [accelerate config](https://hf.co/docs/accelerate/package_reference/cli#accelerate-config) command to help Accelerate set up the correct distributed training environment. ```bash accelerate config @@ -66,7 +66,7 @@ Size-based wrapping is also available. If a layer exceeds a certain number of pa Intermediate checkpoints should be saved as a sharded state dict because saving the full state dict - even with CPU offloading - is time consuming and can cause `NCCL Timeout` errors due to indefinite hanging during broadcasting. -Specify `fsdp_state_dict_type: SHARDED_STATE_DICT` in the configuration file to save the sharded state dict. Now you can resume training from the sharded state dict with the [`~accelerate.Accelerator.load_state`] method. +Specify `fsdp_state_dict_type: SHARDED_STATE_DICT` in the configuration file to save the sharded state dict. Now you can resume training from the sharded state dict with [`~accelerate.Accelerator.load_state`]. ```py accelerator.load_state("directory/containing/checkpoints") @@ -93,7 +93,7 @@ xla_fsdp_grad_ckpt: True # enable gradient checkpointing ## Training -After running [accelerate config](https://hf.co/docs/accelerate/package_reference/cli#accelerate-config), your configuration file should be ready. An example configuration file is show below that fully shards the parameter, gradient and optimizer states on two GPUs. Your file may look different depending on how you setup your configuration. +After running [accelerate config](https://hf.co/docs/accelerate/package_reference/cli#accelerate-config), your configuration file should be ready. An example configuration file is shown below that fully shards the parameter, gradient and optimizer states on two GPUs. Your file may look different depending on how you set up your configuration. ```yaml compute_environment: LOCAL_MACHINE @@ -138,7 +138,7 @@ accelerate launch --fsdp="full shard" --fsdp_config="path/to/fsdp_config/" my-tr ## Resources -FSDP is a powerful tool for training large models with fewer GPUs compared to some other parallelism strategies. Refer to the following resources below to learn even more about FSDP. +FSDP is a powerful tool for training large models with fewer GPUs compared to other parallelism strategies. Refer to the following resources below to learn even more about FSDP. - Follow along with the more in-depth Accelerate guide for [FSDP](https://hf.co/docs/accelerate/usage_guides/fsdp). - Read the [Introducing PyTorch Fully Sharded Data Parallel (FSDP) API](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) blog post. diff --git a/docs/source/en/gpu_selection.md b/docs/source/en/gpu_selection.md new file mode 100644 index 000000000000..749fcf3c2dda --- /dev/null +++ b/docs/source/en/gpu_selection.md @@ -0,0 +1,94 @@ + + +# GPU selection + +During distributed training, you can specify the number of GPUs to use and in what order. This can be useful when you have GPUs with different computing power and you want to use the faster GPU first. Or you could only use a subset of the available GPUs. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html). You don't need Accelerate or [DeepSpeed integration](./main_classes/deepspeed). + +This guide will show you how to select the number of GPUs to use and the order to use them in. + +## Number of GPUs + +For example, if there are 4 GPUs and you only want to use the first 2, run the command below. + + + + +Use the `--nproc_per_node` to select how many GPUs to use. + +```bash +torchrun --nproc_per_node=2 trainer-program.py ... +``` + + + + +Use `--num_processes` to select how many GPUs to use. + +```bash +accelerate launch --num_processes 2 trainer-program.py ... +``` + + + + +Use `--num_gpus` to select how many GPUs to use. + +```bash +deepspeed --num_gpus 2 trainer-program.py ... +``` + + + + +### Order of GPUs + +To select specific GPUs to use and their order, configure the the `CUDA_VISIBLE_DEVICES` environment variable. It is easiest to set the environment variable in `~/bashrc` or another startup config file. `CUDA_VISIBLE_DEVICES` is used to map which GPUs are used. For example, if there are 4 GPUs (0, 1, 2, 3) and you only want to run GPUs 0 and 2: + +```bash +CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ... +``` + +Only the 2 physical GPUs (0 and 2) are "visible" to PyTorch and these are mapped to `cuda:0` and `cuda:1` respectively. You can also reverse the order of the GPUs to use 2 first. The mapping becomes `cuda:1` for GPU 0 and `cuda:0` for GPU 2. + +```bash +CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ... +``` + +You can also set the `CUDA_VISIBLE_DEVICES` environment variable to an empty value to create an environment without GPUs. + +```bash +CUDA_VISIBLE_DEVICES= python trainer-program.py ... +``` + +> [!WARNING] +> As with any environment variable, they can be exported instead of being added to the command line. However, this is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong GPUs. Instead, it is common practice to set the environment variable for a specific training run on the same command line. + +`CUDA_DEVICE_ORDER` is an alternative environment variable you can use to control how the GPUs are ordered. You can order according to the following. + +1. PCIe bus IDs that matches the order of [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface) and [`rocm-smi`](https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/.doxygen/docBin/html/index.html) for NVIDIA and AMD GPUs respectively. + +```bash +export CUDA_DEVICE_ORDER=PCI_BUS_ID +``` + +2. GPU compute ability. + +```bash +export CUDA_DEVICE_ORDER=FASTEST_FIRST +``` + +The `CUDA_DEVICE_ORDER` is especially useful if your training setup consists of an older and newer GPU, where the older GPU appears first, but you cannot physically swap the cards to make the newer GPU appear first. In this case, set `CUDA_DEVICE_ORDER=FASTEST_FIRST` to always use the newer and faster GPU first (`nvidia-smi` or `rocm-smi` still reports the GPUs in their PCIe order). Or you could also set `export CUDA_VISIBLE_DEVICES=1,0`. \ No newline at end of file diff --git a/docs/source/en/hpo_train.md b/docs/source/en/hpo_train.md index 60e5451b8e35..303ff6fb53b4 100644 --- a/docs/source/en/hpo_train.md +++ b/docs/source/en/hpo_train.md @@ -70,7 +70,7 @@ The following examples demonstrate how to perform a hyperparameter search for th -[Optuna](https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html#sphx-glr-tutorial-10-key-features-002-configurations-py) optimizes categorical, integers, and floats. +[Optuna](https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html#sphx-glr-tutorial-10-key-features-002-configurations-py) optimizes categories, integers, and floats. ```py def optuna_hp_space(trial): @@ -91,7 +91,7 @@ best_trials = trainer.hyperparameter_search( -[Ray Tune](https://docs.ray.io/en/latest/tune/api/search_space.html) optimizes floats, integers, and categorical parameters but it also offers multiple sampling distributions for each parameter such as uniform and log-uniform. +[Ray Tune](https://docs.ray.io/en/latest/tune/api/search_space.html) optimizes floats, integers, and categorical parameters. It also offers multiple sampling distributions for each parameter such as uniform and log-uniform. ```py def ray_hp_space(trial): @@ -137,7 +137,7 @@ best_trials = trainer.hyperparameter_search( -[Weights & Biases](https://docs.wandb.ai/guides/sweeps/sweep-config-keys) also optimizes integers, floats, and categorical parameters but it also includes support for different search strategies and distribution options. +[Weights & Biases](https://docs.wandb.ai/guides/sweeps/sweep-config-keys) also optimizes integers, floats, and categorical parameters. It also includes support for different search strategies and distribution options. ```py def wandb_hp_space(trial): diff --git a/docs/source/en/optimizers.md b/docs/source/en/optimizers.md index f9de3d206e92..ae4637b2b6b0 100644 --- a/docs/source/en/optimizers.md +++ b/docs/source/en/optimizers.md @@ -16,9 +16,9 @@ rendered properly in your Markdown viewer. # Optimizers -Transformers offers two optimizers natively, AdamW and AdaFactor. But it also provides integrations for other more specialized optimizers. Install the library that offers the optimizer and drop it in to the `optim` parameter in [`TrainingArguments`]. +Transformers offers two native optimizers, AdamW and AdaFactor. It also provides integrations for more specialized optimizers. Install the library that offers the optimizer and drop it in the `optim` parameter in [`TrainingArguments`]. -This guide will show you how to use these optimizers with [`Trainer`] using the [`TrainingArguments`] shown below. +This guide will show you how to use these optimizers with [`Trainer`] using [`TrainingArguments`] shown below. ```py import torch diff --git a/docs/source/en/peft.md b/docs/source/en/peft.md index e69cca412528..4ee0e2681963 100644 --- a/docs/source/en/peft.md +++ b/docs/source/en/peft.md @@ -13,9 +13,9 @@ rendered properly in your Markdown viewer. [[open-in-colab]] -[PEFT](https://huggingface.co/docs/peft/index), a library of parameter-efficient finetuning methods, enable training and storing large models often on consumer GPUs. These methods only finetune a small number of extra model parameters, also known as adapters, on top of the pretrained model. A significant amount of memory is saved because the GPU doesn't need to store the optimizer states and gradients for the pretrained base model. Adapters are very lightweight, making it convenient to share, store, and load them. +[PEFT](https://huggingface.co/docs/peft/index), a library of parameter-efficient fine-tuning methods, enables training and storing large models on consumer GPUs. These methods only fine-tune a small number of extra model parameters, also known as adapters, on top of the pretrained model. A significant amount of memory is saved because the GPU doesn't need to store the optimizer states and gradients for the pretrained base model. Adapters are very lightweight, making it convenient to share, store, and load them. -This guide provides a short overview of the PEFT library and how to use it for training with Transformers. For more details, refer to the PEFT [documentation](https://huggingface.co/docs/peft/index). +This guide provides a short introduction to the PEFT library and how to use it for training with Transformers. For more details, refer to the PEFT [documentation](https://huggingface.co/docs/peft/index). Install PEFT with the command below. @@ -37,9 +37,9 @@ pip install git+https://github.com/huggingface/peft.git > [!TIP] -> PEFT currently supports the LoRA, IA3, and AdaLoRA methods for Transformers. To use another PEFT method, such as prompt learniing or prompt tuning, you'll need to use the PEFT library directly. +> PEFT currently supports the LoRA, IA3, and AdaLoRA methods for Transformers. To use another PEFT method, such as prompt learning or prompt tuning, use the PEFT library directly. -[Low-Rank Adaptation (LoRA)](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a very common PEFT method that decomposes the weight matrix into two smaller trainable matrices. Start by defining a [`~peft.LoraConfig`] object with the parameters shown below. +[Low-Rank Adaptation (LoRA)](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) is a very common PEFT method that decomposes the weight matrix into two smaller trainable matrices. Start by defining a [LoraConfig](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraConfig) object with the parameters shown below. ```py from peft import LoraConfig, TaskType, get_peft_model @@ -55,7 +55,7 @@ lora_config = LoraConfig( ) ``` -Add [`~peft.LoraConfig`] to the model with [`~integrations.PeftAdapterMixin.add_adapter`]. The model is ready to be passed to [`Trainer`] for training. +Add [LoraConfig](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraConfig) to the model with [`~integrations.PeftAdapterMixin.add_adapter`]. The model is now ready to be passed to [`Trainer`] for training. ```py model.add_adapter(lora_config, adapter_name="lora_1") @@ -108,6 +108,9 @@ model = AutoModelForCausalLM.from_pretrained("google/gemma-7b") model.load_adapter("klcsp/gemma7b-lora-alpaca-11-v1") ``` + + + For very large models, it is helpful to load a quantized version of the model in 8 or 4-bit precision to save memory. Transformers supports quantization with its [bitsandbytes](https://huggingface.co/docs/bitsandbytes/index) integration. Specify in [`BitsAndBytesConfig`] whether you want to load a model in 8 or 4-bit precision. For multiple devices, add `device_map="auto"` to automatically distribute the model across your hardware. @@ -124,7 +127,7 @@ model = AutoModelForCausalLM.from_pretrained( ## Set adapter -[`~integrations.PeftAdapterMixin.add_adapter`] adds a new adapter to a model. To add a second adapter, the new adapter must be the same type as the first adapter. Use the [`~integrations.PeftAdapterMixin.add_adapter.adapter_name`] parameter to assign a name to the adapter. +[`~integrations.PeftAdapterMixin.add_adapter`] adds a new adapter to a model. To add a second adapter, the new adapter must be the same type as the first adapter. Use the `adapter_name` parameter to assign a name to the adapter. ```py model.add_adapter(lora_config, adapter_name="lora_2") diff --git a/docs/source/en/perf_train_cpu.md b/docs/source/en/perf_train_cpu.md index 2f87641008a7..1eab6afbde23 100644 --- a/docs/source/en/perf_train_cpu.md +++ b/docs/source/en/perf_train_cpu.md @@ -15,11 +15,17 @@ rendered properly in your Markdown viewer. # CPU -A modern CPU is capable of efficiently training large models by leveraging the underlying optimizations built into the hardware and training on fp16 or bf16 datatypes. +A modern CPU is capable of efficiently training large models by leveraging the underlying optimizations built into the hardware and training on fp16 or bf16 data types. This guide focuses on how to train large models on an Intel CPU using mixed precision and the [Intel Extension for PyTorch (IPEX)](https://intel.github.io/intel-extension-for-pytorch/index.html) library. -Install IPEX with the command below. You can find your PyTorch version by running `pip list | grep torch` in the command line. +You can Find your PyTorch version by running the command below. + +```bash +pip list | grep torch +``` + +Install IPEX with the PyTorch version from above. ```bash pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu @@ -30,7 +36,7 @@ pip install intel_extension_for_pytorch== -f https://developer.int IPEX provides additional performance optimizations for Intel CPUs. These include additional CPU instruction level architecture (ISA) support such as [Intel AVX512-VNNI](https://en.wikichip.org/wiki/x86/avx512_vnni) and [Intel AMX](https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/what-is-intel-amx.html). Both of these features are designed to accelerate matrix multiplication. Older AMD and Intel CPUs with only Intel AVX2, however, aren't guaranteed better performance with IPEX. -IPEX also supports [Auto Mixed Precision (AMP)](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html) training with the fp16 and bf16 datatypes. Reducing precision speeds up training and reduces memory usage because it requires less computation. The loss in accuracy from using full-precision is minimal. 3rd, 4th, and 5th generation Intel Xeon Scalable processors natively support bf16, and the 6th generation processor also natively supports fp16 in addition to bf16. +IPEX also supports [Auto Mixed Precision (AMP)](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html) training with the fp16 and bf16 data types. Reducing precision speeds up training and reduces memory usage because it requires less computation. The loss in accuracy from using full-precision is minimal. 3rd, 4th, and 5th generation Intel Xeon Scalable processors natively support bf16, and the 6th generation processor also natively supports fp16 in addition to bf16. AMP is enabled for CPU backends training with PyTorch. diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md index 4f14329abfd1..bd332d15e1ba 100644 --- a/docs/source/en/perf_train_cpu_many.md +++ b/docs/source/en/perf_train_cpu_many.md @@ -15,13 +15,13 @@ rendered properly in your Markdown viewer. # Distributed CPUs -CPUs are commonly available and can be a cost-effective option for training when GPUs are unavailable. When training large models or if a single CPU is too slow, distributed training with CPUs can help speed up training. +CPUs are commonly available and can be a cost-effective training option when GPUs are unavailable. When training large models or if a single CPU is too slow, distributed training with CPUs can help speed up training. This guide demonstrates how to perform distributed training with multiple CPUs using a [DistributedDataParallel (DDP)](./perf_train_gpu_many#distributeddataparallel) strategy on bare metal with [`Trainer`] and a Kubernetes cluster. All examples shown in this guide depend on the [Intel oneAPI HPC Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/hpc-toolkit.html). There are two toolkits you'll need from Intel oneAPI. -1. [oneCCL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/oneccl.html) is a toolkit that includes efficient implementations of collectives commonly used in deep learning such as all-gather, all-reduce, and reduce-scatter. To install from a prebuilt wheel, make sure you always use the latest release. Refer to the table [here](https://github.com/intel/torch-ccl#install-prebuilt-wheel) to check if a version of oneCCL is supported for a Python and PyTorch version. +1. [oneCCL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/oneccl.html) includes efficient implementations of collectives commonly used in deep learning such as all-gather, all-reduce, and reduce-scatter. To install from a prebuilt wheel, make sure you always use the latest release. Refer to the table [here](https://github.com/intel/torch-ccl#install-prebuilt-wheel) to check if a version of oneCCL is supported for a Python and PyTorch version. ```bash # installs oneCCL for PyTorch 2.4.0 diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md index 12e4c27d3679..2f6b2d4da9de 100644 --- a/docs/source/en/perf_train_gpu_one.md +++ b/docs/source/en/perf_train_gpu_one.md @@ -17,7 +17,7 @@ rendered properly in your Markdown viewer. GPUs are commonly used to train deep learning models due to their high memory bandwidth and parallel processing capabilities. Depending on your GPU and model size, it is possible to even train models with billions of parameters. The key is to find the right balance between GPU memory utilization (data throughput/training time) and training speed. -This guide will show you the features available in Transformers for efficiently training a model on a single GPU. In many cases, you'll want to use a combination of these features to optimize training. +This guide will show you the features available in Transformers and PyTorch for efficiently training a model on GPUs. In many cases, you'll want to use a combination of these features to optimize training. Refer to the table below to quickly help you identify the features relevant to your training scenario. @@ -39,7 +39,7 @@ Refer to the table below to quickly help you identify the features relevant to y ### Batch size -Batch size is one of the most important hyperparameters for efficient GPU training because it affects memory usage and training speed. Larger batch sizes lead to faster training because it takes advantage of GPUs parallel processing power. It is recommended to use batch sizes that are powers of 2, such as 8, 64, 128, 256, 512, etc. The batch size depends on your GPU and the models data type. +Batch size is one of the most important hyperparameters for efficient GPU training because it affects memory usage and training speed. Larger batch sizes lead to faster training because it takes advantage of a GPUs parallel processing power. It is recommended to use batch sizes that are powers of 2, such as 8, 64, 128, 256, 512, etc. The batch size depends on your GPU and the models data type. Configure [`~TrainingArguments.per_device_train_batch_size`] in [`TrainingArguments`]. @@ -54,7 +54,7 @@ args = TrainingArguments( Refer to the NVIDIA [Performance](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features) guide to learn more about how input features and output neuron counts and batch size affect performance. These are involved in the General Matrix Multiplications (GEMMs) performed by the GPU. Larger parameters are better for parallelization and efficiency. -The [Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) section is also useful for selecting a batch size that maximizes the speed of tensor multiplication based on the data type and GPU. For example, multiples of 8 are recommended for fp16, unless it's an A100 GPU, in which case use multiples of 65. +The [Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) section is also useful for selecting a batch size that maximizes the speed of tensor multiplication based on the data type and GPU. For example, multiples of 8 are recommended for fp16, unless it's an A100 GPU, in which case use multiples of 64. Finally, consider [Dimension Quantization Effects](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization) for smaller parameters. Tile quantization results when matrix dimensions aren't divisible by a GPUs thread block tile size, causing the GPU to underutilize its resources. Selecting the correct batch size multiplier, such that the matrix is divisible by the tile size, can significantly speed up training. @@ -121,7 +121,7 @@ args = TrainingArguments( ) ``` -fp16 doesn't memory-optimized because the gradients that are computed in fp16 are converted back to fp32 during the optimization step. You may end up using more GPU memory, especially for small batch sizes, because there are now two versions (fp16 and fp32) of the model on the GPU. +fp16 isn't memory-optimized because the gradients that are computed in fp16 are converted back to fp32 during the optimization step. You may end up using more GPU memory, especially for small batch sizes, because there are now two versions (fp16 and fp32) of the model on the GPU. @@ -224,7 +224,7 @@ PyTorch provides several features for reducing memory requirements and increasin The [torch.cuda.empty_cache](https://pytorch.org/docs/stable/generated/torch.cuda.empty_cache.html#torch.cuda.empty_cache) function releases unused cached memory, which can help avoid out-of-memory (OOM) errors at the cost of ~10% slower training. -Configure [torch_empty_cache_steps()](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.torch_empty_cache_steps) in [`TrainingArguments`] to enable torch.empty_cache after a certain number of training steps. +Use [torch_empty_cache_steps()](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.torch_empty_cache_steps) in [`TrainingArguments`] to enable it after a certain number of training steps. ```py from transformers import TrainingArguments @@ -234,7 +234,7 @@ args = TrainingArguments( gradient_accumulation_steps=16, gradient_checkpointing=True, bf16=True, - optim="adamw_bnb_8bit, + optim="adamw_bnb_8bit", dataloader_pin_memory=True, dataloader_num_workers=4, torch_empty_cache_steps=4, diff --git a/docs/source/en/perf_train_special.md b/docs/source/en/perf_train_special.md index 16611f16da4c..128f83c23ad7 100644 --- a/docs/source/en/perf_train_special.md +++ b/docs/source/en/perf_train_special.md @@ -15,7 +15,7 @@ rendered properly in your Markdown viewer. # Apple Silicon -Apple Silicon (M series) features a unified memory architecture making it possible to efficiently train large models locally and improves performance by reducing latency associated with data retrieval. You can take advantage of Apple Silicon for training with PyTorch due to its integration with [Metal Performance Shaders (MPS)](https://pytorch.org/docs/stable/notes/mps.html). +Apple Silicon (M series) features a unified memory architecture, making it possible to efficiently train large models locally and improves performance by reducing latency associated with data retrieval. You can take advantage of Apple Silicon for training with PyTorch due to its integration with [Metal Performance Shaders (MPS)](https://pytorch.org/docs/stable/notes/mps.html). The `mps` backend requires macOS 12.3 or later. diff --git a/docs/source/en/perf_train_tpu_tf.md b/docs/source/en/perf_train_tpu_tf.md index caf95b69c939..286ff530a817 100644 --- a/docs/source/en/perf_train_tpu_tf.md +++ b/docs/source/en/perf_train_tpu_tf.md @@ -15,7 +15,7 @@ rendered properly in your Markdown viewer. # TPU -TPU, Tensor Processing Unit, is a type of hardware designed to accelerate tensor computations for training and inference. TPUs are generally accessed through Google's cloud services, but smaller TPUs are also available for free from [Google Colab](https://colab.research.google.com/notebooks/tpu.ipynb) or [Kaggle](https://www.kaggle.com/docs/tpu). +TPU (Tensor Processing Unit) is a type of hardware designed to accelerate tensor computations for training and inference. TPUs are generally accessed through Google cloud services, but smaller TPUs are also available for free from [Google Colab](https://colab.research.google.com/notebooks/tpu.ipynb) or [Kaggle](https://www.kaggle.com/docs/tpu). This guide focuses on training a Keras model for sequence classification on a TPU from Google Colab. Make sure the TPU runtime is enabled by going to **Runtime > Change runtime type** and selecting a TPU. @@ -41,7 +41,7 @@ There are various distribution strategies for running your model on multiple TPU strategy = tf.distribute.TPUStrategy(resolver) ``` -Load and tokenize a dataset - this example uses [CoLA](https://huggingface.co/datasets/nyu-mll/glue/viewer/cola) from the GLUE benchmark - and pad all samples to the maximum length so it is easier to load as an array and to avoid XLA compilation issues. +Load and tokenize a dataset - this example uses [CoLA](https://huggingface.co/datasets/nyu-mll/glue/viewer/cola) from the GLUE benchmark - and pad all samples to the maximum length so it is easier to load as an array and to avoid [XLA compilation issues](#xla). ```py from transformers import AutoTokenizer @@ -90,7 +90,7 @@ model.fit(tf_dataset) ## Large datasets -The dataset created above pads every sample to the maximum length and loads the whole dataset into memory. This may not be possible if you're working with larger datasets. When training on large datasets, you may want to create a [tf.TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord) instead of stream the data. +The dataset created above pads every sample to the maximum length and loads the whole dataset into memory. This may not be possible if you're working with larger datasets. When training on large datasets, you may want to create a [tf.TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord) or stream the data. ### tf.TFRecord @@ -246,18 +246,18 @@ with strategy.scope(): model.fit(tf_dataset) ``` -### Stream dataset with prepare_tf_dataset +### Stream with prepare_tf_dataset -[`~TFPreTrainedModel.prepare_tf_dataset`] creates a `tf.data` pipeline that loads samples from [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset). The pipeline uses [tf.numpy_function]() or [`~datasets.Dataset.from_generator`], which can't be compiled by TensorFlow, to access the underlying `tf.data.Dataset`. It also won't work on a Colab TPU or TPU Nodes because the pipeline stream data from a local disk. Refer to the table below to help you decide whether this approach is helpful for you. +[`~TFPreTrainedModel.prepare_tf_dataset`] creates a `tf.data` pipeline that loads samples from [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset). The pipeline uses [tf.numpy_function]() or [`~datasets.Dataset.from_generator`], which can't be compiled by TensorFlow, to access the underlying `tf.data.Dataset`. It also won't work on a Colab TPU or TPU Nodes because the pipeline streams data from a local disk. Refer to the table below to help you decide whether this approach is helpful for you. -| pros | cons | | | | -|---|---|---|---|---| -| simple code | only works on TPU VM | | | | -| same approach on TPU/GPU | data must be available as a Hugging Face Dataset | | | | -| dataset doesn't have to fit in memory | data must fit on local storage | | | | -| supports variable padding | data loading may be a bottleneck on a big TPU pod slice | | | | +| pros | cons | +|---|---| +| simple code | only works on TPU VM | +| same approach on TPU/GPU | data must be available as a Hugging Face Dataset | +| dataset doesn't have to fit in memory | data must fit on local storage | +| supports variable padding | data loading may be a bottleneck on a big TPU pod slice | -[`~TFPreTrainedModel.prepare_tf_dataset`] only works on **TPU VM**. Add the tokenizer output as columns in the dataset since the dataset is stored on disk, which means it can handle data larger than the available memory. Use [`~TFPreTrainedModel.prepare_tf_dataset`] to stream data from the dataset by wrapping it with a `tf.data` pipeline. +[`~TFPreTrainedModel.prepare_tf_dataset`] only works on [TPU VM](#tpu-types). Add the tokenizer output as columns in the dataset since the dataset is stored on disk, which means it can handle data larger than the available memory. Use [`~TFPreTrainedModel.prepare_tf_dataset`] to stream data from the dataset by wrapping it with a `tf.data` pipeline. ```py def tokenize_function(examples): @@ -289,7 +289,7 @@ model.fit(tf_dataset) There are two types of TPUs, a TPU Node and a TPU VM. -A TPU Node indirectly accesses a remote TPU. It requires a separate VM to initialize your network and data pipeline and then forwards it to the remote node. Google Colab TPUs are an example of a TPU Node. You can't use local data because the TPU is remotely located, and data must be stored in Google Cloud Storage where the data pipeline can access it. +A TPU Node indirectly accesses a remote TPU. It requires a separate VM to initialize your network and data pipeline, and then forwards it to the remote node. Google Colab TPUs are an example of a TPU Node. You can't use local data because the TPU is remotely located, and data must be stored in Google Cloud Storage where the data pipeline can access it. TPU VM are connected directly to the machine the TPU is located on, and they are generally easier to work with, especially when it comes to your data pipeline. diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md index ddcef7994713..0459916d7ad9 100644 --- a/docs/source/en/trainer.md +++ b/docs/source/en/trainer.md @@ -18,15 +18,12 @@ rendered properly in your Markdown viewer. [`Trainer`] is a complete training and evaluation loop for Transformers' PyTorch models. Plug a model, preprocessor, dataset, and training arguments into [`Trainer`] and let it handle the rest to start training faster. -This guide will show you how [`Trainer`] works and how to customize it for your use case with a callback. +[`Trainer`] is also powered by [Accelerate](https://hf.co/docs/accelerate/index), a library for handling large models for distributed training. -[`Trainer`] is powered by [Accelerate](https://hf.co/docs/accelerate/index), a library for handling large models for distributed training, so make sure it is installed. +This guide will show you how [`Trainer`] works and how to customize it for your use case with a callback. ```bash -!pip install accelerate - -# upgrade to the latest version -# !pip install accelerate --upgrade +!pip install accelerate --upgrade ``` [`Trainer`] contains all the necessary components of a training loop. @@ -38,12 +35,12 @@ This guide will show you how [`Trainer`] works and how to customize it for your Manually coding this training loop everytime can be inconvenient or a barrier if you're just getting started with machine learning. [`Trainer`] abstracts this process, allowing you to focus on the model, dataset, and training design choices. -Configure your training with hyperparameters and options from [`TrainingArguments`] which supports a ton of features such as distributed training, torch.compile, mixed precision training, and saving the model to the Hub. +Configure your training with hyperparameters and options from [`TrainingArguments`] which supports many features such as distributed training, torch.compile, mixed precision training, and saving the model to the Hub. > [!TIP] > The number of available parameters available in [`TrainingArguments`] may be intimidating at first. If there is a specific hyperparameter or feature you want to use, try searching for it directly. Otherwise, feel free to start with the default values and gradually customize them as you become more familiar with the training process. -The example below demonstrates an example instance of [`TrainingArguments`] that evaluates and saves the model at the end of each epoch. It also loads the best model found during training and pushes it to the Hub. +The example below demonstrates an example of [`TrainingArguments`] that evaluates and saves the model at the end of each epoch. It also loads the best model found during training and pushes it to the Hub. ```py from transformers import TrainingArguments @@ -62,10 +59,10 @@ training_args = TrainingArguments( ) ``` -Pass your model, dataset, preprocessor, and [`TrainingArguments`] to [`Trainer`] and call [`~Trainer.train`] to start training. +Pass your model, dataset, preprocessor, and [`TrainingArguments`] to [`Trainer`], and call [`~Trainer.train`] to start training. > [!TIP] -> Refer to the [Finetuning](./training) guide for a more complete overview of the training process. +> Refer to the [Fine-tuning](./training) guide for a more complete overview of the training process. ```py from transformers import Trainer @@ -85,7 +82,7 @@ trainer.train() ## Checkpoints -[`Trainer`] saves checkpoints (the optimizer state is not saved by default) to the directory set to `output_dir` in [`TrainingArguments`] to a subfolder named `checkpoint-000`. The number at the end is the training step at which the checkpoint was saved. +[`Trainer`] saves checkpoints (the optimizer state is not saved by default) to the directory in `output_dir` in [`TrainingArguments`] to a subfolder named `checkpoint-000`. The number at the end is the training step at which the checkpoint was saved. Saving checkpoints are useful for resuming training or recovering your training progress if you encounter an error. Set the `resume_from_checkpoint` parameter in [`~Trainer.train`] to resume training from the last checkpoint or a specific checkpoint. @@ -106,7 +103,7 @@ trainer.train(resume_from_checkpoint="your-model/checkpoint-1000") -Checkpoints can be saved to the Hub by setting `push_to_hub=True` in [`TrainingArguments`]. The default method (`"every_save"`) saves a checkpoint to the Hub is every time a model is saved, which is typically the final model at the end of training. Some other options for deciding how to save checkpoints to the Hub include: +Checkpoints can be saved to the Hub by setting `push_to_hub=True` in [`TrainingArguments`]. The default method (`"every_save"`) saves a checkpoint to the Hub every time a model is saved, which is typically the final model at the end of training. Some other options for deciding how to save checkpoints to the Hub include the following. - `hub_strategy="end"` only pushes a checkpoint when [`~Trainer.save_model`] is called - `hub_strategy="checkpoint"` pushes the latest checkpoint to a subfolder named *last-checkpoint* from which training can be resumed @@ -164,7 +161,7 @@ my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0 > [!TIP] -> [`Trainer`] sets the log level separately for each node in the [`~Trainer.__init__`] method, so you may want to consider setting this sooner if you're using other Transformers functionalities before creating the [`Trainer`] instance. +> The log level is separately set for each node in the [`~Trainer.__init__`] method. Consider setting this sooner if you're using other Transformers functionalities before creating the [`Trainer`] instance. ## Customize @@ -176,14 +173,14 @@ Tailor [`Trainer`] to your use case by subclassing or overriding its methods to | [`~Trainer.get_eval_dataloader`] | create an evaluation DataLoader | | [`~Trainer.get_test_dataloader`] | create a test DataLoader | | [`~Trainer.log`] | log information about the training process | -| [`~Trainer.create_optimizer_and_scheduler`] | create an optimizer and learning rate scheduler (can also be separately customized with [`~Trainer.create_optimizer`] and [`~Trainer.screate_scheduler`] if they weren't passed in `__init__` | +| [`~Trainer.create_optimizer_and_scheduler`] | create an optimizer and learning rate scheduler (can also be separately customized with [`~Trainer.create_optimizer`] and [`~Trainer.create_scheduler`] if they weren't passed in `__init__`) | | [`~Trainer.compute_loss`] | compute the loss of a batch of training inputs | | [`~Trainer.training_step`] | perform the training step | | [`~Trainer.prediction_step`] | perform the prediction and test step | | [`~Trainer.evaluate`] | evaluate the model and return the evaluation metric | | [`~Trainer.predict`] | make a prediction (with metrics if labels are available) on the test set | -For example, to use weighted loss, rewrite [`~Trainer.compute_loss`] inside your custom [`Trainer`]. +For example, to use weighted loss, rewrite [`~Trainer.compute_loss`] inside [`Trainer`]. ```py from torch import nn @@ -203,9 +200,9 @@ class CustomTrainer(Trainer): ### Callbacks -[Callbacks](./main_classes/callback) are another way to customize [`Trainer`], but they *don't change anything* inside the training loop. Instead, a callback inspects the training loop state and then executes some action (early stopping, logging, etc.) depending on the state. For example, you can't implement a custom loss function with a callback because that requires overriding [`~Trainer.compute_loss`]. +[Callbacks](./main_classes/callback) are another way to customize [`Trainer`], but they don't change anything *inside the training loop*. Instead, a callback inspects the training loop state and executes some action (early stopping, logging, etc.) depending on the state. For example, you can't implement a custom loss function with a callback because that requires overriding [`~Trainer.compute_loss`]. -To use a callback, create a class that inherits from [`TrainerCallback`] and implements the functionality you want. Then you can pass the callback to the `callback` parameter in [`Trainer`]. The example below implements an early stopping callback that stops training after 10 steps. +To use a callback, create a class that inherits from [`TrainerCallback`] and implements the functionality you want. Then pass the callback to the `callback` parameter in [`Trainer`]. The example below implements an early stopping callback that stops training after 10 steps. ```py from transformers import TrainerCallback, Trainer @@ -368,7 +365,7 @@ accelerate launch \ > [!TIP] > Refer to the [Launching your Accelerate scripts](https://hf.co/docs/accelerate/basic_tutorials/launch) tutorial to learn more about `accelerate_launch` and custom configurations. -## Optimization +## Optimizations [`Trainer`] supports various optimizations to improve *training* performance - reduce memory and increase training speed - and *model* performance. @@ -382,11 +379,11 @@ Install the [GaLore](https://github.com/jiaweizzhao/GaLore) library, [TRL](https pip install galore-torch trl datasets ``` -Then pick a GaLore optimizer (`"galore_adamw"`, `"galore_adafactor"`, `"galore_adamw_8bit`") and pass it to the `optim` parameter in [`TrainingArguments`]. Use the `optim_target_modules` parameter to specify which modules to adapt (can be a list of strings, regex, or a full path). +Pick a GaLore optimizer (`"galore_adamw"`, `"galore_adafactor"`, `"galore_adamw_8bit`") and pass it to the `optim` parameter in [`TrainingArguments`]. Use the `optim_target_modules` parameter to specify which modules to adapt (can be a list of strings, regex, or a full path). -Extra parameters supported by GaLore, `rank`, `update_proj_gap`, and `scale` should be passed to the `optim_args` parameter in [`TrainingArguments`]. +Extra parameters supported by GaLore, `rank`, `update_proj_gap`, and `scale`, should be passed to the `optim_args` parameter in [`TrainingArguments`]. -The example below shows how to enable GaLore with [`~trl.SFTTrainer`] that targets the `attn` and `mlp` layers with regex. +The example below enables GaLore with [`~trl.SFTTrainer`] that targets the `attn` and `mlp` layers with regex. > [!TIP] > It can take some time before training starts (~3 minutes for a 2B model on a NVIDIA A100). @@ -462,7 +459,7 @@ Only linear layers that are considered GaLore layers can be trained with low-ran ### Liger -[Liger Kernel](https://github.com/linkedin/Liger-Kernel) is a collection of layers such as RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy and more that have been fused into a single Triton kernel for training LLMs. These kernels are also compatible with FlashAttention, FSDP, and DeepSpeed. As a result, Liger Kernel can increase multi-GPU training throughput and reduce memory usage. This is useful for multi-head training and supporting larger vocabulary sizes, larger batch sizes and longer context lengths. +[Liger Kernel](https://github.com/linkedin/Liger-Kernel) is a collection of layers such as RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more that have been fused into a single Triton kernel for training LLMs. These kernels are also compatible with FlashAttention, FSDP, and DeepSpeed. As a result, Liger Kernel can increase multi-GPU training throughput and reduce memory usage. This is useful for multi-head training and supporting larger vocabulary sizes, larger batch sizes, and longer context lengths. ```bash pip install liger-kernel diff --git a/docs/source/en/training.md b/docs/source/en/training.md index a666c4938307..d056076c5bfa 100644 --- a/docs/source/en/training.md +++ b/docs/source/en/training.md @@ -14,20 +14,20 @@ rendered properly in your Markdown viewer. --> -# Finetuning +# Fine-tuning [[open-in-colab]] -Finetuning adapts a pretrained model to a specific task with a smaller specialized dataset. This approach requires far less data and compute compared to training a model from scratch, which makes it a more accessible option for many users. +Fine-tuning adapts a pretrained model to a specific task with a smaller specialized dataset. This approach requires far less data and compute compared to training a model from scratch, which makes it a more accessible option for many users. -Transformers provides the [`Trainer`] API, which offers a comprehensive set of training features, for finetuning any of the models on the [Hub](https://hf.co/models). +Transformers provides the [`Trainer`] API, which offers a comprehensive set of training features, for fine-tuning any of the models on the [Hub](https://hf.co/models). > [!TIP] -> Learn how to finetune models for other tasks in our Task Recipes section! +> Learn how to fine-tune models for other tasks in our Task Recipes section in Resources! -This guide will show you how to finetune a model with [`Trainer`] to classify Yelp reviews. +This guide will show you how to fine-tune a model with [`Trainer`] to classify Yelp reviews. -Login to your Hugging Face account with your user token to ensure you can access gated models and share your models on the Hub. +Log in to your Hugging Face account with your user token to ensure you can access gated models and share your models on the Hub. ```py from huggingface_hub import login @@ -35,7 +35,7 @@ from huggingface_hub import login login() ``` -Start by loading the [Yelp Reviews](https://hf.co/datasets/yelp_review_full) dataset and [preprocess](./fast_tokenizers) (tokenize, pad, and truncate) it for training. Use [`~datasets.Dataset.map`] to preprocess the entire dataset in one step. +Start by loading the [Yelp Reviews](https://hf.co/datasets/yelp_review_full) dataset and [preprocess](./fast_tokenizers#preprocess) (tokenize, pad, and truncate) it for training. Use [`~datasets.Dataset.map`] to preprocess the entire dataset in one step. ```py from datasets import load_dataset @@ -51,7 +51,7 @@ dataset = dataset.map(tokenize, batched=True) ``` > [!TIP] -> Finetune on a smaller subset of the full dataset to reduce the time it takes, but the results won't be as good compared to finetuning on the full dataset. +> Fine-tune on a smaller subset of the full dataset to reduce the time it takes. The results won't be as good compared to fine-tuning on the full dataset, but it is useful to make sure everything works as expected first before committing to training on the full dataset. > ```py > small_train = dataset["train"].shuffle(seed=42).select(range(1000)) > small_eval = dataset["test"].shuffle(seed=42).select(range(1000)) @@ -61,9 +61,9 @@ dataset = dataset.map(tokenize, batched=True) -[`Trainer`] is an optimized training loop for Transformers models, making it easy to start training right away without manually writing your own training loop. Pick and choose from a wide range of training features in [`TrainingArguments`] such as gradient accumulation, mixed precision, and options for reporting and logging training metrics. +[Trainer](./trainer) is an optimized training loop for Transformers models, making it easy to start training right away without manually writing your own training code. Pick and choose from a wide range of training features in [`TrainingArguments`] such as gradient accumulation, mixed precision, and options for reporting and logging training metrics. -Load a model and provide the number of expected labels (find this information on the Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields)): +Load a model and provide the number of expected labels (you can find this information on the Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields)). ```py from transformers import AutoModelForSequenceClassification @@ -74,7 +74,7 @@ model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-bas ``` > [!TIP] -> The message above is a reminder that the models pretrained head is discarded and replaced with a randomly initialized classification head. The randomly initialized head needs to be finetuned on your specific task to output meanginful predictions. +> The message above is a reminder that the models pretrained head is discarded and replaced with a randomly initialized classification head. The randomly initialized head needs to be fine-tuned on your specific task to output meanginful predictions. With the model loaded, set up your training hyperparameters in [`TrainingArguments`]. Hyperparameters are variables that control the training process - such as the learning rate, batch size, number of epochs - which in turn impacts model performance. Selecting the correct hyperparameters is important and you should experiment with them to find the best configuration for your task. @@ -107,7 +107,7 @@ training_args = TrainingArguments( ) ``` -Create a [`Trainer`] instance and pass it the model, training arguments, training and test datasets, and evaluation function. Then call [`~Trainer.train`] to start training. +Create a [`Trainer`] instance and pass it the model, training arguments, training and test datasets, and evaluation function. Call [`~Trainer.train`] to start training. ```py trainer = Trainer( @@ -120,7 +120,7 @@ trainer = Trainer( trainer.train() ``` -Finally, call [`~Trainer.push_to_hub`] to upload your model and tokenizer to the Hub. +Finally, use [`~Trainer.push_to_hub`] to upload your model and tokenizer to the Hub. ```py trainer.push_to_hub() @@ -128,7 +128,7 @@ trainer.push_to_hub() ## TensorFlow -[`Trainer`] is incompatible with Transformers TensorFlow models. Instead, finetune these models with [Keras](https://keras.io/) since they're implemented as standard [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model). +[`Trainer`] is incompatible with Transformers TensorFlow models. Instead, fine-tune these models with [Keras](https://keras.io/) since they're implemented as a standard [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model). ```py from transformers import TFAutoModelForSequenceClassification From bf27e12a3a70690be14d53fd92e7b85516719125 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Mon, 27 Jan 2025 10:21:40 -0800 Subject: [PATCH 099/116] modular transformers --- docs/source/en/_toctree.yml | 2 +- docs/source/en/conversations.md | 24 +- docs/source/en/installation.md | 37 +- docs/source/en/modular_transformers.md | 505 +++++++++++++++++++++++-- 4 files changed, 514 insertions(+), 54 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index f2639153a7e2..195022727efd 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -85,7 +85,7 @@ - title: Chat sections: - local: conversations - title: Chat pipeline + title: Chat basics - local: chat_templating title: Templates - local: chat_templating_multimodal diff --git a/docs/source/en/conversations.md b/docs/source/en/conversations.md index 627d72465a47..2e842265a215 100644 --- a/docs/source/en/conversations.md +++ b/docs/source/en/conversations.md @@ -14,7 +14,7 @@ rendered properly in your Markdown viewer. --> -# Chat pipeline +# Chat basics Chat models are conversational models you can send and receive messages from. There are many chat models available to choose from, but in general, larger models tend to be better though that's not always the case. The model size is often included in the name, like "8B" or "70B", and it describes the number of parameters. Mixture-of-expert (MoE) models have names like "8x7B" or "141B-A35B" which means it's a 56B and 141B parameter model. You can try quantizing larger models to reduce memory requirements, otherwise you'll need ~2 bytes of memory per parameter. @@ -23,7 +23,27 @@ Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_ > [!TIP] > Chat with a number of open-source models for free on [HuggingChat](https://hf.co/chat/)! -This guide shows you how to build and format a conversation, and how to quickly start chatting with a model with [`TextGenerationPipeline`]. +This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`]. + +## transformers-cli + +Chat with a model directly from the command line as shown below. It launches an interactive session with a model. Enter `clear` to reset the conversation, `exit` to terminate the session, and `help` to display all the command options. + +```bash +transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct +``` + +
+ +
+ +For a full list of options, run the command below. + +```bash +transformers-cli chat -h +``` + +The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating). ## TextGenerationPipeline diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md index 8a3d9c11e7c1..d7d9a98579c0 100644 --- a/docs/source/en/installation.md +++ b/docs/source/en/installation.md @@ -26,32 +26,35 @@ Transformers works with [PyTorch](https://pytorch.org/get-started/locally/), [Te A virtual environment helps manage different projects and avoids compatibility issues between dependencies. Take a look at the [Install packages in a virtual environment using pip and venv](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) guide if you're unfamiliar with Python virtual environments. -Create a virtual environment in your project directory. + + + +Create and activate a virtual environment in your project directory with venv or uv. ```bash python -m venv .env +source ./env/bin/activate ``` -Activate the virtual environment. + + - - +[uv](https://docs.astral.sh/uv/) is a fast Rust-based Python package and project manager. ```bash +uv venv .env source ./env/bin/activate ``` - + -```bash -.env/Scripts/activate -``` +## Python - - +You can install Transformers with pip or uv. -## pip + + [pip](https://pip.pypa.io/en/stable/) is a package installer for Python. Install Transformers with pip in your newly created virtual environment. @@ -59,6 +62,15 @@ source ./env/bin/activate pip install transformers ``` + + + +[uv](https://docs.astral.sh/uv/) is a fast Rust-based Python package and project manager. + +```bash +uv pip install transformers +``` + @@ -77,6 +89,7 @@ To install a CPU-only version of Transformers and a machine learning framework, ```bash pip install 'transformers[torch]' +uv pip install 'transformers[torch]' ``` @@ -93,6 +106,7 @@ Install TensorFlow 2.0. ```bash pip install 'transformers[tf-cpu]' +uv pip install 'transformers[tf-cpu]' ```
@@ -100,6 +114,7 @@ pip install 'transformers[tf-cpu]' ```bash pip install 'transformers[flax]' +uv pip install 'transformers[flax]' ``` diff --git a/docs/source/en/modular_transformers.md b/docs/source/en/modular_transformers.md index ca7bbf9376f3..77080042c593 100644 --- a/docs/source/en/modular_transformers.md +++ b/docs/source/en/modular_transformers.md @@ -11,7 +11,7 @@ The [`# Copied from`](./pr_checks#check-copies) statements prevents the code fro Modular Transformers addresses these issues by adding a *modular* file to a model folder. The modular file can import code from other models and inherit code from other classes unlike traditional modeling and processing files. > [!TIP] -> Modular Transformers isn't meant to replace the modeling code, and if your model isn't based on an existing model, you'll need to add a `modeling.py` file manually. +> Modular Transformers isn't meant to replace the modeling code, and if your model isn't based on an existing model, you'll need to add a `modeling.py` file manually. Likewise, if a configuration, tokenization or processing file can't easily inherit from a similar file, you can add that file directly. A modular file contains model, processor, and configuration class code that would otherwise be in separate files under the single model, single file policy. @@ -92,29 +92,348 @@ If you don't use the defined dependency, you'll receive the following error. ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should be used when you define `BertModel`, as it is one of it's direct dependencies. Make sure you use it in the `__init__` function. ``` -## Removing attributes and functions +## Implementing a modular file -Use `del` to remove attributes that aren't used in your model or if you don't want to include it in the unravelled `modeling.py` file. The example [`GemmaModel`] below removes the `embed_tokens` from the original [`LlamaModel`] it inherits from. +The easiest way to start is by browsing Transformers for a model similar to yours in order to inherit from it. Some good starting points are [Mistral](./model_doc/mistral), [Qwen2](./model_doc/qwen2), [Cohere](./model_doc/cohere) and [Cohere](./model_doc/cohere2), and [Llama](./model_doc/llama). Refer to the table below for components your model might be using and where you can inherit from. + +| Component | Model | +|---|---| +| Mixture of expert | SwitchTransformers or Mixtral | +| Interleaved (and/or partial) rotary embedding | GLM, Phi | +| State space models | Jamba, Bamba, Zamba, Mamba2 | +| Recurrent hidden states | Gemma2 | +| Sliding window attention/full attention patterns per layer | Gemma2, Cohere2 | +| QKV clipping | Olmo | +| QK normalization | Olmo2, Cohere | +| Fused QKV (not recommended) | Phi3 | + +This section will walk you through how to implement [Olmo2](./model_doc/olmo2) from [Olmo](./model_doc/olmo) with modular Transformers (you can refer to the original [modeling.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modular_olmo2.py) file). + +### Config + +The modular `Olmo2Config` is shown below. + +```py +from ..olmo.configuration_olmo import OlmoConfig + +class Olmo2Config(OlmoConfig): + r""" + This is the configuration class to store the configuration of a [Olmo2Model](/docs/transformers/main/en/model_doc/olmo2#transformers.Olmo2Model). + """ + + def __init__( + self, + vocab_size=50304, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + use_cache=True, + pad_token_id=1, + bos_token_id=None, + eos_token_id=50279, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + rms_norm_eps=1e-5, + **kwargs, + ): + super().__init__( + vocab_size=vocab_size, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + num_key_value_heads=num_key_value_heads, + hidden_act=hidden_act, + max_position_embeddings=max_position_embeddings, + initializer_range=initializer_range, + use_cache=use_cache, + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + attention_bias=attention_bias, + attention_dropout=attention_dropout, + **kwargs, + ) + + self.rms_norm_eps = rms_norm_eps + del self.clip_qkv +``` + +There are three points where the `Olmo2Config` is different from the original `OlmoConfig`. + +1. The default value of most arguments have changed. +2. There is a new argument, `rms_norm_eps`. +3. The `clip_qkv` argument isn't used anymore. + +For the new default values and argument, overwrite the `__init__` function with the new default values and add `rms_norm_eps`. Assign `rms_norm_eps` to `self` in the body of `__init__`. For the `clip_qkv` argument, use `del self.clip_qkv` to remove the assignment of this attribute in the unraveled code (post-linter conversion). + +Notice how the `super().__init__(...)` is used. Typically, it calls the parent `__init__`. + +But in modular Transformers, if there is a call like `super().my_function(...)`, the linter takes the body of `my_function` in the parent and unravels it where the call to `super().my_function(...)` occurred. The `del self.clip_qkv` statement removes the reference to `self.clip_qkv` in the unraveled body. + +`del self.` and `super().my_function(..)` work together, and it should always be placed after `super().my_function(...)`. You can add whatever you want *before* calling `super()`, and it is placed before the parents body. + +### Norm + +```py +from ..llama.modeling_llama import LlamaRMSNorm + +class Olmo2RMSNorm(LlamaRMSNorm): + pass +``` + +Nothing needs to be modified in `LlamaRMSNorm`. The linter unravels the exact content of `LlamaRMSNorm` into `Olmo2RMSNorm`. References to Llama in the docstrings, type hints, and comments are also changed to Olmo2. + +### Attention + +The modular `Olmo2Attention` is shown below. + +```py +from ..llama.modeling_llama import eager_attention_forward +from ..olmo.modeling_olmo import OlmoAttention, apply_rotary_pos_emb + + +# Olmo2 attention is identical to OLMo attention except: +# - Norm is applied to attention queries and keys. +# - No qkv clipping. +class Olmo2Attention(OlmoAttention): + def __init__(self, config: Olmo2Config, layer_idx: Optional[int] = None): + super().__init__(config, layer_idx=layer_idx) + self.q_norm = Olmo2RMSNorm(config.num_attention_heads * self.head_dim, config.rms_norm_eps) + self.k_norm = Olmo2RMSNorm(config.num_key_value_heads * self.head_dim, config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_value: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_norm(self.q_proj(hidden_states)) + key_states = self.k_norm(self.k_proj(hidden_states)) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(hidden_shape).transpose(1, 2) + key_states = key_states.view(hidden_shape).transpose(1, 2) + value_states = value_states.view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): + logger.warning_once( + "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " + 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + else: + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights +``` + +The `super().__init__(...)` copies the parent definition and adds 2 new layers from `Olmo2RMSNorm`. The forward pass needs to be overwritten to use these 2 new layers. A pass with the norm layers is added before projecting with `q_proj` and `k_proj`. To make it easier, the `eager_attention_forward` function is directly imported from Llama and the `apply_rotary_pos_emb` is imported from Olmo. + +The linter automatically adds these imported functions in the final `modeling_olmo2.py` file by copying their definitions from the source files. The `rotate_half` and `repeat_kv` functions are also added because they are used inside `apply_rotary_pos_emb` and `eager_attention_forward`. + +The `Attention` class had to be redefined because there weren't any existing models with an `Attention` layer that included a `RMSNorm` layer. + +### DecoderLayer + +The modular `DecoderLayer` is shown below. + +```py +from ..olmo.modeling_olmo import OlmoDecoderLayer + +# The OLMo2 layers are identical to those of the OLMo model except: +# - RMSNorm is used instead of standard layer norm. +# - Norm is applied after attention/feedforward rather than before. +class Olmo2DecoderLayer(OlmoDecoderLayer): + def __init__(self, config: Olmo2Config, layer_idx: int): + super().__init__(config, layer_idx=layer_idx) + self.post_attention_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_feedforward_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.self_attn = Olmo2Attention(config=config, layer_idx=layer_idx) + del self.input_layernorm + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_feedforward_layernorm(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + if output_attentions: + outputs += (self_attn_weights,) + + return outputs +``` + +The norm type is switched in `__init__` by overwriting `self.post_attention_layernorm` after the call to `super().__init__(...)`. Delete the `self.input_layernorm` attributed and replace it with `self.post_feedforward_layernorm` because it is applied after in Olmo2. The forward method is overwritten to reflect this change. + +If you only switched `self.post_feedforward_layernorm` and `self.input_layernorm` from `LayerNorm` to `RMSNorm` without also changing the name and logic of `self.input_layernorm`, then you wouldn't have to rewrite the forward method. + +### Model + +The modular `Olmo2Model` class is shown below. + +```py +from ..olmo.modeling_olmo import OlmoModel + +# The OLMo2 model is identical to the OLMo model, except RMSNorm is used instead of +# standard layer norm for the output norm. +class Olmo2Model(OlmoModel): + def __init__(self, config: Olmo2Config): + super().__init__(config) + self.norm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.layers = nn.ModuleList( + [Olmo2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) +``` + +You only need to change the *type* of the `self.norm` attribute to use `RMSNorm` isntead of `LayerNorm`. This change doesn't affect the logic in the forward method (layer name and usage is identical to the parent class), so you don't need to overwrite it. The linter automatically unravels it. + +### Model head + +The modular causal modeling head is shown below. + +```py +from ..olmo.modeling_olmo import OlmoForCausalLM + +class Olmo2ForCausalLM(OlmoForCausalLM): + pass +``` + +The logic is identical to `OlmoForCausalLM` which means you don't need to make any changes here. + +### Other classes + +The [modeling_olmo2.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py) generated by the linter also contains some classes (`Olmo2MLP`, `Olmo2RotaryEmbedding`, `Olmo2PreTrainedModel`) that weren't explicitly defined in `modular_olmo2.py`. + +Classes that are a dependency of an inherited class but aren't explicitly defined are automatically added as a part of depdendency tracing. This is similar to how some functions were added to the `Attention` class without drrectly importing them. + +For example, `OlmoDecoderLayer` has an attribute defined as `self.mlp = OlmoMLP(config)`. This class was never explicitly redefined in `Olmo2MLP`, so the linter automatically created a `Olmo2MLP` class similar to `OlmoMLP`. It is identical to the code below if it was explicitly written in `modular_olmo2.py`. ```py -class GemmaModel(LlamaModel): | class GemmaModel(PreTrainedModel): - def __init__(self, config): | def __init__(self, config): - super().__init__(self, eos_token) | super().__init__(config) - del self.embed_tokens | self.padding_idx = config.pad_token_id - | self.vocab_size = config.vocab_size - | - | self.layers = nn.ModuleList( - | [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - | ) - | self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - | self.rotary_emb = LlamaRotaryEmbedding(config=config) - | self.gradient_checkpointing = False - | - | # Initialize weights and apply final processing - | self.post_init() +from ..olmo.modeling_olmo import OlmoMLP + +class Olmo2MLP(OlmoMLP): + pass +``` + +However, it was necessary to rewrite `Olmo2RMSNorm` because the layer norm needed to be redefined in the `Attention` and `DecoderLayer` classes. Similarly, this is why you didn't need to create the `Olmo2PreTrainedModel` and `Olmo2RotaryEmbedding` classes. + +Classes that aren't rewritten are copied from the file where the inherited module first uses them. This means if you wanted `Olmo2MLP` to inherit from `MistralMLP` instead, you would need to be more explicit as shown below. + +```py +# switch to mistral definition +from ..mistral.modeling_mistral import MistralMLP + +class Olmo2MLP(MistralMLP): + pass ``` -Remove a function by writing it with a `raise AttributeError("")` to mimic the behavior you actually want when you remove a parent function in Python. +## Removing attributes + +You can `del` to remove attributes defined in the parent after using `super().__init__()`. However, this doesn't work if the attribute is also used somewhere else as shown below. It only suppresses the assignment. The `self.attribute = config.attribute` line is removed, but the `if` statement remains and references the attribute. + +```py +class DummyModel(nn.Module): + + def __init__(self, config: DummyConfig): + super().__init__() + self.attribute = config.attribute + if self.attribute: + # do more stuff with `self.attribute` here + ... + +class MyNewDummyModel(DummyModel): + + def __init__(self, config: MyNewDummyConfig): + super().__init__(config) + del self.attribute +``` + +## Explicit super() calls + +If you still want to inherit from `DummyModel` but don't want to remove the `self.attribute`, be explicit about which class' `super()` you're calling. The example below shows how to call the `super()` of `nn.Module` (unraveled code shown on the right) + +```py +class MyNewDummyModel(DummyModel, nn.Module): | class MyNewDummyModel(nn.Module): + | + def __init__(self, config: MyNewDummyConfig): | def __init__(self, config: MyNewDummyConfig): + nn.Module.__init__(config) | super().__init__() + self.foo = config.foo | self.foo = config.foo + ... | ... +``` + +## Deleting unused methods + +Remove an attribute by overwriting it with a `raise AttributeError("")` statement to mimic the behavior you want when you remove a parent function in Python. The example below removes the methods in the unraveled code. ```py class GemmaTokenizer(LlamaTokenizer): @@ -127,44 +446,150 @@ class GemmaTokenizer(LlamaTokenizer): raise AttributeError("Not needed for Gemma") ``` -## Define new functions +## Defining new functions + +By default, if you inherit from a class and override a method with one or more decorators in the parent method, the decorators are also added to the unraveled code *only if you don't add any yourself*. Otherwise, the redefined decorator is used. + +For example, if you had a parent class shown below and you overwrite it, the parent decorator is kept. + +```py +class DummyModel(nn.Module): + ... + + @decorator(...) + def forward(...) + # do stuff here +``` + +Modular code is shown on the left, and the unraveled code is shown on the right. -New functions can be defined in the modular file and used inside a class. The new function - and recursively, any other new function called in its body - is automatically copy-pasted in the file where it is used. +```py +class NewModel(DummyModel): | class NewModel(nn.Module): + ... | ... + | + def forward(...): | @decorator(...) + ... | def forward(...): + | ... +``` + +But if you add a new decorator, your new decorator is used instead. + +```py +class NewModel(DummyModel): | class NewModel(nn.Module): + ... | ... + | + @my_new_decorator(...) | @my_new_decorator(...) + def forward(...): | def forward(...): + ... | ... +``` + +## super_kwargs + +In scenarios where a forward method is really long and you want to switch decorators, you don't need to redefine everything and copy/paste the function. You can use `super().forward(...)` to unravel the parent body. When there are a lot of arguments in the function signature, use the special `**super_kwargs` syntax in the overwritten signature. + +This syntax indicates to the linter to unravel all the parent signature arguments here. An example signature in a [`AutoModelForCausalLM`] model is shown below, with lots of arguments. ```py -def my_new_function(*args, **kwargs): - # Do something here - pass +class LlamaForCausalLM(nn.Module): + ... + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, + **kwargs: Unpack[KwargsForCausalLM], + ) -> Union[Tuple, CausalLMOutputWithPast]: + ... +``` + +Instead of rewriting and copying/pasting all of those arguments, use the `super().forward(**super_kwargs)` statement (modular code shown on the left, unraveled code on the right). -class DummyModel(LlamaModel): - def forward(*args, **kwargs): - # Call the function - example = my_new_function(*args, **kwargs) - # Continue here +```py +class NewModelForCausalLM(LlamaForCausalLM): | class LlamaForCausalLM(nn.Module): + ... | ... + | + @my_new_decorator | @my_new_decorator + def forward(self, **super_kwargs): | def forward( + super().forward(**super_kwargs) | self, + | input_ids: torch.LongTensor = None, + | attention_mask: Optional[torch.Tensor] = None, + | position_ids: Optional[torch.LongTensor] = None, + | past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = |None, + | inputs_embeds: Optional[torch.FloatTensor] = None, + | labels: Optional[torch.LongTensor] = None, + | use_cache: Optional[bool] = None, + | output_attentions: Optional[bool] = None, + | output_hidden_states: Optional[bool] = None, + | return_dict: Optional[bool] = None, + | cache_position: Optional[torch.LongTensor] = None, + | num_logits_to_keep: int = 0, + | **kwargs: Unpack[KwargsForCausalLM], + | ) -> Union[Tuple, CausalLMOutputWithPast]: + | ... ``` -## Calling super() +This makes it very easy to switch decorators and makes it explicit that the only change you want to apply is the decorator. + +`**super_kwargs` should not be used to avoid being explicit when redefining methods though. If you overwrite a method, you should explicitly write the signature as you normally would. The `**super_kwargs` syntax is a shortcut for switching decorators and a few other niche cases. -You don't have to unravel a call to `super()` or if you want to differentiate which `super().__init__()` call you're doing. +## Docstring variables -The example below shows how you only need to add `eos_token` to the `__init__` instead of calling `super().__init__(eos_token)`. +If an object defined in both the modular and modeling file from which it inherits, the modular definition has precedence unless for assignments containing the pattern `DOCSTRING`. These variables are typically used in `MODEL_START_DOCSTRING` and `MODEL_INPUT_DOCSTRING` in the modeling files. They are big blocks of docstrings and the linter rewrites the names everywhere. For this reason, assignments containing the `DOCSTRING` variable always uses the definition found in the source file instead of the modular file. + +This is very useful if you need the variable reference somewhere but you don't want to clutter the modular file with docstrings which are always the same. The example code below allows you to automatically use the same docstrings from [Mistral](./model_doc/mistral) in [Starcoder2](./model_doc/starcoder2). ```py -class GemmaTokenizer(LlamaTokenizer, PretrainedTokenizerFast): | class GemmaModel(nn.Module): - def __init__(self, eos_token=""): | def __init__(self): - eos_token = AddedToken(eos_token) | eos_token = AddedToken(eos_token) - PretrainedTokenizerFast.__init__(self, eos_token) | super().__init__(eos_token) +STARCODER2_INPUTS_DOCSTRING = None # will be automatically redefined + +class Starcoder2Model(MistralModel): + ... + + @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING) + def forward(...) + ... ``` ## Special naming -Special naming for classes is also supported, which is useful for composite models. +The linter automatically renames everything when inheriting from a class. For consistency, you should always use the same class name prefix when inheriting from different classes from the same file. -The example below shows how you can use `GemmaVisionModel` even though it's not the same as the modular Gemma model. +The example below is not recommended. It breaks standards in the library, `MyModelIncredibleMLP` instead of `LlamaMLP`, and because the linter doesn't know how to rename potential higher-order dependencies (`MyModelIncredible` or just `MyModel`). ```py -class GemmaVisionModel(CLIPModel): +class MyModelIncredibleMLP(LlamaMLP): + ... + +class MyModelDecoderLayer(LlamaDecoderLayer): + ... +``` + +However, if there aren't any [implicit dependencies](#other-classes), then you can locally rename a single class. Make sure you still explicitly redefine every other mention of the class with the new name pattern though. For example, all mentions of `LlamaMLP` should be renamed to `MyModelIncredibleMLP` otherwise the linter may add a new and unwanted `MyModelMLP` class. + +The linter raises a warning if an ambiguous case is detected. It explains what is happening and which prefix is used by default for getting the dependencies. These warning and renaming pattern complications usually only come up when defining multimodal models. For example, adding `Text` to class names in a multimodal model to make it clear which modality it refers to. + +```py +We detected multiple prefix names when inheriting from transformers.models.llama.modeling_llama: ('Emu3Text', 'Emu3'). We will only use the most used 'Emu3' prefix when grabbing args and dependencies. Make sure to subclass the intermediate classes with the prefix you want (if different from 'Emu3') or use a single prefix in all the modular (best). +``` + +If there are automatic dependencies with a prefix, but you want another one, explicitly rename the classes locally with a `pass` class as shown in the following. + +```py +class Emu3TextMLP(LlamaMLP): pass ``` -When inheriting a Config class and adding or deleting some attributes, it may be tempting to only redefine the new attributes in the docstring, and hoping that modular will do the rest. And similarly when deleting an argument, do nothing and hope that modular will remove itself from the docstring. However, due to current limitations of our linter, this is not yet supported. Thus, if you are in this case, you need to directly put the whole docstring (as it should appear in the end, with the correct arguments and default values) directly in the modular file under the class definition. \ No newline at end of file +## Config docstrings + +When inheriting a `Config` class or adding and deleting attributes, you may want to only redefine the new attributes in the docstring. However, the linter doesn't support this yet. You need to directly add the while docstring directly in the modular file under the class definition. From 0632603b7c02c121e5dc65e0d1b193dd9ce8871d Mon Sep 17 00:00:00 2001 From: stevhliu Date: Mon, 27 Jan 2025 11:44:48 -0800 Subject: [PATCH 100/116] more review --- docs/source/en/perf_train_gpu_many.md | 12 ++++++++ docs/source/en/quantization/gptq.md | 41 ++++++++++++++++++++------- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/docs/source/en/perf_train_gpu_many.md b/docs/source/en/perf_train_gpu_many.md index 18aa41a43f63..d710508e753c 100644 --- a/docs/source/en/perf_train_gpu_many.md +++ b/docs/source/en/perf_train_gpu_many.md @@ -19,6 +19,18 @@ Multi-GPU setups are effective for accelerating training and fitting large model This guide will discuss the various parallelism methods, combining them, and choosing an appropriate strategy for your setup. For more details about distributed training, refer to the [Accelerate](https://hf.co/docs/accelerate/index) documentation. +## Scalability strategy + +Use the [Model Memory Calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) to calculate how much memory a model requires. Then refer to the table below to select a strategy based on your setup. + +| setup | scenario | strategy | +|---|---|---| +| single node/multi-GPU | fits on single GPU | DistributedDataParallel or ZeRO | +| | doesn't fit on single GPU | PipelineParallel, ZeRO or TensorParallel | +| | largest model layer doesn't fit | TensorParallel or ZeRO | +| multi-node/multi-GPU | fast inter-node connectivity (NVLink or NVSwitch) | ZeRO or 3D parallelism (PipelineParallel, TensorParallel, DataParallel) | +| | slow inter-node connectivity | ZeRO or 3D parallelism (PipelineParallel, TensorParallel, DataParallel) | + ## Data parallelism Data parallelism evenly distributes data across multiple GPUs. Each GPU holds a copy of the model and concurrently proccesses their portion of the data. At the end, the results from each GPU are synchronized and combined. diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index 57e9c10a6a46..a9878bbc362e 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -16,26 +16,36 @@ rendered properly in your Markdown viewer. # GPTQ -[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save your memory-usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory. Inference is also faster because a lower bitwidth takes less time to communicate. +The [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save memory usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory. Inference is also faster because a lower bitwidth takes less time to communicate. -Run the commands below to install AutoGPTQ. +> [!WARNING] +> AutoGPTQ is likely to be deprecated in the future due to lack of continued support for new models and features. See the [GPTQModel](#gptqmodel) section for more details. + +Install Accelerate, Transformers and Optimum first. ```bash pip install --upgrade accelerate optimum transformers ``` -Then install either GPTQModel or AutoGPTQ. +Then run the command below to install a GPTQ library. + + + ```bash pip install gptqmodel --no-build-isolation ``` -or + + ```bash pip install auto-gptq --no-build-isolation ``` + + + Create a [`GPTQConfig`] class and set the number of bits to quantize to, a dataset to calbrate the weights for quantization, and a tokenizer to prepare the dataset. ```py @@ -45,13 +55,12 @@ tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") gptq_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer) ``` -> [!TIP] -> You can pass your own dataset as a list of strings, but it is highly recommended to use the same dataset from the GPTQ paper. -> -> ```py -> dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."] -> gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer) -> ``` +You can pass your own dataset as a list of strings, but it is highly recommended to use the same dataset from the GPTQ paper. + +```py +dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."] +gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer) +``` Load a model to quantize and pass [`GPTQConfig`] to [`~AutoModelForCausalLM.from_pretrained`]. Set `device_map="auto"` to automatically offload the model to a CPU to help fit the model in memory, and allow the model modules to be moved between the CPU and GPU for quantization. @@ -147,6 +156,16 @@ model = AutoModelForCausalLM.from_pretrained( ) ``` +## GPTQModel + +It is recommended to use GPTQModel, originally a maintained fork of AutoGPTQ, because it has since diverged from AutoGTPQ with some significant features. GPTQModel has faster quantization, lower memory usage, and more accurate default quantization. + +GPTQModel provides asymmetric quantization which can potentially lower quantization errors compared to symmetric quantization. It is not backward compatible with AutoGPTQ, and not all kernels (Marlin) support asymmetric quantization. + +GPTQModel also has broader support for the latest LLM models, multimodal models (Qwen2-VL and Ovis1.6-VL), platforms (Linux, macOS, Windows 11), and hardware (AMD ROCm, Apple Silicon, Intel/AMD CPUs, and Intel Datacenter Max/Arc GPUs, etc.). + +The Marlin kernels are also updated for A100 GPUs and other kernels are updated to include auto-padding for legacy models and models with non-uniform in/out-features. + ## Resources Run the GPTQ quantization with PEFT [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) for a hands-on experience, and read [Making LLMs lighter with AutoGPTQ and transformers](https://huggingface.co/blog/gptq-integration) to learn more about the AutoGPTQ integration. From 06896e336f86986ef2dac019664c97180d37fc44 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Mon, 27 Jan 2025 11:46:11 -0800 Subject: [PATCH 101/116] zamba2 --- docs/source/en/_toctree.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 195022727efd..3fa6b89b4605 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -654,6 +654,8 @@ title: YOSO - local: model_doc/zamba title: Zamba + - local: model_doc/zamba2 + title: Zamba2 - title: Vision models sections: - local: model_doc/beit From 534ffb9a78c2ac74e37cabb878286e8240c1d360 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Mon, 27 Jan 2025 11:55:39 -0800 Subject: [PATCH 102/116] fix --- docs/source/en/installation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md index d7d9a98579c0..46d4aa385bdb 100644 --- a/docs/source/en/installation.md +++ b/docs/source/en/installation.md @@ -47,7 +47,7 @@ source ./env/bin/activate ``` - + ## Python From dbcb72744634d4c1f9506eb63e770f7b494c08ff Mon Sep 17 00:00:00 2001 From: stevhliu Date: Mon, 27 Jan 2025 15:37:33 -0800 Subject: [PATCH 103/116] all frameworks --- docs/source/en/model_doc/albert.md | 6 ------ docs/source/en/model_doc/bart.md | 10 ++++------ docs/source/en/model_doc/barthez.md | 7 +++++++ docs/source/en/model_doc/bartpho.md | 7 +++++++ docs/source/en/model_doc/bert-japanese.md | 7 +++++++ docs/source/en/model_doc/bert.md | 10 ++++------ docs/source/en/model_doc/bertweet.md | 7 +++++++ docs/source/en/model_doc/blenderbot-small.md | 7 +++++++ docs/source/en/model_doc/blenderbot.md | 7 +++++++ docs/source/en/model_doc/bort.md | 7 +++++++ docs/source/en/model_doc/byt5.md | 7 +++++++ docs/source/en/model_doc/clip.md | 7 +++++++ docs/source/en/model_doc/cpm.md | 7 +++++++ docs/source/en/model_doc/dialogpt.md | 7 +++++++ docs/source/en/model_doc/distilbert.md | 13 ++++--------- docs/source/en/model_doc/electra.md | 10 ++++------ docs/source/en/model_doc/encoder-decoder.md | 7 +++++++ docs/source/en/model_doc/flan-t5.md | 7 +++++++ docs/source/en/model_doc/flan-ul2.md | 7 +++++++ docs/source/en/model_doc/gpt-sw3.md | 7 +++++++ docs/source/en/model_doc/gptj.md | 7 +++++++ docs/source/en/model_doc/herbert.md | 7 +++++++ docs/source/en/model_doc/madlad-400.md | 7 +++++++ docs/source/en/model_doc/marian.md | 10 ++++------ docs/source/en/model_doc/mbart.md | 10 ++++------ docs/source/en/model_doc/megatron_gpt2.md | 7 +++++++ docs/source/en/model_doc/mistral.md | 7 +++++++ docs/source/en/model_doc/mms.md | 7 +++++++ docs/source/en/model_doc/mt5.md | 10 ++++------ docs/source/en/model_doc/nougat.md | 7 +++++++ docs/source/en/model_doc/openai-gpt.md | 10 ++++------ docs/source/en/model_doc/opt.md | 7 +++++++ docs/source/en/model_doc/pegasus.md | 11 ++++------- docs/source/en/model_doc/phobert.md | 7 +++++++ docs/source/en/model_doc/regnet.md | 7 +++++++ docs/source/en/model_doc/resnet.md | 7 +++++++ docs/source/en/model_doc/roberta-prelayernorm.md | 7 +++++++ docs/source/en/model_doc/roberta.md | 14 ++++---------- docs/source/en/model_doc/roformer.md | 7 +++++++ docs/source/en/model_doc/t5.md | 13 ++++--------- docs/source/en/model_doc/t5v1.1.md | 7 +++++++ docs/source/en/model_doc/tapex.md | 7 +++++++ docs/source/en/model_doc/ul2.md | 7 +++++++ docs/source/en/model_doc/vision-encoder-decoder.md | 7 +++++++ .../en/model_doc/vision-text-dual-encoder.md | 7 +++++++ docs/source/en/model_doc/vit.md | 7 +++++++ docs/source/en/model_doc/wav2vec2.md | 7 +++++++ docs/source/en/model_doc/wav2vec2_phoneme.md | 7 +++++++ docs/source/en/model_doc/whisper.md | 7 +++++++ docs/source/en/model_doc/xglm.md | 7 +++++++ docs/source/en/model_doc/xlm-roberta.md | 10 ++++------ docs/source/en/model_doc/xlm-v.md | 7 +++++++ docs/source/en/model_doc/xls_r.md | 7 +++++++ docs/source/en/model_doc/xlsr_wav2vec2.md | 7 +++++++ 54 files changed, 335 insertions(+), 89 deletions(-) diff --git a/docs/source/en/model_doc/albert.md b/docs/source/en/model_doc/albert.md index 52826572aeda..9a8ebca15480 100644 --- a/docs/source/en/model_doc/albert.md +++ b/docs/source/en/model_doc/albert.md @@ -17,12 +17,6 @@ rendered properly in your Markdown viewer. # ALBERT
- -Models - - -Spaces - PyTorch TensorFlow Flax - -Models - - -Spaces - +PyTorch +TensorFlow +Flax
## Overview diff --git a/docs/source/en/model_doc/barthez.md b/docs/source/en/model_doc/barthez.md index 1b571e242f47..131b1dd8e185 100644 --- a/docs/source/en/model_doc/barthez.md +++ b/docs/source/en/model_doc/barthez.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # BARThez +
+PyTorch +TensorFlow +Flax +
+ ## Overview The BARThez model was proposed in [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis on 23 Oct, diff --git a/docs/source/en/model_doc/bartpho.md b/docs/source/en/model_doc/bartpho.md index 8f0a5f8bfe24..b3749516323d 100644 --- a/docs/source/en/model_doc/bartpho.md +++ b/docs/source/en/model_doc/bartpho.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # BARTpho +
+PyTorch +TensorFlow +Flax +
+ ## Overview The BARTpho model was proposed in [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen. diff --git a/docs/source/en/model_doc/bert-japanese.md b/docs/source/en/model_doc/bert-japanese.md index d68bb221d577..33a720318b63 100644 --- a/docs/source/en/model_doc/bert-japanese.md +++ b/docs/source/en/model_doc/bert-japanese.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # BertJapanese +
+PyTorch +TensorFlow +Flax +
+ ## Overview The BERT models trained on Japanese text. diff --git a/docs/source/en/model_doc/bert.md b/docs/source/en/model_doc/bert.md index b6e99d1031e8..3379679f076e 100644 --- a/docs/source/en/model_doc/bert.md +++ b/docs/source/en/model_doc/bert.md @@ -17,12 +17,10 @@ rendered properly in your Markdown viewer. # BERT
- -Models - - -Spaces - +PyTorch +TensorFlow +Flax
## Overview diff --git a/docs/source/en/model_doc/bertweet.md b/docs/source/en/model_doc/bertweet.md index c4c883b21ad7..be489643173f 100644 --- a/docs/source/en/model_doc/bertweet.md +++ b/docs/source/en/model_doc/bertweet.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # BERTweet +
+PyTorch +TensorFlow +Flax +
+ ## Overview The BERTweet model was proposed in [BERTweet: A pre-trained language model for English Tweets](https://www.aclweb.org/anthology/2020.emnlp-demos.2.pdf) by Dat Quoc Nguyen, Thanh Vu, Anh Tuan Nguyen. diff --git a/docs/source/en/model_doc/blenderbot-small.md b/docs/source/en/model_doc/blenderbot-small.md index d5f4a7d849b7..647a865de339 100644 --- a/docs/source/en/model_doc/blenderbot-small.md +++ b/docs/source/en/model_doc/blenderbot-small.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # Blenderbot Small +
+PyTorch +TensorFlow +Flax +
+ Note that [`BlenderbotSmallModel`] and [`BlenderbotSmallForConditionalGeneration`] are only used in combination with the checkpoint [facebook/blenderbot-90M](https://huggingface.co/facebook/blenderbot-90M). Larger Blenderbot checkpoints should diff --git a/docs/source/en/model_doc/blenderbot.md b/docs/source/en/model_doc/blenderbot.md index 42e1710cb2d5..ec24d5ed7495 100644 --- a/docs/source/en/model_doc/blenderbot.md +++ b/docs/source/en/model_doc/blenderbot.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # Blenderbot +
+PyTorch +TensorFlow +Flax +
+ ## Overview The Blender chatbot model was proposed in [Recipes for building an open-domain chatbot](https://arxiv.org/pdf/2004.13637.pdf) Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, diff --git a/docs/source/en/model_doc/bort.md b/docs/source/en/model_doc/bort.md index 1542d464d9fd..04cc2feb063b 100644 --- a/docs/source/en/model_doc/bort.md +++ b/docs/source/en/model_doc/bort.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # BORT +
+PyTorch +TensorFlow +Flax +
+ This model is in maintenance mode only, we do not accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/byt5.md b/docs/source/en/model_doc/byt5.md index dc2942e33bbe..7e95bae53e87 100644 --- a/docs/source/en/model_doc/byt5.md +++ b/docs/source/en/model_doc/byt5.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # ByT5 +
+PyTorch +TensorFlow +Flax +
+ ## Overview The ByT5 model was presented in [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir diff --git a/docs/source/en/model_doc/clip.md b/docs/source/en/model_doc/clip.md index cd2d56229b4e..7fae22d81016 100644 --- a/docs/source/en/model_doc/clip.md +++ b/docs/source/en/model_doc/clip.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # CLIP +
+PyTorch +TensorFlow +Flax +
+ ## Overview The CLIP model was proposed in [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, diff --git a/docs/source/en/model_doc/cpm.md b/docs/source/en/model_doc/cpm.md index 129c4ed3a377..8a1826a25c6d 100644 --- a/docs/source/en/model_doc/cpm.md +++ b/docs/source/en/model_doc/cpm.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # CPM +
+PyTorch +TensorFlow +Flax +
+ ## Overview The CPM model was proposed in [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, diff --git a/docs/source/en/model_doc/dialogpt.md b/docs/source/en/model_doc/dialogpt.md index 558b91d76d25..33d7e3b16d88 100644 --- a/docs/source/en/model_doc/dialogpt.md +++ b/docs/source/en/model_doc/dialogpt.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # DialoGPT +
+PyTorch +TensorFlow +Flax +
+ ## Overview DialoGPT was proposed in [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, diff --git a/docs/source/en/model_doc/distilbert.md b/docs/source/en/model_doc/distilbert.md index 10f7c2d757a2..9a1fadb6b9b9 100644 --- a/docs/source/en/model_doc/distilbert.md +++ b/docs/source/en/model_doc/distilbert.md @@ -17,15 +17,10 @@ rendered properly in your Markdown viewer. # DistilBERT
- -Models - - -Spaces - - -Paper page - +PyTorch +TensorFlow +Flax
## Overview diff --git a/docs/source/en/model_doc/electra.md b/docs/source/en/model_doc/electra.md index 700c49df7993..bee883d64153 100644 --- a/docs/source/en/model_doc/electra.md +++ b/docs/source/en/model_doc/electra.md @@ -17,12 +17,10 @@ rendered properly in your Markdown viewer. # ELECTRA
- -Models - - -Spaces - +PyTorch +TensorFlow +Flax
## Overview diff --git a/docs/source/en/model_doc/encoder-decoder.md b/docs/source/en/model_doc/encoder-decoder.md index 4bd0e6f188fe..dc977b3e7485 100644 --- a/docs/source/en/model_doc/encoder-decoder.md +++ b/docs/source/en/model_doc/encoder-decoder.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # Encoder Decoder Models +
+PyTorch +TensorFlow +Flax +
+ ## Overview The [`EncoderDecoderModel`] can be used to initialize a sequence-to-sequence model with any diff --git a/docs/source/en/model_doc/flan-t5.md b/docs/source/en/model_doc/flan-t5.md index c0fd6b0011cc..0e3b9ba0738f 100644 --- a/docs/source/en/model_doc/flan-t5.md +++ b/docs/source/en/model_doc/flan-t5.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # FLAN-T5 +
+PyTorch +TensorFlow +Flax +
+ ## Overview FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf) - it is an enhanced version of T5 that has been finetuned in a mixture of tasks. diff --git a/docs/source/en/model_doc/flan-ul2.md b/docs/source/en/model_doc/flan-ul2.md index 5487bb779760..3b946b909b09 100644 --- a/docs/source/en/model_doc/flan-ul2.md +++ b/docs/source/en/model_doc/flan-ul2.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # FLAN-UL2 +
+PyTorch +TensorFlow +Flax +
+ ## Overview Flan-UL2 is an encoder decoder model based on the T5 architecture. It uses the same configuration as the [UL2](ul2) model released earlier last year. diff --git a/docs/source/en/model_doc/gpt-sw3.md b/docs/source/en/model_doc/gpt-sw3.md index f69bd958e9c5..20daa3537af0 100644 --- a/docs/source/en/model_doc/gpt-sw3.md +++ b/docs/source/en/model_doc/gpt-sw3.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # GPT-Sw3 +
+PyTorch +TensorFlow +Flax +
+ ## Overview The GPT-Sw3 model was first proposed in diff --git a/docs/source/en/model_doc/gptj.md b/docs/source/en/model_doc/gptj.md index b515cf36dd40..9268adb2a3e5 100644 --- a/docs/source/en/model_doc/gptj.md +++ b/docs/source/en/model_doc/gptj.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # GPT-J +
+PyTorch +TensorFlow +Flax +
+ ## Overview The GPT-J model was released in the [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax) repository by Ben Wang and Aran Komatsuzaki. It is a GPT-2-like diff --git a/docs/source/en/model_doc/herbert.md b/docs/source/en/model_doc/herbert.md index 0049d6bfcf3a..aa4f535ed274 100644 --- a/docs/source/en/model_doc/herbert.md +++ b/docs/source/en/model_doc/herbert.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # HerBERT +
+PyTorch +TensorFlow +Flax +
+ ## Overview The HerBERT model was proposed in [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, and diff --git a/docs/source/en/model_doc/madlad-400.md b/docs/source/en/model_doc/madlad-400.md index aeb41938499c..db6abc38eaf1 100644 --- a/docs/source/en/model_doc/madlad-400.md +++ b/docs/source/en/model_doc/madlad-400.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # MADLAD-400 +
+PyTorch +TensorFlow +Flax +
+ ## Overview MADLAD-400 models were released in the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](MADLAD-400: A Multilingual And Document-Level Large Audited Dataset). diff --git a/docs/source/en/model_doc/marian.md b/docs/source/en/model_doc/marian.md index d8ebec8ffb0a..80bb73d26df1 100644 --- a/docs/source/en/model_doc/marian.md +++ b/docs/source/en/model_doc/marian.md @@ -17,12 +17,10 @@ rendered properly in your Markdown viewer. # MarianMT
- -Models - - -Spaces - +PyTorch +TensorFlow +Flax
## Overview diff --git a/docs/source/en/model_doc/mbart.md b/docs/source/en/model_doc/mbart.md index ca529e957e2d..b75d36ca50e6 100644 --- a/docs/source/en/model_doc/mbart.md +++ b/docs/source/en/model_doc/mbart.md @@ -17,12 +17,10 @@ rendered properly in your Markdown viewer. # MBart and MBart-50
- -Models - - -Spaces - +PyTorch +TensorFlow +Flax
diff --git a/docs/source/en/model_doc/megatron_gpt2.md b/docs/source/en/model_doc/megatron_gpt2.md index 284fd372c0e0..7e0ee3cb9e7c 100644 --- a/docs/source/en/model_doc/megatron_gpt2.md +++ b/docs/source/en/model_doc/megatron_gpt2.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # MegatronGPT2 +
+PyTorch +TensorFlow +Flax +
+ ## Overview The MegatronGPT2 model was proposed in [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md index cfa2af367813..c61fa0dd2628 100644 --- a/docs/source/en/model_doc/mistral.md +++ b/docs/source/en/model_doc/mistral.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # Mistral +
+PyTorch +TensorFlow +Flax +
+ ## Overview Mistral was introduced in the [this blogpost](https://mistral.ai/news/announcing-mistral-7b/) by Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. diff --git a/docs/source/en/model_doc/mms.md b/docs/source/en/model_doc/mms.md index 7102b8896647..480d5bc8ddb1 100644 --- a/docs/source/en/model_doc/mms.md +++ b/docs/source/en/model_doc/mms.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # MMS +
+PyTorch +TensorFlow +Flax +
+ ## Overview The MMS model was proposed in [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) diff --git a/docs/source/en/model_doc/mt5.md b/docs/source/en/model_doc/mt5.md index 7f053bb724a1..d4af9f538cb3 100644 --- a/docs/source/en/model_doc/mt5.md +++ b/docs/source/en/model_doc/mt5.md @@ -17,12 +17,10 @@ rendered properly in your Markdown viewer. # mT5
- -Models - - -Spaces - +PyTorch +TensorFlow +Flax
## Overview diff --git a/docs/source/en/model_doc/nougat.md b/docs/source/en/model_doc/nougat.md index a39e74eb213a..06b12b5ee8e6 100644 --- a/docs/source/en/model_doc/nougat.md +++ b/docs/source/en/model_doc/nougat.md @@ -15,6 +15,13 @@ specific language governing permissions and limitations under the License. --> # Nougat +
+PyTorch +TensorFlow +Flax +
+ ## Overview The Nougat model was proposed in [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by diff --git a/docs/source/en/model_doc/openai-gpt.md b/docs/source/en/model_doc/openai-gpt.md index 09277858aa3b..054495676a83 100644 --- a/docs/source/en/model_doc/openai-gpt.md +++ b/docs/source/en/model_doc/openai-gpt.md @@ -17,12 +17,10 @@ rendered properly in your Markdown viewer. # OpenAI GPT
- -Models - - -Spaces - +PyTorch +TensorFlow +Flax
## Overview diff --git a/docs/source/en/model_doc/opt.md b/docs/source/en/model_doc/opt.md index c82064bae894..b543f46f04fa 100644 --- a/docs/source/en/model_doc/opt.md +++ b/docs/source/en/model_doc/opt.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # OPT +
+PyTorch +TensorFlow +Flax +
+ ## Overview The OPT model was proposed in [Open Pre-trained Transformer Language Models](https://arxiv.org/pdf/2205.01068) by Meta AI. diff --git a/docs/source/en/model_doc/pegasus.md b/docs/source/en/model_doc/pegasus.md index 0622354e62de..46fca71ac0d9 100644 --- a/docs/source/en/model_doc/pegasus.md +++ b/docs/source/en/model_doc/pegasus.md @@ -17,15 +17,12 @@ rendered properly in your Markdown viewer. # Pegasus
- -Models - - -Spaces - +PyTorch +TensorFlow +Flax
- ## Overview The Pegasus model was proposed in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019. diff --git a/docs/source/en/model_doc/phobert.md b/docs/source/en/model_doc/phobert.md index adf5900ebe2a..c1c4b8742b4d 100644 --- a/docs/source/en/model_doc/phobert.md +++ b/docs/source/en/model_doc/phobert.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # PhoBERT +
+PyTorch +TensorFlow +Flax +
+ ## Overview The PhoBERT model was proposed in [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92.pdf) by Dat Quoc Nguyen, Anh Tuan Nguyen. diff --git a/docs/source/en/model_doc/regnet.md b/docs/source/en/model_doc/regnet.md index acd833c77c2d..f292fe0df24b 100644 --- a/docs/source/en/model_doc/regnet.md +++ b/docs/source/en/model_doc/regnet.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # RegNet +
+PyTorch +TensorFlow +Flax +
+ ## Overview The RegNet model was proposed in [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. diff --git a/docs/source/en/model_doc/resnet.md b/docs/source/en/model_doc/resnet.md index b959266512f5..d7400b46c838 100644 --- a/docs/source/en/model_doc/resnet.md +++ b/docs/source/en/model_doc/resnet.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # ResNet +
+PyTorch +TensorFlow +Flax +
+ ## Overview The ResNet model was proposed in [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. Our implementation follows the small changes made by [Nvidia](https://catalog.ngc.nvidia.com/orgs/nvidia/resources/resnet_50_v1_5_for_pytorch), we apply the `stride=2` for downsampling in bottleneck's `3x3` conv and not in the first `1x1`. This is generally known as "ResNet v1.5". diff --git a/docs/source/en/model_doc/roberta-prelayernorm.md b/docs/source/en/model_doc/roberta-prelayernorm.md index f748e273e8f8..7cef8526c251 100644 --- a/docs/source/en/model_doc/roberta-prelayernorm.md +++ b/docs/source/en/model_doc/roberta-prelayernorm.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # RoBERTa-PreLayerNorm +
+PyTorch +TensorFlow +Flax +
+ ## Overview The RoBERTa-PreLayerNorm model was proposed in [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli. diff --git a/docs/source/en/model_doc/roberta.md b/docs/source/en/model_doc/roberta.md index 2a1843d8885a..a67ea79ec74c 100644 --- a/docs/source/en/model_doc/roberta.md +++ b/docs/source/en/model_doc/roberta.md @@ -17,17 +17,11 @@ rendered properly in your Markdown viewer. # RoBERTa
- -Models - - -Spaces - - -Paper page - +PyTorch +TensorFlow +Flax
- ## Overview The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, [Myle Ott](https://huggingface.co/myleott), Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer diff --git a/docs/source/en/model_doc/roformer.md b/docs/source/en/model_doc/roformer.md index 5d8f146c43fd..83d01c2fc91d 100644 --- a/docs/source/en/model_doc/roformer.md +++ b/docs/source/en/model_doc/roformer.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # RoFormer +
+PyTorch +TensorFlow +Flax +
+ ## Overview The RoFormer model was proposed in [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu. diff --git a/docs/source/en/model_doc/t5.md b/docs/source/en/model_doc/t5.md index 86a645512c6c..87b9d2c6e849 100644 --- a/docs/source/en/model_doc/t5.md +++ b/docs/source/en/model_doc/t5.md @@ -17,15 +17,10 @@ rendered properly in your Markdown viewer. # T5
- -Models - - -Spaces - - -Paper page - +PyTorch +TensorFlow +Flax
## Overview diff --git a/docs/source/en/model_doc/t5v1.1.md b/docs/source/en/model_doc/t5v1.1.md index e18696f629df..5ae908bacdae 100644 --- a/docs/source/en/model_doc/t5v1.1.md +++ b/docs/source/en/model_doc/t5v1.1.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # T5v1.1 +
+PyTorch +TensorFlow +Flax +
+ ## Overview T5v1.1 was released in the [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) diff --git a/docs/source/en/model_doc/tapex.md b/docs/source/en/model_doc/tapex.md index 15ac2463fd85..d46d520c7d18 100644 --- a/docs/source/en/model_doc/tapex.md +++ b/docs/source/en/model_doc/tapex.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # TAPEX +
+PyTorch +TensorFlow +Flax +
+ This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/ul2.md b/docs/source/en/model_doc/ul2.md index f4d01c40b0c1..18743a28426e 100644 --- a/docs/source/en/model_doc/ul2.md +++ b/docs/source/en/model_doc/ul2.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # UL2 +
+PyTorch +TensorFlow +Flax +
+ ## Overview The T5 model was presented in [Unifying Language Learning Paradigms](https://arxiv.org/pdf/2205.05131v1.pdf) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler. diff --git a/docs/source/en/model_doc/vision-encoder-decoder.md b/docs/source/en/model_doc/vision-encoder-decoder.md index 41159b7fc5f9..e0aa98cb3d0f 100644 --- a/docs/source/en/model_doc/vision-encoder-decoder.md +++ b/docs/source/en/model_doc/vision-encoder-decoder.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # Vision Encoder Decoder Models +
+PyTorch +TensorFlow +Flax +
+ ## Overview The [`VisionEncoderDecoderModel`] can be used to initialize an image-to-text model with any diff --git a/docs/source/en/model_doc/vision-text-dual-encoder.md b/docs/source/en/model_doc/vision-text-dual-encoder.md index 7cb68a261875..bae26d05128a 100644 --- a/docs/source/en/model_doc/vision-text-dual-encoder.md +++ b/docs/source/en/model_doc/vision-text-dual-encoder.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # VisionTextDualEncoder +
+PyTorch +TensorFlow +Flax +
+ ## Overview The [`VisionTextDualEncoderModel`] can be used to initialize a vision-text dual encoder model with diff --git a/docs/source/en/model_doc/vit.md b/docs/source/en/model_doc/vit.md index 53a550895ce2..5d122e777115 100644 --- a/docs/source/en/model_doc/vit.md +++ b/docs/source/en/model_doc/vit.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # Vision Transformer (ViT) +
+PyTorch +TensorFlow +Flax +
+ ## Overview The Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition diff --git a/docs/source/en/model_doc/wav2vec2.md b/docs/source/en/model_doc/wav2vec2.md index 5ef3fdbb1eaa..6987434a4e24 100644 --- a/docs/source/en/model_doc/wav2vec2.md +++ b/docs/source/en/model_doc/wav2vec2.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # Wav2Vec2 +
+PyTorch +TensorFlow +Flax +
+ ## Overview The Wav2Vec2 model was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. diff --git a/docs/source/en/model_doc/wav2vec2_phoneme.md b/docs/source/en/model_doc/wav2vec2_phoneme.md index 93e0656f493c..c5c1edd6aced 100644 --- a/docs/source/en/model_doc/wav2vec2_phoneme.md +++ b/docs/source/en/model_doc/wav2vec2_phoneme.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # Wav2Vec2Phoneme +
+PyTorch +TensorFlow +Flax +
+ ## Overview The Wav2Vec2Phoneme model was proposed in [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition (Xu et al., diff --git a/docs/source/en/model_doc/whisper.md b/docs/source/en/model_doc/whisper.md index 58e641a5d0e0..2f3ed0647404 100644 --- a/docs/source/en/model_doc/whisper.md +++ b/docs/source/en/model_doc/whisper.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # Whisper +
+PyTorch +TensorFlow +Flax +
+ ## Overview The Whisper model was proposed in [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever. diff --git a/docs/source/en/model_doc/xglm.md b/docs/source/en/model_doc/xglm.md index 470e42c747be..4032de2cd784 100644 --- a/docs/source/en/model_doc/xglm.md +++ b/docs/source/en/model_doc/xglm.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # XGLM +
+PyTorch +TensorFlow +Flax +
+ ## Overview The XGLM model was proposed in [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) diff --git a/docs/source/en/model_doc/xlm-roberta.md b/docs/source/en/model_doc/xlm-roberta.md index 414afba11681..7b60b43404ee 100644 --- a/docs/source/en/model_doc/xlm-roberta.md +++ b/docs/source/en/model_doc/xlm-roberta.md @@ -17,12 +17,10 @@ rendered properly in your Markdown viewer. # XLM-RoBERTa
- -Models - - -Spaces - +PyTorch +TensorFlow +Flax
## Overview diff --git a/docs/source/en/model_doc/xlm-v.md b/docs/source/en/model_doc/xlm-v.md index 049a1f35ad9a..69badfe2e698 100644 --- a/docs/source/en/model_doc/xlm-v.md +++ b/docs/source/en/model_doc/xlm-v.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # XLM-V +
+PyTorch +TensorFlow +Flax +
+ ## Overview XLM-V is multilingual language model with a one million token vocabulary trained on 2.5TB of data from Common Crawl (same as XLM-R). diff --git a/docs/source/en/model_doc/xls_r.md b/docs/source/en/model_doc/xls_r.md index 2226c813e72b..d24d88907ee7 100644 --- a/docs/source/en/model_doc/xls_r.md +++ b/docs/source/en/model_doc/xls_r.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # XLS-R +
+PyTorch +TensorFlow +Flax +
+ ## Overview The XLS-R model was proposed in [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman diff --git a/docs/source/en/model_doc/xlsr_wav2vec2.md b/docs/source/en/model_doc/xlsr_wav2vec2.md index 6369d068850a..f88b0dc9e14f 100644 --- a/docs/source/en/model_doc/xlsr_wav2vec2.md +++ b/docs/source/en/model_doc/xlsr_wav2vec2.md @@ -16,6 +16,13 @@ rendered properly in your Markdown viewer. # XLSR-Wav2Vec2 +
+PyTorch +TensorFlow +Flax +
+ ## Overview The XLSR-Wav2Vec2 model was proposed in [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael From a34b1aff72f8674dd08bb6ca1f97422dfecc06ee Mon Sep 17 00:00:00 2001 From: stevhliu Date: Mon, 27 Jan 2025 16:13:19 -0800 Subject: [PATCH 104/116] pytorch --- docs/source/en/model_doc/align.md | 2 ++ docs/source/en/model_doc/altclip.md | 2 ++ docs/source/en/model_doc/aria.md | 2 ++ .../source/en/model_doc/audio-spectrogram-transformer.md | 2 ++ docs/source/en/model_doc/autoformer.md | 2 ++ docs/source/en/model_doc/bamba.md | 1 + docs/source/en/model_doc/bark.md | 2 ++ docs/source/en/model_doc/bert-generation.md | 2 ++ docs/source/en/model_doc/bigbird_pegasus.md | 2 ++ docs/source/en/model_doc/biogpt.md | 2 ++ docs/source/en/model_doc/bit.md | 2 ++ docs/source/en/model_doc/blip-2.md | 2 ++ docs/source/en/model_doc/bridgetower.md | 2 ++ docs/source/en/model_doc/bros.md | 2 ++ docs/source/en/model_doc/canine.md | 2 ++ docs/source/en/model_doc/chameleon.md | 2 ++ docs/source/en/model_doc/chinese_clip.md | 2 ++ docs/source/en/model_doc/clap.md | 2 ++ docs/source/en/model_doc/clipseg.md | 2 ++ docs/source/en/model_doc/clvp.md | 2 ++ docs/source/en/model_doc/codegen.md | 2 ++ docs/source/en/model_doc/cohere.md | 2 ++ docs/source/en/model_doc/cohere2.md | 2 ++ docs/source/en/model_doc/colpali.md | 2 ++ docs/source/en/model_doc/conditional_detr.md | 2 ++ docs/source/en/model_doc/cpmant.md | 2 ++ docs/source/en/model_doc/dac.md | 2 ++ docs/source/en/model_doc/data2vec.md | 2 ++ docs/source/en/model_doc/dbrx.md | 2 ++ docs/source/en/model_doc/decision_transformer.md | 2 ++ docs/source/en/model_doc/deformable_detr.md | 2 ++ docs/source/en/model_doc/deplot.md | 2 ++ docs/source/en/model_doc/depth_anything.md | 2 ++ docs/source/en/model_doc/deta.md | 2 ++ docs/source/en/model_doc/detr.md | 2 ++ docs/source/en/model_doc/diffllama.md | 2 ++ docs/source/en/model_doc/dinat.md | 2 ++ docs/source/en/model_doc/dinov2_with_registers.md | 2 ++ docs/source/en/model_doc/dpt.md | 2 ++ docs/source/en/model_doc/efficientnet.md | 2 ++ docs/source/en/model_doc/emu3.md | 2 ++ docs/source/en/model_doc/encodec.md | 2 ++ docs/source/en/model_doc/ernie.md | 2 ++ docs/source/en/model_doc/ernie_m.md | 2 ++ docs/source/en/model_doc/falcon.md | 2 ++ docs/source/en/model_doc/falcon_mamba.md | 2 ++ docs/source/en/model_doc/fastspeech2_conformer.md | 2 ++ docs/source/en/model_doc/flava.md | 2 ++ docs/source/en/model_doc/fnet.md | 2 ++ docs/source/en/model_doc/focalnet.md | 2 ++ docs/source/en/model_doc/fuyu.md | 2 ++ docs/source/en/model_doc/gemma2.md | 2 ++ docs/source/en/model_doc/git.md | 2 ++ docs/source/en/model_doc/glm.md | 2 ++ docs/source/en/model_doc/glpn.md | 2 ++ docs/source/en/model_doc/gpt_bigcode.md | 2 ++ docs/source/en/model_doc/gpt_neox.md | 2 ++ docs/source/en/model_doc/gpt_neox_japanese.md | 2 ++ docs/source/en/model_doc/gptsan-japanese.md | 2 ++ docs/source/en/model_doc/granite.md | 2 ++ docs/source/en/model_doc/granitemoe.md | 2 ++ docs/source/en/model_doc/graphormer.md | 2 ++ docs/source/en/model_doc/grounding-dino.md | 2 ++ docs/source/en/model_doc/helium.md | 1 + docs/source/en/model_doc/hiera.md | 2 ++ docs/source/en/model_doc/ibert.md | 2 ++ docs/source/en/model_doc/idefics2.md | 2 ++ docs/source/en/model_doc/idefics3.md | 2 ++ docs/source/en/model_doc/ijepa.md | 2 ++ docs/source/en/model_doc/imagegpt.md | 2 ++ docs/source/en/model_doc/informer.md | 2 ++ docs/source/en/model_doc/instructblip.md | 2 ++ docs/source/en/model_doc/instructblipvideo.md | 2 +- docs/source/en/model_doc/jamba.md | 2 ++ docs/source/en/model_doc/jetmoe.md | 2 ++ docs/source/en/model_doc/jukebox.md | 2 ++ docs/source/en/model_doc/kosmos-2.md | 2 ++ docs/source/en/model_doc/layoutlmv2.md | 2 ++ docs/source/en/model_doc/layoutxlm.md | 2 ++ docs/source/en/model_doc/levit.md | 2 ++ docs/source/en/model_doc/lilt.md | 2 ++ docs/source/en/model_doc/llava.md | 2 ++ docs/source/en/model_doc/llava_next.md | 2 ++ docs/source/en/model_doc/llava_next_video.md | 2 ++ docs/source/en/model_doc/llava_onevision.md | 2 ++ docs/source/en/model_doc/luke.md | 2 ++ docs/source/en/model_doc/m2m_100.md | 2 ++ docs/source/en/model_doc/mamba.md | 2 ++ docs/source/en/model_doc/mamba2.md | 2 ++ docs/source/en/model_doc/markuplm.md | 2 ++ docs/source/en/model_doc/mask2former.md | 2 ++ docs/source/en/model_doc/maskformer.md | 2 ++ docs/source/en/model_doc/matcha.md | 2 ++ docs/source/en/model_doc/mctct.md | 2 ++ docs/source/en/model_doc/mega.md | 2 ++ docs/source/en/model_doc/megatron-bert.md | 2 ++ docs/source/en/model_doc/mgp-str.md | 2 ++ docs/source/en/model_doc/mimi.md | 2 ++ docs/source/en/model_doc/mixtral.md | 2 ++ docs/source/en/model_doc/mllama.md | 2 ++ docs/source/en/model_doc/mluke.md | 2 ++ docs/source/en/model_doc/mobilenet_v1.md | 2 ++ docs/source/en/model_doc/mobilenet_v2.md | 2 ++ docs/source/en/model_doc/mobilevitv2.md | 2 ++ docs/source/en/model_doc/modernbert.md | 9 +-------- docs/source/en/model_doc/moonshine.md | 2 ++ docs/source/en/model_doc/moshi.md | 2 ++ docs/source/en/model_doc/mpt.md | 2 ++ docs/source/en/model_doc/mra.md | 2 ++ docs/source/en/model_doc/musicgen.md | 2 ++ docs/source/en/model_doc/musicgen_melody.md | 2 ++ docs/source/en/model_doc/mvp.md | 2 ++ docs/source/en/model_doc/nat.md | 2 ++ docs/source/en/model_doc/nemotron.md | 2 +- docs/source/en/model_doc/nezha.md | 2 ++ docs/source/en/model_doc/nllb-moe.md | 1 + docs/source/en/model_doc/nllb.md | 2 ++ docs/source/en/model_doc/nystromformer.md | 2 ++ docs/source/en/model_doc/olmo.md | 2 ++ docs/source/en/model_doc/olmo2.md | 2 ++ docs/source/en/model_doc/olmoe.md | 2 ++ docs/source/en/model_doc/omdet-turbo.md | 2 ++ docs/source/en/model_doc/oneformer.md | 2 ++ docs/source/en/model_doc/open-llama.md | 2 ++ docs/source/en/model_doc/owlv2.md | 2 ++ docs/source/en/model_doc/owlvit.md | 2 ++ docs/source/en/model_doc/paligemma.md | 2 ++ docs/source/en/model_doc/patchtsmixer.md | 2 ++ docs/source/en/model_doc/patchtst.md | 2 ++ docs/source/en/model_doc/pegasus_x.md | 2 ++ docs/source/en/model_doc/perceiver.md | 2 ++ docs/source/en/model_doc/persimmon.md | 2 ++ docs/source/en/model_doc/phi.md | 2 ++ docs/source/en/model_doc/phi3.md | 2 ++ docs/source/en/model_doc/phimoe.md | 2 ++ docs/source/en/model_doc/pix2struct.md | 2 ++ docs/source/en/model_doc/pixtral.md | 2 ++ docs/source/en/model_doc/plbart.md | 2 ++ docs/source/en/model_doc/poolformer.md | 2 ++ docs/source/en/model_doc/pop2piano.md | 6 +----- docs/source/en/model_doc/prophetnet.md | 9 +-------- docs/source/en/model_doc/pvt.md | 2 ++ docs/source/en/model_doc/pvt_v2.md | 2 ++ docs/source/en/model_doc/qdqbert.md | 2 ++ docs/source/en/model_doc/qwen2.md | 2 ++ docs/source/en/model_doc/qwen2_5_vl.md | 2 ++ docs/source/en/model_doc/qwen2_audio.md | 2 ++ docs/source/en/model_doc/qwen2_moe.md | 2 ++ docs/source/en/model_doc/qwen2_vl.md | 2 ++ docs/source/en/model_doc/realm.md | 2 ++ docs/source/en/model_doc/recurrent_gemma.md | 2 ++ docs/source/en/model_doc/reformer.md | 9 +-------- docs/source/en/model_doc/retribert.md | 2 ++ docs/source/en/model_doc/roc_bert.md | 2 ++ docs/source/en/model_doc/rt_detr.md | 2 ++ docs/source/en/model_doc/rwkv.md | 2 ++ docs/source/en/model_doc/seamless_m4t.md | 2 ++ docs/source/en/model_doc/seamless_m4t_v2.md | 2 ++ docs/source/en/model_doc/seggpt.md | 2 ++ docs/source/en/model_doc/sew-d.md | 2 ++ docs/source/en/model_doc/sew.md | 2 ++ docs/source/en/model_doc/siglip.md | 2 ++ docs/source/en/model_doc/speecht5.md | 2 ++ docs/source/en/model_doc/splinter.md | 2 ++ docs/source/en/model_doc/squeezebert.md | 2 ++ docs/source/en/model_doc/stablelm.md | 2 ++ docs/source/en/model_doc/starcoder2.md | 2 ++ docs/source/en/model_doc/superglue.md | 2 ++ docs/source/en/model_doc/superpoint.md | 2 ++ docs/source/en/model_doc/swin2sr.md | 2 ++ docs/source/en/model_doc/swinv2.md | 2 ++ docs/source/en/model_doc/switch_transformers.md | 2 ++ docs/source/en/model_doc/table-transformer.md | 2 ++ docs/source/en/model_doc/textnet.md | 2 ++ docs/source/en/model_doc/time_series_transformer.md | 2 ++ docs/source/en/model_doc/timesformer.md | 2 ++ docs/source/en/model_doc/timm_wrapper.md | 2 ++ docs/source/en/model_doc/trajectory_transformer.md | 2 ++ docs/source/en/model_doc/trocr.md | 2 ++ docs/source/en/model_doc/tvlt.md | 2 ++ docs/source/en/model_doc/tvp.md | 2 ++ docs/source/en/model_doc/udop.md | 2 ++ docs/source/en/model_doc/umt5.md | 9 +-------- docs/source/en/model_doc/unispeech-sat.md | 2 ++ docs/source/en/model_doc/unispeech.md | 2 ++ docs/source/en/model_doc/univnet.md | 2 ++ docs/source/en/model_doc/upernet.md | 2 ++ docs/source/en/model_doc/van.md | 2 ++ docs/source/en/model_doc/video_llava.md | 2 ++ docs/source/en/model_doc/videomae.md | 2 ++ docs/source/en/model_doc/vilt.md | 2 ++ docs/source/en/model_doc/vipllava.md | 2 ++ docs/source/en/model_doc/visual_bert.md | 2 ++ docs/source/en/model_doc/vit_hybrid.md | 2 ++ docs/source/en/model_doc/vit_msn.md | 2 ++ docs/source/en/model_doc/vitdet.md | 2 ++ docs/source/en/model_doc/vitmatte.md | 2 ++ docs/source/en/model_doc/vitpose.md | 2 ++ docs/source/en/model_doc/vits.md | 2 ++ docs/source/en/model_doc/vivit.md | 2 ++ docs/source/en/model_doc/wav2vec2-bert.md | 2 ++ docs/source/en/model_doc/wav2vec2-conformer.md | 2 ++ docs/source/en/model_doc/wavlm.md | 2 ++ docs/source/en/model_doc/xclip.md | 2 ++ docs/source/en/model_doc/xlm-prophetnet.md | 2 ++ docs/source/en/model_doc/xlm-roberta-xl.md | 2 ++ docs/source/en/model_doc/yolos.md | 2 ++ docs/source/en/model_doc/yoso.md | 2 ++ docs/source/en/model_doc/zamba.md | 2 ++ docs/source/en/model_doc/zamba2.md | 2 ++ docs/source/en/model_doc/zoedepth.md | 2 ++ 211 files changed, 412 insertions(+), 39 deletions(-) diff --git a/docs/source/en/model_doc/align.md b/docs/source/en/model_doc/align.md index 0d34d95a7981..c657ba15d59f 100644 --- a/docs/source/en/model_doc/align.md +++ b/docs/source/en/model_doc/align.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # ALIGN +PyTorch + ## Overview The ALIGN model was proposed in [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig. ALIGN is a multi-modal vision and language model. It can be used for image-text similarity and for zero-shot image classification. ALIGN features a dual-encoder architecture with [EfficientNet](efficientnet) as its vision encoder and [BERT](bert) as its text encoder, and learns to align visual and text representations with contrastive learning. Unlike previous work, ALIGN leverages a massive noisy dataset and shows that the scale of the corpus can be used to achieve SOTA representations with a simple recipe. diff --git a/docs/source/en/model_doc/altclip.md b/docs/source/en/model_doc/altclip.md index b1fc9b382694..5bab1564ed27 100644 --- a/docs/source/en/model_doc/altclip.md +++ b/docs/source/en/model_doc/altclip.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # AltCLIP +PyTorch + ## Overview The AltCLIP model was proposed in [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679v2) by Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Fulong Ye, Qinghong Yang, Ledell Wu. AltCLIP diff --git a/docs/source/en/model_doc/aria.md b/docs/source/en/model_doc/aria.md index 9ff7a6687aa9..dd592ded8d66 100644 --- a/docs/source/en/model_doc/aria.md +++ b/docs/source/en/model_doc/aria.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Aria +PyTorch + ## Overview The Aria model was proposed in [Aria: An Open Multimodal Native Mixture-of-Experts Model](https://huggingface.co/papers/2410.05993) by Li et al. from the Rhymes.AI team. diff --git a/docs/source/en/model_doc/audio-spectrogram-transformer.md b/docs/source/en/model_doc/audio-spectrogram-transformer.md index d83c3bbb6cf2..2a016dc25fae 100644 --- a/docs/source/en/model_doc/audio-spectrogram-transformer.md +++ b/docs/source/en/model_doc/audio-spectrogram-transformer.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Audio Spectrogram Transformer +PyTorch + ## Overview The Audio Spectrogram Transformer model was proposed in [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass. diff --git a/docs/source/en/model_doc/autoformer.md b/docs/source/en/model_doc/autoformer.md index bb423e941c78..f706e851aeff 100644 --- a/docs/source/en/model_doc/autoformer.md +++ b/docs/source/en/model_doc/autoformer.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Autoformer +PyTorch + ## Overview The Autoformer model was proposed in [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long. diff --git a/docs/source/en/model_doc/bamba.md b/docs/source/en/model_doc/bamba.md index 4ea8475edb88..d6e1273cac11 100644 --- a/docs/source/en/model_doc/bamba.md +++ b/docs/source/en/model_doc/bamba.md @@ -16,6 +16,7 @@ rendered properly in your Markdown viewer. # Bamba +PyTorch ## Overview diff --git a/docs/source/en/model_doc/bark.md b/docs/source/en/model_doc/bark.md index 7c02e4be7011..0009f3e66d86 100644 --- a/docs/source/en/model_doc/bark.md +++ b/docs/source/en/model_doc/bark.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # Bark +PyTorch + ## Overview Bark is a transformer-based text-to-speech model proposed by Suno AI in [suno-ai/bark](https://github.com/suno-ai/bark). diff --git a/docs/source/en/model_doc/bert-generation.md b/docs/source/en/model_doc/bert-generation.md index 40c2fbaa212e..8e2efcef6dd5 100644 --- a/docs/source/en/model_doc/bert-generation.md +++ b/docs/source/en/model_doc/bert-generation.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # BertGeneration +PyTorch + ## Overview The BertGeneration model is a BERT model that can be leveraged for sequence-to-sequence tasks using diff --git a/docs/source/en/model_doc/bigbird_pegasus.md b/docs/source/en/model_doc/bigbird_pegasus.md index 003e5643719b..9bf91e16d5fd 100644 --- a/docs/source/en/model_doc/bigbird_pegasus.md +++ b/docs/source/en/model_doc/bigbird_pegasus.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # BigBirdPegasus +PyTorch + ## Overview The BigBird model was proposed in [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by diff --git a/docs/source/en/model_doc/biogpt.md b/docs/source/en/model_doc/biogpt.md index 7d0943d5393d..0acecf1d8c9f 100644 --- a/docs/source/en/model_doc/biogpt.md +++ b/docs/source/en/model_doc/biogpt.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # BioGPT +PyTorch + ## Overview The BioGPT model was proposed in [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. BioGPT is a domain-specific generative pre-trained Transformer language model for biomedical text generation and mining. BioGPT follows the Transformer language model backbone, and is pre-trained on 15M PubMed abstracts from scratch. diff --git a/docs/source/en/model_doc/bit.md b/docs/source/en/model_doc/bit.md index 7f8a8ea67c45..291a92a83e0f 100644 --- a/docs/source/en/model_doc/bit.md +++ b/docs/source/en/model_doc/bit.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Big Transfer (BiT) +PyTorch + ## Overview The BiT model was proposed in [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby. diff --git a/docs/source/en/model_doc/blip-2.md b/docs/source/en/model_doc/blip-2.md index 4125d372d55a..b8517c4d1524 100644 --- a/docs/source/en/model_doc/blip-2.md +++ b/docs/source/en/model_doc/blip-2.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # BLIP-2 +PyTorch + ## Overview The BLIP-2 model was proposed in [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by diff --git a/docs/source/en/model_doc/bridgetower.md b/docs/source/en/model_doc/bridgetower.md index 013fea06c277..aae9bdc4c626 100644 --- a/docs/source/en/model_doc/bridgetower.md +++ b/docs/source/en/model_doc/bridgetower.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # BridgeTower +PyTorch + ## Overview The BridgeTower model was proposed in [BridgeTower: Building Bridges Between Encoders in Vision-Language Representative Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan. The goal of this model is to build a diff --git a/docs/source/en/model_doc/bros.md b/docs/source/en/model_doc/bros.md index 419e725e75e8..ac8056b0d738 100644 --- a/docs/source/en/model_doc/bros.md +++ b/docs/source/en/model_doc/bros.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # BROS +PyTorch + ## Overview The BROS model was proposed in [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park. diff --git a/docs/source/en/model_doc/canine.md b/docs/source/en/model_doc/canine.md index 7729d8aa91d7..d6b448ceb09a 100644 --- a/docs/source/en/model_doc/canine.md +++ b/docs/source/en/model_doc/canine.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # CANINE +PyTorch + ## Overview The CANINE model was proposed in [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md index 6cbbdf398274..8cf8dbf7b793 100644 --- a/docs/source/en/model_doc/chameleon.md +++ b/docs/source/en/model_doc/chameleon.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Chameleon +PyTorch + ## Overview The Chameleon model was proposed in [Chameleon: Mixed-Modal Early-Fusion Foundation Models diff --git a/docs/source/en/model_doc/chinese_clip.md b/docs/source/en/model_doc/chinese_clip.md index b2d27a844e9e..b49889ad5cc7 100644 --- a/docs/source/en/model_doc/chinese_clip.md +++ b/docs/source/en/model_doc/chinese_clip.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Chinese-CLIP +PyTorch + ## Overview The Chinese-CLIP model was proposed in [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou. diff --git a/docs/source/en/model_doc/clap.md b/docs/source/en/model_doc/clap.md index 2bd2814e1b06..e4eeaa220ed3 100644 --- a/docs/source/en/model_doc/clap.md +++ b/docs/source/en/model_doc/clap.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # CLAP +PyTorch + ## Overview The CLAP model was proposed in [Large Scale Contrastive Language-Audio pretraining with diff --git a/docs/source/en/model_doc/clipseg.md b/docs/source/en/model_doc/clipseg.md index 005e6746d097..577849289e3f 100644 --- a/docs/source/en/model_doc/clipseg.md +++ b/docs/source/en/model_doc/clipseg.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # CLIPSeg +PyTorch + ## Overview The CLIPSeg model was proposed in [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke diff --git a/docs/source/en/model_doc/clvp.md b/docs/source/en/model_doc/clvp.md index a30269faf9ca..ae193b19f35f 100644 --- a/docs/source/en/model_doc/clvp.md +++ b/docs/source/en/model_doc/clvp.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # CLVP +PyTorch + ## Overview The CLVP (Contrastive Language-Voice Pretrained Transformer) model was proposed in [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. diff --git a/docs/source/en/model_doc/codegen.md b/docs/source/en/model_doc/codegen.md index bee8c8a07620..1c93e6ab5b2c 100644 --- a/docs/source/en/model_doc/codegen.md +++ b/docs/source/en/model_doc/codegen.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # CodeGen +PyTorch + ## Overview The CodeGen model was proposed in [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, and Caiming Xiong. diff --git a/docs/source/en/model_doc/cohere.md b/docs/source/en/model_doc/cohere.md index 4275f059c532..5049d90a7c61 100644 --- a/docs/source/en/model_doc/cohere.md +++ b/docs/source/en/model_doc/cohere.md @@ -1,5 +1,7 @@ # Cohere +PyTorch + ## Overview The Cohere Command-R model was proposed in the blogpost [Command-R: Retrieval Augmented Generation at Production Scale](https://txt.cohere.com/command-r/) by the Cohere Team. diff --git a/docs/source/en/model_doc/cohere2.md b/docs/source/en/model_doc/cohere2.md index 33e67d48fb0e..19107e75835e 100644 --- a/docs/source/en/model_doc/cohere2.md +++ b/docs/source/en/model_doc/cohere2.md @@ -1,5 +1,7 @@ # Cohere +PyTorch + ## Overview [C4AI Command R7B](https://cohere.com/blog/command-r7b) is an open weights research release of a 7B billion parameter model developed by Cohere and Cohere For AI. It has advanced capabilities optimized for various use cases, including reasoning, summarization, question answering, and code. The model is trained to perform sophisticated tasks including Retrieval Augmented Generation (RAG) and tool use. The model also has powerful agentic capabilities that can use and combine multiple tools over multiple steps to accomplish more difficult tasks. It obtains top performance on enterprise-relevant code use cases. C4AI Command R7B is a multilingual model trained on 23 languages. diff --git a/docs/source/en/model_doc/colpali.md b/docs/source/en/model_doc/colpali.md index 3f6b0cbc6613..baba60984315 100644 --- a/docs/source/en/model_doc/colpali.md +++ b/docs/source/en/model_doc/colpali.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # ColPali +PyTorch + ## Overview The *ColPali* model was proposed in [ColPali: Efficient Document Retrieval with Vision Language Models](https://doi.org/10.48550/arXiv.2407.01449) by **Manuel Faysse***, **Hugues Sibille***, **Tony Wu***, Bilel Omrani, Gautier Viaud, Céline Hudelot, Pierre Colombo (* denotes equal contribution). Work lead by ILLUIN Technology. diff --git a/docs/source/en/model_doc/conditional_detr.md b/docs/source/en/model_doc/conditional_detr.md index 400c5c2c53b6..13cf4685142d 100644 --- a/docs/source/en/model_doc/conditional_detr.md +++ b/docs/source/en/model_doc/conditional_detr.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Conditional DETR +PyTorch + ## Overview The Conditional DETR model was proposed in [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. Conditional DETR presents a conditional cross-attention mechanism for fast DETR training. Conditional DETR converges 6.7× to 10× faster than DETR. diff --git a/docs/source/en/model_doc/cpmant.md b/docs/source/en/model_doc/cpmant.md index 4bcf774507fb..51f2f9e4d794 100644 --- a/docs/source/en/model_doc/cpmant.md +++ b/docs/source/en/model_doc/cpmant.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # CPMAnt +PyTorch + ## Overview CPM-Ant is an open-source Chinese pre-trained language model (PLM) with 10B parameters. It is also the first milestone of the live training process of CPM-Live. The training process is cost-effective and environment-friendly. CPM-Ant also achieves promising results with delta tuning on the CUGE benchmark. Besides the full model, we also provide various compressed versions to meet the requirements of different hardware configurations. [See more](https://github.com/OpenBMB/CPM-Live/tree/cpm-ant/cpm-live) diff --git a/docs/source/en/model_doc/dac.md b/docs/source/en/model_doc/dac.md index db54b387b1c3..ab78a31e8d1b 100644 --- a/docs/source/en/model_doc/dac.md +++ b/docs/source/en/model_doc/dac.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # DAC +PyTorch + ## Overview diff --git a/docs/source/en/model_doc/data2vec.md b/docs/source/en/model_doc/data2vec.md index cb1dc675caa5..f9ac15e80d70 100644 --- a/docs/source/en/model_doc/data2vec.md +++ b/docs/source/en/model_doc/data2vec.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Data2Vec +PyTorch + ## Overview The Data2Vec model was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/pdf/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu and Michael Auli. diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md index fb53742d0541..edf5239b292d 100644 --- a/docs/source/en/model_doc/dbrx.md +++ b/docs/source/en/model_doc/dbrx.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # DBRX +PyTorch + ## Overview DBRX is a [transformer-based](https://www.isattentionallyouneed.com/) decoder-only large language model (LLM) that was trained using next-token prediction. diff --git a/docs/source/en/model_doc/decision_transformer.md b/docs/source/en/model_doc/decision_transformer.md index 07ef2ecbdc8e..3b5672f7bb89 100644 --- a/docs/source/en/model_doc/decision_transformer.md +++ b/docs/source/en/model_doc/decision_transformer.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Decision Transformer +PyTorch + ## Overview The Decision Transformer model was proposed in [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) diff --git a/docs/source/en/model_doc/deformable_detr.md b/docs/source/en/model_doc/deformable_detr.md index 5ed99dfe81d1..c79cea426d6b 100644 --- a/docs/source/en/model_doc/deformable_detr.md +++ b/docs/source/en/model_doc/deformable_detr.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Deformable DETR +PyTorch + ## Overview The Deformable DETR model was proposed in [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai. diff --git a/docs/source/en/model_doc/deplot.md b/docs/source/en/model_doc/deplot.md index a77bee39de76..be1cd3eec9d9 100644 --- a/docs/source/en/model_doc/deplot.md +++ b/docs/source/en/model_doc/deplot.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # DePlot +PyTorch + ## Overview DePlot was proposed in the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) from Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun. diff --git a/docs/source/en/model_doc/depth_anything.md b/docs/source/en/model_doc/depth_anything.md index 7cdf72de5c84..4504a8122269 100644 --- a/docs/source/en/model_doc/depth_anything.md +++ b/docs/source/en/model_doc/depth_anything.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Depth Anything +PyTorch + ## Overview The Depth Anything model was proposed in [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao. Depth Anything is based on the [DPT](dpt) architecture, trained on ~62 million images, obtaining state-of-the-art results for both relative and absolute depth estimation. diff --git a/docs/source/en/model_doc/deta.md b/docs/source/en/model_doc/deta.md index 996142bc59d6..0ae9c0b50293 100644 --- a/docs/source/en/model_doc/deta.md +++ b/docs/source/en/model_doc/deta.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # DETA +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/detr.md b/docs/source/en/model_doc/detr.md index 43c6e6d17e2f..28cafd48691f 100644 --- a/docs/source/en/model_doc/detr.md +++ b/docs/source/en/model_doc/detr.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # DETR +PyTorch + ## Overview The DETR model was proposed in [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by diff --git a/docs/source/en/model_doc/diffllama.md b/docs/source/en/model_doc/diffllama.md index 80afcfe433e9..4cd8485c826a 100644 --- a/docs/source/en/model_doc/diffllama.md +++ b/docs/source/en/model_doc/diffllama.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # DiffLlama +PyTorch + ## Overview The DiffLlama model was proposed in [Differential Transformer](https://arxiv.org/abs/2410.05258) by Kazuma Matsumoto and . diff --git a/docs/source/en/model_doc/dinat.md b/docs/source/en/model_doc/dinat.md index 23dfa3b74fb0..eb636bef3692 100644 --- a/docs/source/en/model_doc/dinat.md +++ b/docs/source/en/model_doc/dinat.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Dilated Neighborhood Attention Transformer +PyTorch + ## Overview DiNAT was proposed in [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) diff --git a/docs/source/en/model_doc/dinov2_with_registers.md b/docs/source/en/model_doc/dinov2_with_registers.md index 360ebf9b8f8a..63f65dabaed2 100644 --- a/docs/source/en/model_doc/dinov2_with_registers.md +++ b/docs/source/en/model_doc/dinov2_with_registers.md @@ -9,6 +9,8 @@ specific language governing permissions and limitations under the License. # DINOv2 with Registers +PyTorch + ## Overview The DINOv2 with Registers model was proposed in [Vision Transformers Need Registers](https://arxiv.org/abs/2309.16588) by Timothée Darcet, Maxime Oquab, Julien Mairal, Piotr Bojanowski. diff --git a/docs/source/en/model_doc/dpt.md b/docs/source/en/model_doc/dpt.md index a02313a31235..d461724c8447 100644 --- a/docs/source/en/model_doc/dpt.md +++ b/docs/source/en/model_doc/dpt.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # DPT +PyTorch + ## Overview The DPT model was proposed in [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun. diff --git a/docs/source/en/model_doc/efficientnet.md b/docs/source/en/model_doc/efficientnet.md index a69b255dba5e..03aa15663b59 100644 --- a/docs/source/en/model_doc/efficientnet.md +++ b/docs/source/en/model_doc/efficientnet.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # EfficientNet +PyTorch + ## Overview The EfficientNet model was proposed in [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) diff --git a/docs/source/en/model_doc/emu3.md b/docs/source/en/model_doc/emu3.md index 619c9a3be51f..f8ca88e8a323 100644 --- a/docs/source/en/model_doc/emu3.md +++ b/docs/source/en/model_doc/emu3.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Emu3 +PyTorch + ## Overview The Emu3 model was proposed in [Emu3: Next-Token Prediction is All You Need](https://arxiv.org/abs/2409.18869) by Xinlong Wang, Xiaosong Zhang, Zhengxiong Luo, Quan Sun, Yufeng Cui, Jinsheng Wang, Fan Zhang, Yueze Wang, Zhen Li, Qiying Yu, Yingli Zhao, Yulong Ao, Xuebin Min, Tao Li, Boya Wu, Bo Zhao, Bowen Zhang, Liangdong Wang, Guang Liu, Zheqi He, Xi Yang, Jingjing Liu, Yonghua Lin, Tiejun Huang, Zhongyuan Wang. diff --git a/docs/source/en/model_doc/encodec.md b/docs/source/en/model_doc/encodec.md index 856f8be2b80a..a92d85bd6d2e 100644 --- a/docs/source/en/model_doc/encodec.md +++ b/docs/source/en/model_doc/encodec.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # EnCodec +PyTorch + ## Overview The EnCodec neural codec model was proposed in [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi. diff --git a/docs/source/en/model_doc/ernie.md b/docs/source/en/model_doc/ernie.md index a5110b2d7b73..715e3b49b943 100644 --- a/docs/source/en/model_doc/ernie.md +++ b/docs/source/en/model_doc/ernie.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # ERNIE +PyTorch + ## Overview ERNIE is a series of powerful models proposed by baidu, especially in Chinese tasks, including [ERNIE1.0](https://arxiv.org/abs/1904.09223), [ERNIE2.0](https://ojs.aaai.org/index.php/AAAI/article/view/6428), diff --git a/docs/source/en/model_doc/ernie_m.md b/docs/source/en/model_doc/ernie_m.md index 85254693501c..8e85f7d14789 100644 --- a/docs/source/en/model_doc/ernie_m.md +++ b/docs/source/en/model_doc/ernie_m.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # ErnieM +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/falcon.md b/docs/source/en/model_doc/falcon.md index 9bf6c32a4ec5..33d638ccc409 100644 --- a/docs/source/en/model_doc/falcon.md +++ b/docs/source/en/model_doc/falcon.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Falcon +PyTorch + ## Overview Falcon is a class of causal decoder-only models built by [TII](https://www.tii.ae/). The largest Falcon checkpoints diff --git a/docs/source/en/model_doc/falcon_mamba.md b/docs/source/en/model_doc/falcon_mamba.md index cbec6378cc14..b945f0c72fbb 100644 --- a/docs/source/en/model_doc/falcon_mamba.md +++ b/docs/source/en/model_doc/falcon_mamba.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # FalconMamba +PyTorch + ## Overview The FalconMamba model was proposed by TII UAE (Technology Innovation Institute) in their release. diff --git a/docs/source/en/model_doc/fastspeech2_conformer.md b/docs/source/en/model_doc/fastspeech2_conformer.md index 7d9250273331..35ca8ab73168 100644 --- a/docs/source/en/model_doc/fastspeech2_conformer.md +++ b/docs/source/en/model_doc/fastspeech2_conformer.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # FastSpeech2Conformer +PyTorch + ## Overview The FastSpeech2Conformer model was proposed with the paper [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang. diff --git a/docs/source/en/model_doc/flava.md b/docs/source/en/model_doc/flava.md index d9f9f1de5146..51a177fc96c8 100644 --- a/docs/source/en/model_doc/flava.md +++ b/docs/source/en/model_doc/flava.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # FLAVA +PyTorch + ## Overview The FLAVA model was proposed in [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela and is accepted at CVPR 2022. diff --git a/docs/source/en/model_doc/fnet.md b/docs/source/en/model_doc/fnet.md index 1bcae678e632..2b3884832583 100644 --- a/docs/source/en/model_doc/fnet.md +++ b/docs/source/en/model_doc/fnet.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # FNet +PyTorch + ## Overview The FNet model was proposed in [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by diff --git a/docs/source/en/model_doc/focalnet.md b/docs/source/en/model_doc/focalnet.md index c4c97980f069..5875155ef089 100644 --- a/docs/source/en/model_doc/focalnet.md +++ b/docs/source/en/model_doc/focalnet.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # FocalNet +PyTorch + ## Overview The FocalNet model was proposed in [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. diff --git a/docs/source/en/model_doc/fuyu.md b/docs/source/en/model_doc/fuyu.md index bd55737da58f..78684c8b02d8 100644 --- a/docs/source/en/model_doc/fuyu.md +++ b/docs/source/en/model_doc/fuyu.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Fuyu +PyTorch + ## Overview The Fuyu model was created by [ADEPT](https://www.adept.ai/blog/fuyu-8b), and authored by Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar. diff --git a/docs/source/en/model_doc/gemma2.md b/docs/source/en/model_doc/gemma2.md index 431c4ecd25f2..12dd8c96bd39 100644 --- a/docs/source/en/model_doc/gemma2.md +++ b/docs/source/en/model_doc/gemma2.md @@ -17,6 +17,8 @@ rendered properly in your Markdown viewer. # Gemma2 +PyTorch + ## Overview The Gemma2 model was proposed in [Gemma2: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/google-gemma-2/) by Gemma2 Team, Google. diff --git a/docs/source/en/model_doc/git.md b/docs/source/en/model_doc/git.md index bffa98b89e3b..c24b7f433d5e 100644 --- a/docs/source/en/model_doc/git.md +++ b/docs/source/en/model_doc/git.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # GIT +PyTorch + ## Overview The GIT model was proposed in [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by diff --git a/docs/source/en/model_doc/glm.md b/docs/source/en/model_doc/glm.md index 1268b2e7cf9c..61e517bb4e06 100644 --- a/docs/source/en/model_doc/glm.md +++ b/docs/source/en/model_doc/glm.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # GLM +PyTorch + ## Overview The GLM Model was proposed diff --git a/docs/source/en/model_doc/glpn.md b/docs/source/en/model_doc/glpn.md index b57d1a7ccdda..c3fbc1d9647b 100644 --- a/docs/source/en/model_doc/glpn.md +++ b/docs/source/en/model_doc/glpn.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # GLPN +PyTorch + This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight diff --git a/docs/source/en/model_doc/gpt_bigcode.md b/docs/source/en/model_doc/gpt_bigcode.md index 1635a9f50dd0..8123e1fde087 100644 --- a/docs/source/en/model_doc/gpt_bigcode.md +++ b/docs/source/en/model_doc/gpt_bigcode.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # GPTBigCode +PyTorch + ## Overview The GPTBigCode model was proposed in [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by BigCode. The listed authors are: Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. diff --git a/docs/source/en/model_doc/gpt_neox.md b/docs/source/en/model_doc/gpt_neox.md index 1319f2e93c14..155b7449f8e0 100644 --- a/docs/source/en/model_doc/gpt_neox.md +++ b/docs/source/en/model_doc/gpt_neox.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # GPT-NeoX +PyTorch + ## Overview We introduce GPT-NeoX-20B, a 20 billion parameter autoregressive language model trained on the Pile, whose weights will diff --git a/docs/source/en/model_doc/gpt_neox_japanese.md b/docs/source/en/model_doc/gpt_neox_japanese.md index c69e643cae5b..e1600d9b1115 100644 --- a/docs/source/en/model_doc/gpt_neox_japanese.md +++ b/docs/source/en/model_doc/gpt_neox_japanese.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # GPT-NeoX-Japanese +PyTorch + ## Overview We introduce GPT-NeoX-Japanese, which is an autoregressive language model for Japanese, trained on top of [https://github.com/EleutherAI/gpt-neox](https://github.com/EleutherAI/gpt-neox). diff --git a/docs/source/en/model_doc/gptsan-japanese.md b/docs/source/en/model_doc/gptsan-japanese.md index 108e59048d5d..83040362f82f 100644 --- a/docs/source/en/model_doc/gptsan-japanese.md +++ b/docs/source/en/model_doc/gptsan-japanese.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # GPTSAN-japanese +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/granite.md b/docs/source/en/model_doc/granite.md index 42b6da4e7478..57fc530282cc 100644 --- a/docs/source/en/model_doc/granite.md +++ b/docs/source/en/model_doc/granite.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Granite +PyTorch + ## Overview The Granite model was proposed in [Power Scheduler: A Batch Size and Token Number Agnostic Learning Rate Scheduler](https://arxiv.org/abs/2408.13359) by Yikang Shen, Matthew Stallone, Mayank Mishra, Gaoyuan Zhang, Shawn Tan, Aditya Prasad, Adriana Meza Soria, David D. Cox and Rameswar Panda. diff --git a/docs/source/en/model_doc/granitemoe.md b/docs/source/en/model_doc/granitemoe.md index 176e833c24c6..bc1a6bb33ee9 100644 --- a/docs/source/en/model_doc/granitemoe.md +++ b/docs/source/en/model_doc/granitemoe.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # GraniteMoe +PyTorch + ## Overview The GraniteMoe model was proposed in [Power Scheduler: A Batch Size and Token Number Agnostic Learning Rate Scheduler](https://arxiv.org/abs/2408.13359) by Yikang Shen, Matthew Stallone, Mayank Mishra, Gaoyuan Zhang, Shawn Tan, Aditya Prasad, Adriana Meza Soria, David D. Cox and Rameswar Panda. diff --git a/docs/source/en/model_doc/graphormer.md b/docs/source/en/model_doc/graphormer.md index d01bf04debf9..6a81a0f84e77 100644 --- a/docs/source/en/model_doc/graphormer.md +++ b/docs/source/en/model_doc/graphormer.md @@ -14,6 +14,8 @@ rendered properly in your Markdown viewer. # Graphormer +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md index 1b9104eb963e..deb8b20029f1 100644 --- a/docs/source/en/model_doc/grounding-dino.md +++ b/docs/source/en/model_doc/grounding-dino.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Grounding DINO +PyTorch + ## Overview The Grounding DINO model was proposed in [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. Grounding DINO extends a closed-set object detection model with a text encoder, enabling open-set object detection. The model achieves remarkable results, such as 52.5 AP on COCO zero-shot. diff --git a/docs/source/en/model_doc/helium.md b/docs/source/en/model_doc/helium.md index b830c0a72be7..cf83f5a0352c 100644 --- a/docs/source/en/model_doc/helium.md +++ b/docs/source/en/model_doc/helium.md @@ -16,6 +16,7 @@ rendered properly in your Markdown viewer. # Helium +PyTorch ## Overview diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md index c63c892c7c7d..e255185cb6bb 100644 --- a/docs/source/en/model_doc/hiera.md +++ b/docs/source/en/model_doc/hiera.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Hiera +PyTorch + ## Overview Hiera was proposed in [Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles](https://arxiv.org/abs/2306.00989) by Chaitanya Ryali, Yuan-Ting Hu, Daniel Bolya, Chen Wei, Haoqi Fan, Po-Yao Huang, Vaibhav Aggarwal, Arkabandhu Chowdhury, Omid Poursaeed, Judy Hoffman, Jitendra Malik, Yanghao Li, Christoph Feichtenhofer diff --git a/docs/source/en/model_doc/ibert.md b/docs/source/en/model_doc/ibert.md index 9ea623951aec..c887ba3e0a5c 100644 --- a/docs/source/en/model_doc/ibert.md +++ b/docs/source/en/model_doc/ibert.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # I-BERT +PyTorch + ## Overview The I-BERT model was proposed in [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by diff --git a/docs/source/en/model_doc/idefics2.md b/docs/source/en/model_doc/idefics2.md index b9b51082f29e..1baec145fa33 100644 --- a/docs/source/en/model_doc/idefics2.md +++ b/docs/source/en/model_doc/idefics2.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Idefics2 +PyTorch + ## Overview The Idefics2 model was proposed in [What matters when building vision-language models?](https://arxiv.org/abs/2405.02246) by Léo Tronchon, Hugo Laurencon, Victor Sanh. The accompanying blog post can be found [here](https://huggingface.co/blog/idefics2). diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md index cf7c043e9289..f8c0cf2dbaf7 100644 --- a/docs/source/en/model_doc/idefics3.md +++ b/docs/source/en/model_doc/idefics3.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Idefics3 +PyTorch + ## Overview The Idefics3 model was proposed in [Building and better understanding vision-language models: insights and future directions](https://huggingface.co/papers/2408.12637) by Hugo Laurençon, Andrés Marafioti, Victor Sanh, and Léo Tronchon. diff --git a/docs/source/en/model_doc/ijepa.md b/docs/source/en/model_doc/ijepa.md index cb2afd25e20b..a72b164ddcef 100644 --- a/docs/source/en/model_doc/ijepa.md +++ b/docs/source/en/model_doc/ijepa.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # I-JEPA +PyTorch + ## Overview The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://arxiv.org/abs/2301.08243) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas. diff --git a/docs/source/en/model_doc/imagegpt.md b/docs/source/en/model_doc/imagegpt.md index 53a7ba3b34b7..51e6bd1c6722 100644 --- a/docs/source/en/model_doc/imagegpt.md +++ b/docs/source/en/model_doc/imagegpt.md @@ -15,6 +15,8 @@ specific language governing permissions and limitations under the License. --> # ImageGPT +PyTorch + ## Overview The ImageGPT model was proposed in [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt) by Mark diff --git a/docs/source/en/model_doc/informer.md b/docs/source/en/model_doc/informer.md index f866afbfcb8a..fc6e5e8cb828 100644 --- a/docs/source/en/model_doc/informer.md +++ b/docs/source/en/model_doc/informer.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Informer +PyTorch + ## Overview The Informer model was proposed in [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md index 904a96bc786f..d2bf881fa148 100644 --- a/docs/source/en/model_doc/instructblip.md +++ b/docs/source/en/model_doc/instructblip.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # InstructBLIP +PyTorch + ## Overview The InstructBLIP model was proposed in [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md index 8b2207ce1765..05baf90fe639 100644 --- a/docs/source/en/model_doc/instructblipvideo.md +++ b/docs/source/en/model_doc/instructblipvideo.md @@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License. # InstructBlipVideo -## Overview +PyTorch ## Overview diff --git a/docs/source/en/model_doc/jamba.md b/docs/source/en/model_doc/jamba.md index c3f66c1825f3..4f4f283459ff 100644 --- a/docs/source/en/model_doc/jamba.md +++ b/docs/source/en/model_doc/jamba.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Jamba +PyTorch + ## Overview Jamba is a state-of-the-art, hybrid SSM-Transformer LLM. It is the first production-scale Mamba implementation, which opens up interesting research and application opportunities. While this initial experimentation shows encouraging gains, we expect these to be further enhanced with future optimizations and explorations. diff --git a/docs/source/en/model_doc/jetmoe.md b/docs/source/en/model_doc/jetmoe.md index 87f99c6f9988..53ef534add02 100644 --- a/docs/source/en/model_doc/jetmoe.md +++ b/docs/source/en/model_doc/jetmoe.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # JetMoe +PyTorch + ## Overview **JetMoe-8B** is an 8B Mixture-of-Experts (MoE) language model developed by [Yikang Shen](https://scholar.google.com.hk/citations?user=qff5rRYAAAAJ) and [MyShell](https://myshell.ai/). diff --git a/docs/source/en/model_doc/jukebox.md b/docs/source/en/model_doc/jukebox.md index 12f273b71e97..3971390b1cdf 100644 --- a/docs/source/en/model_doc/jukebox.md +++ b/docs/source/en/model_doc/jukebox.md @@ -15,6 +15,8 @@ rendered properly in your Markdown viewer. --> # Jukebox +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/kosmos-2.md b/docs/source/en/model_doc/kosmos-2.md index f799751cce84..5f419545de27 100644 --- a/docs/source/en/model_doc/kosmos-2.md +++ b/docs/source/en/model_doc/kosmos-2.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # KOSMOS-2 +PyTorch + ## Overview The KOSMOS-2 model was proposed in [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei. diff --git a/docs/source/en/model_doc/layoutlmv2.md b/docs/source/en/model_doc/layoutlmv2.md index 0769322e9ad5..1579a5bbba4d 100644 --- a/docs/source/en/model_doc/layoutlmv2.md +++ b/docs/source/en/model_doc/layoutlmv2.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # LayoutLMV2 +PyTorch + ## Overview The LayoutLMV2 model was proposed in [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, diff --git a/docs/source/en/model_doc/layoutxlm.md b/docs/source/en/model_doc/layoutxlm.md index f6b2cbef9d6f..ae1842c2b15d 100644 --- a/docs/source/en/model_doc/layoutxlm.md +++ b/docs/source/en/model_doc/layoutxlm.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # LayoutXLM +PyTorch + ## Overview LayoutXLM was proposed in [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha diff --git a/docs/source/en/model_doc/levit.md b/docs/source/en/model_doc/levit.md index 15dc2f4e1373..13c6d19fafd4 100644 --- a/docs/source/en/model_doc/levit.md +++ b/docs/source/en/model_doc/levit.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # LeViT +PyTorch + ## Overview The LeViT model was proposed in [LeViT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze. LeViT improves the [Vision Transformer (ViT)](vit) in performance and efficiency by a few architectural differences such as activation maps with decreasing resolutions in Transformers and the introduction of an attention bias to integrate positional information. diff --git a/docs/source/en/model_doc/lilt.md b/docs/source/en/model_doc/lilt.md index 2514a6ebd852..c55f72f68289 100644 --- a/docs/source/en/model_doc/lilt.md +++ b/docs/source/en/model_doc/lilt.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # LiLT +PyTorch + ## Overview The LiLT model was proposed in [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding. diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md index d89ec57be1e7..3fd5b583f947 100644 --- a/docs/source/en/model_doc/llava.md +++ b/docs/source/en/model_doc/llava.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # LLaVa +PyTorch + ## Overview LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. It is an auto-regressive language model, based on the transformer architecture. In other words, it is an multi-modal version of LLMs fine-tuned for chat / instructions. diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index e62b9ba68c1e..aa7e8fb39ebe 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # LLaVA-NeXT +PyTorch + ## Overview The LLaVA-NeXT model was proposed in [LLaVA-NeXT: Improved reasoning, OCR, and world knowledge](https://llava-vl.github.io/blog/2024-01-30-llava-next/) by Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, Yong Jae Lee. LLaVa-NeXT (also called LLaVa-1.6) improves upon [LLaVa](llava) by increasing the input image resolution and training on an improved visual instruction tuning dataset to improve OCR and common sense reasoning. diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md index ecd7b83a8b58..497abf2f5f6d 100644 --- a/docs/source/en/model_doc/llava_next_video.md +++ b/docs/source/en/model_doc/llava_next_video.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # LLaVa-NeXT-Video +PyTorch + ## Overview The LLaVa-NeXT-Video model was proposed in [LLaVA-NeXT: A Strong Zero-shot Video Understanding Model diff --git a/docs/source/en/model_doc/llava_onevision.md b/docs/source/en/model_doc/llava_onevision.md index 785e6af74a4d..512d666b063f 100644 --- a/docs/source/en/model_doc/llava_onevision.md +++ b/docs/source/en/model_doc/llava_onevision.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # LLaVA-OneVision +PyTorch + ## Overview The LLaVA-OneVision model was proposed in [LLaVA-OneVision: Easy Visual Task Transfer](https://arxiv.org/abs/2408.03326) by + ## Overview The LUKE model was proposed in [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda and Yuji Matsumoto. diff --git a/docs/source/en/model_doc/m2m_100.md b/docs/source/en/model_doc/m2m_100.md index d64545fafb06..8b2da0940d35 100644 --- a/docs/source/en/model_doc/m2m_100.md +++ b/docs/source/en/model_doc/m2m_100.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # M2M100 +PyTorch + ## Overview The M2M100 model was proposed in [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, diff --git a/docs/source/en/model_doc/mamba.md b/docs/source/en/model_doc/mamba.md index 317948331eb1..8bb51af54328 100644 --- a/docs/source/en/model_doc/mamba.md +++ b/docs/source/en/model_doc/mamba.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Mamba +PyTorch + ## Overview The Mamba model was proposed in [Mamba: Linear-Time Sequence Modeling with Selective State Spaces](https://arxiv.org/abs/2312.00752) by Albert Gu and Tri Dao. diff --git a/docs/source/en/model_doc/mamba2.md b/docs/source/en/model_doc/mamba2.md index 5ed27881cf18..5421458186f5 100644 --- a/docs/source/en/model_doc/mamba2.md +++ b/docs/source/en/model_doc/mamba2.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Mamba 2 +PyTorch + ## Overview The Mamba2 model was proposed in [Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality](https://arxiv.org/abs/2405.21060) by Tri Dao and Albert Gu. It is a State Space Model similar to Mamba 1, with better performances in a simplified architecture. diff --git a/docs/source/en/model_doc/markuplm.md b/docs/source/en/model_doc/markuplm.md index e52ff3157eac..b1291fd1a23f 100644 --- a/docs/source/en/model_doc/markuplm.md +++ b/docs/source/en/model_doc/markuplm.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # MarkupLM +PyTorch + ## Overview The MarkupLM model was proposed in [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document diff --git a/docs/source/en/model_doc/mask2former.md b/docs/source/en/model_doc/mask2former.md index 4faeed50311f..cdffde742f85 100644 --- a/docs/source/en/model_doc/mask2former.md +++ b/docs/source/en/model_doc/mask2former.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Mask2Former +PyTorch + ## Overview The Mask2Former model was proposed in [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. Mask2Former is a unified framework for panoptic, instance and semantic segmentation and features significant performance and efficiency improvements over [MaskFormer](maskformer). diff --git a/docs/source/en/model_doc/maskformer.md b/docs/source/en/model_doc/maskformer.md index a0199f380ce6..1d67bafbdc77 100644 --- a/docs/source/en/model_doc/maskformer.md +++ b/docs/source/en/model_doc/maskformer.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # MaskFormer +PyTorch + This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight diff --git a/docs/source/en/model_doc/matcha.md b/docs/source/en/model_doc/matcha.md index d26b88b16fae..a33ef62e5c45 100644 --- a/docs/source/en/model_doc/matcha.md +++ b/docs/source/en/model_doc/matcha.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # MatCha +PyTorch + ## Overview MatCha has been proposed in the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662), from Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos. diff --git a/docs/source/en/model_doc/mctct.md b/docs/source/en/model_doc/mctct.md index 7cf1a68f12e4..628aa872b05e 100644 --- a/docs/source/en/model_doc/mctct.md +++ b/docs/source/en/model_doc/mctct.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # M-CTC-T +PyTorch + This model is in maintenance mode only, so we won't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/mega.md b/docs/source/en/model_doc/mega.md index 5545f5e19c47..9845a27ec986 100644 --- a/docs/source/en/model_doc/mega.md +++ b/docs/source/en/model_doc/mega.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # MEGA +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/megatron-bert.md b/docs/source/en/model_doc/megatron-bert.md index 67000c8b843f..23e86a39d2ae 100644 --- a/docs/source/en/model_doc/megatron-bert.md +++ b/docs/source/en/model_doc/megatron-bert.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # MegatronBERT +PyTorch + ## Overview The MegatronBERT model was proposed in [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model diff --git a/docs/source/en/model_doc/mgp-str.md b/docs/source/en/model_doc/mgp-str.md index d4152e92b2ec..cd89d5a6f42b 100644 --- a/docs/source/en/model_doc/mgp-str.md +++ b/docs/source/en/model_doc/mgp-str.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # MGP-STR +PyTorch + ## Overview The MGP-STR model was proposed in [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao. MGP-STR is a conceptually **simple** yet **powerful** vision Scene Text Recognition (STR) model, which is built upon the [Vision Transformer (ViT)](vit). To integrate linguistic knowledge, Multi-Granularity Prediction (MGP) strategy is proposed to inject information from the language modality into the model in an implicit way. diff --git a/docs/source/en/model_doc/mimi.md b/docs/source/en/model_doc/mimi.md index ad15a002da91..88a9fcf0256f 100644 --- a/docs/source/en/model_doc/mimi.md +++ b/docs/source/en/model_doc/mimi.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Mimi +PyTorch + ## Overview The Mimi model was proposed in [Moshi: a speech-text foundation model for real-time dialogue](https://kyutai.org/Moshi.pdf) by Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave and Neil Zeghidour. Mimi is a high-fidelity audio codec model developed by the Kyutai team, that combines semantic and acoustic information into audio tokens running at 12Hz and a bitrate of 1.1kbps. In other words, it can be used to map audio waveforms into “audio tokens”, known as “codebooks”. diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md index b5451702e44a..42ffcef39d0f 100644 --- a/docs/source/en/model_doc/mixtral.md +++ b/docs/source/en/model_doc/mixtral.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Mixtral +PyTorch + ## Overview Mixtral-8x7B was introduced in the [Mixtral of Experts blogpost](https://mistral.ai/news/mixtral-of-experts/) by Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. diff --git a/docs/source/en/model_doc/mllama.md b/docs/source/en/model_doc/mllama.md index 64da42b38b0f..ae7bdb64540e 100644 --- a/docs/source/en/model_doc/mllama.md +++ b/docs/source/en/model_doc/mllama.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Mllama +PyTorch + ## Overview The Llama 3.2-Vision collection of multimodal large language models (LLMs) is a collection of pretrained and instruction-tuned image reasoning generative models in 11B and 90B sizes (text \+ images in / text out). The Llama 3.2-Vision instruction-tuned models are optimized for visual recognition, image reasoning, captioning, and answering general questions about an image. diff --git a/docs/source/en/model_doc/mluke.md b/docs/source/en/model_doc/mluke.md index 719af76ad446..6ec7b2090fbc 100644 --- a/docs/source/en/model_doc/mluke.md +++ b/docs/source/en/model_doc/mluke.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # mLUKE +PyTorch + ## Overview The mLUKE model was proposed in [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. It's a multilingual extension diff --git a/docs/source/en/model_doc/mobilenet_v1.md b/docs/source/en/model_doc/mobilenet_v1.md index 9f68035c63c2..ea880f0e53d4 100644 --- a/docs/source/en/model_doc/mobilenet_v1.md +++ b/docs/source/en/model_doc/mobilenet_v1.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # MobileNet V1 +PyTorch + ## Overview The MobileNet model was proposed in [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam. diff --git a/docs/source/en/model_doc/mobilenet_v2.md b/docs/source/en/model_doc/mobilenet_v2.md index ff22231ae0c1..39bd0e0b7ebf 100644 --- a/docs/source/en/model_doc/mobilenet_v2.md +++ b/docs/source/en/model_doc/mobilenet_v2.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # MobileNet V2 +PyTorch + ## Overview The MobileNet model was proposed in [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. diff --git a/docs/source/en/model_doc/mobilevitv2.md b/docs/source/en/model_doc/mobilevitv2.md index c3a650fc7042..041e703acb29 100644 --- a/docs/source/en/model_doc/mobilevitv2.md +++ b/docs/source/en/model_doc/mobilevitv2.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # MobileViTV2 +PyTorch + ## Overview The MobileViTV2 model was proposed in [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari. diff --git a/docs/source/en/model_doc/modernbert.md b/docs/source/en/model_doc/modernbert.md index e90f34a903e4..533b2fe8d460 100644 --- a/docs/source/en/model_doc/modernbert.md +++ b/docs/source/en/model_doc/modernbert.md @@ -16,14 +16,7 @@ rendered properly in your Markdown viewer. # ModernBERT -
- -Models - - -Paper page - -
+PyTorch ## Overview diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md index 571e3febdb4f..68720a0a60b4 100644 --- a/docs/source/en/model_doc/moonshine.md +++ b/docs/source/en/model_doc/moonshine.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Moonshine +PyTorch + ## Overview The Moonshine model was proposed in [Moonshine: Speech Recognition for Live Transcription and Voice Commands diff --git a/docs/source/en/model_doc/moshi.md b/docs/source/en/model_doc/moshi.md index 2e2c5655de45..5035a416144a 100644 --- a/docs/source/en/model_doc/moshi.md +++ b/docs/source/en/model_doc/moshi.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Moshi +PyTorch + ## Overview The Moshi model was proposed in [Moshi: a speech-text foundation model for real-time dialogue](https://kyutai.org/Moshi.pdf) by Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave and Neil Zeghidour. diff --git a/docs/source/en/model_doc/mpt.md b/docs/source/en/model_doc/mpt.md index 113b42573f4d..b43b6a0d4166 100644 --- a/docs/source/en/model_doc/mpt.md +++ b/docs/source/en/model_doc/mpt.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # MPT +PyTorch + ## Overview The MPT model was proposed by the [MosaicML](https://www.mosaicml.com/) team and released with multiple sizes and finetuned variants. The MPT models are a series of open source and commercially usable LLMs pre-trained on 1T tokens. diff --git a/docs/source/en/model_doc/mra.md b/docs/source/en/model_doc/mra.md index cc4c0d9cc9c8..37bf13baacb8 100644 --- a/docs/source/en/model_doc/mra.md +++ b/docs/source/en/model_doc/mra.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # MRA +PyTorch + ## Overview The MRA model was proposed in [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, and Vikas Singh. diff --git a/docs/source/en/model_doc/musicgen.md b/docs/source/en/model_doc/musicgen.md index 7c105e1f39f7..e2617e68c8f5 100644 --- a/docs/source/en/model_doc/musicgen.md +++ b/docs/source/en/model_doc/musicgen.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # MusicGen +PyTorch + ## Overview The MusicGen model was proposed in the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) diff --git a/docs/source/en/model_doc/musicgen_melody.md b/docs/source/en/model_doc/musicgen_melody.md index 7b67713c42b7..04ebea60cc2d 100644 --- a/docs/source/en/model_doc/musicgen_melody.md +++ b/docs/source/en/model_doc/musicgen_melody.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # MusicGen Melody +PyTorch + ## Overview The MusicGen Melody model was proposed in [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez. diff --git a/docs/source/en/model_doc/mvp.md b/docs/source/en/model_doc/mvp.md index 0d98e04cf091..d8d78476805d 100644 --- a/docs/source/en/model_doc/mvp.md +++ b/docs/source/en/model_doc/mvp.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # MVP +PyTorch + ## Overview The MVP model was proposed in [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen. diff --git a/docs/source/en/model_doc/nat.md b/docs/source/en/model_doc/nat.md index 02c2e466cc4a..d00e6efcebab 100644 --- a/docs/source/en/model_doc/nat.md +++ b/docs/source/en/model_doc/nat.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Neighborhood Attention Transformer +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/nemotron.md b/docs/source/en/model_doc/nemotron.md index 1979847c43cf..0de5ea28f49c 100644 --- a/docs/source/en/model_doc/nemotron.md +++ b/docs/source/en/model_doc/nemotron.md @@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License. # Nemotron -## Nemotron +PyTorch ### License diff --git a/docs/source/en/model_doc/nezha.md b/docs/source/en/model_doc/nezha.md index 976722592cad..82dc8aecb8f4 100644 --- a/docs/source/en/model_doc/nezha.md +++ b/docs/source/en/model_doc/nezha.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Nezha +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/nllb-moe.md b/docs/source/en/model_doc/nllb-moe.md index 5c283fb3f0e1..3e23ef5bebf1 100644 --- a/docs/source/en/model_doc/nllb-moe.md +++ b/docs/source/en/model_doc/nllb-moe.md @@ -16,6 +16,7 @@ rendered properly in your Markdown viewer. # NLLB-MOE +PyTorch ## Overview diff --git a/docs/source/en/model_doc/nllb.md b/docs/source/en/model_doc/nllb.md index abdff7445aa3..4b08fb4fa06e 100644 --- a/docs/source/en/model_doc/nllb.md +++ b/docs/source/en/model_doc/nllb.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # NLLB +PyTorch + ## Updated tokenizer behavior **DISCLAIMER:** The default behaviour for the tokenizer was fixed and thus changed in April 2023. diff --git a/docs/source/en/model_doc/nystromformer.md b/docs/source/en/model_doc/nystromformer.md index 185c4e1f011a..26aa2d85f457 100644 --- a/docs/source/en/model_doc/nystromformer.md +++ b/docs/source/en/model_doc/nystromformer.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Nyströmformer +PyTorch + ## Overview The Nyströmformer model was proposed in [*Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention*](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn diff --git a/docs/source/en/model_doc/olmo.md b/docs/source/en/model_doc/olmo.md index 6db7d8ad5c5e..3c6943e4ff27 100644 --- a/docs/source/en/model_doc/olmo.md +++ b/docs/source/en/model_doc/olmo.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # OLMo +PyTorch + ## Overview The OLMo model was proposed in [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838) by Dirk Groeneveld, Iz Beltagy, Pete Walsh, Akshita Bhagia, Rodney Kinney, Oyvind Tafjord, Ananya Harsh Jha, Hamish Ivison, Ian Magnusson, Yizhong Wang, Shane Arora, David Atkinson, Russell Authur, Khyathi Raghavi Chandu, Arman Cohan, Jennifer Dumas, Yanai Elazar, Yuling Gu, Jack Hessel, Tushar Khot, William Merrill, Jacob Morrison, Niklas Muennighoff, Aakanksha Naik, Crystal Nam, Matthew E. Peters, Valentina Pyatkin, Abhilasha Ravichander, Dustin Schwenk, Saurabh Shah, Will Smith, Emma Strubell, Nishant Subramani, Mitchell Wortsman, Pradeep Dasigi, Nathan Lambert, Kyle Richardson, Luke Zettlemoyer, Jesse Dodge, Kyle Lo, Luca Soldaini, Noah A. Smith, Hannaneh Hajishirzi. diff --git a/docs/source/en/model_doc/olmo2.md b/docs/source/en/model_doc/olmo2.md index 8ca3326660b3..b8a0780d055e 100644 --- a/docs/source/en/model_doc/olmo2.md +++ b/docs/source/en/model_doc/olmo2.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # OLMo2 +PyTorch + ## Overview The OLMo2 model is the successor of the OLMo model, which was proposed in diff --git a/docs/source/en/model_doc/olmoe.md b/docs/source/en/model_doc/olmoe.md index 5ebcf3f943b3..0b3e71f446dc 100644 --- a/docs/source/en/model_doc/olmoe.md +++ b/docs/source/en/model_doc/olmoe.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # OLMoE +PyTorch + ## Overview The OLMoE model was proposed in [OLMoE: Open Mixture-of-Experts Language Models](https://arxiv.org/abs/2409.02060) by Niklas Muennighoff, Luca Soldaini, Dirk Groeneveld, Kyle Lo, Jacob Morrison, Sewon Min, Weijia Shi, Pete Walsh, Oyvind Tafjord, Nathan Lambert, Yuling Gu, Shane Arora, Akshita Bhagia, Dustin Schwenk, David Wadden, Alexander Wettig, Binyuan Hui, Tim Dettmers, Douwe Kiela, Ali Farhadi, Noah A. Smith, Pang Wei Koh, Amanpreet Singh, Hannaneh Hajishirzi. diff --git a/docs/source/en/model_doc/omdet-turbo.md b/docs/source/en/model_doc/omdet-turbo.md index 91419919b6e0..b0d4be3d9198 100644 --- a/docs/source/en/model_doc/omdet-turbo.md +++ b/docs/source/en/model_doc/omdet-turbo.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # OmDet-Turbo +PyTorch + ## Overview The OmDet-Turbo model was proposed in [Real-time Transformer-based Open-Vocabulary Detection with Efficient Fusion Head](https://arxiv.org/abs/2403.06892) by Tiancheng Zhao, Peng Liu, Xuan He, Lu Zhang, Kyusong Lee. OmDet-Turbo incorporates components from RT-DETR and introduces a swift multimodal fusion module to achieve real-time open-vocabulary object detection capabilities while maintaining high accuracy. The base model achieves performance of up to 100.2 FPS and 53.4 AP on COCO zero-shot. diff --git a/docs/source/en/model_doc/oneformer.md b/docs/source/en/model_doc/oneformer.md index 0132a600ccc5..3cd513499250 100644 --- a/docs/source/en/model_doc/oneformer.md +++ b/docs/source/en/model_doc/oneformer.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # OneFormer +PyTorch + ## Overview The OneFormer model was proposed in [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi. OneFormer is a universal image segmentation framework that can be trained on a single panoptic dataset to perform semantic, instance, and panoptic segmentation tasks. OneFormer uses a task token to condition the model on the task in focus, making the architecture task-guided for training, and task-dynamic for inference. diff --git a/docs/source/en/model_doc/open-llama.md b/docs/source/en/model_doc/open-llama.md index 01170e7e3be6..e4f76966dbfe 100644 --- a/docs/source/en/model_doc/open-llama.md +++ b/docs/source/en/model_doc/open-llama.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Open-Llama +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/owlv2.md b/docs/source/en/model_doc/owlv2.md index 696a1b03776a..9d2d22336859 100644 --- a/docs/source/en/model_doc/owlv2.md +++ b/docs/source/en/model_doc/owlv2.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # OWLv2 +PyTorch + ## Overview OWLv2 was proposed in [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby. OWLv2 scales up [OWL-ViT](owlvit) using self-training, which uses an existing detector to generate pseudo-box annotations on image-text pairs. This results in large gains over the previous state-of-the-art for zero-shot object detection. diff --git a/docs/source/en/model_doc/owlvit.md b/docs/source/en/model_doc/owlvit.md index 519648bbd8dc..945409eb3caa 100644 --- a/docs/source/en/model_doc/owlvit.md +++ b/docs/source/en/model_doc/owlvit.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # OWL-ViT +PyTorch + ## Overview The OWL-ViT (short for Vision Transformer for Open-World Localization) was proposed in [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby. OWL-ViT is an open-vocabulary object detection network trained on a variety of (image, text) pairs. It can be used to query an image with one or multiple text queries to search for and detect target objects described in text. diff --git a/docs/source/en/model_doc/paligemma.md b/docs/source/en/model_doc/paligemma.md index 8b88db39bd71..c6f97d98d91e 100644 --- a/docs/source/en/model_doc/paligemma.md +++ b/docs/source/en/model_doc/paligemma.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # PaliGemma +PyTorch + ## Overview The PaliGemma model was proposed in [PaliGemma – Google's Cutting-Edge Open Vision Language Model](https://huggingface.co/blog/paligemma) by Google. It is a 3B vision-language model composed by a [SigLIP](siglip) vision encoder and a [Gemma](gemma) language decoder linked by a multimodal linear projection. It cuts an image into a fixed number of VIT tokens and prepends it to an optional prompt. One particularity is that the model uses full block attention on all the image tokens plus the input text tokens. It comes in 3 resolutions, 224x224, 448x448 and 896x896 with 3 base models, with 55 fine-tuned versions for different tasks, and 2 mix models. diff --git a/docs/source/en/model_doc/patchtsmixer.md b/docs/source/en/model_doc/patchtsmixer.md index a67138e533b7..0f2934edaeea 100644 --- a/docs/source/en/model_doc/patchtsmixer.md +++ b/docs/source/en/model_doc/patchtsmixer.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # PatchTSMixer +PyTorch + ## Overview The PatchTSMixer model was proposed in [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong and Jayant Kalagnanam. diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md index 544e4cb378c6..2e48e63bc2d8 100644 --- a/docs/source/en/model_doc/patchtst.md +++ b/docs/source/en/model_doc/patchtst.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # PatchTST +PyTorch + ## Overview The PatchTST model was proposed in [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong and Jayant Kalagnanam. diff --git a/docs/source/en/model_doc/pegasus_x.md b/docs/source/en/model_doc/pegasus_x.md index d64d8ba95416..719359723ce7 100644 --- a/docs/source/en/model_doc/pegasus_x.md +++ b/docs/source/en/model_doc/pegasus_x.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # PEGASUS-X +PyTorch + ## Overview The PEGASUS-X model was proposed in [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao and Peter J. Liu. diff --git a/docs/source/en/model_doc/perceiver.md b/docs/source/en/model_doc/perceiver.md index ee678c22f6f8..49a775462f65 100644 --- a/docs/source/en/model_doc/perceiver.md +++ b/docs/source/en/model_doc/perceiver.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Perceiver +PyTorch + ## Overview The Perceiver IO model was proposed in [Perceiver IO: A General Architecture for Structured Inputs & diff --git a/docs/source/en/model_doc/persimmon.md b/docs/source/en/model_doc/persimmon.md index 7a105ac5543d..714b4e804378 100644 --- a/docs/source/en/model_doc/persimmon.md +++ b/docs/source/en/model_doc/persimmon.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Persimmon +PyTorch + ## Overview The Persimmon model was created by [ADEPT](https://www.adept.ai/blog/persimmon-8b), and authored by Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani. diff --git a/docs/source/en/model_doc/phi.md b/docs/source/en/model_doc/phi.md index ef163213bf14..0d81e1d9773f 100644 --- a/docs/source/en/model_doc/phi.md +++ b/docs/source/en/model_doc/phi.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Phi +PyTorch + ## Overview The Phi-1 model was proposed in [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio César Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, Sébastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li. diff --git a/docs/source/en/model_doc/phi3.md b/docs/source/en/model_doc/phi3.md index fe68a6ae76b2..46864adbccd1 100644 --- a/docs/source/en/model_doc/phi3.md +++ b/docs/source/en/model_doc/phi3.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Phi-3 +PyTorch + ## Overview The Phi-3 model was proposed in [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219) by Microsoft. diff --git a/docs/source/en/model_doc/phimoe.md b/docs/source/en/model_doc/phimoe.md index d9c9ae4a1831..63479190723d 100644 --- a/docs/source/en/model_doc/phimoe.md +++ b/docs/source/en/model_doc/phimoe.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # PhiMoE +PyTorch + ## Overview The PhiMoE model was proposed in [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219) by Microsoft. diff --git a/docs/source/en/model_doc/pix2struct.md b/docs/source/en/model_doc/pix2struct.md index 0c9baa18e02f..e93bd2444848 100644 --- a/docs/source/en/model_doc/pix2struct.md +++ b/docs/source/en/model_doc/pix2struct.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Pix2Struct +PyTorch + ## Overview The Pix2Struct model was proposed in [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md index 6e7652bfdea3..4f7f40b90f94 100644 --- a/docs/source/en/model_doc/pixtral.md +++ b/docs/source/en/model_doc/pixtral.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Pixtral +PyTorch + ## Overview The Pixtral model was released by the Mistral AI team in a [blog post](https://mistral.ai/news/pixtral-12b/). Pixtral is a multimodal version of [Mistral](mistral), incorporating a 400 million parameter vision encoder trained from scratch. diff --git a/docs/source/en/model_doc/plbart.md b/docs/source/en/model_doc/plbart.md index 61af52e54d0d..7d3e1d770a82 100644 --- a/docs/source/en/model_doc/plbart.md +++ b/docs/source/en/model_doc/plbart.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # PLBart +PyTorch + ## Overview The PLBART model was proposed in [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang. diff --git a/docs/source/en/model_doc/poolformer.md b/docs/source/en/model_doc/poolformer.md index 823c4412485c..f46501289fde 100644 --- a/docs/source/en/model_doc/poolformer.md +++ b/docs/source/en/model_doc/poolformer.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # PoolFormer +PyTorch + ## Overview The PoolFormer model was proposed in [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Sea AI Labs. Instead of designing complicated token mixer to achieve SOTA performance, the target of this work is to demonstrate the competence of transformer models largely stem from the general architecture MetaFormer. diff --git a/docs/source/en/model_doc/pop2piano.md b/docs/source/en/model_doc/pop2piano.md index 8e7c1fbd3435..81e52e87af72 100644 --- a/docs/source/en/model_doc/pop2piano.md +++ b/docs/source/en/model_doc/pop2piano.md @@ -12,11 +12,7 @@ specific language governing permissions and limitations under the License. # Pop2Piano -
- -Spaces - -
+PyTorch ## Overview diff --git a/docs/source/en/model_doc/prophetnet.md b/docs/source/en/model_doc/prophetnet.md index 764c3acb0674..8c1dcf793f8b 100644 --- a/docs/source/en/model_doc/prophetnet.md +++ b/docs/source/en/model_doc/prophetnet.md @@ -16,14 +16,7 @@ rendered properly in your Markdown viewer. # ProphetNet -
- -Models - - -Spaces - -
+PyTorch ## Overview diff --git a/docs/source/en/model_doc/pvt.md b/docs/source/en/model_doc/pvt.md index 3e88a24999f7..5a3a29770020 100644 --- a/docs/source/en/model_doc/pvt.md +++ b/docs/source/en/model_doc/pvt.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # Pyramid Vision Transformer (PVT) +PyTorch + ## Overview The PVT model was proposed in diff --git a/docs/source/en/model_doc/pvt_v2.md b/docs/source/en/model_doc/pvt_v2.md index 4b580491ea1e..588a3835e0c5 100644 --- a/docs/source/en/model_doc/pvt_v2.md +++ b/docs/source/en/model_doc/pvt_v2.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # Pyramid Vision Transformer V2 (PVTv2) +PyTorch + ## Overview The PVTv2 model was proposed in diff --git a/docs/source/en/model_doc/qdqbert.md b/docs/source/en/model_doc/qdqbert.md index ca718f34af4a..02f098b915eb 100644 --- a/docs/source/en/model_doc/qdqbert.md +++ b/docs/source/en/model_doc/qdqbert.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # QDQBERT +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 78138413c7fb..1d39d4e11440 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Qwen2 +PyTorch + ## Overview Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen2-0.5B, Qwen2-1.5B, Qwen2-7B, Qwen2-57B-A14B, Qwen2-72B, Qwen2-Audio, etc. diff --git a/docs/source/en/model_doc/qwen2_5_vl.md b/docs/source/en/model_doc/qwen2_5_vl.md index f08343506b6a..3554976af10d 100644 --- a/docs/source/en/model_doc/qwen2_5_vl.md +++ b/docs/source/en/model_doc/qwen2_5_vl.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Qwen2.5-VL +PyTorch + ## Overview The [Qwen2.5-VL](https://qwenlm.github.io/blog/qwen2_5-vl/) model is an update to [Qwen2-VL](https://arxiv.org/abs/2409.12191) from Qwen team, Alibaba Group. diff --git a/docs/source/en/model_doc/qwen2_audio.md b/docs/source/en/model_doc/qwen2_audio.md index 2ef947ce430d..24b855c9038a 100644 --- a/docs/source/en/model_doc/qwen2_audio.md +++ b/docs/source/en/model_doc/qwen2_audio.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Qwen2Audio +PyTorch + ## Overview The Qwen2-Audio is the new model series of large audio-language models from the Qwen team. Qwen2-Audio is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions. We introduce two distinct audio interaction modes: diff --git a/docs/source/en/model_doc/qwen2_moe.md b/docs/source/en/model_doc/qwen2_moe.md index 3a7391ca194f..9c2168457a1d 100644 --- a/docs/source/en/model_doc/qwen2_moe.md +++ b/docs/source/en/model_doc/qwen2_moe.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Qwen2MoE +PyTorch + ## Overview Qwen2MoE is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen-72B, Qwen-1.8B, Qwen-VL, Qwen-Audio, etc. diff --git a/docs/source/en/model_doc/qwen2_vl.md b/docs/source/en/model_doc/qwen2_vl.md index b0275ce94af5..d28a4c25e19e 100644 --- a/docs/source/en/model_doc/qwen2_vl.md +++ b/docs/source/en/model_doc/qwen2_vl.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Qwen2-VL +PyTorch + ## Overview The [Qwen2-VL](https://qwenlm.github.io/blog/qwen2-vl/) model is a major update to [Qwen-VL](https://arxiv.org/pdf/2308.12966) from the Qwen team at Alibaba Research. diff --git a/docs/source/en/model_doc/realm.md b/docs/source/en/model_doc/realm.md index 558e83c08b06..5156011e87d2 100644 --- a/docs/source/en/model_doc/realm.md +++ b/docs/source/en/model_doc/realm.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # REALM +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/recurrent_gemma.md b/docs/source/en/model_doc/recurrent_gemma.md index ceee799159fc..7886f63ed326 100644 --- a/docs/source/en/model_doc/recurrent_gemma.md +++ b/docs/source/en/model_doc/recurrent_gemma.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # RecurrentGemma +PyTorch + ## Overview The Recurrent Gemma model was proposed in [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams of Google. diff --git a/docs/source/en/model_doc/reformer.md b/docs/source/en/model_doc/reformer.md index c78b1bbb8333..f99a806780c7 100644 --- a/docs/source/en/model_doc/reformer.md +++ b/docs/source/en/model_doc/reformer.md @@ -16,14 +16,7 @@ rendered properly in your Markdown viewer. # Reformer -
- -Models - - -Spaces - -
+PyTorch ## Overview diff --git a/docs/source/en/model_doc/retribert.md b/docs/source/en/model_doc/retribert.md index ab29ac966fe1..7d0c33d71722 100644 --- a/docs/source/en/model_doc/retribert.md +++ b/docs/source/en/model_doc/retribert.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # RetriBERT +PyTorch + This model is in maintenance mode only, so we won't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/roc_bert.md b/docs/source/en/model_doc/roc_bert.md index 30fadd5c2c10..e1321dd2c7f1 100644 --- a/docs/source/en/model_doc/roc_bert.md +++ b/docs/source/en/model_doc/roc_bert.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # RoCBert +PyTorch + ## Overview The RoCBert model was proposed in [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou. diff --git a/docs/source/en/model_doc/rt_detr.md b/docs/source/en/model_doc/rt_detr.md index 6a1545e12329..03e87ef4515a 100644 --- a/docs/source/en/model_doc/rt_detr.md +++ b/docs/source/en/model_doc/rt_detr.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # RT-DETR +PyTorch + ## Overview diff --git a/docs/source/en/model_doc/rwkv.md b/docs/source/en/model_doc/rwkv.md index 1acb17306021..ee367411fcf8 100644 --- a/docs/source/en/model_doc/rwkv.md +++ b/docs/source/en/model_doc/rwkv.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # RWKV +PyTorch + ## Overview The RWKV model was proposed in [this repo](https://github.com/BlinkDL/RWKV-LM) diff --git a/docs/source/en/model_doc/seamless_m4t.md b/docs/source/en/model_doc/seamless_m4t.md index 486e58691f6d..f18617e83d6f 100644 --- a/docs/source/en/model_doc/seamless_m4t.md +++ b/docs/source/en/model_doc/seamless_m4t.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # SeamlessM4T +PyTorch + ## Overview The SeamlessM4T model was proposed in [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team from Meta AI. diff --git a/docs/source/en/model_doc/seamless_m4t_v2.md b/docs/source/en/model_doc/seamless_m4t_v2.md index c6a2ec4b51c2..3be24330ecf6 100644 --- a/docs/source/en/model_doc/seamless_m4t_v2.md +++ b/docs/source/en/model_doc/seamless_m4t_v2.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # SeamlessM4T-v2 +PyTorch + ## Overview The SeamlessM4T-v2 model was proposed in [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team from Meta AI. diff --git a/docs/source/en/model_doc/seggpt.md b/docs/source/en/model_doc/seggpt.md index b53f5d6ca150..464b8a3de415 100644 --- a/docs/source/en/model_doc/seggpt.md +++ b/docs/source/en/model_doc/seggpt.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # SegGPT +PyTorch + ## Overview The SegGPT model was proposed in [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang. SegGPT employs a decoder-only Transformer that can generate a segmentation mask given an input image, a prompt image and its corresponding prompt mask. The model achieves remarkable one-shot results with 56.1 mIoU on COCO-20 and 85.6 mIoU on FSS-1000. diff --git a/docs/source/en/model_doc/sew-d.md b/docs/source/en/model_doc/sew-d.md index 013e404bd045..87af1709be74 100644 --- a/docs/source/en/model_doc/sew-d.md +++ b/docs/source/en/model_doc/sew-d.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # SEW-D +PyTorch + ## Overview SEW-D (Squeezed and Efficient Wav2Vec with Disentangled attention) was proposed in [Performance-Efficiency Trade-offs diff --git a/docs/source/en/model_doc/sew.md b/docs/source/en/model_doc/sew.md index ee8a36a4dcb2..895457d75006 100644 --- a/docs/source/en/model_doc/sew.md +++ b/docs/source/en/model_doc/sew.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # SEW +PyTorch + ## Overview SEW (Squeezed and Efficient Wav2Vec) was proposed in [Performance-Efficiency Trade-offs in Unsupervised Pre-training diff --git a/docs/source/en/model_doc/siglip.md b/docs/source/en/model_doc/siglip.md index 4beac361de53..a0b418833c54 100644 --- a/docs/source/en/model_doc/siglip.md +++ b/docs/source/en/model_doc/siglip.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # SigLIP +PyTorch + ## Overview The SigLIP model was proposed in [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer. SigLIP proposes to replace the loss function used in [CLIP](clip) by a simple pairwise sigmoid loss. This results in better performance in terms of zero-shot classification accuracy on ImageNet. diff --git a/docs/source/en/model_doc/speecht5.md b/docs/source/en/model_doc/speecht5.md index 4d5e2098a542..bea4fe555d78 100644 --- a/docs/source/en/model_doc/speecht5.md +++ b/docs/source/en/model_doc/speecht5.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # SpeechT5 +PyTorch + ## Overview The SpeechT5 model was proposed in [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei. diff --git a/docs/source/en/model_doc/splinter.md b/docs/source/en/model_doc/splinter.md index a46c55966c0e..6eb41f1928c0 100644 --- a/docs/source/en/model_doc/splinter.md +++ b/docs/source/en/model_doc/splinter.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Splinter +PyTorch + ## Overview The Splinter model was proposed in [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. Splinter diff --git a/docs/source/en/model_doc/squeezebert.md b/docs/source/en/model_doc/squeezebert.md index e2bb378fe5bb..be911724b039 100644 --- a/docs/source/en/model_doc/squeezebert.md +++ b/docs/source/en/model_doc/squeezebert.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # SqueezeBERT +PyTorch + ## Overview The SqueezeBERT model was proposed in [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, Kurt W. Keutzer. It's a diff --git a/docs/source/en/model_doc/stablelm.md b/docs/source/en/model_doc/stablelm.md index 09c0e5855c3a..a6ae8fb2da5f 100644 --- a/docs/source/en/model_doc/stablelm.md +++ b/docs/source/en/model_doc/stablelm.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # StableLM +PyTorch + ## Overview `StableLM 3B 4E1T` was proposed in [`StableLM 3B 4E1T`: Technical Report](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Stability AI and is the first model in a series of multi-epoch pre-trained language models. diff --git a/docs/source/en/model_doc/starcoder2.md b/docs/source/en/model_doc/starcoder2.md index 1d107b385556..5cdc8a9ba71c 100644 --- a/docs/source/en/model_doc/starcoder2.md +++ b/docs/source/en/model_doc/starcoder2.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Starcoder2 +PyTorch + ## Overview StarCoder2 is a family of open LLMs for code and comes in 3 different sizes with 3B, 7B and 15B parameters. The flagship StarCoder2-15B model is trained on over 4 trillion tokens and 600+ programming languages from The Stack v2. All models use Grouped Query Attention, a context window of 16,384 tokens with a sliding window attention of 4,096 tokens, and were trained using the Fill-in-the-Middle objective. The models have been released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas Krauß, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries. diff --git a/docs/source/en/model_doc/superglue.md b/docs/source/en/model_doc/superglue.md index 08a4575dddc2..df700a93cc2d 100644 --- a/docs/source/en/model_doc/superglue.md +++ b/docs/source/en/model_doc/superglue.md @@ -15,6 +15,8 @@ rendered properly in your Markdown viewer. # SuperGlue +PyTorch + ## Overview The SuperGlue model was proposed in [SuperGlue: Learning Feature Matching with Graph Neural Networks](https://arxiv.org/abs/1911.11763) by Paul-Edouard Sarlin, Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich. diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md index 59e451adceb8..c1d48daa27a4 100644 --- a/docs/source/en/model_doc/superpoint.md +++ b/docs/source/en/model_doc/superpoint.md @@ -15,6 +15,8 @@ rendered properly in your Markdown viewer. # SuperPoint +PyTorch + ## Overview The SuperPoint model was proposed diff --git a/docs/source/en/model_doc/swin2sr.md b/docs/source/en/model_doc/swin2sr.md index 18d6635feffc..dfccd94f31eb 100644 --- a/docs/source/en/model_doc/swin2sr.md +++ b/docs/source/en/model_doc/swin2sr.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Swin2SR +PyTorch + ## Overview The Swin2SR model was proposed in [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte. diff --git a/docs/source/en/model_doc/swinv2.md b/docs/source/en/model_doc/swinv2.md index 25233dca3395..6075db8b2c39 100644 --- a/docs/source/en/model_doc/swinv2.md +++ b/docs/source/en/model_doc/swinv2.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Swin Transformer V2 +PyTorch + ## Overview The Swin Transformer V2 model was proposed in [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. diff --git a/docs/source/en/model_doc/switch_transformers.md b/docs/source/en/model_doc/switch_transformers.md index ca6748167f5e..911cbe987c37 100644 --- a/docs/source/en/model_doc/switch_transformers.md +++ b/docs/source/en/model_doc/switch_transformers.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # SwitchTransformers +PyTorch + ## Overview The SwitchTransformers model was proposed in [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer. diff --git a/docs/source/en/model_doc/table-transformer.md b/docs/source/en/model_doc/table-transformer.md index 850e7f50aa61..b4b5b577b4fc 100644 --- a/docs/source/en/model_doc/table-transformer.md +++ b/docs/source/en/model_doc/table-transformer.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Table Transformer +PyTorch + ## Overview The Table Transformer model was proposed in [PubTables-1M: Towards comprehensive table extraction from unstructured documents](https://arxiv.org/abs/2110.00061) by diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md index d6b431e648f2..7c9373a34fee 100644 --- a/docs/source/en/model_doc/textnet.md +++ b/docs/source/en/model_doc/textnet.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # TextNet +PyTorch + ## Overview The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. TextNet is a vision backbone useful for text detection tasks. It is the result of neural architecture search (NAS) on backbones with reward function as text detection task (to provide powerful features for text detection). diff --git a/docs/source/en/model_doc/time_series_transformer.md b/docs/source/en/model_doc/time_series_transformer.md index c5bfcfc15ea2..e54b57df74e4 100644 --- a/docs/source/en/model_doc/time_series_transformer.md +++ b/docs/source/en/model_doc/time_series_transformer.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Time Series Transformer +PyTorch + ## Overview The Time Series Transformer model is a vanilla encoder-decoder Transformer for time series forecasting. diff --git a/docs/source/en/model_doc/timesformer.md b/docs/source/en/model_doc/timesformer.md index fe75bee5b289..bd9d03bf1867 100644 --- a/docs/source/en/model_doc/timesformer.md +++ b/docs/source/en/model_doc/timesformer.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # TimeSformer +PyTorch + ## Overview The TimeSformer model was proposed in [TimeSformer: Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Facebook Research. diff --git a/docs/source/en/model_doc/timm_wrapper.md b/docs/source/en/model_doc/timm_wrapper.md index 467f2addf963..578f01eb5ead 100644 --- a/docs/source/en/model_doc/timm_wrapper.md +++ b/docs/source/en/model_doc/timm_wrapper.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # TimmWrapper +PyTorch + ## Overview Helper class to enable loading timm models to be used with the transformers library and its autoclasses. diff --git a/docs/source/en/model_doc/trajectory_transformer.md b/docs/source/en/model_doc/trajectory_transformer.md index 45616255871a..164f3810886c 100644 --- a/docs/source/en/model_doc/trajectory_transformer.md +++ b/docs/source/en/model_doc/trajectory_transformer.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Trajectory Transformer +PyTorch + This model is in maintenance mode only, so we won't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/trocr.md b/docs/source/en/model_doc/trocr.md index c471a13bbd23..80df93c02a5b 100644 --- a/docs/source/en/model_doc/trocr.md +++ b/docs/source/en/model_doc/trocr.md @@ -15,6 +15,8 @@ specific language governing permissions and limitations under the License. --> # TrOCR +PyTorch + ## Overview The TrOCR model was proposed in [TrOCR: Transformer-based Optical Character Recognition with Pre-trained diff --git a/docs/source/en/model_doc/tvlt.md b/docs/source/en/model_doc/tvlt.md index 0a0f50e47315..54242f67b8fe 100644 --- a/docs/source/en/model_doc/tvlt.md +++ b/docs/source/en/model_doc/tvlt.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # TVLT +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/tvp.md b/docs/source/en/model_doc/tvp.md index 22b400a06c73..7753abf1aaa6 100644 --- a/docs/source/en/model_doc/tvp.md +++ b/docs/source/en/model_doc/tvp.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # TVP +PyTorch + ## Overview The text-visual prompting (TVP) framework was proposed in the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding. diff --git a/docs/source/en/model_doc/udop.md b/docs/source/en/model_doc/udop.md index 614bd2ff4fd7..fc1edb1d8508 100644 --- a/docs/source/en/model_doc/udop.md +++ b/docs/source/en/model_doc/udop.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # UDOP +PyTorch + ## Overview The UDOP model was proposed in [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) by Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal. diff --git a/docs/source/en/model_doc/umt5.md b/docs/source/en/model_doc/umt5.md index b9f86a0304e8..f753c9624fad 100644 --- a/docs/source/en/model_doc/umt5.md +++ b/docs/source/en/model_doc/umt5.md @@ -16,14 +16,7 @@ rendered properly in your Markdown viewer. # UMT5 -
- -Models - - -Spaces - -
+PyTorch ## Overview diff --git a/docs/source/en/model_doc/unispeech-sat.md b/docs/source/en/model_doc/unispeech-sat.md index 3f0bbcc79323..f29eacfcc14b 100644 --- a/docs/source/en/model_doc/unispeech-sat.md +++ b/docs/source/en/model_doc/unispeech-sat.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # UniSpeech-SAT +PyTorch + ## Overview The UniSpeech-SAT model was proposed in [UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware diff --git a/docs/source/en/model_doc/unispeech.md b/docs/source/en/model_doc/unispeech.md index 2b2b13bed52c..02688cf44793 100644 --- a/docs/source/en/model_doc/unispeech.md +++ b/docs/source/en/model_doc/unispeech.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # UniSpeech +PyTorch + ## Overview The UniSpeech model was proposed in [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael diff --git a/docs/source/en/model_doc/univnet.md b/docs/source/en/model_doc/univnet.md index 45bd94732773..aefd2f157578 100644 --- a/docs/source/en/model_doc/univnet.md +++ b/docs/source/en/model_doc/univnet.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # UnivNet +PyTorch + ## Overview The UnivNet model was proposed in [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kin, and Juntae Kim. diff --git a/docs/source/en/model_doc/upernet.md b/docs/source/en/model_doc/upernet.md index 418c3ef1786b..ea08ccf109a6 100644 --- a/docs/source/en/model_doc/upernet.md +++ b/docs/source/en/model_doc/upernet.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # UPerNet +PyTorch + ## Overview The UPerNet model was proposed in [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) diff --git a/docs/source/en/model_doc/van.md b/docs/source/en/model_doc/van.md index 2fb8475ce72f..3bdf4dedbb87 100644 --- a/docs/source/en/model_doc/van.md +++ b/docs/source/en/model_doc/van.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # VAN +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md index a3ba1258ecfa..3985f76e3ab3 100644 --- a/docs/source/en/model_doc/video_llava.md +++ b/docs/source/en/model_doc/video_llava.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Video-LLaVA +PyTorch + ## Overview Video-LLaVa is an open-source multimodal LLM trained by fine-tuning LlamA/Vicuna on multimodal instruction-following data generated by Llava1.5 and VideChat. It is an auto-regressive language model, based on the transformer architecture. Video-LLaVa unifies visual representations to the language feature space, and enables an LLM to perform visual reasoning capabilities on both images and videos simultaneously. diff --git a/docs/source/en/model_doc/videomae.md b/docs/source/en/model_doc/videomae.md index a78561118570..9035be3be948 100644 --- a/docs/source/en/model_doc/videomae.md +++ b/docs/source/en/model_doc/videomae.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # VideoMAE +PyTorch + ## Overview The VideoMAE model was proposed in [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang. diff --git a/docs/source/en/model_doc/vilt.md b/docs/source/en/model_doc/vilt.md index 2b0ac022da4b..1a6573a1bc0a 100644 --- a/docs/source/en/model_doc/vilt.md +++ b/docs/source/en/model_doc/vilt.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # ViLT +PyTorch + ## Overview The ViLT model was proposed in [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md index cb625e371161..69a1c5f5cab1 100644 --- a/docs/source/en/model_doc/vipllava.md +++ b/docs/source/en/model_doc/vipllava.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # VipLlava +PyTorch + ## Overview The VipLlava model was proposed in [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee. diff --git a/docs/source/en/model_doc/visual_bert.md b/docs/source/en/model_doc/visual_bert.md index 95e5ae4e84a2..704ff1733b36 100644 --- a/docs/source/en/model_doc/visual_bert.md +++ b/docs/source/en/model_doc/visual_bert.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # VisualBERT +PyTorch + ## Overview The VisualBERT model was proposed in [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang. diff --git a/docs/source/en/model_doc/vit_hybrid.md b/docs/source/en/model_doc/vit_hybrid.md index 5cde5e529807..0471988398ac 100644 --- a/docs/source/en/model_doc/vit_hybrid.md +++ b/docs/source/en/model_doc/vit_hybrid.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Hybrid Vision Transformer (ViT Hybrid) +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/vit_msn.md b/docs/source/en/model_doc/vit_msn.md index e1210ce7f9dd..bc5a61e9846d 100644 --- a/docs/source/en/model_doc/vit_msn.md +++ b/docs/source/en/model_doc/vit_msn.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # ViTMSN +PyTorch + ## Overview The ViTMSN model was proposed in [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, diff --git a/docs/source/en/model_doc/vitdet.md b/docs/source/en/model_doc/vitdet.md index 81bf787d6cda..20b12ae4efaf 100644 --- a/docs/source/en/model_doc/vitdet.md +++ b/docs/source/en/model_doc/vitdet.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # ViTDet +PyTorch + ## Overview The ViTDet model was proposed in [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He. diff --git a/docs/source/en/model_doc/vitmatte.md b/docs/source/en/model_doc/vitmatte.md index 5a6d501030fc..97dd3c1747cd 100644 --- a/docs/source/en/model_doc/vitmatte.md +++ b/docs/source/en/model_doc/vitmatte.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # ViTMatte +PyTorch + ## Overview The ViTMatte model was proposed in [Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang. diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md index 4fbead04ea80..30e2018b0950 100644 --- a/docs/source/en/model_doc/vitpose.md +++ b/docs/source/en/model_doc/vitpose.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # ViTPose +PyTorch + ## Overview The ViTPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. ViTPose employs a standard, non-hierarchical [Vision Transformer](vit) as backbone for the task of keypoint estimation. A simple decoder head is added on top to predict the heatmaps from a given image. Despite its simplicity, the model gets state-of-the-art results on the challenging MS COCO Keypoint Detection benchmark. The model was further improved in [ViTPose++: Vision Transformer for Generic Body Pose Estimation](https://arxiv.org/abs/2212.04246) where the authors employ diff --git a/docs/source/en/model_doc/vits.md b/docs/source/en/model_doc/vits.md index 42997cae1e74..9d5baab837f6 100644 --- a/docs/source/en/model_doc/vits.md +++ b/docs/source/en/model_doc/vits.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # VITS +PyTorch + ## Overview The VITS model was proposed in [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son. diff --git a/docs/source/en/model_doc/vivit.md b/docs/source/en/model_doc/vivit.md index c3e3df14ab98..5150dc922020 100644 --- a/docs/source/en/model_doc/vivit.md +++ b/docs/source/en/model_doc/vivit.md @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # Video Vision Transformer (ViViT) +PyTorch + ## Overview The Vivit model was proposed in [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid. diff --git a/docs/source/en/model_doc/wav2vec2-bert.md b/docs/source/en/model_doc/wav2vec2-bert.md index 6514133330a9..1aec6f1bb2d2 100644 --- a/docs/source/en/model_doc/wav2vec2-bert.md +++ b/docs/source/en/model_doc/wav2vec2-bert.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Wav2Vec2-BERT +PyTorch + ## Overview The Wav2Vec2-BERT model was proposed in [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team from Meta AI. diff --git a/docs/source/en/model_doc/wav2vec2-conformer.md b/docs/source/en/model_doc/wav2vec2-conformer.md index 0b30cf5fa431..e5450c41f911 100644 --- a/docs/source/en/model_doc/wav2vec2-conformer.md +++ b/docs/source/en/model_doc/wav2vec2-conformer.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # Wav2Vec2-Conformer +PyTorch + ## Overview The Wav2Vec2-Conformer was added to an updated version of [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino. diff --git a/docs/source/en/model_doc/wavlm.md b/docs/source/en/model_doc/wavlm.md index a42fbff13958..363e734c9b6c 100644 --- a/docs/source/en/model_doc/wavlm.md +++ b/docs/source/en/model_doc/wavlm.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # WavLM +PyTorch + ## Overview The WavLM model was proposed in [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, diff --git a/docs/source/en/model_doc/xclip.md b/docs/source/en/model_doc/xclip.md index 8c22747387c0..2f5d7a392376 100644 --- a/docs/source/en/model_doc/xclip.md +++ b/docs/source/en/model_doc/xclip.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # X-CLIP +PyTorch + ## Overview The X-CLIP model was proposed in [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling. diff --git a/docs/source/en/model_doc/xlm-prophetnet.md b/docs/source/en/model_doc/xlm-prophetnet.md index b350cb554b03..0af6259924bf 100644 --- a/docs/source/en/model_doc/xlm-prophetnet.md +++ b/docs/source/en/model_doc/xlm-prophetnet.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # XLM-ProphetNet +PyTorch + This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/xlm-roberta-xl.md b/docs/source/en/model_doc/xlm-roberta-xl.md index f9cb78c0bf4e..eef6825faaba 100644 --- a/docs/source/en/model_doc/xlm-roberta-xl.md +++ b/docs/source/en/model_doc/xlm-roberta-xl.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # XLM-RoBERTa-XL +PyTorch + ## Overview The XLM-RoBERTa-XL model was proposed in [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau. diff --git a/docs/source/en/model_doc/yolos.md b/docs/source/en/model_doc/yolos.md index ebe249517fdf..994ec1384bbb 100644 --- a/docs/source/en/model_doc/yolos.md +++ b/docs/source/en/model_doc/yolos.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # YOLOS +PyTorch + ## Overview The YOLOS model was proposed in [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu. diff --git a/docs/source/en/model_doc/yoso.md b/docs/source/en/model_doc/yoso.md index a3dfa3fed855..d3289dfa20e3 100644 --- a/docs/source/en/model_doc/yoso.md +++ b/docs/source/en/model_doc/yoso.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # YOSO +PyTorch + ## Overview The YOSO model was proposed in [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) diff --git a/docs/source/en/model_doc/zamba.md b/docs/source/en/model_doc/zamba.md index 450b68c77d6d..ad5648001339 100644 --- a/docs/source/en/model_doc/zamba.md +++ b/docs/source/en/model_doc/zamba.md @@ -15,6 +15,8 @@ rendered properly in your Markdown viewer. --> # Zamba +PyTorch + Zamba is a large language model (LLM) trained by Zyphra, and made available under an Apache 2.0 license. Please see the [Zyphra Hugging Face](https://huggingface.co/collections/zyphra/) repository for model weights. This model was contributed by [pglo](https://huggingface.co/pglo). diff --git a/docs/source/en/model_doc/zamba2.md b/docs/source/en/model_doc/zamba2.md index c3e67291039c..88e543b14ba2 100644 --- a/docs/source/en/model_doc/zamba2.md +++ b/docs/source/en/model_doc/zamba2.md @@ -15,6 +15,8 @@ rendered properly in your Markdown viewer. --> # Zamba2 +PyTorch + Zamba2 is a large language model (LLM) trained by Zyphra, and made available under an Apache 2.0 license. Please see the [Zyphra Hugging Face](https://huggingface.co/collections/zyphra/) repository for model weights. This model was contributed by [pglo](https://huggingface.co/pglo). diff --git a/docs/source/en/model_doc/zoedepth.md b/docs/source/en/model_doc/zoedepth.md index ecd068511e96..573d93119e32 100644 --- a/docs/source/en/model_doc/zoedepth.md +++ b/docs/source/en/model_doc/zoedepth.md @@ -16,6 +16,8 @@ rendered properly in your Markdown viewer. # ZoeDepth +PyTorch + ## Overview The ZoeDepth model was proposed in [ZoeDepth: Zero-shot Transfer by Combining Relative and Metric Depth](https://arxiv.org/abs/2302.12288) by Shariq Farooq Bhat, Reiner Birkl, Diana Wofk, Peter Wonka, Matthias Müller. ZoeDepth extends the [DPT](dpt) framework for metric (also called absolute) depth estimation. ZoeDepth is pre-trained on 12 datasets using relative depth and fine-tuned on two domains (NYU and KITTI) using metric depth. A lightweight head is used with a novel bin adjustment design called metric bins module for each domain. During inference, each input image is automatically routed to the appropriate head using a latent classifier. From 711e2112e1d3863e2ea81df151bf2106af9e1212 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Tue, 28 Jan 2025 10:02:02 -0800 Subject: [PATCH 105/116] supported model frameworks --- docs/source/en/model_doc/align.md | 2 ++ docs/source/en/model_doc/altclip.md | 2 ++ docs/source/en/model_doc/aria.md | 2 ++ .../source/en/model_doc/audio-spectrogram-transformer.md | 2 ++ docs/source/en/model_doc/autoformer.md | 2 ++ docs/source/en/model_doc/bamba.md | 2 ++ docs/source/en/model_doc/bark.md | 2 ++ docs/source/en/model_doc/beit.md | 6 ++++++ docs/source/en/model_doc/bert-generation.md | 2 ++ docs/source/en/model_doc/big_bird.md | 6 ++++++ docs/source/en/model_doc/bigbird_pegasus.md | 2 ++ docs/source/en/model_doc/biogpt.md | 2 ++ docs/source/en/model_doc/bit.md | 2 ++ docs/source/en/model_doc/blip-2.md | 2 ++ docs/source/en/model_doc/blip.md | 5 +++++ docs/source/en/model_doc/bloom.md | 6 ++++++ docs/source/en/model_doc/bridgetower.md | 2 ++ docs/source/en/model_doc/bros.md | 2 ++ docs/source/en/model_doc/camembert.md | 5 +++++ docs/source/en/model_doc/canine.md | 2 ++ docs/source/en/model_doc/chameleon.md | 2 ++ docs/source/en/model_doc/chinese_clip.md | 2 ++ docs/source/en/model_doc/clap.md | 2 ++ docs/source/en/model_doc/clipseg.md | 2 ++ docs/source/en/model_doc/clvp.md | 2 ++ docs/source/en/model_doc/code_llama.md | 6 ++++++ docs/source/en/model_doc/codegen.md | 2 ++ docs/source/en/model_doc/cohere.md | 2 ++ docs/source/en/model_doc/cohere2.md | 2 ++ docs/source/en/model_doc/colpali.md | 2 ++ docs/source/en/model_doc/conditional_detr.md | 2 ++ docs/source/en/model_doc/convbert.md | 8 ++------ docs/source/en/model_doc/convnext.md | 5 +++++ docs/source/en/model_doc/convnextv2.md | 5 +++++ docs/source/en/model_doc/cpmant.md | 2 ++ docs/source/en/model_doc/ctrl.md | 8 ++------ docs/source/en/model_doc/cvt.md | 5 +++++ docs/source/en/model_doc/dac.md | 2 ++ docs/source/en/model_doc/data2vec.md | 2 ++ docs/source/en/model_doc/dbrx.md | 2 ++ docs/source/en/model_doc/deberta-v2.md | 5 +++++ docs/source/en/model_doc/deberta.md | 5 +++++ docs/source/en/model_doc/decision_transformer.md | 2 ++ docs/source/en/model_doc/deformable_detr.md | 2 ++ docs/source/en/model_doc/deit.md | 5 +++++ docs/source/en/model_doc/deplot.md | 2 ++ docs/source/en/model_doc/depth_anything.md | 2 ++ docs/source/en/model_doc/deta.md | 2 ++ docs/source/en/model_doc/detr.md | 2 ++ docs/source/en/model_doc/diffllama.md | 2 ++ docs/source/en/model_doc/dinat.md | 2 ++ docs/source/en/model_doc/dinov2.md | 6 ++++++ docs/source/en/model_doc/dinov2_with_registers.md | 2 ++ docs/source/en/model_doc/dit.md | 6 ++++++ docs/source/en/model_doc/dpr.md | 8 ++------ docs/source/en/model_doc/dpt.md | 2 ++ docs/source/en/model_doc/efficientformer.md | 5 +++++ docs/source/en/model_doc/efficientnet.md | 2 ++ docs/source/en/model_doc/emu3.md | 2 ++ docs/source/en/model_doc/encodec.md | 2 ++ docs/source/en/model_doc/ernie.md | 2 ++ docs/source/en/model_doc/ernie_m.md | 2 ++ docs/source/en/model_doc/esm.md | 5 +++++ docs/source/en/model_doc/falcon.md | 2 ++ docs/source/en/model_doc/falcon3.md | 6 ++++++ docs/source/en/model_doc/falcon_mamba.md | 2 ++ docs/source/en/model_doc/fastspeech2_conformer.md | 2 ++ docs/source/en/model_doc/flaubert.md | 8 ++------ docs/source/en/model_doc/flava.md | 2 ++ docs/source/en/model_doc/fnet.md | 2 ++ docs/source/en/model_doc/focalnet.md | 2 ++ docs/source/en/model_doc/funnel.md | 9 ++------- docs/source/en/model_doc/fuyu.md | 2 ++ docs/source/en/model_doc/gemma.md | 6 ++++++ docs/source/en/model_doc/gemma2.md | 2 ++ docs/source/en/model_doc/git.md | 2 ++ docs/source/en/model_doc/glm.md | 2 ++ docs/source/en/model_doc/glpn.md | 2 ++ docs/source/en/model_doc/gpt_bigcode.md | 2 ++ docs/source/en/model_doc/gpt_neo.md | 6 ++++++ docs/source/en/model_doc/gpt_neox.md | 2 ++ docs/source/en/model_doc/gpt_neox_japanese.md | 2 ++ docs/source/en/model_doc/gptsan-japanese.md | 2 ++ docs/source/en/model_doc/granite.md | 2 ++ docs/source/en/model_doc/granitemoe.md | 2 ++ docs/source/en/model_doc/graphormer.md | 2 ++ docs/source/en/model_doc/grounding-dino.md | 2 ++ docs/source/en/model_doc/groupvit.md | 5 +++++ docs/source/en/model_doc/helium.md | 2 ++ docs/source/en/model_doc/hiera.md | 2 ++ docs/source/en/model_doc/hubert.md | 5 +++++ docs/source/en/model_doc/ibert.md | 2 ++ docs/source/en/model_doc/idefics.md | 5 +++++ docs/source/en/model_doc/idefics2.md | 2 ++ docs/source/en/model_doc/idefics3.md | 2 ++ docs/source/en/model_doc/ijepa.md | 2 ++ docs/source/en/model_doc/imagegpt.md | 2 ++ docs/source/en/model_doc/informer.md | 2 ++ docs/source/en/model_doc/instructblip.md | 2 ++ docs/source/en/model_doc/instructblipvideo.md | 2 ++ docs/source/en/model_doc/jamba.md | 2 ++ docs/source/en/model_doc/jetmoe.md | 2 ++ docs/source/en/model_doc/jukebox.md | 2 ++ docs/source/en/model_doc/kosmos-2.md | 2 ++ docs/source/en/model_doc/layoutlm.md | 5 +++++ docs/source/en/model_doc/layoutlmv2.md | 2 ++ docs/source/en/model_doc/layoutxlm.md | 2 ++ docs/source/en/model_doc/led.md | 5 +++++ docs/source/en/model_doc/levit.md | 2 ++ docs/source/en/model_doc/lilt.md | 2 ++ docs/source/en/model_doc/llama.md | 6 ++++++ docs/source/en/model_doc/llama2.md | 6 ++++++ docs/source/en/model_doc/llama3.md | 6 ++++++ docs/source/en/model_doc/llava.md | 2 ++ docs/source/en/model_doc/llava_next.md | 2 ++ docs/source/en/model_doc/llava_next_video.md | 2 ++ docs/source/en/model_doc/llava_onevision.md | 2 ++ docs/source/en/model_doc/longformer.md | 8 ++------ docs/source/en/model_doc/longt5.md | 6 ++++++ docs/source/en/model_doc/luke.md | 2 ++ docs/source/en/model_doc/lxmert.md | 5 +++++ docs/source/en/model_doc/m2m_100.md | 2 ++ docs/source/en/model_doc/mamba.md | 2 ++ docs/source/en/model_doc/mamba2.md | 2 ++ docs/source/en/model_doc/markuplm.md | 2 ++ docs/source/en/model_doc/mask2former.md | 2 ++ docs/source/en/model_doc/maskformer.md | 2 ++ docs/source/en/model_doc/matcha.md | 2 ++ docs/source/en/model_doc/mctct.md | 2 ++ docs/source/en/model_doc/mega.md | 2 ++ docs/source/en/model_doc/megatron-bert.md | 2 ++ docs/source/en/model_doc/mgp-str.md | 2 ++ docs/source/en/model_doc/mimi.md | 2 ++ docs/source/en/model_doc/mixtral.md | 2 ++ docs/source/en/model_doc/mllama.md | 2 ++ docs/source/en/model_doc/mluke.md | 2 ++ docs/source/en/model_doc/mobilebert.md | 5 +++++ docs/source/en/model_doc/mobilenet_v1.md | 2 ++ docs/source/en/model_doc/mobilenet_v2.md | 2 ++ docs/source/en/model_doc/mobilevit.md | 5 +++++ docs/source/en/model_doc/mobilevitv2.md | 2 ++ docs/source/en/model_doc/modernbert.md | 2 ++ docs/source/en/model_doc/moonshine.md | 2 ++ docs/source/en/model_doc/moshi.md | 2 ++ docs/source/en/model_doc/mpnet.md | 5 +++++ docs/source/en/model_doc/mpt.md | 2 ++ docs/source/en/model_doc/mra.md | 2 ++ docs/source/en/model_doc/musicgen.md | 2 ++ docs/source/en/model_doc/musicgen_melody.md | 2 ++ docs/source/en/model_doc/mvp.md | 2 ++ docs/source/en/model_doc/nat.md | 2 ++ docs/source/en/model_doc/nemotron.md | 2 ++ docs/source/en/model_doc/nezha.md | 2 ++ docs/source/en/model_doc/nllb-moe.md | 2 ++ docs/source/en/model_doc/nllb.md | 2 ++ docs/source/en/model_doc/nystromformer.md | 2 ++ docs/source/en/model_doc/olmo.md | 2 ++ docs/source/en/model_doc/olmo2.md | 2 ++ docs/source/en/model_doc/olmoe.md | 2 ++ docs/source/en/model_doc/omdet-turbo.md | 2 ++ docs/source/en/model_doc/oneformer.md | 2 ++ docs/source/en/model_doc/open-llama.md | 2 ++ docs/source/en/model_doc/owlv2.md | 2 ++ docs/source/en/model_doc/owlvit.md | 2 ++ docs/source/en/model_doc/paligemma.md | 2 ++ docs/source/en/model_doc/patchtsmixer.md | 2 ++ docs/source/en/model_doc/patchtst.md | 2 ++ docs/source/en/model_doc/pegasus_x.md | 2 ++ docs/source/en/model_doc/perceiver.md | 2 ++ docs/source/en/model_doc/persimmon.md | 2 ++ docs/source/en/model_doc/phi.md | 2 ++ docs/source/en/model_doc/phi3.md | 2 ++ docs/source/en/model_doc/phimoe.md | 2 ++ docs/source/en/model_doc/pix2struct.md | 2 ++ docs/source/en/model_doc/pixtral.md | 2 ++ docs/source/en/model_doc/plbart.md | 2 ++ docs/source/en/model_doc/poolformer.md | 2 ++ docs/source/en/model_doc/pop2piano.md | 2 ++ docs/source/en/model_doc/prophetnet.md | 2 ++ docs/source/en/model_doc/pvt.md | 2 ++ docs/source/en/model_doc/pvt_v2.md | 2 ++ docs/source/en/model_doc/qdqbert.md | 2 ++ docs/source/en/model_doc/qwen2.md | 2 ++ docs/source/en/model_doc/qwen2_5_vl.md | 2 ++ docs/source/en/model_doc/qwen2_audio.md | 2 ++ docs/source/en/model_doc/qwen2_moe.md | 2 ++ docs/source/en/model_doc/qwen2_vl.md | 2 ++ docs/source/en/model_doc/rag.md | 5 ++--- docs/source/en/model_doc/realm.md | 2 ++ docs/source/en/model_doc/recurrent_gemma.md | 2 ++ docs/source/en/model_doc/reformer.md | 2 ++ docs/source/en/model_doc/rembert.md | 5 +++++ docs/source/en/model_doc/retribert.md | 2 ++ docs/source/en/model_doc/roc_bert.md | 2 ++ docs/source/en/model_doc/rt_detr.md | 2 ++ docs/source/en/model_doc/rwkv.md | 2 ++ docs/source/en/model_doc/sam.md | 5 +++++ docs/source/en/model_doc/seamless_m4t.md | 2 ++ docs/source/en/model_doc/seamless_m4t_v2.md | 2 ++ docs/source/en/model_doc/segformer.md | 5 +++++ docs/source/en/model_doc/seggpt.md | 2 ++ docs/source/en/model_doc/sew-d.md | 2 ++ docs/source/en/model_doc/sew.md | 2 ++ docs/source/en/model_doc/siglip.md | 2 ++ docs/source/en/model_doc/speech-encoder-decoder.md | 6 ++++++ docs/source/en/model_doc/speech_to_text.md | 5 +++++ docs/source/en/model_doc/speecht5.md | 2 ++ docs/source/en/model_doc/splinter.md | 2 ++ docs/source/en/model_doc/squeezebert.md | 2 ++ docs/source/en/model_doc/stablelm.md | 2 ++ docs/source/en/model_doc/starcoder2.md | 2 ++ docs/source/en/model_doc/superglue.md | 2 ++ docs/source/en/model_doc/superpoint.md | 2 ++ docs/source/en/model_doc/swiftformer.md | 5 +++++ docs/source/en/model_doc/swin.md | 5 +++++ docs/source/en/model_doc/swin2sr.md | 2 ++ docs/source/en/model_doc/swinv2.md | 2 ++ docs/source/en/model_doc/switch_transformers.md | 2 ++ docs/source/en/model_doc/table-transformer.md | 2 ++ docs/source/en/model_doc/tapas.md | 5 +++++ docs/source/en/model_doc/textnet.md | 2 ++ docs/source/en/model_doc/time_series_transformer.md | 2 ++ docs/source/en/model_doc/timesformer.md | 2 ++ docs/source/en/model_doc/timm_wrapper.md | 2 ++ docs/source/en/model_doc/trajectory_transformer.md | 2 ++ docs/source/en/model_doc/transfo-xl.md | 5 +++++ docs/source/en/model_doc/trocr.md | 2 ++ docs/source/en/model_doc/tvlt.md | 2 ++ docs/source/en/model_doc/tvp.md | 2 ++ docs/source/en/model_doc/udop.md | 2 ++ docs/source/en/model_doc/umt5.md | 2 ++ docs/source/en/model_doc/unispeech-sat.md | 2 ++ docs/source/en/model_doc/unispeech.md | 2 ++ docs/source/en/model_doc/univnet.md | 2 ++ docs/source/en/model_doc/upernet.md | 2 ++ docs/source/en/model_doc/van.md | 2 ++ docs/source/en/model_doc/video_llava.md | 2 ++ docs/source/en/model_doc/videomae.md | 2 ++ docs/source/en/model_doc/vilt.md | 2 ++ docs/source/en/model_doc/vipllava.md | 2 ++ docs/source/en/model_doc/visual_bert.md | 2 ++ docs/source/en/model_doc/vit_hybrid.md | 2 ++ docs/source/en/model_doc/vit_mae.md | 5 +++++ docs/source/en/model_doc/vit_msn.md | 2 ++ docs/source/en/model_doc/vitdet.md | 2 ++ docs/source/en/model_doc/vitmatte.md | 2 ++ docs/source/en/model_doc/vitpose.md | 2 ++ docs/source/en/model_doc/vits.md | 2 ++ docs/source/en/model_doc/vivit.md | 2 ++ docs/source/en/model_doc/wav2vec2-bert.md | 2 ++ docs/source/en/model_doc/wav2vec2-conformer.md | 2 ++ docs/source/en/model_doc/wavlm.md | 2 ++ docs/source/en/model_doc/xclip.md | 2 ++ docs/source/en/model_doc/xlm-prophetnet.md | 2 ++ docs/source/en/model_doc/xlm-roberta-xl.md | 2 ++ docs/source/en/model_doc/xlm.md | 8 ++------ docs/source/en/model_doc/xlnet.md | 8 ++------ docs/source/en/model_doc/xmod.md | 4 ++++ docs/source/en/model_doc/yolos.md | 2 ++ docs/source/en/model_doc/yoso.md | 2 ++ docs/source/en/model_doc/zamba.md | 2 ++ docs/source/en/model_doc/zamba2.md | 2 ++ docs/source/en/model_doc/zoedepth.md | 2 ++ 263 files changed, 668 insertions(+), 52 deletions(-) diff --git a/docs/source/en/model_doc/align.md b/docs/source/en/model_doc/align.md index c657ba15d59f..b2920bdc2bac 100644 --- a/docs/source/en/model_doc/align.md +++ b/docs/source/en/model_doc/align.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # ALIGN +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/altclip.md b/docs/source/en/model_doc/altclip.md index 5bab1564ed27..0dfbf797a033 100644 --- a/docs/source/en/model_doc/altclip.md +++ b/docs/source/en/model_doc/altclip.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # AltCLIP +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/aria.md b/docs/source/en/model_doc/aria.md index dd592ded8d66..b73a72947898 100644 --- a/docs/source/en/model_doc/aria.md +++ b/docs/source/en/model_doc/aria.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Aria +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/audio-spectrogram-transformer.md b/docs/source/en/model_doc/audio-spectrogram-transformer.md index 2a016dc25fae..4203761958d4 100644 --- a/docs/source/en/model_doc/audio-spectrogram-transformer.md +++ b/docs/source/en/model_doc/audio-spectrogram-transformer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Audio Spectrogram Transformer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/autoformer.md b/docs/source/en/model_doc/autoformer.md index f706e851aeff..2c5e27153e03 100644 --- a/docs/source/en/model_doc/autoformer.md +++ b/docs/source/en/model_doc/autoformer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Autoformer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/bamba.md b/docs/source/en/model_doc/bamba.md index d6e1273cac11..5c6092aa1d58 100644 --- a/docs/source/en/model_doc/bamba.md +++ b/docs/source/en/model_doc/bamba.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Bamba +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/bark.md b/docs/source/en/model_doc/bark.md index 0009f3e66d86..feba11707f48 100644 --- a/docs/source/en/model_doc/bark.md +++ b/docs/source/en/model_doc/bark.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # Bark +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/beit.md b/docs/source/en/model_doc/beit.md index 25b0eafb26a0..1faea35dcea5 100644 --- a/docs/source/en/model_doc/beit.md +++ b/docs/source/en/model_doc/beit.md @@ -16,6 +16,12 @@ rendered properly in your Markdown viewer. # BEiT +
+PyTorch +Flax +
+ ## Overview The BEiT model was proposed in [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by diff --git a/docs/source/en/model_doc/bert-generation.md b/docs/source/en/model_doc/bert-generation.md index 8e2efcef6dd5..0c42adbeb564 100644 --- a/docs/source/en/model_doc/bert-generation.md +++ b/docs/source/en/model_doc/bert-generation.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # BertGeneration +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/big_bird.md b/docs/source/en/model_doc/big_bird.md index 3d1ef91d5606..32ca5a2062a2 100644 --- a/docs/source/en/model_doc/big_bird.md +++ b/docs/source/en/model_doc/big_bird.md @@ -16,6 +16,12 @@ rendered properly in your Markdown viewer. # BigBird +
+PyTorch +Flax +
+ ## Overview The BigBird model was proposed in [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by diff --git a/docs/source/en/model_doc/bigbird_pegasus.md b/docs/source/en/model_doc/bigbird_pegasus.md index 9bf91e16d5fd..499d40b3149b 100644 --- a/docs/source/en/model_doc/bigbird_pegasus.md +++ b/docs/source/en/model_doc/bigbird_pegasus.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # BigBirdPegasus +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/biogpt.md b/docs/source/en/model_doc/biogpt.md index 0acecf1d8c9f..19dbaa56023a 100644 --- a/docs/source/en/model_doc/biogpt.md +++ b/docs/source/en/model_doc/biogpt.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # BioGPT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/bit.md b/docs/source/en/model_doc/bit.md index 291a92a83e0f..550c07662dd7 100644 --- a/docs/source/en/model_doc/bit.md +++ b/docs/source/en/model_doc/bit.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Big Transfer (BiT) +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/blip-2.md b/docs/source/en/model_doc/blip-2.md index b8517c4d1524..94331d9a5f6e 100644 --- a/docs/source/en/model_doc/blip-2.md +++ b/docs/source/en/model_doc/blip-2.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # BLIP-2 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/blip.md b/docs/source/en/model_doc/blip.md index 0545400b8355..1acf172f26b8 100644 --- a/docs/source/en/model_doc/blip.md +++ b/docs/source/en/model_doc/blip.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # BLIP +
+PyTorch +TensorFlow +
+ ## Overview The BLIP model was proposed in [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi. diff --git a/docs/source/en/model_doc/bloom.md b/docs/source/en/model_doc/bloom.md index a1d39d13ad00..9de987059574 100644 --- a/docs/source/en/model_doc/bloom.md +++ b/docs/source/en/model_doc/bloom.md @@ -16,6 +16,12 @@ rendered properly in your Markdown viewer. # BLOOM +
+PyTorch +Flax +
+ ## Overview The BLOOM model has been proposed with its various versions through the [BigScience Workshop](https://bigscience.huggingface.co/). BigScience is inspired by other open science initiatives where researchers have pooled their time and resources to collectively achieve a higher impact. diff --git a/docs/source/en/model_doc/bridgetower.md b/docs/source/en/model_doc/bridgetower.md index aae9bdc4c626..2aee4cdebe5d 100644 --- a/docs/source/en/model_doc/bridgetower.md +++ b/docs/source/en/model_doc/bridgetower.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # BridgeTower +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/bros.md b/docs/source/en/model_doc/bros.md index ac8056b0d738..baa658e598fb 100644 --- a/docs/source/en/model_doc/bros.md +++ b/docs/source/en/model_doc/bros.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # BROS +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/camembert.md b/docs/source/en/model_doc/camembert.md index fd872282d588..288cbc49794f 100644 --- a/docs/source/en/model_doc/camembert.md +++ b/docs/source/en/model_doc/camembert.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # CamemBERT +
+PyTorch +TensorFlow +
+ ## Overview The CamemBERT model was proposed in [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by diff --git a/docs/source/en/model_doc/canine.md b/docs/source/en/model_doc/canine.md index d6b448ceb09a..cd1cce34c79c 100644 --- a/docs/source/en/model_doc/canine.md +++ b/docs/source/en/model_doc/canine.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # CANINE +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md index 8cf8dbf7b793..0e3b12b53b61 100644 --- a/docs/source/en/model_doc/chameleon.md +++ b/docs/source/en/model_doc/chameleon.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Chameleon +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/chinese_clip.md b/docs/source/en/model_doc/chinese_clip.md index b49889ad5cc7..c73fee0422f0 100644 --- a/docs/source/en/model_doc/chinese_clip.md +++ b/docs/source/en/model_doc/chinese_clip.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Chinese-CLIP +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/clap.md b/docs/source/en/model_doc/clap.md index e4eeaa220ed3..e060662c01a9 100644 --- a/docs/source/en/model_doc/clap.md +++ b/docs/source/en/model_doc/clap.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # CLAP +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/clipseg.md b/docs/source/en/model_doc/clipseg.md index 577849289e3f..f594dbc3e0f3 100644 --- a/docs/source/en/model_doc/clipseg.md +++ b/docs/source/en/model_doc/clipseg.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # CLIPSeg +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/clvp.md b/docs/source/en/model_doc/clvp.md index ae193b19f35f..cfa4f97b8286 100644 --- a/docs/source/en/model_doc/clvp.md +++ b/docs/source/en/model_doc/clvp.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # CLVP +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/code_llama.md b/docs/source/en/model_doc/code_llama.md index 6eb687a728a0..ff3e66769c9c 100644 --- a/docs/source/en/model_doc/code_llama.md +++ b/docs/source/en/model_doc/code_llama.md @@ -16,6 +16,12 @@ rendered properly in your Markdown viewer. # CodeLlama +
+PyTorch +Flax +
+ ## Overview The Code Llama model was proposed in [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. diff --git a/docs/source/en/model_doc/codegen.md b/docs/source/en/model_doc/codegen.md index 1c93e6ab5b2c..465c8e5445b8 100644 --- a/docs/source/en/model_doc/codegen.md +++ b/docs/source/en/model_doc/codegen.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # CodeGen +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/cohere.md b/docs/source/en/model_doc/cohere.md index 5049d90a7c61..760cfd00a00f 100644 --- a/docs/source/en/model_doc/cohere.md +++ b/docs/source/en/model_doc/cohere.md @@ -1,6 +1,8 @@ # Cohere +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/cohere2.md b/docs/source/en/model_doc/cohere2.md index 19107e75835e..6b6d11ecd204 100644 --- a/docs/source/en/model_doc/cohere2.md +++ b/docs/source/en/model_doc/cohere2.md @@ -1,6 +1,8 @@ # Cohere +
PyTorch +
## Overview [C4AI Command R7B](https://cohere.com/blog/command-r7b) is an open weights research release of a 7B billion parameter model developed by Cohere and Cohere For AI. It has advanced capabilities optimized for various use cases, including reasoning, summarization, question answering, and code. The model is trained to perform sophisticated tasks including Retrieval Augmented Generation (RAG) and tool use. The model also has powerful agentic capabilities that can use and combine multiple tools over multiple steps to accomplish more difficult tasks. It obtains top performance on enterprise-relevant code use cases. C4AI Command R7B is a multilingual model trained on 23 languages. diff --git a/docs/source/en/model_doc/colpali.md b/docs/source/en/model_doc/colpali.md index baba60984315..07c4b45f140a 100644 --- a/docs/source/en/model_doc/colpali.md +++ b/docs/source/en/model_doc/colpali.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # ColPali +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/conditional_detr.md b/docs/source/en/model_doc/conditional_detr.md index 13cf4685142d..6a03d14d969c 100644 --- a/docs/source/en/model_doc/conditional_detr.md +++ b/docs/source/en/model_doc/conditional_detr.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Conditional DETR +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/convbert.md b/docs/source/en/model_doc/convbert.md index 17b5d7920c6c..e52bbd5c4772 100644 --- a/docs/source/en/model_doc/convbert.md +++ b/docs/source/en/model_doc/convbert.md @@ -17,12 +17,8 @@ rendered properly in your Markdown viewer. # ConvBERT
- -Models - - -Spaces - +PyTorch +TensorFlow
## Overview diff --git a/docs/source/en/model_doc/convnext.md b/docs/source/en/model_doc/convnext.md index f3d10d77b1d2..576e95ee043d 100644 --- a/docs/source/en/model_doc/convnext.md +++ b/docs/source/en/model_doc/convnext.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # ConvNeXT +
+PyTorch +TensorFlow +
+ ## Overview The ConvNeXT model was proposed in [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. diff --git a/docs/source/en/model_doc/convnextv2.md b/docs/source/en/model_doc/convnextv2.md index 8cd142c2765f..87a261b8dede 100644 --- a/docs/source/en/model_doc/convnextv2.md +++ b/docs/source/en/model_doc/convnextv2.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # ConvNeXt V2 +
+PyTorch +TensorFlow +
+ ## Overview The ConvNeXt V2 model was proposed in [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie. diff --git a/docs/source/en/model_doc/cpmant.md b/docs/source/en/model_doc/cpmant.md index 51f2f9e4d794..f8e2b3b515ec 100644 --- a/docs/source/en/model_doc/cpmant.md +++ b/docs/source/en/model_doc/cpmant.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # CPMAnt +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/ctrl.md b/docs/source/en/model_doc/ctrl.md index be9fa85c7073..0253d4e007e0 100644 --- a/docs/source/en/model_doc/ctrl.md +++ b/docs/source/en/model_doc/ctrl.md @@ -17,12 +17,8 @@ rendered properly in your Markdown viewer. # CTRL
- -Models - - -Spaces - +PyTorch +TensorFlow
## Overview diff --git a/docs/source/en/model_doc/cvt.md b/docs/source/en/model_doc/cvt.md index 503f97795c0e..fec632ed84d1 100644 --- a/docs/source/en/model_doc/cvt.md +++ b/docs/source/en/model_doc/cvt.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # Convolutional Vision Transformer (CvT) +
+PyTorch +TensorFlow +
+ ## Overview The CvT model was proposed in [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan and Lei Zhang. The Convolutional vision Transformer (CvT) improves the [Vision Transformer (ViT)](vit) in performance and efficiency by introducing convolutions into ViT to yield the best of both designs. diff --git a/docs/source/en/model_doc/dac.md b/docs/source/en/model_doc/dac.md index ab78a31e8d1b..3ee4d92b58e0 100644 --- a/docs/source/en/model_doc/dac.md +++ b/docs/source/en/model_doc/dac.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # DAC +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/data2vec.md b/docs/source/en/model_doc/data2vec.md index f9ac15e80d70..31efc35e5acf 100644 --- a/docs/source/en/model_doc/data2vec.md +++ b/docs/source/en/model_doc/data2vec.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Data2Vec +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md index edf5239b292d..c2b084dab512 100644 --- a/docs/source/en/model_doc/dbrx.md +++ b/docs/source/en/model_doc/dbrx.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # DBRX +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/deberta-v2.md b/docs/source/en/model_doc/deberta-v2.md index e3bd91e8e4fa..2e48a3e9a7fc 100644 --- a/docs/source/en/model_doc/deberta-v2.md +++ b/docs/source/en/model_doc/deberta-v2.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # DeBERTa-v2 +
+PyTorch +TensorFlow +
+ ## Overview The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's diff --git a/docs/source/en/model_doc/deberta.md b/docs/source/en/model_doc/deberta.md index 342a3bc47960..39afe83f5fe3 100644 --- a/docs/source/en/model_doc/deberta.md +++ b/docs/source/en/model_doc/deberta.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # DeBERTa +
+PyTorch +TensorFlow +
+ ## Overview The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's diff --git a/docs/source/en/model_doc/decision_transformer.md b/docs/source/en/model_doc/decision_transformer.md index 3b5672f7bb89..fb932ce3ec7a 100644 --- a/docs/source/en/model_doc/decision_transformer.md +++ b/docs/source/en/model_doc/decision_transformer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Decision Transformer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/deformable_detr.md b/docs/source/en/model_doc/deformable_detr.md index c79cea426d6b..5b83f23cf5b3 100644 --- a/docs/source/en/model_doc/deformable_detr.md +++ b/docs/source/en/model_doc/deformable_detr.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Deformable DETR +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/deit.md b/docs/source/en/model_doc/deit.md index a24632d5f867..058362b1abea 100644 --- a/docs/source/en/model_doc/deit.md +++ b/docs/source/en/model_doc/deit.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # DeiT +
+PyTorch +TensorFlow +
+ ## Overview The DeiT model was proposed in [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre diff --git a/docs/source/en/model_doc/deplot.md b/docs/source/en/model_doc/deplot.md index be1cd3eec9d9..d3c0de7b7f84 100644 --- a/docs/source/en/model_doc/deplot.md +++ b/docs/source/en/model_doc/deplot.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # DePlot +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/depth_anything.md b/docs/source/en/model_doc/depth_anything.md index 4504a8122269..07bed7088037 100644 --- a/docs/source/en/model_doc/depth_anything.md +++ b/docs/source/en/model_doc/depth_anything.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Depth Anything +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/deta.md b/docs/source/en/model_doc/deta.md index 0ae9c0b50293..e3859341a71a 100644 --- a/docs/source/en/model_doc/deta.md +++ b/docs/source/en/model_doc/deta.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # DETA +
PyTorch +
diff --git a/docs/source/en/model_doc/detr.md b/docs/source/en/model_doc/detr.md index 28cafd48691f..4614d549a180 100644 --- a/docs/source/en/model_doc/detr.md +++ b/docs/source/en/model_doc/detr.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # DETR +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/diffllama.md b/docs/source/en/model_doc/diffllama.md index 4cd8485c826a..5b1b22ee872a 100644 --- a/docs/source/en/model_doc/diffllama.md +++ b/docs/source/en/model_doc/diffllama.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # DiffLlama +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/dinat.md b/docs/source/en/model_doc/dinat.md index eb636bef3692..cd1d67073be6 100644 --- a/docs/source/en/model_doc/dinat.md +++ b/docs/source/en/model_doc/dinat.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Dilated Neighborhood Attention Transformer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/dinov2.md b/docs/source/en/model_doc/dinov2.md index 19674907f0c2..b78113e87b37 100644 --- a/docs/source/en/model_doc/dinov2.md +++ b/docs/source/en/model_doc/dinov2.md @@ -12,6 +12,12 @@ specific language governing permissions and limitations under the License. # DINOv2 +
+PyTorch +Flax +
+ ## Overview The DINOv2 model was proposed in [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by diff --git a/docs/source/en/model_doc/dinov2_with_registers.md b/docs/source/en/model_doc/dinov2_with_registers.md index 63f65dabaed2..ade263d91409 100644 --- a/docs/source/en/model_doc/dinov2_with_registers.md +++ b/docs/source/en/model_doc/dinov2_with_registers.md @@ -9,7 +9,9 @@ specific language governing permissions and limitations under the License. # DINOv2 with Registers +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/dit.md b/docs/source/en/model_doc/dit.md index 7f6691a15bc4..8848948375e8 100644 --- a/docs/source/en/model_doc/dit.md +++ b/docs/source/en/model_doc/dit.md @@ -16,6 +16,12 @@ rendered properly in your Markdown viewer. # DiT +
+PyTorch +Flax +
+ ## Overview DiT was proposed in [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei. diff --git a/docs/source/en/model_doc/dpr.md b/docs/source/en/model_doc/dpr.md index 8b9f352b637b..def36f38c741 100644 --- a/docs/source/en/model_doc/dpr.md +++ b/docs/source/en/model_doc/dpr.md @@ -17,12 +17,8 @@ rendered properly in your Markdown viewer. # DPR
- -Models - - -Spaces - +PyTorch +TensorFlow
## Overview diff --git a/docs/source/en/model_doc/dpt.md b/docs/source/en/model_doc/dpt.md index d461724c8447..7010d03cdc68 100644 --- a/docs/source/en/model_doc/dpt.md +++ b/docs/source/en/model_doc/dpt.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # DPT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/efficientformer.md b/docs/source/en/model_doc/efficientformer.md index 24b20793b03c..f05ccacc3dbf 100644 --- a/docs/source/en/model_doc/efficientformer.md +++ b/docs/source/en/model_doc/efficientformer.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # EfficientFormer +
+PyTorch +TensorFlow +
+ This model is in maintenance mode only, we don't accept any new PRs changing its code. diff --git a/docs/source/en/model_doc/efficientnet.md b/docs/source/en/model_doc/efficientnet.md index 03aa15663b59..a34378fa4709 100644 --- a/docs/source/en/model_doc/efficientnet.md +++ b/docs/source/en/model_doc/efficientnet.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # EfficientNet +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/emu3.md b/docs/source/en/model_doc/emu3.md index f8ca88e8a323..ad9f0719ed54 100644 --- a/docs/source/en/model_doc/emu3.md +++ b/docs/source/en/model_doc/emu3.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Emu3 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/encodec.md b/docs/source/en/model_doc/encodec.md index a92d85bd6d2e..893954d5cf86 100644 --- a/docs/source/en/model_doc/encodec.md +++ b/docs/source/en/model_doc/encodec.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # EnCodec +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/ernie.md b/docs/source/en/model_doc/ernie.md index 715e3b49b943..82f2a0d5ba81 100644 --- a/docs/source/en/model_doc/ernie.md +++ b/docs/source/en/model_doc/ernie.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # ERNIE +
PyTorch +
## Overview ERNIE is a series of powerful models proposed by baidu, especially in Chinese tasks, diff --git a/docs/source/en/model_doc/ernie_m.md b/docs/source/en/model_doc/ernie_m.md index 8e85f7d14789..3ce3b40c4463 100644 --- a/docs/source/en/model_doc/ernie_m.md +++ b/docs/source/en/model_doc/ernie_m.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # ErnieM +
PyTorch +
diff --git a/docs/source/en/model_doc/esm.md b/docs/source/en/model_doc/esm.md index 46bab860ff4d..6061d8eea987 100644 --- a/docs/source/en/model_doc/esm.md +++ b/docs/source/en/model_doc/esm.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # ESM +
+PyTorch +TensorFlow +
+ ## Overview This page provides code and pre-trained weights for Transformer protein language models from Meta AI's Fundamental diff --git a/docs/source/en/model_doc/falcon.md b/docs/source/en/model_doc/falcon.md index 33d638ccc409..4fe9cd81b9f3 100644 --- a/docs/source/en/model_doc/falcon.md +++ b/docs/source/en/model_doc/falcon.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Falcon +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/falcon3.md b/docs/source/en/model_doc/falcon3.md index 813533dd7f4d..276548be77ad 100644 --- a/docs/source/en/model_doc/falcon3.md +++ b/docs/source/en/model_doc/falcon3.md @@ -16,6 +16,12 @@ rendered properly in your Markdown viewer. # Falcon3 +
+PyTorch +Flax +
+ ## Overview Falcon3 represents a natural evolution from previous releases, emphasizing expanding the models' science, math, and code capabilities. This iteration includes five base models: Falcon3-1B-Base, Falcon3-3B-Base, Falcon3-Mamba-7B-Base, Falcon3-7B-Base, and Falcon3-10B-Base. In developing these models, we incorporated several key innovations aimed at improving the models' performances while reducing training costs: diff --git a/docs/source/en/model_doc/falcon_mamba.md b/docs/source/en/model_doc/falcon_mamba.md index b945f0c72fbb..fb6debfef921 100644 --- a/docs/source/en/model_doc/falcon_mamba.md +++ b/docs/source/en/model_doc/falcon_mamba.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # FalconMamba +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/fastspeech2_conformer.md b/docs/source/en/model_doc/fastspeech2_conformer.md index 35ca8ab73168..aeb055ceae40 100644 --- a/docs/source/en/model_doc/fastspeech2_conformer.md +++ b/docs/source/en/model_doc/fastspeech2_conformer.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # FastSpeech2Conformer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/flaubert.md b/docs/source/en/model_doc/flaubert.md index 04bcc2638ac9..59ab44ebff03 100644 --- a/docs/source/en/model_doc/flaubert.md +++ b/docs/source/en/model_doc/flaubert.md @@ -17,12 +17,8 @@ rendered properly in your Markdown viewer. # FlauBERT
- -Models - - -Spaces - +PyTorch +TensorFlow
## Overview diff --git a/docs/source/en/model_doc/flava.md b/docs/source/en/model_doc/flava.md index 51a177fc96c8..b32f93fc8bcb 100644 --- a/docs/source/en/model_doc/flava.md +++ b/docs/source/en/model_doc/flava.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # FLAVA +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/fnet.md b/docs/source/en/model_doc/fnet.md index 2b3884832583..fcf75e21caed 100644 --- a/docs/source/en/model_doc/fnet.md +++ b/docs/source/en/model_doc/fnet.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # FNet +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/focalnet.md b/docs/source/en/model_doc/focalnet.md index 5875155ef089..5312cae4ff67 100644 --- a/docs/source/en/model_doc/focalnet.md +++ b/docs/source/en/model_doc/focalnet.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # FocalNet +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/funnel.md b/docs/source/en/model_doc/funnel.md index d6929691f400..96050a153df2 100644 --- a/docs/source/en/model_doc/funnel.md +++ b/docs/source/en/model_doc/funnel.md @@ -17,15 +17,10 @@ rendered properly in your Markdown viewer. # Funnel Transformer
- -Models - - -Spaces - +PyTorch +TensorFlow
- ## Overview The Funnel Transformer model was proposed in the paper [Funnel-Transformer: Filtering out Sequential Redundancy for diff --git a/docs/source/en/model_doc/fuyu.md b/docs/source/en/model_doc/fuyu.md index 78684c8b02d8..c0ea89ad19fb 100644 --- a/docs/source/en/model_doc/fuyu.md +++ b/docs/source/en/model_doc/fuyu.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Fuyu +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/gemma.md b/docs/source/en/model_doc/gemma.md index abd077af8da1..7d39e6e3c4a2 100644 --- a/docs/source/en/model_doc/gemma.md +++ b/docs/source/en/model_doc/gemma.md @@ -16,6 +16,12 @@ rendered properly in your Markdown viewer. # Gemma +
+PyTorch +Flax +
+ ## Overview The Gemma model was proposed in [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by Gemma Team, Google. diff --git a/docs/source/en/model_doc/gemma2.md b/docs/source/en/model_doc/gemma2.md index 12dd8c96bd39..d80690c6e395 100644 --- a/docs/source/en/model_doc/gemma2.md +++ b/docs/source/en/model_doc/gemma2.md @@ -17,7 +17,9 @@ rendered properly in your Markdown viewer. # Gemma2 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/git.md b/docs/source/en/model_doc/git.md index c24b7f433d5e..825b73c5c59b 100644 --- a/docs/source/en/model_doc/git.md +++ b/docs/source/en/model_doc/git.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # GIT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/glm.md b/docs/source/en/model_doc/glm.md index 61e517bb4e06..bc592346c0c3 100644 --- a/docs/source/en/model_doc/glm.md +++ b/docs/source/en/model_doc/glm.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # GLM +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/glpn.md b/docs/source/en/model_doc/glpn.md index c3fbc1d9647b..95ecc36bf5b7 100644 --- a/docs/source/en/model_doc/glpn.md +++ b/docs/source/en/model_doc/glpn.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # GLPN +
PyTorch +
diff --git a/docs/source/en/model_doc/gpt_bigcode.md b/docs/source/en/model_doc/gpt_bigcode.md index 8123e1fde087..2ebbaa512511 100644 --- a/docs/source/en/model_doc/gpt_bigcode.md +++ b/docs/source/en/model_doc/gpt_bigcode.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # GPTBigCode +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/gpt_neo.md b/docs/source/en/model_doc/gpt_neo.md index 3c7858c99820..de1f80c08268 100644 --- a/docs/source/en/model_doc/gpt_neo.md +++ b/docs/source/en/model_doc/gpt_neo.md @@ -16,6 +16,12 @@ rendered properly in your Markdown viewer. # GPT Neo +
+PyTorch +Flax +
+ ## Overview The GPTNeo model was released in the [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) repository by Sid diff --git a/docs/source/en/model_doc/gpt_neox.md b/docs/source/en/model_doc/gpt_neox.md index 155b7449f8e0..41c8eee47340 100644 --- a/docs/source/en/model_doc/gpt_neox.md +++ b/docs/source/en/model_doc/gpt_neox.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # GPT-NeoX +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/gpt_neox_japanese.md b/docs/source/en/model_doc/gpt_neox_japanese.md index e1600d9b1115..9a5f7335564d 100644 --- a/docs/source/en/model_doc/gpt_neox_japanese.md +++ b/docs/source/en/model_doc/gpt_neox_japanese.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # GPT-NeoX-Japanese +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/gptsan-japanese.md b/docs/source/en/model_doc/gptsan-japanese.md index 83040362f82f..929e7330ceea 100644 --- a/docs/source/en/model_doc/gptsan-japanese.md +++ b/docs/source/en/model_doc/gptsan-japanese.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # GPTSAN-japanese +
PyTorch +
diff --git a/docs/source/en/model_doc/granite.md b/docs/source/en/model_doc/granite.md index 57fc530282cc..875177278a57 100644 --- a/docs/source/en/model_doc/granite.md +++ b/docs/source/en/model_doc/granite.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Granite +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/granitemoe.md b/docs/source/en/model_doc/granitemoe.md index bc1a6bb33ee9..d9e5fa7f61cb 100644 --- a/docs/source/en/model_doc/granitemoe.md +++ b/docs/source/en/model_doc/granitemoe.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # GraniteMoe +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/graphormer.md b/docs/source/en/model_doc/graphormer.md index 6a81a0f84e77..0d88134d4b7e 100644 --- a/docs/source/en/model_doc/graphormer.md +++ b/docs/source/en/model_doc/graphormer.md @@ -14,7 +14,9 @@ rendered properly in your Markdown viewer. # Graphormer +
PyTorch +
diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md index deb8b20029f1..75f8a2fa32f7 100644 --- a/docs/source/en/model_doc/grounding-dino.md +++ b/docs/source/en/model_doc/grounding-dino.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Grounding DINO +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/groupvit.md b/docs/source/en/model_doc/groupvit.md index 8728cf0da21b..c77a51d8b1b7 100644 --- a/docs/source/en/model_doc/groupvit.md +++ b/docs/source/en/model_doc/groupvit.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # GroupViT +
+PyTorch +TensorFlow +
+ ## Overview The GroupViT model was proposed in [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. diff --git a/docs/source/en/model_doc/helium.md b/docs/source/en/model_doc/helium.md index cf83f5a0352c..d0f676c8470a 100644 --- a/docs/source/en/model_doc/helium.md +++ b/docs/source/en/model_doc/helium.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Helium +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md index e255185cb6bb..a82eec950a51 100644 --- a/docs/source/en/model_doc/hiera.md +++ b/docs/source/en/model_doc/hiera.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Hiera +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/hubert.md b/docs/source/en/model_doc/hubert.md index 93e40d4f4ee8..9447e8785f02 100644 --- a/docs/source/en/model_doc/hubert.md +++ b/docs/source/en/model_doc/hubert.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # Hubert +
+PyTorch +TensorFlow +
+ ## Overview Hubert was proposed in [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan diff --git a/docs/source/en/model_doc/ibert.md b/docs/source/en/model_doc/ibert.md index c887ba3e0a5c..8c43eeddaf55 100644 --- a/docs/source/en/model_doc/ibert.md +++ b/docs/source/en/model_doc/ibert.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # I-BERT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/idefics.md b/docs/source/en/model_doc/idefics.md index ab66bd555a71..35d1c2a56afc 100644 --- a/docs/source/en/model_doc/idefics.md +++ b/docs/source/en/model_doc/idefics.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # IDEFICS +
+PyTorch +TensorFlow +
+ ## Overview The IDEFICS model was proposed in [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents diff --git a/docs/source/en/model_doc/idefics2.md b/docs/source/en/model_doc/idefics2.md index 1baec145fa33..815de863b741 100644 --- a/docs/source/en/model_doc/idefics2.md +++ b/docs/source/en/model_doc/idefics2.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Idefics2 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md index f8c0cf2dbaf7..1f4ae33a8f61 100644 --- a/docs/source/en/model_doc/idefics3.md +++ b/docs/source/en/model_doc/idefics3.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Idefics3 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/ijepa.md b/docs/source/en/model_doc/ijepa.md index a72b164ddcef..ecb90c67cb3b 100644 --- a/docs/source/en/model_doc/ijepa.md +++ b/docs/source/en/model_doc/ijepa.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # I-JEPA +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/imagegpt.md b/docs/source/en/model_doc/imagegpt.md index 51e6bd1c6722..7fbec62d30bb 100644 --- a/docs/source/en/model_doc/imagegpt.md +++ b/docs/source/en/model_doc/imagegpt.md @@ -15,7 +15,9 @@ specific language governing permissions and limitations under the License. --> # ImageGPT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/informer.md b/docs/source/en/model_doc/informer.md index fc6e5e8cb828..1dfc397db777 100644 --- a/docs/source/en/model_doc/informer.md +++ b/docs/source/en/model_doc/informer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Informer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md index d2bf881fa148..4f2feb015f1f 100644 --- a/docs/source/en/model_doc/instructblip.md +++ b/docs/source/en/model_doc/instructblip.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # InstructBLIP +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md index 05baf90fe639..c26562a85308 100644 --- a/docs/source/en/model_doc/instructblipvideo.md +++ b/docs/source/en/model_doc/instructblipvideo.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # InstructBlipVideo +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/jamba.md b/docs/source/en/model_doc/jamba.md index 4f4f283459ff..6edb9e0c7c32 100644 --- a/docs/source/en/model_doc/jamba.md +++ b/docs/source/en/model_doc/jamba.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Jamba +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/jetmoe.md b/docs/source/en/model_doc/jetmoe.md index 53ef534add02..67387614b47d 100644 --- a/docs/source/en/model_doc/jetmoe.md +++ b/docs/source/en/model_doc/jetmoe.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # JetMoe +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/jukebox.md b/docs/source/en/model_doc/jukebox.md index 3971390b1cdf..144134d9b070 100644 --- a/docs/source/en/model_doc/jukebox.md +++ b/docs/source/en/model_doc/jukebox.md @@ -15,7 +15,9 @@ rendered properly in your Markdown viewer. --> # Jukebox +
PyTorch +
diff --git a/docs/source/en/model_doc/kosmos-2.md b/docs/source/en/model_doc/kosmos-2.md index 5f419545de27..88a3b6bd99e1 100644 --- a/docs/source/en/model_doc/kosmos-2.md +++ b/docs/source/en/model_doc/kosmos-2.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # KOSMOS-2 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/layoutlm.md b/docs/source/en/model_doc/layoutlm.md index 34b429fb7376..51cc52b7f452 100644 --- a/docs/source/en/model_doc/layoutlm.md +++ b/docs/source/en/model_doc/layoutlm.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # LayoutLM +
+PyTorch +TensorFlow +
+ ## Overview diff --git a/docs/source/en/model_doc/layoutlmv2.md b/docs/source/en/model_doc/layoutlmv2.md index 1579a5bbba4d..7fc5ae36197b 100644 --- a/docs/source/en/model_doc/layoutlmv2.md +++ b/docs/source/en/model_doc/layoutlmv2.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # LayoutLMV2 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/layoutxlm.md b/docs/source/en/model_doc/layoutxlm.md index ae1842c2b15d..96e0a4d4bf51 100644 --- a/docs/source/en/model_doc/layoutxlm.md +++ b/docs/source/en/model_doc/layoutxlm.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # LayoutXLM +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/led.md b/docs/source/en/model_doc/led.md index 9a39b0b28ede..729d5666d8a3 100644 --- a/docs/source/en/model_doc/led.md +++ b/docs/source/en/model_doc/led.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # LED +
+PyTorch +TensorFlow +
+ ## Overview The LED model was proposed in [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz diff --git a/docs/source/en/model_doc/levit.md b/docs/source/en/model_doc/levit.md index 13c6d19fafd4..af42c1533e53 100644 --- a/docs/source/en/model_doc/levit.md +++ b/docs/source/en/model_doc/levit.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # LeViT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/lilt.md b/docs/source/en/model_doc/lilt.md index c55f72f68289..2474d854e030 100644 --- a/docs/source/en/model_doc/lilt.md +++ b/docs/source/en/model_doc/lilt.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # LiLT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md index 2f0eb63da00a..27927b591185 100644 --- a/docs/source/en/model_doc/llama.md +++ b/docs/source/en/model_doc/llama.md @@ -16,6 +16,12 @@ rendered properly in your Markdown viewer. # LLaMA +
+PyTorch +Flax +
+ ## Overview The LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. It is a collection of foundation language models ranging from 7B to 65B parameters. diff --git a/docs/source/en/model_doc/llama2.md b/docs/source/en/model_doc/llama2.md index b4cd6b9ca110..4e5f572c4b20 100644 --- a/docs/source/en/model_doc/llama2.md +++ b/docs/source/en/model_doc/llama2.md @@ -16,6 +16,12 @@ rendered properly in your Markdown viewer. # Llama2 +
+PyTorch +Flax +
+ ## Overview The Llama2 model was proposed in [LLaMA: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom. It is a collection of foundation language models ranging from 7B to 70B parameters, with checkpoints finetuned for chat application! diff --git a/docs/source/en/model_doc/llama3.md b/docs/source/en/model_doc/llama3.md index 9c77db44fcf3..0bb5e8160c90 100644 --- a/docs/source/en/model_doc/llama3.md +++ b/docs/source/en/model_doc/llama3.md @@ -16,6 +16,12 @@ rendered properly in your Markdown viewer. # Llama3 +
+PyTorch +Flax +
+ ```py3 import transformers import torch diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md index 3fd5b583f947..bdfd07944218 100644 --- a/docs/source/en/model_doc/llava.md +++ b/docs/source/en/model_doc/llava.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # LLaVa +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index aa7e8fb39ebe..51a386815ef0 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # LLaVA-NeXT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md index 497abf2f5f6d..52181f992c51 100644 --- a/docs/source/en/model_doc/llava_next_video.md +++ b/docs/source/en/model_doc/llava_next_video.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # LLaVa-NeXT-Video +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/llava_onevision.md b/docs/source/en/model_doc/llava_onevision.md index 512d666b063f..b1e487cc299e 100644 --- a/docs/source/en/model_doc/llava_onevision.md +++ b/docs/source/en/model_doc/llava_onevision.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # LLaVA-OneVision +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/longformer.md b/docs/source/en/model_doc/longformer.md index 20ba7a922515..d173a7eb32ec 100644 --- a/docs/source/en/model_doc/longformer.md +++ b/docs/source/en/model_doc/longformer.md @@ -17,12 +17,8 @@ rendered properly in your Markdown viewer. # Longformer
- -Models - - -Spaces - +PyTorch +TensorFlow
## Overview diff --git a/docs/source/en/model_doc/longt5.md b/docs/source/en/model_doc/longt5.md index 40faa6d8c237..85a869f3c594 100644 --- a/docs/source/en/model_doc/longt5.md +++ b/docs/source/en/model_doc/longt5.md @@ -16,6 +16,12 @@ rendered properly in your Markdown viewer. # LongT5 +
+PyTorch +Flax +
+ ## Overview The LongT5 model was proposed in [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) diff --git a/docs/source/en/model_doc/luke.md b/docs/source/en/model_doc/luke.md index 5630b1785b91..be4d5946dfcf 100644 --- a/docs/source/en/model_doc/luke.md +++ b/docs/source/en/model_doc/luke.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # LUKE +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/lxmert.md b/docs/source/en/model_doc/lxmert.md index 435994196b43..a0f686efc35d 100644 --- a/docs/source/en/model_doc/lxmert.md +++ b/docs/source/en/model_doc/lxmert.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # LXMERT +
+PyTorch +TensorFlow +
+ ## Overview The LXMERT model was proposed in [LXMERT: Learning Cross-Modality Encoder Representations from Transformers](https://arxiv.org/abs/1908.07490) by Hao Tan & Mohit Bansal. It is a series of bidirectional transformer encoders diff --git a/docs/source/en/model_doc/m2m_100.md b/docs/source/en/model_doc/m2m_100.md index 8b2da0940d35..77f1d22c7f54 100644 --- a/docs/source/en/model_doc/m2m_100.md +++ b/docs/source/en/model_doc/m2m_100.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # M2M100 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mamba.md b/docs/source/en/model_doc/mamba.md index 8bb51af54328..d5c0612b1ebe 100644 --- a/docs/source/en/model_doc/mamba.md +++ b/docs/source/en/model_doc/mamba.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Mamba +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mamba2.md b/docs/source/en/model_doc/mamba2.md index 5421458186f5..8d88d6c02652 100644 --- a/docs/source/en/model_doc/mamba2.md +++ b/docs/source/en/model_doc/mamba2.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Mamba 2 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/markuplm.md b/docs/source/en/model_doc/markuplm.md index b1291fd1a23f..72948da2c5af 100644 --- a/docs/source/en/model_doc/markuplm.md +++ b/docs/source/en/model_doc/markuplm.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # MarkupLM +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mask2former.md b/docs/source/en/model_doc/mask2former.md index cdffde742f85..37a2603c6880 100644 --- a/docs/source/en/model_doc/mask2former.md +++ b/docs/source/en/model_doc/mask2former.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Mask2Former +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/maskformer.md b/docs/source/en/model_doc/maskformer.md index 1d67bafbdc77..0adbbf2285f9 100644 --- a/docs/source/en/model_doc/maskformer.md +++ b/docs/source/en/model_doc/maskformer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # MaskFormer +
PyTorch +
diff --git a/docs/source/en/model_doc/matcha.md b/docs/source/en/model_doc/matcha.md index a33ef62e5c45..f3c618953b9b 100644 --- a/docs/source/en/model_doc/matcha.md +++ b/docs/source/en/model_doc/matcha.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # MatCha +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mctct.md b/docs/source/en/model_doc/mctct.md index 628aa872b05e..a755f5a027d2 100644 --- a/docs/source/en/model_doc/mctct.md +++ b/docs/source/en/model_doc/mctct.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # M-CTC-T +
PyTorch +
diff --git a/docs/source/en/model_doc/mega.md b/docs/source/en/model_doc/mega.md index 9845a27ec986..4e8ccd4b29f3 100644 --- a/docs/source/en/model_doc/mega.md +++ b/docs/source/en/model_doc/mega.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # MEGA +
PyTorch +
diff --git a/docs/source/en/model_doc/megatron-bert.md b/docs/source/en/model_doc/megatron-bert.md index 23e86a39d2ae..b032655f7547 100644 --- a/docs/source/en/model_doc/megatron-bert.md +++ b/docs/source/en/model_doc/megatron-bert.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # MegatronBERT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mgp-str.md b/docs/source/en/model_doc/mgp-str.md index cd89d5a6f42b..168e5bd1043d 100644 --- a/docs/source/en/model_doc/mgp-str.md +++ b/docs/source/en/model_doc/mgp-str.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # MGP-STR +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mimi.md b/docs/source/en/model_doc/mimi.md index 88a9fcf0256f..d8852672685f 100644 --- a/docs/source/en/model_doc/mimi.md +++ b/docs/source/en/model_doc/mimi.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Mimi +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md index 42ffcef39d0f..74b49cf81dd7 100644 --- a/docs/source/en/model_doc/mixtral.md +++ b/docs/source/en/model_doc/mixtral.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Mixtral +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mllama.md b/docs/source/en/model_doc/mllama.md index ae7bdb64540e..77f5e211f170 100644 --- a/docs/source/en/model_doc/mllama.md +++ b/docs/source/en/model_doc/mllama.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Mllama +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mluke.md b/docs/source/en/model_doc/mluke.md index 6ec7b2090fbc..aae607def6f1 100644 --- a/docs/source/en/model_doc/mluke.md +++ b/docs/source/en/model_doc/mluke.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # mLUKE +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mobilebert.md b/docs/source/en/model_doc/mobilebert.md index 5c9a230d0d5c..11a2b21b6130 100644 --- a/docs/source/en/model_doc/mobilebert.md +++ b/docs/source/en/model_doc/mobilebert.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # MobileBERT +
+PyTorch +TensorFlow +
+ ## Overview The MobileBERT model was proposed in [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny diff --git a/docs/source/en/model_doc/mobilenet_v1.md b/docs/source/en/model_doc/mobilenet_v1.md index ea880f0e53d4..7d94777d6f83 100644 --- a/docs/source/en/model_doc/mobilenet_v1.md +++ b/docs/source/en/model_doc/mobilenet_v1.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # MobileNet V1 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mobilenet_v2.md b/docs/source/en/model_doc/mobilenet_v2.md index 39bd0e0b7ebf..b78a8eb72f63 100644 --- a/docs/source/en/model_doc/mobilenet_v2.md +++ b/docs/source/en/model_doc/mobilenet_v2.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # MobileNet V2 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mobilevit.md b/docs/source/en/model_doc/mobilevit.md index e724ffa380e2..c9054b59cbc9 100644 --- a/docs/source/en/model_doc/mobilevit.md +++ b/docs/source/en/model_doc/mobilevit.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # MobileViT +
+PyTorch +TensorFlow +
+ ## Overview The MobileViT model was proposed in [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari. MobileViT introduces a new layer that replaces local processing in convolutions with global processing using transformers. diff --git a/docs/source/en/model_doc/mobilevitv2.md b/docs/source/en/model_doc/mobilevitv2.md index 041e703acb29..b6549666850a 100644 --- a/docs/source/en/model_doc/mobilevitv2.md +++ b/docs/source/en/model_doc/mobilevitv2.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # MobileViTV2 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/modernbert.md b/docs/source/en/model_doc/modernbert.md index 533b2fe8d460..7432d27d7fc0 100644 --- a/docs/source/en/model_doc/modernbert.md +++ b/docs/source/en/model_doc/modernbert.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # ModernBERT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md index 68720a0a60b4..e019c89ce765 100644 --- a/docs/source/en/model_doc/moonshine.md +++ b/docs/source/en/model_doc/moonshine.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Moonshine +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/moshi.md b/docs/source/en/model_doc/moshi.md index 5035a416144a..e9dfd9b84d82 100644 --- a/docs/source/en/model_doc/moshi.md +++ b/docs/source/en/model_doc/moshi.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Moshi +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mpnet.md b/docs/source/en/model_doc/mpnet.md index c571da47b004..cf84e2b41075 100644 --- a/docs/source/en/model_doc/mpnet.md +++ b/docs/source/en/model_doc/mpnet.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # MPNet +
+PyTorch +TensorFlow +
+ ## Overview The MPNet model was proposed in [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. diff --git a/docs/source/en/model_doc/mpt.md b/docs/source/en/model_doc/mpt.md index b43b6a0d4166..a4dbc5ea6a8d 100644 --- a/docs/source/en/model_doc/mpt.md +++ b/docs/source/en/model_doc/mpt.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # MPT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mra.md b/docs/source/en/model_doc/mra.md index 37bf13baacb8..a5490d5d379c 100644 --- a/docs/source/en/model_doc/mra.md +++ b/docs/source/en/model_doc/mra.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # MRA +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/musicgen.md b/docs/source/en/model_doc/musicgen.md index e2617e68c8f5..063f4ac97b39 100644 --- a/docs/source/en/model_doc/musicgen.md +++ b/docs/source/en/model_doc/musicgen.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # MusicGen +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/musicgen_melody.md b/docs/source/en/model_doc/musicgen_melody.md index 04ebea60cc2d..af0f21559a1a 100644 --- a/docs/source/en/model_doc/musicgen_melody.md +++ b/docs/source/en/model_doc/musicgen_melody.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # MusicGen Melody +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/mvp.md b/docs/source/en/model_doc/mvp.md index d8d78476805d..d73297716792 100644 --- a/docs/source/en/model_doc/mvp.md +++ b/docs/source/en/model_doc/mvp.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # MVP +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/nat.md b/docs/source/en/model_doc/nat.md index d00e6efcebab..c7725ed7a563 100644 --- a/docs/source/en/model_doc/nat.md +++ b/docs/source/en/model_doc/nat.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Neighborhood Attention Transformer +
PyTorch +
diff --git a/docs/source/en/model_doc/nemotron.md b/docs/source/en/model_doc/nemotron.md index 0de5ea28f49c..d8837568deee 100644 --- a/docs/source/en/model_doc/nemotron.md +++ b/docs/source/en/model_doc/nemotron.md @@ -14,7 +14,9 @@ specific language governing permissions and limitations under the License. # Nemotron +
PyTorch +
### License diff --git a/docs/source/en/model_doc/nezha.md b/docs/source/en/model_doc/nezha.md index 82dc8aecb8f4..dc815e0ecc48 100644 --- a/docs/source/en/model_doc/nezha.md +++ b/docs/source/en/model_doc/nezha.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Nezha +
PyTorch +
diff --git a/docs/source/en/model_doc/nllb-moe.md b/docs/source/en/model_doc/nllb-moe.md index 3e23ef5bebf1..65a4812ed6ab 100644 --- a/docs/source/en/model_doc/nllb-moe.md +++ b/docs/source/en/model_doc/nllb-moe.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # NLLB-MOE +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/nllb.md b/docs/source/en/model_doc/nllb.md index 4b08fb4fa06e..0e1bb34577fa 100644 --- a/docs/source/en/model_doc/nllb.md +++ b/docs/source/en/model_doc/nllb.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # NLLB +
PyTorch +
## Updated tokenizer behavior diff --git a/docs/source/en/model_doc/nystromformer.md b/docs/source/en/model_doc/nystromformer.md index 26aa2d85f457..b4c017b35fff 100644 --- a/docs/source/en/model_doc/nystromformer.md +++ b/docs/source/en/model_doc/nystromformer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Nyströmformer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/olmo.md b/docs/source/en/model_doc/olmo.md index 3c6943e4ff27..9788a98c7721 100644 --- a/docs/source/en/model_doc/olmo.md +++ b/docs/source/en/model_doc/olmo.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # OLMo +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/olmo2.md b/docs/source/en/model_doc/olmo2.md index b8a0780d055e..dc989341cf3a 100644 --- a/docs/source/en/model_doc/olmo2.md +++ b/docs/source/en/model_doc/olmo2.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # OLMo2 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/olmoe.md b/docs/source/en/model_doc/olmoe.md index 0b3e71f446dc..71502aea3dd6 100644 --- a/docs/source/en/model_doc/olmoe.md +++ b/docs/source/en/model_doc/olmoe.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # OLMoE +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/omdet-turbo.md b/docs/source/en/model_doc/omdet-turbo.md index b0d4be3d9198..d73fef2d8b5d 100644 --- a/docs/source/en/model_doc/omdet-turbo.md +++ b/docs/source/en/model_doc/omdet-turbo.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # OmDet-Turbo +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/oneformer.md b/docs/source/en/model_doc/oneformer.md index 3cd513499250..f1c1de791238 100644 --- a/docs/source/en/model_doc/oneformer.md +++ b/docs/source/en/model_doc/oneformer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # OneFormer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/open-llama.md b/docs/source/en/model_doc/open-llama.md index e4f76966dbfe..3b4856cd4fb6 100644 --- a/docs/source/en/model_doc/open-llama.md +++ b/docs/source/en/model_doc/open-llama.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Open-Llama +
PyTorch +
diff --git a/docs/source/en/model_doc/owlv2.md b/docs/source/en/model_doc/owlv2.md index 9d2d22336859..f01a5c59063b 100644 --- a/docs/source/en/model_doc/owlv2.md +++ b/docs/source/en/model_doc/owlv2.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # OWLv2 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/owlvit.md b/docs/source/en/model_doc/owlvit.md index 945409eb3caa..5be8ffc8f58c 100644 --- a/docs/source/en/model_doc/owlvit.md +++ b/docs/source/en/model_doc/owlvit.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # OWL-ViT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/paligemma.md b/docs/source/en/model_doc/paligemma.md index c6f97d98d91e..b6805be90531 100644 --- a/docs/source/en/model_doc/paligemma.md +++ b/docs/source/en/model_doc/paligemma.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # PaliGemma +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/patchtsmixer.md b/docs/source/en/model_doc/patchtsmixer.md index 0f2934edaeea..dd678dd40101 100644 --- a/docs/source/en/model_doc/patchtsmixer.md +++ b/docs/source/en/model_doc/patchtsmixer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # PatchTSMixer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md index 2e48e63bc2d8..c55ba3334299 100644 --- a/docs/source/en/model_doc/patchtst.md +++ b/docs/source/en/model_doc/patchtst.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # PatchTST +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/pegasus_x.md b/docs/source/en/model_doc/pegasus_x.md index 719359723ce7..3f982263cdb1 100644 --- a/docs/source/en/model_doc/pegasus_x.md +++ b/docs/source/en/model_doc/pegasus_x.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # PEGASUS-X +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/perceiver.md b/docs/source/en/model_doc/perceiver.md index 49a775462f65..700f49d42d93 100644 --- a/docs/source/en/model_doc/perceiver.md +++ b/docs/source/en/model_doc/perceiver.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Perceiver +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/persimmon.md b/docs/source/en/model_doc/persimmon.md index 714b4e804378..bf721f19a107 100644 --- a/docs/source/en/model_doc/persimmon.md +++ b/docs/source/en/model_doc/persimmon.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Persimmon +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/phi.md b/docs/source/en/model_doc/phi.md index 0d81e1d9773f..81873459e0a8 100644 --- a/docs/source/en/model_doc/phi.md +++ b/docs/source/en/model_doc/phi.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Phi +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/phi3.md b/docs/source/en/model_doc/phi3.md index 46864adbccd1..93c3073f28fb 100644 --- a/docs/source/en/model_doc/phi3.md +++ b/docs/source/en/model_doc/phi3.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Phi-3 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/phimoe.md b/docs/source/en/model_doc/phimoe.md index 63479190723d..1dc6e22d6a44 100644 --- a/docs/source/en/model_doc/phimoe.md +++ b/docs/source/en/model_doc/phimoe.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # PhiMoE +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/pix2struct.md b/docs/source/en/model_doc/pix2struct.md index e93bd2444848..e912cc96cdcc 100644 --- a/docs/source/en/model_doc/pix2struct.md +++ b/docs/source/en/model_doc/pix2struct.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Pix2Struct +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md index 4f7f40b90f94..02fe2081bba0 100644 --- a/docs/source/en/model_doc/pixtral.md +++ b/docs/source/en/model_doc/pixtral.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Pixtral +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/plbart.md b/docs/source/en/model_doc/plbart.md index 7d3e1d770a82..bac567615d42 100644 --- a/docs/source/en/model_doc/plbart.md +++ b/docs/source/en/model_doc/plbart.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # PLBart +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/poolformer.md b/docs/source/en/model_doc/poolformer.md index f46501289fde..bce183706a83 100644 --- a/docs/source/en/model_doc/poolformer.md +++ b/docs/source/en/model_doc/poolformer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # PoolFormer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/pop2piano.md b/docs/source/en/model_doc/pop2piano.md index 81e52e87af72..a9554b4924a9 100644 --- a/docs/source/en/model_doc/pop2piano.md +++ b/docs/source/en/model_doc/pop2piano.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # Pop2Piano +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/prophetnet.md b/docs/source/en/model_doc/prophetnet.md index 8c1dcf793f8b..b768fef72a04 100644 --- a/docs/source/en/model_doc/prophetnet.md +++ b/docs/source/en/model_doc/prophetnet.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # ProphetNet +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/pvt.md b/docs/source/en/model_doc/pvt.md index 5a3a29770020..d4c80445bf61 100644 --- a/docs/source/en/model_doc/pvt.md +++ b/docs/source/en/model_doc/pvt.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # Pyramid Vision Transformer (PVT) +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/pvt_v2.md b/docs/source/en/model_doc/pvt_v2.md index 588a3835e0c5..deac614d38bf 100644 --- a/docs/source/en/model_doc/pvt_v2.md +++ b/docs/source/en/model_doc/pvt_v2.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # Pyramid Vision Transformer V2 (PVTv2) +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/qdqbert.md b/docs/source/en/model_doc/qdqbert.md index 02f098b915eb..76555909c76d 100644 --- a/docs/source/en/model_doc/qdqbert.md +++ b/docs/source/en/model_doc/qdqbert.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # QDQBERT +
PyTorch +
diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 1d39d4e11440..a774f885e3a9 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Qwen2 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/qwen2_5_vl.md b/docs/source/en/model_doc/qwen2_5_vl.md index 3554976af10d..631da5d88224 100644 --- a/docs/source/en/model_doc/qwen2_5_vl.md +++ b/docs/source/en/model_doc/qwen2_5_vl.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Qwen2.5-VL +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/qwen2_audio.md b/docs/source/en/model_doc/qwen2_audio.md index 24b855c9038a..da055d1015ab 100644 --- a/docs/source/en/model_doc/qwen2_audio.md +++ b/docs/source/en/model_doc/qwen2_audio.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Qwen2Audio +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/qwen2_moe.md b/docs/source/en/model_doc/qwen2_moe.md index 9c2168457a1d..c56ec74ab3aa 100644 --- a/docs/source/en/model_doc/qwen2_moe.md +++ b/docs/source/en/model_doc/qwen2_moe.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Qwen2MoE +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/qwen2_vl.md b/docs/source/en/model_doc/qwen2_vl.md index d28a4c25e19e..516271bdaf59 100644 --- a/docs/source/en/model_doc/qwen2_vl.md +++ b/docs/source/en/model_doc/qwen2_vl.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Qwen2-VL +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/rag.md b/docs/source/en/model_doc/rag.md index 1891efe74263..0f59592633f7 100644 --- a/docs/source/en/model_doc/rag.md +++ b/docs/source/en/model_doc/rag.md @@ -17,9 +17,8 @@ rendered properly in your Markdown viewer. # RAG
- -Models - +PyTorch +TensorFlow
## Overview diff --git a/docs/source/en/model_doc/realm.md b/docs/source/en/model_doc/realm.md index 5156011e87d2..b5b9102c2c64 100644 --- a/docs/source/en/model_doc/realm.md +++ b/docs/source/en/model_doc/realm.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # REALM +
PyTorch +
diff --git a/docs/source/en/model_doc/recurrent_gemma.md b/docs/source/en/model_doc/recurrent_gemma.md index 7886f63ed326..b543b35a75f0 100644 --- a/docs/source/en/model_doc/recurrent_gemma.md +++ b/docs/source/en/model_doc/recurrent_gemma.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # RecurrentGemma +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/reformer.md b/docs/source/en/model_doc/reformer.md index f99a806780c7..7e403599fdb0 100644 --- a/docs/source/en/model_doc/reformer.md +++ b/docs/source/en/model_doc/reformer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Reformer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/rembert.md b/docs/source/en/model_doc/rembert.md index b755d3423060..319e44cf0987 100644 --- a/docs/source/en/model_doc/rembert.md +++ b/docs/source/en/model_doc/rembert.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # RemBERT +
+PyTorch +TensorFlow +
+ ## Overview The RemBERT model was proposed in [Rethinking Embedding Coupling in Pre-trained Language Models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, Melvin Johnson, Sebastian Ruder. diff --git a/docs/source/en/model_doc/retribert.md b/docs/source/en/model_doc/retribert.md index 7d0c33d71722..795f81caaa72 100644 --- a/docs/source/en/model_doc/retribert.md +++ b/docs/source/en/model_doc/retribert.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # RetriBERT +
PyTorch +
diff --git a/docs/source/en/model_doc/roc_bert.md b/docs/source/en/model_doc/roc_bert.md index e1321dd2c7f1..f3797663ff70 100644 --- a/docs/source/en/model_doc/roc_bert.md +++ b/docs/source/en/model_doc/roc_bert.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # RoCBert +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/rt_detr.md b/docs/source/en/model_doc/rt_detr.md index 03e87ef4515a..c80e83e7b883 100644 --- a/docs/source/en/model_doc/rt_detr.md +++ b/docs/source/en/model_doc/rt_detr.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # RT-DETR +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/rwkv.md b/docs/source/en/model_doc/rwkv.md index ee367411fcf8..8b54c25204bb 100644 --- a/docs/source/en/model_doc/rwkv.md +++ b/docs/source/en/model_doc/rwkv.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # RWKV +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/sam.md b/docs/source/en/model_doc/sam.md index f45b08c2c235..cd9e3f5c3c45 100644 --- a/docs/source/en/model_doc/sam.md +++ b/docs/source/en/model_doc/sam.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # SAM +
+PyTorch +TensorFlow +
+ ## Overview SAM (Segment Anything Model) was proposed in [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. diff --git a/docs/source/en/model_doc/seamless_m4t.md b/docs/source/en/model_doc/seamless_m4t.md index f18617e83d6f..100198e50170 100644 --- a/docs/source/en/model_doc/seamless_m4t.md +++ b/docs/source/en/model_doc/seamless_m4t.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # SeamlessM4T +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/seamless_m4t_v2.md b/docs/source/en/model_doc/seamless_m4t_v2.md index 3be24330ecf6..7b68d08b5f95 100644 --- a/docs/source/en/model_doc/seamless_m4t_v2.md +++ b/docs/source/en/model_doc/seamless_m4t_v2.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # SeamlessM4T-v2 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/segformer.md b/docs/source/en/model_doc/segformer.md index 1dc38ef45b8e..093a141eaf83 100644 --- a/docs/source/en/model_doc/segformer.md +++ b/docs/source/en/model_doc/segformer.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # SegFormer +
+PyTorch +TensorFlow +
+ ## Overview The SegFormer model was proposed in [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping diff --git a/docs/source/en/model_doc/seggpt.md b/docs/source/en/model_doc/seggpt.md index 464b8a3de415..1eb82b84774c 100644 --- a/docs/source/en/model_doc/seggpt.md +++ b/docs/source/en/model_doc/seggpt.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # SegGPT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/sew-d.md b/docs/source/en/model_doc/sew-d.md index 87af1709be74..3626d953d97d 100644 --- a/docs/source/en/model_doc/sew-d.md +++ b/docs/source/en/model_doc/sew-d.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # SEW-D +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/sew.md b/docs/source/en/model_doc/sew.md index 895457d75006..b0ba0deb6c2c 100644 --- a/docs/source/en/model_doc/sew.md +++ b/docs/source/en/model_doc/sew.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # SEW +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/siglip.md b/docs/source/en/model_doc/siglip.md index a0b418833c54..55126f1dc104 100644 --- a/docs/source/en/model_doc/siglip.md +++ b/docs/source/en/model_doc/siglip.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # SigLIP +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/speech-encoder-decoder.md b/docs/source/en/model_doc/speech-encoder-decoder.md index 7e2bcef98abc..bc04cf3a2b3d 100644 --- a/docs/source/en/model_doc/speech-encoder-decoder.md +++ b/docs/source/en/model_doc/speech-encoder-decoder.md @@ -16,6 +16,12 @@ rendered properly in your Markdown viewer. # Speech Encoder Decoder Models +
+PyTorch +Flax +
+ The [`SpeechEncoderDecoderModel`] can be used to initialize a speech-to-text model with any pretrained speech autoencoding model as the encoder (*e.g.* [Wav2Vec2](wav2vec2), [Hubert](hubert)) and any pretrained autoregressive model as the decoder. diff --git a/docs/source/en/model_doc/speech_to_text.md b/docs/source/en/model_doc/speech_to_text.md index 23512b323af6..8b375374ea54 100644 --- a/docs/source/en/model_doc/speech_to_text.md +++ b/docs/source/en/model_doc/speech_to_text.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # Speech2Text +
+PyTorch +TensorFlow +
+ ## Overview The Speech2Text model was proposed in [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. It's a diff --git a/docs/source/en/model_doc/speecht5.md b/docs/source/en/model_doc/speecht5.md index bea4fe555d78..acbadb137f46 100644 --- a/docs/source/en/model_doc/speecht5.md +++ b/docs/source/en/model_doc/speecht5.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # SpeechT5 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/splinter.md b/docs/source/en/model_doc/splinter.md index 6eb41f1928c0..0d526beff968 100644 --- a/docs/source/en/model_doc/splinter.md +++ b/docs/source/en/model_doc/splinter.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Splinter +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/squeezebert.md b/docs/source/en/model_doc/squeezebert.md index be911724b039..56046e22b799 100644 --- a/docs/source/en/model_doc/squeezebert.md +++ b/docs/source/en/model_doc/squeezebert.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # SqueezeBERT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/stablelm.md b/docs/source/en/model_doc/stablelm.md index a6ae8fb2da5f..c05b76e82a8d 100644 --- a/docs/source/en/model_doc/stablelm.md +++ b/docs/source/en/model_doc/stablelm.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # StableLM +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/starcoder2.md b/docs/source/en/model_doc/starcoder2.md index 5cdc8a9ba71c..305250b58e0e 100644 --- a/docs/source/en/model_doc/starcoder2.md +++ b/docs/source/en/model_doc/starcoder2.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Starcoder2 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/superglue.md b/docs/source/en/model_doc/superglue.md index df700a93cc2d..38ef55ab793f 100644 --- a/docs/source/en/model_doc/superglue.md +++ b/docs/source/en/model_doc/superglue.md @@ -15,7 +15,9 @@ rendered properly in your Markdown viewer. # SuperGlue +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md index c1d48daa27a4..06ae5cb08127 100644 --- a/docs/source/en/model_doc/superpoint.md +++ b/docs/source/en/model_doc/superpoint.md @@ -15,7 +15,9 @@ rendered properly in your Markdown viewer. # SuperPoint +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/swiftformer.md b/docs/source/en/model_doc/swiftformer.md index 319c79fce4fb..48580a60f580 100644 --- a/docs/source/en/model_doc/swiftformer.md +++ b/docs/source/en/model_doc/swiftformer.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # SwiftFormer +
+PyTorch +TensorFlow +
+ ## Overview The SwiftFormer model was proposed in [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. diff --git a/docs/source/en/model_doc/swin.md b/docs/source/en/model_doc/swin.md index e23c882a3f09..4e2adf5ca820 100644 --- a/docs/source/en/model_doc/swin.md +++ b/docs/source/en/model_doc/swin.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # Swin Transformer +
+PyTorch +TensorFlow +
+ ## Overview The Swin Transformer was proposed in [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) diff --git a/docs/source/en/model_doc/swin2sr.md b/docs/source/en/model_doc/swin2sr.md index dfccd94f31eb..136f1a1c1e17 100644 --- a/docs/source/en/model_doc/swin2sr.md +++ b/docs/source/en/model_doc/swin2sr.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Swin2SR +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/swinv2.md b/docs/source/en/model_doc/swinv2.md index 6075db8b2c39..a709af9712e3 100644 --- a/docs/source/en/model_doc/swinv2.md +++ b/docs/source/en/model_doc/swinv2.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Swin Transformer V2 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/switch_transformers.md b/docs/source/en/model_doc/switch_transformers.md index 911cbe987c37..433b84dd8622 100644 --- a/docs/source/en/model_doc/switch_transformers.md +++ b/docs/source/en/model_doc/switch_transformers.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # SwitchTransformers +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/table-transformer.md b/docs/source/en/model_doc/table-transformer.md index b4b5b577b4fc..fea4dabf3f38 100644 --- a/docs/source/en/model_doc/table-transformer.md +++ b/docs/source/en/model_doc/table-transformer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Table Transformer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/tapas.md b/docs/source/en/model_doc/tapas.md index 79bbe3e819cf..21eb697ee34d 100644 --- a/docs/source/en/model_doc/tapas.md +++ b/docs/source/en/model_doc/tapas.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # TAPAS +
+PyTorch +TensorFlow +
+ ## Overview The TAPAS model was proposed in [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://www.aclweb.org/anthology/2020.acl-main.398) diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md index 7c9373a34fee..72f29b4463ed 100644 --- a/docs/source/en/model_doc/textnet.md +++ b/docs/source/en/model_doc/textnet.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # TextNet +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/time_series_transformer.md b/docs/source/en/model_doc/time_series_transformer.md index e54b57df74e4..a91633b6b029 100644 --- a/docs/source/en/model_doc/time_series_transformer.md +++ b/docs/source/en/model_doc/time_series_transformer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Time Series Transformer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/timesformer.md b/docs/source/en/model_doc/timesformer.md index bd9d03bf1867..c01f64efa71c 100644 --- a/docs/source/en/model_doc/timesformer.md +++ b/docs/source/en/model_doc/timesformer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # TimeSformer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/timm_wrapper.md b/docs/source/en/model_doc/timm_wrapper.md index 578f01eb5ead..8095a91054a5 100644 --- a/docs/source/en/model_doc/timm_wrapper.md +++ b/docs/source/en/model_doc/timm_wrapper.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # TimmWrapper +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/trajectory_transformer.md b/docs/source/en/model_doc/trajectory_transformer.md index 164f3810886c..0c8fc29e01fa 100644 --- a/docs/source/en/model_doc/trajectory_transformer.md +++ b/docs/source/en/model_doc/trajectory_transformer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Trajectory Transformer +
PyTorch +
diff --git a/docs/source/en/model_doc/transfo-xl.md b/docs/source/en/model_doc/transfo-xl.md index c80d9352b5ae..4d4f68ab07c9 100644 --- a/docs/source/en/model_doc/transfo-xl.md +++ b/docs/source/en/model_doc/transfo-xl.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # Transformer XL +
+PyTorch +TensorFlow +
+ This model is in maintenance mode only, so we won't accept any new PRs changing its code. This model was deprecated due to security issues linked to `pickle.load`. diff --git a/docs/source/en/model_doc/trocr.md b/docs/source/en/model_doc/trocr.md index 80df93c02a5b..0d0fb6ca24ab 100644 --- a/docs/source/en/model_doc/trocr.md +++ b/docs/source/en/model_doc/trocr.md @@ -15,7 +15,9 @@ specific language governing permissions and limitations under the License. --> # TrOCR +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/tvlt.md b/docs/source/en/model_doc/tvlt.md index 54242f67b8fe..f1a97dfcd813 100644 --- a/docs/source/en/model_doc/tvlt.md +++ b/docs/source/en/model_doc/tvlt.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # TVLT +
PyTorch +
diff --git a/docs/source/en/model_doc/tvp.md b/docs/source/en/model_doc/tvp.md index 7753abf1aaa6..33b31c8602ca 100644 --- a/docs/source/en/model_doc/tvp.md +++ b/docs/source/en/model_doc/tvp.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # TVP +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/udop.md b/docs/source/en/model_doc/udop.md index fc1edb1d8508..b63bc11a53ee 100644 --- a/docs/source/en/model_doc/udop.md +++ b/docs/source/en/model_doc/udop.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # UDOP +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/umt5.md b/docs/source/en/model_doc/umt5.md index f753c9624fad..736574373c50 100644 --- a/docs/source/en/model_doc/umt5.md +++ b/docs/source/en/model_doc/umt5.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # UMT5 +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/unispeech-sat.md b/docs/source/en/model_doc/unispeech-sat.md index f29eacfcc14b..9190cb4f694b 100644 --- a/docs/source/en/model_doc/unispeech-sat.md +++ b/docs/source/en/model_doc/unispeech-sat.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # UniSpeech-SAT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/unispeech.md b/docs/source/en/model_doc/unispeech.md index 02688cf44793..8abb443f8fd4 100644 --- a/docs/source/en/model_doc/unispeech.md +++ b/docs/source/en/model_doc/unispeech.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # UniSpeech +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/univnet.md b/docs/source/en/model_doc/univnet.md index aefd2f157578..367147115278 100644 --- a/docs/source/en/model_doc/univnet.md +++ b/docs/source/en/model_doc/univnet.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # UnivNet +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/upernet.md b/docs/source/en/model_doc/upernet.md index ea08ccf109a6..a2c96582f24a 100644 --- a/docs/source/en/model_doc/upernet.md +++ b/docs/source/en/model_doc/upernet.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # UPerNet +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/van.md b/docs/source/en/model_doc/van.md index 3bdf4dedbb87..1df6a4640bbb 100644 --- a/docs/source/en/model_doc/van.md +++ b/docs/source/en/model_doc/van.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # VAN +
PyTorch +
diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md index 3985f76e3ab3..553f6b6741e1 100644 --- a/docs/source/en/model_doc/video_llava.md +++ b/docs/source/en/model_doc/video_llava.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Video-LLaVA +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/videomae.md b/docs/source/en/model_doc/videomae.md index 9035be3be948..f4c9d8b38705 100644 --- a/docs/source/en/model_doc/videomae.md +++ b/docs/source/en/model_doc/videomae.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # VideoMAE +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/vilt.md b/docs/source/en/model_doc/vilt.md index 1a6573a1bc0a..107271e2c96e 100644 --- a/docs/source/en/model_doc/vilt.md +++ b/docs/source/en/model_doc/vilt.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # ViLT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md index 69a1c5f5cab1..a9517245e0e0 100644 --- a/docs/source/en/model_doc/vipllava.md +++ b/docs/source/en/model_doc/vipllava.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # VipLlava +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/visual_bert.md b/docs/source/en/model_doc/visual_bert.md index 704ff1733b36..55c526d067ac 100644 --- a/docs/source/en/model_doc/visual_bert.md +++ b/docs/source/en/model_doc/visual_bert.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # VisualBERT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/vit_hybrid.md b/docs/source/en/model_doc/vit_hybrid.md index 0471988398ac..9a7e04e6b7ae 100644 --- a/docs/source/en/model_doc/vit_hybrid.md +++ b/docs/source/en/model_doc/vit_hybrid.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Hybrid Vision Transformer (ViT Hybrid) +
PyTorch +
diff --git a/docs/source/en/model_doc/vit_mae.md b/docs/source/en/model_doc/vit_mae.md index 8d0a40c8a3e1..c3ed0d36c664 100644 --- a/docs/source/en/model_doc/vit_mae.md +++ b/docs/source/en/model_doc/vit_mae.md @@ -16,6 +16,11 @@ rendered properly in your Markdown viewer. # ViTMAE +
+PyTorch +TensorFlow +
+ ## Overview The ViTMAE model was proposed in [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377v2) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, diff --git a/docs/source/en/model_doc/vit_msn.md b/docs/source/en/model_doc/vit_msn.md index bc5a61e9846d..6d34b8513674 100644 --- a/docs/source/en/model_doc/vit_msn.md +++ b/docs/source/en/model_doc/vit_msn.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # ViTMSN +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/vitdet.md b/docs/source/en/model_doc/vitdet.md index 20b12ae4efaf..d569e71d904e 100644 --- a/docs/source/en/model_doc/vitdet.md +++ b/docs/source/en/model_doc/vitdet.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # ViTDet +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/vitmatte.md b/docs/source/en/model_doc/vitmatte.md index 97dd3c1747cd..105d529c2d44 100644 --- a/docs/source/en/model_doc/vitmatte.md +++ b/docs/source/en/model_doc/vitmatte.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # ViTMatte +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md index 30e2018b0950..02471ad39e22 100644 --- a/docs/source/en/model_doc/vitpose.md +++ b/docs/source/en/model_doc/vitpose.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # ViTPose +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/vits.md b/docs/source/en/model_doc/vits.md index 9d5baab837f6..225d0f639003 100644 --- a/docs/source/en/model_doc/vits.md +++ b/docs/source/en/model_doc/vits.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # VITS +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/vivit.md b/docs/source/en/model_doc/vivit.md index 5150dc922020..aaa86a850c4d 100644 --- a/docs/source/en/model_doc/vivit.md +++ b/docs/source/en/model_doc/vivit.md @@ -12,7 +12,9 @@ specific language governing permissions and limitations under the License. # Video Vision Transformer (ViViT) +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/wav2vec2-bert.md b/docs/source/en/model_doc/wav2vec2-bert.md index 1aec6f1bb2d2..c2cf46497706 100644 --- a/docs/source/en/model_doc/wav2vec2-bert.md +++ b/docs/source/en/model_doc/wav2vec2-bert.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Wav2Vec2-BERT +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/wav2vec2-conformer.md b/docs/source/en/model_doc/wav2vec2-conformer.md index e5450c41f911..f84e6b371116 100644 --- a/docs/source/en/model_doc/wav2vec2-conformer.md +++ b/docs/source/en/model_doc/wav2vec2-conformer.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # Wav2Vec2-Conformer +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/wavlm.md b/docs/source/en/model_doc/wavlm.md index 363e734c9b6c..54947e2f1579 100644 --- a/docs/source/en/model_doc/wavlm.md +++ b/docs/source/en/model_doc/wavlm.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # WavLM +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/xclip.md b/docs/source/en/model_doc/xclip.md index 2f5d7a392376..62f0c3aa2e4e 100644 --- a/docs/source/en/model_doc/xclip.md +++ b/docs/source/en/model_doc/xclip.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # X-CLIP +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/xlm-prophetnet.md b/docs/source/en/model_doc/xlm-prophetnet.md index 0af6259924bf..046904d885a4 100644 --- a/docs/source/en/model_doc/xlm-prophetnet.md +++ b/docs/source/en/model_doc/xlm-prophetnet.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # XLM-ProphetNet +
PyTorch +
diff --git a/docs/source/en/model_doc/xlm-roberta-xl.md b/docs/source/en/model_doc/xlm-roberta-xl.md index eef6825faaba..b291105865ca 100644 --- a/docs/source/en/model_doc/xlm-roberta-xl.md +++ b/docs/source/en/model_doc/xlm-roberta-xl.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # XLM-RoBERTa-XL +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/xlm.md b/docs/source/en/model_doc/xlm.md index 0ee11c6addc5..61effea7cca7 100644 --- a/docs/source/en/model_doc/xlm.md +++ b/docs/source/en/model_doc/xlm.md @@ -17,12 +17,8 @@ rendered properly in your Markdown viewer. # XLM
- -Models - - -Spaces - +PyTorch +TensorFlow
## Overview diff --git a/docs/source/en/model_doc/xlnet.md b/docs/source/en/model_doc/xlnet.md index 90b454e8af3c..0b90de75ccff 100644 --- a/docs/source/en/model_doc/xlnet.md +++ b/docs/source/en/model_doc/xlnet.md @@ -17,12 +17,8 @@ rendered properly in your Markdown viewer. # XLNet
- -Models - - -Spaces - +PyTorch +TensorFlow
## Overview diff --git a/docs/source/en/model_doc/xmod.md b/docs/source/en/model_doc/xmod.md index 47797fa64902..e07601074c2b 100644 --- a/docs/source/en/model_doc/xmod.md +++ b/docs/source/en/model_doc/xmod.md @@ -16,6 +16,10 @@ rendered properly in your Markdown viewer. # X-MOD +
+PyTorch +
+ ## Overview The X-MOD model was proposed in [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, and Mikel Artetxe. diff --git a/docs/source/en/model_doc/yolos.md b/docs/source/en/model_doc/yolos.md index 994ec1384bbb..c92c63372338 100644 --- a/docs/source/en/model_doc/yolos.md +++ b/docs/source/en/model_doc/yolos.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # YOLOS +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/yoso.md b/docs/source/en/model_doc/yoso.md index d3289dfa20e3..c9fbb11b1e49 100644 --- a/docs/source/en/model_doc/yoso.md +++ b/docs/source/en/model_doc/yoso.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # YOSO +
PyTorch +
## Overview diff --git a/docs/source/en/model_doc/zamba.md b/docs/source/en/model_doc/zamba.md index ad5648001339..a6a7ee38cf60 100644 --- a/docs/source/en/model_doc/zamba.md +++ b/docs/source/en/model_doc/zamba.md @@ -15,7 +15,9 @@ rendered properly in your Markdown viewer. --> # Zamba +
PyTorch +
Zamba is a large language model (LLM) trained by Zyphra, and made available under an Apache 2.0 license. Please see the [Zyphra Hugging Face](https://huggingface.co/collections/zyphra/) repository for model weights. diff --git a/docs/source/en/model_doc/zamba2.md b/docs/source/en/model_doc/zamba2.md index 88e543b14ba2..0d7e6782b67f 100644 --- a/docs/source/en/model_doc/zamba2.md +++ b/docs/source/en/model_doc/zamba2.md @@ -15,7 +15,9 @@ rendered properly in your Markdown viewer. --> # Zamba2 +
PyTorch +
Zamba2 is a large language model (LLM) trained by Zyphra, and made available under an Apache 2.0 license. Please see the [Zyphra Hugging Face](https://huggingface.co/collections/zyphra/) repository for model weights. diff --git a/docs/source/en/model_doc/zoedepth.md b/docs/source/en/model_doc/zoedepth.md index 573d93119e32..fefadfba6aa4 100644 --- a/docs/source/en/model_doc/zoedepth.md +++ b/docs/source/en/model_doc/zoedepth.md @@ -16,7 +16,9 @@ rendered properly in your Markdown viewer. # ZoeDepth +
PyTorch +
## Overview From 1389d2fde8605766257a3ecef787c404b626e545 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Tue, 28 Jan 2025 10:19:14 -0800 Subject: [PATCH 106/116] flashattention --- docs/source/en/model_doc/aria.md | 1 + docs/source/en/model_doc/bamba.md | 1 + docs/source/en/model_doc/bark.md | 1 + docs/source/en/model_doc/bart.md | 1 + docs/source/en/model_doc/chameleon.md | 1 + docs/source/en/model_doc/clip.md | 1 + docs/source/en/model_doc/cohere.md | 1 + docs/source/en/model_doc/cohere2.md | 1 + docs/source/en/model_doc/data2vec.md | 1 + docs/source/en/model_doc/dbrx.md | 1 + docs/source/en/model_doc/diffllama.md | 1 + docs/source/en/model_doc/distilbert.md | 1 + docs/source/en/model_doc/emu3.md | 1 + docs/source/en/model_doc/falcon.md | 1 + docs/source/en/model_doc/gemma.md | 1 + docs/source/en/model_doc/gemma2.md | 1 + docs/source/en/model_doc/glm.md | 1 + docs/source/en/model_doc/gpt_bigcode.md | 1 + docs/source/en/model_doc/gpt_neo.md | 1 + docs/source/en/model_doc/gpt_neox_japanese.md | 1 + docs/source/en/model_doc/gptj.md | 1 + docs/source/en/model_doc/granite.md | 1 + docs/source/en/model_doc/granitemoe.md | 1 + docs/source/en/model_doc/helium.md | 1 + docs/source/en/model_doc/hubert.md | 1 + docs/source/en/model_doc/idefics2.md | 1 + docs/source/en/model_doc/idefics3.md | 1 + docs/source/en/model_doc/jamba.md | 1 + docs/source/en/model_doc/jetmoe.md | 1 + docs/source/en/model_doc/llama.md | 1 + docs/source/en/model_doc/llava.md | 1 + docs/source/en/model_doc/llava_next.md | 1 + docs/source/en/model_doc/llava_next_video.md | 1 + docs/source/en/model_doc/llava_onevision.md | 1 + docs/source/en/model_doc/m2m_100.md | 1 + docs/source/en/model_doc/mbart.md | 1 + docs/source/en/model_doc/mimi.md | 1 + docs/source/en/model_doc/mistral.md | 1 + docs/source/en/model_doc/mixtral.md | 1 + docs/source/en/model_doc/modernbert.md | 1 + docs/source/en/model_doc/moonshine.md | 1 + docs/source/en/model_doc/moshi.md | 1 + docs/source/en/model_doc/musicgen.md | 1 + docs/source/en/model_doc/musicgen_melody.md | 1 + docs/source/en/model_doc/nemotron.md | 1 + docs/source/en/model_doc/nllb.md | 1 + docs/source/en/model_doc/olmo.md | 1 + docs/source/en/model_doc/olmo2.md | 1 + docs/source/en/model_doc/olmoe.md | 1 + docs/source/en/model_doc/openai-gpt.md | 1 + docs/source/en/model_doc/opt.md | 1 + docs/source/en/model_doc/paligemma.md | 1 + docs/source/en/model_doc/phi.md | 1 + docs/source/en/model_doc/phi3.md | 1 + docs/source/en/model_doc/phimoe.md | 1 + docs/source/en/model_doc/qwen2.md | 1 + docs/source/en/model_doc/qwen2_5_vl.md | 1 + docs/source/en/model_doc/qwen2_audio.md | 1 + docs/source/en/model_doc/qwen2_moe.md | 1 + docs/source/en/model_doc/qwen2_vl.md | 1 + docs/source/en/model_doc/rag.md | 1 + docs/source/en/model_doc/sew.md | 1 + docs/source/en/model_doc/siglip.md | 1 + docs/source/en/model_doc/speech-encoder-decoder.md | 1 + docs/source/en/model_doc/stablelm.md | 1 + docs/source/en/model_doc/starcoder2.md | 1 + docs/source/en/model_doc/unispeech-sat.md | 1 + docs/source/en/model_doc/unispeech.md | 1 + docs/source/en/model_doc/video_llava.md | 1 + docs/source/en/model_doc/vipllava.md | 1 + docs/source/en/model_doc/vision-encoder-decoder.md | 1 + docs/source/en/model_doc/vision-text-dual-encoder.md | 1 + docs/source/en/model_doc/wav2vec2.md | 1 + docs/source/en/model_doc/whisper.md | 1 + docs/source/en/model_doc/zamba2.md | 1 + 75 files changed, 75 insertions(+) diff --git a/docs/source/en/model_doc/aria.md b/docs/source/en/model_doc/aria.md index b73a72947898..9d42709041ee 100644 --- a/docs/source/en/model_doc/aria.md +++ b/docs/source/en/model_doc/aria.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/bamba.md b/docs/source/en/model_doc/bamba.md index 5c6092aa1d58..a5073bd5b749 100644 --- a/docs/source/en/model_doc/bamba.md +++ b/docs/source/en/model_doc/bamba.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/bark.md b/docs/source/en/model_doc/bark.md index feba11707f48..912f552fa7c0 100644 --- a/docs/source/en/model_doc/bark.md +++ b/docs/source/en/model_doc/bark.md @@ -14,6 +14,7 @@ specific language governing permissions and limitations under the License.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/bart.md b/docs/source/en/model_doc/bart.md index a3a4e4bfe647..5ebc0a230b69 100644 --- a/docs/source/en/model_doc/bart.md +++ b/docs/source/en/model_doc/bart.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md index 0e3b12b53b61..f3c54dbe1f39 100644 --- a/docs/source/en/model_doc/chameleon.md +++ b/docs/source/en/model_doc/chameleon.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/clip.md b/docs/source/en/model_doc/clip.md index 7fae22d81016..e9b354dfb916 100644 --- a/docs/source/en/model_doc/clip.md +++ b/docs/source/en/model_doc/clip.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/cohere.md b/docs/source/en/model_doc/cohere.md index 760cfd00a00f..bacc5f665bd5 100644 --- a/docs/source/en/model_doc/cohere.md +++ b/docs/source/en/model_doc/cohere.md @@ -2,6 +2,7 @@
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/cohere2.md b/docs/source/en/model_doc/cohere2.md index 6b6d11ecd204..83da4df2154a 100644 --- a/docs/source/en/model_doc/cohere2.md +++ b/docs/source/en/model_doc/cohere2.md @@ -2,6 +2,7 @@
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/data2vec.md b/docs/source/en/model_doc/data2vec.md index 31efc35e5acf..2c354bf1aee0 100644 --- a/docs/source/en/model_doc/data2vec.md +++ b/docs/source/en/model_doc/data2vec.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md index c2b084dab512..32fbbb315a25 100644 --- a/docs/source/en/model_doc/dbrx.md +++ b/docs/source/en/model_doc/dbrx.md @@ -14,6 +14,7 @@ specific language governing permissions and limitations under the License.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/diffllama.md b/docs/source/en/model_doc/diffllama.md index 5b1b22ee872a..f19028bb02c9 100644 --- a/docs/source/en/model_doc/diffllama.md +++ b/docs/source/en/model_doc/diffllama.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/distilbert.md b/docs/source/en/model_doc/distilbert.md index 9a1fadb6b9b9..931cbbc0b404 100644 --- a/docs/source/en/model_doc/distilbert.md +++ b/docs/source/en/model_doc/distilbert.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/emu3.md b/docs/source/en/model_doc/emu3.md index ad9f0719ed54..af195836d20a 100644 --- a/docs/source/en/model_doc/emu3.md +++ b/docs/source/en/model_doc/emu3.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/falcon.md b/docs/source/en/model_doc/falcon.md index 4fe9cd81b9f3..e34216ec057b 100644 --- a/docs/source/en/model_doc/falcon.md +++ b/docs/source/en/model_doc/falcon.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/gemma.md b/docs/source/en/model_doc/gemma.md index 7d39e6e3c4a2..f2dd3f897940 100644 --- a/docs/source/en/model_doc/gemma.md +++ b/docs/source/en/model_doc/gemma.md @@ -20,6 +20,7 @@ rendered properly in your Markdown viewer. PyTorch Flax +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/gemma2.md b/docs/source/en/model_doc/gemma2.md index d80690c6e395..7ad268535c15 100644 --- a/docs/source/en/model_doc/gemma2.md +++ b/docs/source/en/model_doc/gemma2.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/glm.md b/docs/source/en/model_doc/glm.md index bc592346c0c3..0ea6fde66a42 100644 --- a/docs/source/en/model_doc/glm.md +++ b/docs/source/en/model_doc/glm.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/gpt_bigcode.md b/docs/source/en/model_doc/gpt_bigcode.md index 2ebbaa512511..f8919f503945 100644 --- a/docs/source/en/model_doc/gpt_bigcode.md +++ b/docs/source/en/model_doc/gpt_bigcode.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/gpt_neo.md b/docs/source/en/model_doc/gpt_neo.md index de1f80c08268..f90e0d18498f 100644 --- a/docs/source/en/model_doc/gpt_neo.md +++ b/docs/source/en/model_doc/gpt_neo.md @@ -20,6 +20,7 @@ rendered properly in your Markdown viewer. PyTorch Flax +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/gpt_neox_japanese.md b/docs/source/en/model_doc/gpt_neox_japanese.md index 9a5f7335564d..cedfafa133e4 100644 --- a/docs/source/en/model_doc/gpt_neox_japanese.md +++ b/docs/source/en/model_doc/gpt_neox_japanese.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/gptj.md b/docs/source/en/model_doc/gptj.md index 9268adb2a3e5..8e852d931aae 100644 --- a/docs/source/en/model_doc/gptj.md +++ b/docs/source/en/model_doc/gptj.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +FlashAttention ## Overview diff --git a/docs/source/en/model_doc/granite.md b/docs/source/en/model_doc/granite.md index 875177278a57..66551025aa04 100644 --- a/docs/source/en/model_doc/granite.md +++ b/docs/source/en/model_doc/granite.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/granitemoe.md b/docs/source/en/model_doc/granitemoe.md index d9e5fa7f61cb..9cdea05d8fb2 100644 --- a/docs/source/en/model_doc/granitemoe.md +++ b/docs/source/en/model_doc/granitemoe.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/helium.md b/docs/source/en/model_doc/helium.md index d0f676c8470a..a2ae1424774d 100644 --- a/docs/source/en/model_doc/helium.md +++ b/docs/source/en/model_doc/helium.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/hubert.md b/docs/source/en/model_doc/hubert.md index 9447e8785f02..53d729b920f5 100644 --- a/docs/source/en/model_doc/hubert.md +++ b/docs/source/en/model_doc/hubert.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch TensorFlow +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/idefics2.md b/docs/source/en/model_doc/idefics2.md index 815de863b741..8f3351d8a49d 100644 --- a/docs/source/en/model_doc/idefics2.md +++ b/docs/source/en/model_doc/idefics2.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md index 1f4ae33a8f61..b4975d73471d 100644 --- a/docs/source/en/model_doc/idefics3.md +++ b/docs/source/en/model_doc/idefics3.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/jamba.md b/docs/source/en/model_doc/jamba.md index 6edb9e0c7c32..6ed97fc29612 100644 --- a/docs/source/en/model_doc/jamba.md +++ b/docs/source/en/model_doc/jamba.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/jetmoe.md b/docs/source/en/model_doc/jetmoe.md index 67387614b47d..91b2f551f264 100644 --- a/docs/source/en/model_doc/jetmoe.md +++ b/docs/source/en/model_doc/jetmoe.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md index 27927b591185..9d40b5b3010c 100644 --- a/docs/source/en/model_doc/llama.md +++ b/docs/source/en/model_doc/llama.md @@ -20,6 +20,7 @@ rendered properly in your Markdown viewer. PyTorch Flax +FlashAttention ## Overview diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md index bdfd07944218..aecc9ea6dec6 100644 --- a/docs/source/en/model_doc/llava.md +++ b/docs/source/en/model_doc/llava.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index 51a386815ef0..b88e20a1c7b1 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md index 52181f992c51..76a8409455a9 100644 --- a/docs/source/en/model_doc/llava_next_video.md +++ b/docs/source/en/model_doc/llava_next_video.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/llava_onevision.md b/docs/source/en/model_doc/llava_onevision.md index b1e487cc299e..60fa88d716a3 100644 --- a/docs/source/en/model_doc/llava_onevision.md +++ b/docs/source/en/model_doc/llava_onevision.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/m2m_100.md b/docs/source/en/model_doc/m2m_100.md index 77f1d22c7f54..840f21a037e0 100644 --- a/docs/source/en/model_doc/m2m_100.md +++ b/docs/source/en/model_doc/m2m_100.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/mbart.md b/docs/source/en/model_doc/mbart.md index b75d36ca50e6..742747715620 100644 --- a/docs/source/en/model_doc/mbart.md +++ b/docs/source/en/model_doc/mbart.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +FlashAttention diff --git a/docs/source/en/model_doc/mimi.md b/docs/source/en/model_doc/mimi.md index d8852672685f..61cbe8fbe626 100644 --- a/docs/source/en/model_doc/mimi.md +++ b/docs/source/en/model_doc/mimi.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md index c61fa0dd2628..02335f2fb133 100644 --- a/docs/source/en/model_doc/mistral.md +++ b/docs/source/en/model_doc/mistral.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +FlashAttention ## Overview diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md index 74b49cf81dd7..4b28dd898cd6 100644 --- a/docs/source/en/model_doc/mixtral.md +++ b/docs/source/en/model_doc/mixtral.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/modernbert.md b/docs/source/en/model_doc/modernbert.md index 7432d27d7fc0..7bf71ef47557 100644 --- a/docs/source/en/model_doc/modernbert.md +++ b/docs/source/en/model_doc/modernbert.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md index e019c89ce765..2c8c6feae5bb 100644 --- a/docs/source/en/model_doc/moonshine.md +++ b/docs/source/en/model_doc/moonshine.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/moshi.md b/docs/source/en/model_doc/moshi.md index e9dfd9b84d82..ca06e05d844e 100644 --- a/docs/source/en/model_doc/moshi.md +++ b/docs/source/en/model_doc/moshi.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/musicgen.md b/docs/source/en/model_doc/musicgen.md index 063f4ac97b39..72f581881077 100644 --- a/docs/source/en/model_doc/musicgen.md +++ b/docs/source/en/model_doc/musicgen.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/musicgen_melody.md b/docs/source/en/model_doc/musicgen_melody.md index af0f21559a1a..cf7ba13f0ad1 100644 --- a/docs/source/en/model_doc/musicgen_melody.md +++ b/docs/source/en/model_doc/musicgen_melody.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/nemotron.md b/docs/source/en/model_doc/nemotron.md index d8837568deee..6af9fe135b88 100644 --- a/docs/source/en/model_doc/nemotron.md +++ b/docs/source/en/model_doc/nemotron.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License.
PyTorch +FlashAttention
### License diff --git a/docs/source/en/model_doc/nllb.md b/docs/source/en/model_doc/nllb.md index 0e1bb34577fa..ec8358b77ba8 100644 --- a/docs/source/en/model_doc/nllb.md +++ b/docs/source/en/model_doc/nllb.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Updated tokenizer behavior diff --git a/docs/source/en/model_doc/olmo.md b/docs/source/en/model_doc/olmo.md index 9788a98c7721..c173b83189a9 100644 --- a/docs/source/en/model_doc/olmo.md +++ b/docs/source/en/model_doc/olmo.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/olmo2.md b/docs/source/en/model_doc/olmo2.md index dc989341cf3a..bdecd17aae62 100644 --- a/docs/source/en/model_doc/olmo2.md +++ b/docs/source/en/model_doc/olmo2.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/olmoe.md b/docs/source/en/model_doc/olmoe.md index 71502aea3dd6..b9606f4b9c2c 100644 --- a/docs/source/en/model_doc/olmoe.md +++ b/docs/source/en/model_doc/olmoe.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/openai-gpt.md b/docs/source/en/model_doc/openai-gpt.md index 054495676a83..d64622de7d5c 100644 --- a/docs/source/en/model_doc/openai-gpt.md +++ b/docs/source/en/model_doc/openai-gpt.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +FlashAttention ## Overview diff --git a/docs/source/en/model_doc/opt.md b/docs/source/en/model_doc/opt.md index b543f46f04fa..3ac3cdacfe2a 100644 --- a/docs/source/en/model_doc/opt.md +++ b/docs/source/en/model_doc/opt.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +FlashAttention ## Overview diff --git a/docs/source/en/model_doc/paligemma.md b/docs/source/en/model_doc/paligemma.md index b6805be90531..1dc3d33a9692 100644 --- a/docs/source/en/model_doc/paligemma.md +++ b/docs/source/en/model_doc/paligemma.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/phi.md b/docs/source/en/model_doc/phi.md index 81873459e0a8..6bf4aa6bd60f 100644 --- a/docs/source/en/model_doc/phi.md +++ b/docs/source/en/model_doc/phi.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/phi3.md b/docs/source/en/model_doc/phi3.md index 93c3073f28fb..a2b52342c005 100644 --- a/docs/source/en/model_doc/phi3.md +++ b/docs/source/en/model_doc/phi3.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/phimoe.md b/docs/source/en/model_doc/phimoe.md index 1dc6e22d6a44..cc19f3ac2d57 100644 --- a/docs/source/en/model_doc/phimoe.md +++ b/docs/source/en/model_doc/phimoe.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index a774f885e3a9..0ac94085893c 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/qwen2_5_vl.md b/docs/source/en/model_doc/qwen2_5_vl.md index 631da5d88224..5ad1ff8761bf 100644 --- a/docs/source/en/model_doc/qwen2_5_vl.md +++ b/docs/source/en/model_doc/qwen2_5_vl.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/qwen2_audio.md b/docs/source/en/model_doc/qwen2_audio.md index da055d1015ab..48d516cc226f 100644 --- a/docs/source/en/model_doc/qwen2_audio.md +++ b/docs/source/en/model_doc/qwen2_audio.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/qwen2_moe.md b/docs/source/en/model_doc/qwen2_moe.md index c56ec74ab3aa..077fc0561184 100644 --- a/docs/source/en/model_doc/qwen2_moe.md +++ b/docs/source/en/model_doc/qwen2_moe.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/qwen2_vl.md b/docs/source/en/model_doc/qwen2_vl.md index 516271bdaf59..37c7ad31b311 100644 --- a/docs/source/en/model_doc/qwen2_vl.md +++ b/docs/source/en/model_doc/qwen2_vl.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/rag.md b/docs/source/en/model_doc/rag.md index 0f59592633f7..8b65da43a22e 100644 --- a/docs/source/en/model_doc/rag.md +++ b/docs/source/en/model_doc/rag.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch TensorFlow +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/sew.md b/docs/source/en/model_doc/sew.md index b0ba0deb6c2c..7911b09c5f60 100644 --- a/docs/source/en/model_doc/sew.md +++ b/docs/source/en/model_doc/sew.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/siglip.md b/docs/source/en/model_doc/siglip.md index 55126f1dc104..5495b7f6545c 100644 --- a/docs/source/en/model_doc/siglip.md +++ b/docs/source/en/model_doc/siglip.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/speech-encoder-decoder.md b/docs/source/en/model_doc/speech-encoder-decoder.md index bc04cf3a2b3d..199163d6f795 100644 --- a/docs/source/en/model_doc/speech-encoder-decoder.md +++ b/docs/source/en/model_doc/speech-encoder-decoder.md @@ -20,6 +20,7 @@ rendered properly in your Markdown viewer. PyTorch Flax +FlashAttention The [`SpeechEncoderDecoderModel`] can be used to initialize a speech-to-text model diff --git a/docs/source/en/model_doc/stablelm.md b/docs/source/en/model_doc/stablelm.md index c05b76e82a8d..31064fca29e8 100644 --- a/docs/source/en/model_doc/stablelm.md +++ b/docs/source/en/model_doc/stablelm.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/starcoder2.md b/docs/source/en/model_doc/starcoder2.md index 305250b58e0e..dbd365b0f15b 100644 --- a/docs/source/en/model_doc/starcoder2.md +++ b/docs/source/en/model_doc/starcoder2.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/unispeech-sat.md b/docs/source/en/model_doc/unispeech-sat.md index 9190cb4f694b..4e4bef5d2093 100644 --- a/docs/source/en/model_doc/unispeech-sat.md +++ b/docs/source/en/model_doc/unispeech-sat.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/unispeech.md b/docs/source/en/model_doc/unispeech.md index 8abb443f8fd4..dd1c55f9e8c3 100644 --- a/docs/source/en/model_doc/unispeech.md +++ b/docs/source/en/model_doc/unispeech.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md index 553f6b6741e1..73b171a2ea4a 100644 --- a/docs/source/en/model_doc/video_llava.md +++ b/docs/source/en/model_doc/video_llava.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md index a9517245e0e0..76838e762919 100644 --- a/docs/source/en/model_doc/vipllava.md +++ b/docs/source/en/model_doc/vipllava.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
## Overview diff --git a/docs/source/en/model_doc/vision-encoder-decoder.md b/docs/source/en/model_doc/vision-encoder-decoder.md index e0aa98cb3d0f..d95b70f6c68e 100644 --- a/docs/source/en/model_doc/vision-encoder-decoder.md +++ b/docs/source/en/model_doc/vision-encoder-decoder.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +FlashAttention ## Overview diff --git a/docs/source/en/model_doc/vision-text-dual-encoder.md b/docs/source/en/model_doc/vision-text-dual-encoder.md index bae26d05128a..2cda544b862b 100644 --- a/docs/source/en/model_doc/vision-text-dual-encoder.md +++ b/docs/source/en/model_doc/vision-text-dual-encoder.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +FlashAttention ## Overview diff --git a/docs/source/en/model_doc/wav2vec2.md b/docs/source/en/model_doc/wav2vec2.md index 6987434a4e24..30a707a06c96 100644 --- a/docs/source/en/model_doc/wav2vec2.md +++ b/docs/source/en/model_doc/wav2vec2.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +FlashAttention ## Overview diff --git a/docs/source/en/model_doc/whisper.md b/docs/source/en/model_doc/whisper.md index 2f3ed0647404..d11a8af98e55 100644 --- a/docs/source/en/model_doc/whisper.md +++ b/docs/source/en/model_doc/whisper.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +FlashAttention ## Overview diff --git a/docs/source/en/model_doc/zamba2.md b/docs/source/en/model_doc/zamba2.md index 0d7e6782b67f..b7495745bd35 100644 --- a/docs/source/en/model_doc/zamba2.md +++ b/docs/source/en/model_doc/zamba2.md @@ -17,6 +17,7 @@ rendered properly in your Markdown viewer.
PyTorch +FlashAttention
Zamba2 is a large language model (LLM) trained by Zyphra, and made available under an Apache 2.0 license. Please see the [Zyphra Hugging Face](https://huggingface.co/collections/zyphra/) repository for model weights. From 167ef027b86b1f8726487dd351c0cc2e25b9fd0c Mon Sep 17 00:00:00 2001 From: stevhliu Date: Tue, 28 Jan 2025 12:07:19 -0800 Subject: [PATCH 107/116] rm check_table --- .circleci/config.yml | 3 +- Makefile | 2 - utils/check_table.py | 302 ------------------------------------------- 3 files changed, 1 insertion(+), 306 deletions(-) delete mode 100644 utils/check_table.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 7e497d755a14..922a51f798ce 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -170,8 +170,7 @@ jobs: - store_artifacts: path: ~/transformers/installed.txt - run: python utils/check_copies.py - - run: python utils/check_modular_conversion.py --num_workers 4 - - run: python utils/check_table.py + - run: python utils/check_modular_conversion.py - run: python utils/check_dummies.py - run: python utils/check_repo.py - run: python utils/check_inits.py diff --git a/Makefile b/Makefile index 710c555b74f6..40d4e6abf424 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,6 @@ autogenerate_code: deps_table_update repo-consistency: python utils/check_copies.py python utils/check_modular_conversion.py - python utils/check_table.py python utils/check_dummies.py python utils/check_repo.py python utils/check_inits.py @@ -82,7 +81,6 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency fix-copies: python utils/check_copies.py --fix_and_overwrite python utils/check_modular_conversion.py --fix_and_overwrite - python utils/check_table.py --fix_and_overwrite python utils/check_dummies.py --fix_and_overwrite python utils/check_doctest_list.py --fix_and_overwrite python utils/check_docstrings.py --fix_and_overwrite diff --git a/utils/check_table.py b/utils/check_table.py deleted file mode 100644 index 9ce7deaf6e32..000000000000 --- a/utils/check_table.py +++ /dev/null @@ -1,302 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Utility that checks the big table in the file docs/source/en/index.md and potentially updates it. - -Use from the root of the repo with: - -```bash -python utils/check_table.py -``` - -for a check that will error in case of inconsistencies (used by `make repo-consistency`). - -To auto-fix issues run: - -```bash -python utils/check_table.py --fix_and_overwrite -``` - -which is used by `make fix-copies`. -""" - -import argparse -import collections -import os -import re -from typing import List - -from transformers.utils import direct_transformers_import - - -# All paths are set with the intent you should run this script from the root of the repo with the command -# python utils/check_table.py -TRANSFORMERS_PATH = "src/transformers" -PATH_TO_DOCS = "docs/source/en" -REPO_PATH = "." - - -def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> str: - """ - Find the text in filename between two prompts. - - Args: - filename (`str`): The file to search into. - start_prompt (`str`): A string to look for at the start of the content searched. - end_prompt (`str`): A string that will mark the end of the content to look for. - - Returns: - `str`: The content between the prompts. - """ - with open(filename, "r", encoding="utf-8", newline="\n") as f: - lines = f.readlines() - - # Find the start prompt. - start_index = 0 - while not lines[start_index].startswith(start_prompt): - start_index += 1 - start_index += 1 - - # Now go until the end prompt. - end_index = start_index - while not lines[end_index].startswith(end_prompt): - end_index += 1 - end_index -= 1 - - while len(lines[start_index]) <= 1: - start_index += 1 - while len(lines[end_index]) <= 1: - end_index -= 1 - end_index += 1 - return "".join(lines[start_index:end_index]), start_index, end_index, lines - - -# Regexes that match TF/Flax/PT model names. Add here suffixes that are used to identify models, separated by | -_re_tf_models = re.compile(r"TF(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)") -_re_flax_models = re.compile(r"Flax(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration)") -# Will match any TF or Flax model too so need to be in an else branch after the two previous regexes. -_re_pt_models = re.compile(r"(.*)(?:Model|Encoder|Decoder|ForConditionalGeneration|ForRetrieval)") - - -# This is to make sure the transformers module imported is the one in the repo. -transformers_module = direct_transformers_import(TRANSFORMERS_PATH) - - -def camel_case_split(identifier: str) -> List[str]: - """ - Split a camel-cased name into words. - - Args: - identifier (`str`): The camel-cased name to parse. - - Returns: - `List[str]`: The list of words in the identifier (as seprated by capital letters). - - Example: - - ```py - >>> camel_case_split("CamelCasedClass") - ["Camel", "Cased", "Class"] - ``` - """ - # Regex thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python - matches = re.finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier) - return [m.group(0) for m in matches] - - -def _center_text(text: str, width: int) -> str: - """ - Utility that will add spaces on the left and right of a text to make it centered for a given width. - - Args: - text (`str`): The text to center. - width (`int`): The desired length of the result. - - Returns: - `str`: A text of length `width` with the original `text` in the middle. - """ - text_length = 2 if text == "✅" or text == "❌" else len(text) - left_indent = (width - text_length) // 2 - right_indent = width - text_length - left_indent - return " " * left_indent + text + " " * right_indent - - -SPECIAL_MODEL_NAME_LINK_MAPPING = { - "Data2VecAudio": "[Data2VecAudio](model_doc/data2vec)", - "Data2VecText": "[Data2VecText](model_doc/data2vec)", - "Data2VecVision": "[Data2VecVision](model_doc/data2vec)", - "DonutSwin": "[DonutSwin](model_doc/donut)", -} - -MODEL_NAMES_WITH_SAME_CONFIG = { - "BARThez": "BART", - "BARTpho": "BART", - "BertJapanese": "BERT", - "BERTweet": "BERT", - "BORT": "BERT", - "ByT5": "T5", - "CPM": "OpenAI GPT-2", - "DePlot": "Pix2Struct", - "DialoGPT": "OpenAI GPT-2", - "DiT": "BEiT", - "FLAN-T5": "T5", - "FLAN-UL2": "T5", - "HerBERT": "BERT", - "LayoutXLM": "LayoutLMv2", - "Llama2": "LLaMA", - "Llama3": "LLaMA", - "Falcon3": "LLaMA", - "MADLAD-400": "T5", - "MatCha": "Pix2Struct", - "mBART-50": "mBART", - "Megatron-GPT2": "OpenAI GPT-2", - "mLUKE": "LUKE", - "MMS": "Wav2Vec2", - "NLLB": "M2M100", - "PhoBERT": "BERT", - "T5v1.1": "T5", - "TAPEX": "BART", - "UL2": "T5", - "Wav2Vec2Phoneme": "Wav2Vec2", - "XLM-V": "XLM-RoBERTa", - "XLS-R": "Wav2Vec2", - "XLSR-Wav2Vec2": "Wav2Vec2", -} -MODEL_NAMES_TO_IGNORE = [ - "ChineseCLIPVisionModel", - "CLIPTextModel", - "CLIPVisionModel", - "Qwen2AudioEncoder", - "SiglipVisionModel", -] - - -def get_model_table_from_auto_modules() -> str: - """ - Generates an up-to-date model table from the content of the auto modules. - """ - # Dictionary model names to config. - config_maping_names = transformers_module.models.auto.configuration_auto.CONFIG_MAPPING_NAMES - model_name_to_config = { - name: config_maping_names[code] - for code, name in transformers_module.MODEL_NAMES_MAPPING.items() - if code in config_maping_names - } - model_name_to_prefix = {name: config.replace("Config", "") for name, config in model_name_to_config.items()} - - # Dictionaries flagging if each model prefix has a backend in PT/TF/Flax. - pt_models = collections.defaultdict(bool) - tf_models = collections.defaultdict(bool) - flax_models = collections.defaultdict(bool) - - # Let's lookup through all transformers object (once). - for attr_name in dir(transformers_module): - lookup_dict = None - if _re_tf_models.match(attr_name) is not None: - lookup_dict = tf_models - attr_name = _re_tf_models.match(attr_name).groups()[0] - elif _re_flax_models.match(attr_name) is not None: - lookup_dict = flax_models - attr_name = _re_flax_models.match(attr_name).groups()[0] - elif _re_pt_models.match(attr_name) is not None: - lookup_dict = pt_models - attr_name = _re_pt_models.match(attr_name).groups()[0] - - if lookup_dict is not None: - while len(attr_name) > 0: - if attr_name in model_name_to_prefix.values(): - lookup_dict[attr_name] = True - break - # Try again after removing the last word in the name - attr_name = "".join(camel_case_split(attr_name)[:-1]) - - # Let's build that table! - model_names = list(model_name_to_config.keys()) + list(MODEL_NAMES_WITH_SAME_CONFIG.keys()) - - # model name to doc link mapping - model_names_mapping = transformers_module.models.auto.configuration_auto.MODEL_NAMES_MAPPING - model_name_to_link_mapping = {value: f"[{value}](model_doc/{key})" for key, value in model_names_mapping.items()} - # update mapping with special model names - model_name_to_link_mapping = { - k: SPECIAL_MODEL_NAME_LINK_MAPPING[k] if k in SPECIAL_MODEL_NAME_LINK_MAPPING else v - for k, v in model_name_to_link_mapping.items() - } - - # MaskFormerSwin and TimmBackbone are backbones and so not meant to be loaded and used on their own. Instead, they define architectures which can be loaded using the AutoBackbone API. - names_to_exclude = ["MaskFormerSwin", "TimmBackbone", "Speech2Text2"] - model_names = [name for name in model_names if name not in names_to_exclude] - model_names.sort(key=str.lower) - - columns = ["Model", "PyTorch support", "TensorFlow support", "Flax Support"] - # We'll need widths to properly display everything in the center (+2 is to leave one extra space on each side). - - widths = [len(c) + 2 for c in columns] - widths[0] = max([len(doc_link) for doc_link in model_name_to_link_mapping.values()]) + 2 - - # Build the table per se - table = "|" + "|".join([_center_text(c, w) for c, w in zip(columns, widths)]) + "|\n" - # Use ":-----:" format to center-aligned table cell texts - table += "|" + "|".join([":" + "-" * (w - 2) + ":" for w in widths]) + "|\n" - - check = {True: "✅", False: "❌"} - - for name in model_names: - if name in MODEL_NAMES_TO_IGNORE: - continue - if name in MODEL_NAMES_WITH_SAME_CONFIG.keys(): - prefix = model_name_to_prefix[MODEL_NAMES_WITH_SAME_CONFIG[name]] - else: - prefix = model_name_to_prefix[name] - line = [ - model_name_to_link_mapping[name], - check[pt_models[prefix]], - check[tf_models[prefix]], - check[flax_models[prefix]], - ] - table += "|" + "|".join([_center_text(l, w) for l, w in zip(line, widths)]) + "|\n" - return table - - -def check_model_table(overwrite=False): - """ - Check the model table in the index.md is consistent with the state of the lib and potentially fix it. - - Args: - overwrite (`bool`, *optional*, defaults to `False`): - Whether or not to overwrite the table when it's not up to date. - """ - current_table, start_index, end_index, lines = _find_text_in_file( - filename=os.path.join(PATH_TO_DOCS, "index.md"), - start_prompt="", - ) - new_table = get_model_table_from_auto_modules() - - if current_table != new_table: - if overwrite: - with open(os.path.join(PATH_TO_DOCS, "index.md"), "w", encoding="utf-8", newline="\n") as f: - f.writelines(lines[:start_index] + [new_table] + lines[end_index:]) - else: - raise ValueError( - "The model table in the `index.md` has not been updated. Run `make fix-copies` to fix this." - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.") - args = parser.parse_args() - - check_model_table(args.fix_and_overwrite) From a5f89c647c4f4f74eddc1afc60d4ba280626846f Mon Sep 17 00:00:00 2001 From: stevhliu Date: Tue, 28 Jan 2025 12:24:52 -0800 Subject: [PATCH 108/116] not-doctested.txt --- utils/not_doctested.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt index 656e9a6aaf8f..5c56ed144cae 100644 --- a/utils/not_doctested.txt +++ b/utils/not_doctested.txt @@ -5,7 +5,6 @@ docs/source/en/add_new_pipeline.md docs/source/en/agents.md docs/source/en/agents.md docs/source/en/attention.md -docs/source/en/benchmarks.md docs/source/en/community.md docs/source/en/contributing.md docs/source/en/custom_models.md @@ -271,7 +270,6 @@ docs/source/en/model_doc/yoso.md docs/source/en/model_memory_anatomy.md docs/source/en/model_sharing.md docs/source/en/model_summary.md -docs/source/en/multilingual.md docs/source/en/notebooks.md docs/source/en/pad_truncation.md docs/source/en/peft.md @@ -290,7 +288,6 @@ docs/source/en/philosophy.md docs/source/en/pipeline_webserver.md docs/source/en/pr_checks.md docs/source/en/run_scripts.md -docs/source/en/sagemaker.md docs/source/en/serialization.md docs/source/en/tasks/asr.md docs/source/en/tasks/audio_classification.md From 84da7a533d08c9e87b12ba9c6dcddb1d001d920c Mon Sep 17 00:00:00 2001 From: stevhliu Date: Tue, 28 Jan 2025 13:21:36 -0800 Subject: [PATCH 109/116] rm check_support_list.py --- .circleci/config.yml | 1 - Makefile | 1 - docs/source/en/model_doc/albert.md | 1 + docs/source/en/model_doc/aria.md | 1 + .../audio-spectrogram-transformer.md | 1 + docs/source/en/model_doc/bamba.md | 1 + docs/source/en/model_doc/bart.md | 1 + docs/source/en/model_doc/beit.md | 1 + docs/source/en/model_doc/bert.md | 1 + docs/source/en/model_doc/biogpt.md | 1 + docs/source/en/model_doc/camembert.md | 1 + docs/source/en/model_doc/chameleon.md | 1 + docs/source/en/model_doc/clip.md | 1 + docs/source/en/model_doc/cohere.md | 1 + docs/source/en/model_doc/cohere2.md | 1 + docs/source/en/model_doc/data2vec.md | 1 + docs/source/en/model_doc/dbrx.md | 1 + docs/source/en/model_doc/deit.md | 1 + docs/source/en/model_doc/diffllama.md | 1 + docs/source/en/model_doc/dinov2.md | 1 + .../en/model_doc/dinov2_with_registers.md | 1 + docs/source/en/model_doc/distilbert.md | 1 + docs/source/en/model_doc/dpr.md | 1 + docs/source/en/model_doc/emu3.md | 1 + docs/source/en/model_doc/encoder-decoder.md | 1 + docs/source/en/model_doc/falcon.md | 1 + docs/source/en/model_doc/gemma.md | 1 + docs/source/en/model_doc/gemma2.md | 1 + docs/source/en/model_doc/glm.md | 1 + docs/source/en/model_doc/gpt_bigcode.md | 1 + docs/source/en/model_doc/gpt_neox.md | 1 + docs/source/en/model_doc/granite.md | 1 + docs/source/en/model_doc/granitemoe.md | 1 + docs/source/en/model_doc/helium.md | 1 + docs/source/en/model_doc/hubert.md | 1 + docs/source/en/model_doc/idefics.md | 1 + docs/source/en/model_doc/idefics2.md | 1 + docs/source/en/model_doc/idefics3.md | 1 + docs/source/en/model_doc/ijepa.md | 1 + docs/source/en/model_doc/jamba.md | 1 + docs/source/en/model_doc/jetmoe.md | 1 + docs/source/en/model_doc/llama.md | 1 + docs/source/en/model_doc/llava.md | 1 + docs/source/en/model_doc/llava_next.md | 1 + docs/source/en/model_doc/llava_next_video.md | 1 + docs/source/en/model_doc/llava_onevision.md | 1 + docs/source/en/model_doc/m2m_100.md | 1 + docs/source/en/model_doc/mbart.md | 1 + docs/source/en/model_doc/mimi.md | 1 + docs/source/en/model_doc/mistral.md | 1 + docs/source/en/model_doc/mixtral.md | 1 + docs/source/en/model_doc/modernbert.md | 1 + docs/source/en/model_doc/moonshine.md | 1 + docs/source/en/model_doc/moshi.md | 1 + docs/source/en/model_doc/musicgen.md | 1 + docs/source/en/model_doc/musicgen_melody.md | 1 + docs/source/en/model_doc/nemotron.md | 1 + docs/source/en/model_doc/nllb.md | 1 + docs/source/en/model_doc/olmo.md | 1 + docs/source/en/model_doc/olmo2.md | 1 + docs/source/en/model_doc/olmoe.md | 1 + docs/source/en/model_doc/openai-gpt.md | 1 + docs/source/en/model_doc/opt.md | 1 + docs/source/en/model_doc/paligemma.md | 1 + docs/source/en/model_doc/phi.md | 1 + docs/source/en/model_doc/phi3.md | 1 + docs/source/en/model_doc/phimoe.md | 1 + docs/source/en/model_doc/qwen2.md | 1 + docs/source/en/model_doc/qwen2_5_vl.md | 1 + docs/source/en/model_doc/qwen2_audio.md | 1 + docs/source/en/model_doc/qwen2_moe.md | 1 + docs/source/en/model_doc/roberta.md | 1 + docs/source/en/model_doc/sew.md | 1 + docs/source/en/model_doc/siglip.md | 1 + .../en/model_doc/speech-encoder-decoder.md | 1 + docs/source/en/model_doc/stablelm.md | 1 + docs/source/en/model_doc/starcoder2.md | 1 + docs/source/en/model_doc/unispeech-sat.md | 1 + docs/source/en/model_doc/unispeech.md | 1 + docs/source/en/model_doc/video_llava.md | 1 + docs/source/en/model_doc/videomae.md | 1 + docs/source/en/model_doc/vipllava.md | 1 + .../en/model_doc/vision-encoder-decoder.md | 1 + .../en/model_doc/vision-text-dual-encoder.md | 1 + docs/source/en/model_doc/vit.md | 1 + docs/source/en/model_doc/vit_hybrid.md | 1 + docs/source/en/model_doc/vit_mae.md | 1 + docs/source/en/model_doc/vit_msn.md | 1 + docs/source/en/model_doc/vivit.md | 1 + docs/source/en/model_doc/wav2vec2.md | 1 + docs/source/en/model_doc/whisper.md | 1 + docs/source/en/model_doc/xlm-roberta-xl.md | 1 + docs/source/en/model_doc/xlm-roberta.md | 1 + docs/source/en/model_doc/yolos.md | 1 + docs/source/en/model_doc/zamba2.md | 1 + utils/check_support_list.py | 97 ------------------- 96 files changed, 93 insertions(+), 99 deletions(-) delete mode 100644 utils/check_support_list.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 922a51f798ce..2aef0775fd7b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -180,7 +180,6 @@ jobs: - run: make deps_table_check_updated - run: python utils/update_metadata.py --check-only - run: python utils/check_docstrings.py - - run: python utils/check_support_list.py workflows: version: 2 diff --git a/Makefile b/Makefile index 40d4e6abf424..21152e985082 100644 --- a/Makefile +++ b/Makefile @@ -45,7 +45,6 @@ repo-consistency: python utils/check_doctest_list.py python utils/update_metadata.py --check-only python utils/check_docstrings.py - python utils/check_support_list.py # this target runs checks on all files diff --git a/docs/source/en/model_doc/albert.md b/docs/source/en/model_doc/albert.md index 9a8ebca15480..21cd57675e53 100644 --- a/docs/source/en/model_doc/albert.md +++ b/docs/source/en/model_doc/albert.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +SDPA ## Overview diff --git a/docs/source/en/model_doc/aria.md b/docs/source/en/model_doc/aria.md index 9d42709041ee..7b58f59cab7e 100644 --- a/docs/source/en/model_doc/aria.md +++ b/docs/source/en/model_doc/aria.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/audio-spectrogram-transformer.md b/docs/source/en/model_doc/audio-spectrogram-transformer.md index 4203761958d4..4cc07aea758c 100644 --- a/docs/source/en/model_doc/audio-spectrogram-transformer.md +++ b/docs/source/en/model_doc/audio-spectrogram-transformer.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +SDPA
## Overview diff --git a/docs/source/en/model_doc/bamba.md b/docs/source/en/model_doc/bamba.md index a5073bd5b749..c6e1bcec56a2 100644 --- a/docs/source/en/model_doc/bamba.md +++ b/docs/source/en/model_doc/bamba.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/bart.md b/docs/source/en/model_doc/bart.md index 5ebc0a230b69..aaccd78047db 100644 --- a/docs/source/en/model_doc/bart.md +++ b/docs/source/en/model_doc/bart.md @@ -22,6 +22,7 @@ rendered properly in your Markdown viewer. Flax FlashAttention +SDPA ## Overview diff --git a/docs/source/en/model_doc/beit.md b/docs/source/en/model_doc/beit.md index 1faea35dcea5..24dfabf682b6 100644 --- a/docs/source/en/model_doc/beit.md +++ b/docs/source/en/model_doc/beit.md @@ -20,6 +20,7 @@ rendered properly in your Markdown viewer. PyTorch Flax +SDPA ## Overview diff --git a/docs/source/en/model_doc/bert.md b/docs/source/en/model_doc/bert.md index 3379679f076e..883fba9a076d 100644 --- a/docs/source/en/model_doc/bert.md +++ b/docs/source/en/model_doc/bert.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +SDPA ## Overview diff --git a/docs/source/en/model_doc/biogpt.md b/docs/source/en/model_doc/biogpt.md index 19dbaa56023a..ab8aea6c29e8 100644 --- a/docs/source/en/model_doc/biogpt.md +++ b/docs/source/en/model_doc/biogpt.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +SDPA
## Overview diff --git a/docs/source/en/model_doc/camembert.md b/docs/source/en/model_doc/camembert.md index 288cbc49794f..9066ee360c6c 100644 --- a/docs/source/en/model_doc/camembert.md +++ b/docs/source/en/model_doc/camembert.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch TensorFlow +SDPA
## Overview diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md index f3c54dbe1f39..3810b3590a00 100644 --- a/docs/source/en/model_doc/chameleon.md +++ b/docs/source/en/model_doc/chameleon.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/clip.md b/docs/source/en/model_doc/clip.md index e9b354dfb916..2e1c5168ce71 100644 --- a/docs/source/en/model_doc/clip.md +++ b/docs/source/en/model_doc/clip.md @@ -22,6 +22,7 @@ rendered properly in your Markdown viewer. Flax FlashAttention +SDPA ## Overview diff --git a/docs/source/en/model_doc/cohere.md b/docs/source/en/model_doc/cohere.md index bacc5f665bd5..2ab75e9d1c8b 100644 --- a/docs/source/en/model_doc/cohere.md +++ b/docs/source/en/model_doc/cohere.md @@ -3,6 +3,7 @@
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/cohere2.md b/docs/source/en/model_doc/cohere2.md index 83da4df2154a..3b0b6e1740a9 100644 --- a/docs/source/en/model_doc/cohere2.md +++ b/docs/source/en/model_doc/cohere2.md @@ -3,6 +3,7 @@
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/data2vec.md b/docs/source/en/model_doc/data2vec.md index 2c354bf1aee0..62ddbd8ff184 100644 --- a/docs/source/en/model_doc/data2vec.md +++ b/docs/source/en/model_doc/data2vec.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md index 32fbbb315a25..11463e93d160 100644 --- a/docs/source/en/model_doc/dbrx.md +++ b/docs/source/en/model_doc/dbrx.md @@ -15,6 +15,7 @@ specific language governing permissions and limitations under the License.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/deit.md b/docs/source/en/model_doc/deit.md index 058362b1abea..0750d4000a44 100644 --- a/docs/source/en/model_doc/deit.md +++ b/docs/source/en/model_doc/deit.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch TensorFlow +SDPA
## Overview diff --git a/docs/source/en/model_doc/diffllama.md b/docs/source/en/model_doc/diffllama.md index f19028bb02c9..c4a170c26572 100644 --- a/docs/source/en/model_doc/diffllama.md +++ b/docs/source/en/model_doc/diffllama.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/dinov2.md b/docs/source/en/model_doc/dinov2.md index b78113e87b37..5c130dabda90 100644 --- a/docs/source/en/model_doc/dinov2.md +++ b/docs/source/en/model_doc/dinov2.md @@ -16,6 +16,7 @@ specific language governing permissions and limitations under the License. PyTorch Flax +SDPA ## Overview diff --git a/docs/source/en/model_doc/dinov2_with_registers.md b/docs/source/en/model_doc/dinov2_with_registers.md index ade263d91409..7151dc45356a 100644 --- a/docs/source/en/model_doc/dinov2_with_registers.md +++ b/docs/source/en/model_doc/dinov2_with_registers.md @@ -11,6 +11,7 @@ specific language governing permissions and limitations under the License.
PyTorch +SDPA
## Overview diff --git a/docs/source/en/model_doc/distilbert.md b/docs/source/en/model_doc/distilbert.md index 931cbbc0b404..66be95fa0406 100644 --- a/docs/source/en/model_doc/distilbert.md +++ b/docs/source/en/model_doc/distilbert.md @@ -22,6 +22,7 @@ rendered properly in your Markdown viewer. Flax FlashAttention +SDPA ## Overview diff --git a/docs/source/en/model_doc/dpr.md b/docs/source/en/model_doc/dpr.md index def36f38c741..0f6b19c90014 100644 --- a/docs/source/en/model_doc/dpr.md +++ b/docs/source/en/model_doc/dpr.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch TensorFlow +SDPA
## Overview diff --git a/docs/source/en/model_doc/emu3.md b/docs/source/en/model_doc/emu3.md index af195836d20a..4ac7d0b0c4f1 100644 --- a/docs/source/en/model_doc/emu3.md +++ b/docs/source/en/model_doc/emu3.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/encoder-decoder.md b/docs/source/en/model_doc/encoder-decoder.md index dc977b3e7485..d0a676fb33a6 100644 --- a/docs/source/en/model_doc/encoder-decoder.md +++ b/docs/source/en/model_doc/encoder-decoder.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +SDPA ## Overview diff --git a/docs/source/en/model_doc/falcon.md b/docs/source/en/model_doc/falcon.md index e34216ec057b..1197d208a2ab 100644 --- a/docs/source/en/model_doc/falcon.md +++ b/docs/source/en/model_doc/falcon.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/gemma.md b/docs/source/en/model_doc/gemma.md index f2dd3f897940..144bcf33886b 100644 --- a/docs/source/en/model_doc/gemma.md +++ b/docs/source/en/model_doc/gemma.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. Flax FlashAttention +SDPA ## Overview diff --git a/docs/source/en/model_doc/gemma2.md b/docs/source/en/model_doc/gemma2.md index 7ad268535c15..9cf8ff7af102 100644 --- a/docs/source/en/model_doc/gemma2.md +++ b/docs/source/en/model_doc/gemma2.md @@ -20,6 +20,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/glm.md b/docs/source/en/model_doc/glm.md index 0ea6fde66a42..cfcd549d1493 100644 --- a/docs/source/en/model_doc/glm.md +++ b/docs/source/en/model_doc/glm.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/gpt_bigcode.md b/docs/source/en/model_doc/gpt_bigcode.md index f8919f503945..648fa6cb8d60 100644 --- a/docs/source/en/model_doc/gpt_bigcode.md +++ b/docs/source/en/model_doc/gpt_bigcode.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/gpt_neox.md b/docs/source/en/model_doc/gpt_neox.md index 41c8eee47340..35f12bdb2128 100644 --- a/docs/source/en/model_doc/gpt_neox.md +++ b/docs/source/en/model_doc/gpt_neox.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +SDPA
## Overview diff --git a/docs/source/en/model_doc/granite.md b/docs/source/en/model_doc/granite.md index 66551025aa04..0326bc5ad24a 100644 --- a/docs/source/en/model_doc/granite.md +++ b/docs/source/en/model_doc/granite.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/granitemoe.md b/docs/source/en/model_doc/granitemoe.md index 9cdea05d8fb2..56ba5d936c9d 100644 --- a/docs/source/en/model_doc/granitemoe.md +++ b/docs/source/en/model_doc/granitemoe.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/helium.md b/docs/source/en/model_doc/helium.md index a2ae1424774d..a9296eb110d5 100644 --- a/docs/source/en/model_doc/helium.md +++ b/docs/source/en/model_doc/helium.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/hubert.md b/docs/source/en/model_doc/hubert.md index 53d729b920f5..432e127c7863 100644 --- a/docs/source/en/model_doc/hubert.md +++ b/docs/source/en/model_doc/hubert.md @@ -20,6 +20,7 @@ rendered properly in your Markdown viewer. PyTorch TensorFlow FlashAttention +SDPA ## Overview diff --git a/docs/source/en/model_doc/idefics.md b/docs/source/en/model_doc/idefics.md index 35d1c2a56afc..2b8e471213d7 100644 --- a/docs/source/en/model_doc/idefics.md +++ b/docs/source/en/model_doc/idefics.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch TensorFlow +SDPA
## Overview diff --git a/docs/source/en/model_doc/idefics2.md b/docs/source/en/model_doc/idefics2.md index 8f3351d8a49d..8de2c92d5609 100644 --- a/docs/source/en/model_doc/idefics2.md +++ b/docs/source/en/model_doc/idefics2.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md index b4975d73471d..deab4423f80c 100644 --- a/docs/source/en/model_doc/idefics3.md +++ b/docs/source/en/model_doc/idefics3.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/ijepa.md b/docs/source/en/model_doc/ijepa.md index ecb90c67cb3b..a92fdc83e8ac 100644 --- a/docs/source/en/model_doc/ijepa.md +++ b/docs/source/en/model_doc/ijepa.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +SDPA
## Overview diff --git a/docs/source/en/model_doc/jamba.md b/docs/source/en/model_doc/jamba.md index 6ed97fc29612..c8d66b163b5a 100644 --- a/docs/source/en/model_doc/jamba.md +++ b/docs/source/en/model_doc/jamba.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/jetmoe.md b/docs/source/en/model_doc/jetmoe.md index 91b2f551f264..aba6577f70cd 100644 --- a/docs/source/en/model_doc/jetmoe.md +++ b/docs/source/en/model_doc/jetmoe.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md index 9d40b5b3010c..c127e0d73c00 100644 --- a/docs/source/en/model_doc/llama.md +++ b/docs/source/en/model_doc/llama.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. Flax FlashAttention +SDPA ## Overview diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md index aecc9ea6dec6..79033ec5a189 100644 --- a/docs/source/en/model_doc/llava.md +++ b/docs/source/en/model_doc/llava.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index b88e20a1c7b1..7d85ab8b6967 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md index 76a8409455a9..b338bbfba128 100644 --- a/docs/source/en/model_doc/llava_next_video.md +++ b/docs/source/en/model_doc/llava_next_video.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/llava_onevision.md b/docs/source/en/model_doc/llava_onevision.md index 60fa88d716a3..77fe807d46d0 100644 --- a/docs/source/en/model_doc/llava_onevision.md +++ b/docs/source/en/model_doc/llava_onevision.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/m2m_100.md b/docs/source/en/model_doc/m2m_100.md index 840f21a037e0..f4f2955bb046 100644 --- a/docs/source/en/model_doc/m2m_100.md +++ b/docs/source/en/model_doc/m2m_100.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/mbart.md b/docs/source/en/model_doc/mbart.md index 742747715620..62356ad26402 100644 --- a/docs/source/en/model_doc/mbart.md +++ b/docs/source/en/model_doc/mbart.md @@ -22,6 +22,7 @@ rendered properly in your Markdown viewer. Flax FlashAttention +SDPA diff --git a/docs/source/en/model_doc/mimi.md b/docs/source/en/model_doc/mimi.md index 61cbe8fbe626..6e68394fcaea 100644 --- a/docs/source/en/model_doc/mimi.md +++ b/docs/source/en/model_doc/mimi.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md index 02335f2fb133..097d8888f9a5 100644 --- a/docs/source/en/model_doc/mistral.md +++ b/docs/source/en/model_doc/mistral.md @@ -22,6 +22,7 @@ rendered properly in your Markdown viewer. Flax FlashAttention +SDPA ## Overview diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md index 4b28dd898cd6..38c0c98ed0b9 100644 --- a/docs/source/en/model_doc/mixtral.md +++ b/docs/source/en/model_doc/mixtral.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/modernbert.md b/docs/source/en/model_doc/modernbert.md index 7bf71ef47557..f7ceaae18797 100644 --- a/docs/source/en/model_doc/modernbert.md +++ b/docs/source/en/model_doc/modernbert.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md index 2c8c6feae5bb..2a4599e3d7e0 100644 --- a/docs/source/en/model_doc/moonshine.md +++ b/docs/source/en/model_doc/moonshine.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/moshi.md b/docs/source/en/model_doc/moshi.md index ca06e05d844e..9302a9461959 100644 --- a/docs/source/en/model_doc/moshi.md +++ b/docs/source/en/model_doc/moshi.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/musicgen.md b/docs/source/en/model_doc/musicgen.md index 72f581881077..6d709a963c04 100644 --- a/docs/source/en/model_doc/musicgen.md +++ b/docs/source/en/model_doc/musicgen.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/musicgen_melody.md b/docs/source/en/model_doc/musicgen_melody.md index cf7ba13f0ad1..b1f16c4574ef 100644 --- a/docs/source/en/model_doc/musicgen_melody.md +++ b/docs/source/en/model_doc/musicgen_melody.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/nemotron.md b/docs/source/en/model_doc/nemotron.md index 6af9fe135b88..13b1b9be2fbc 100644 --- a/docs/source/en/model_doc/nemotron.md +++ b/docs/source/en/model_doc/nemotron.md @@ -17,6 +17,7 @@ specific language governing permissions and limitations under the License.
PyTorch FlashAttention +SDPA
### License diff --git a/docs/source/en/model_doc/nllb.md b/docs/source/en/model_doc/nllb.md index ec8358b77ba8..4ba273777920 100644 --- a/docs/source/en/model_doc/nllb.md +++ b/docs/source/en/model_doc/nllb.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Updated tokenizer behavior diff --git a/docs/source/en/model_doc/olmo.md b/docs/source/en/model_doc/olmo.md index c173b83189a9..8d722185c31f 100644 --- a/docs/source/en/model_doc/olmo.md +++ b/docs/source/en/model_doc/olmo.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/olmo2.md b/docs/source/en/model_doc/olmo2.md index bdecd17aae62..24030b855244 100644 --- a/docs/source/en/model_doc/olmo2.md +++ b/docs/source/en/model_doc/olmo2.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/olmoe.md b/docs/source/en/model_doc/olmoe.md index b9606f4b9c2c..6496e44c1bd5 100644 --- a/docs/source/en/model_doc/olmoe.md +++ b/docs/source/en/model_doc/olmoe.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/openai-gpt.md b/docs/source/en/model_doc/openai-gpt.md index d64622de7d5c..68cda34db5ab 100644 --- a/docs/source/en/model_doc/openai-gpt.md +++ b/docs/source/en/model_doc/openai-gpt.md @@ -22,6 +22,7 @@ rendered properly in your Markdown viewer. Flax FlashAttention +SDPA ## Overview diff --git a/docs/source/en/model_doc/opt.md b/docs/source/en/model_doc/opt.md index 3ac3cdacfe2a..f6165e495393 100644 --- a/docs/source/en/model_doc/opt.md +++ b/docs/source/en/model_doc/opt.md @@ -22,6 +22,7 @@ rendered properly in your Markdown viewer. Flax FlashAttention +SDPA ## Overview diff --git a/docs/source/en/model_doc/paligemma.md b/docs/source/en/model_doc/paligemma.md index 1dc3d33a9692..3662f0fcf47b 100644 --- a/docs/source/en/model_doc/paligemma.md +++ b/docs/source/en/model_doc/paligemma.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/phi.md b/docs/source/en/model_doc/phi.md index 6bf4aa6bd60f..097d7fdd39ee 100644 --- a/docs/source/en/model_doc/phi.md +++ b/docs/source/en/model_doc/phi.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/phi3.md b/docs/source/en/model_doc/phi3.md index a2b52342c005..82973d39c07b 100644 --- a/docs/source/en/model_doc/phi3.md +++ b/docs/source/en/model_doc/phi3.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/phimoe.md b/docs/source/en/model_doc/phimoe.md index cc19f3ac2d57..6728248f2e0a 100644 --- a/docs/source/en/model_doc/phimoe.md +++ b/docs/source/en/model_doc/phimoe.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 0ac94085893c..dc6201d0de5e 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/qwen2_5_vl.md b/docs/source/en/model_doc/qwen2_5_vl.md index 5ad1ff8761bf..b2c138999e6f 100644 --- a/docs/source/en/model_doc/qwen2_5_vl.md +++ b/docs/source/en/model_doc/qwen2_5_vl.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/qwen2_audio.md b/docs/source/en/model_doc/qwen2_audio.md index 48d516cc226f..8f7bd7c3b69c 100644 --- a/docs/source/en/model_doc/qwen2_audio.md +++ b/docs/source/en/model_doc/qwen2_audio.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/qwen2_moe.md b/docs/source/en/model_doc/qwen2_moe.md index 077fc0561184..eaaa66aedf7a 100644 --- a/docs/source/en/model_doc/qwen2_moe.md +++ b/docs/source/en/model_doc/qwen2_moe.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/roberta.md b/docs/source/en/model_doc/roberta.md index a67ea79ec74c..10a46d6f57eb 100644 --- a/docs/source/en/model_doc/roberta.md +++ b/docs/source/en/model_doc/roberta.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +SDPA ## Overview diff --git a/docs/source/en/model_doc/sew.md b/docs/source/en/model_doc/sew.md index 7911b09c5f60..cfc92db0eaa1 100644 --- a/docs/source/en/model_doc/sew.md +++ b/docs/source/en/model_doc/sew.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/siglip.md b/docs/source/en/model_doc/siglip.md index 5495b7f6545c..478e8a19a8c3 100644 --- a/docs/source/en/model_doc/siglip.md +++ b/docs/source/en/model_doc/siglip.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/speech-encoder-decoder.md b/docs/source/en/model_doc/speech-encoder-decoder.md index 199163d6f795..8893adfdd4a0 100644 --- a/docs/source/en/model_doc/speech-encoder-decoder.md +++ b/docs/source/en/model_doc/speech-encoder-decoder.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. Flax FlashAttention +SDPA The [`SpeechEncoderDecoderModel`] can be used to initialize a speech-to-text model diff --git a/docs/source/en/model_doc/stablelm.md b/docs/source/en/model_doc/stablelm.md index 31064fca29e8..b996b7fcf9e8 100644 --- a/docs/source/en/model_doc/stablelm.md +++ b/docs/source/en/model_doc/stablelm.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/starcoder2.md b/docs/source/en/model_doc/starcoder2.md index dbd365b0f15b..c6b146bf30ed 100644 --- a/docs/source/en/model_doc/starcoder2.md +++ b/docs/source/en/model_doc/starcoder2.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/unispeech-sat.md b/docs/source/en/model_doc/unispeech-sat.md index 4e4bef5d2093..ae4eed71874c 100644 --- a/docs/source/en/model_doc/unispeech-sat.md +++ b/docs/source/en/model_doc/unispeech-sat.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/unispeech.md b/docs/source/en/model_doc/unispeech.md index dd1c55f9e8c3..43b0c3bb117e 100644 --- a/docs/source/en/model_doc/unispeech.md +++ b/docs/source/en/model_doc/unispeech.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md index 73b171a2ea4a..f407b4dc5ebd 100644 --- a/docs/source/en/model_doc/video_llava.md +++ b/docs/source/en/model_doc/video_llava.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/videomae.md b/docs/source/en/model_doc/videomae.md index f4c9d8b38705..f115d81694bd 100644 --- a/docs/source/en/model_doc/videomae.md +++ b/docs/source/en/model_doc/videomae.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +SDPA
## Overview diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md index 76838e762919..9438893dfb15 100644 --- a/docs/source/en/model_doc/vipllava.md +++ b/docs/source/en/model_doc/vipllava.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
## Overview diff --git a/docs/source/en/model_doc/vision-encoder-decoder.md b/docs/source/en/model_doc/vision-encoder-decoder.md index d95b70f6c68e..05340858612f 100644 --- a/docs/source/en/model_doc/vision-encoder-decoder.md +++ b/docs/source/en/model_doc/vision-encoder-decoder.md @@ -22,6 +22,7 @@ rendered properly in your Markdown viewer. Flax FlashAttention +SDPA ## Overview diff --git a/docs/source/en/model_doc/vision-text-dual-encoder.md b/docs/source/en/model_doc/vision-text-dual-encoder.md index 2cda544b862b..b9d6db38d588 100644 --- a/docs/source/en/model_doc/vision-text-dual-encoder.md +++ b/docs/source/en/model_doc/vision-text-dual-encoder.md @@ -22,6 +22,7 @@ rendered properly in your Markdown viewer. Flax FlashAttention +SDPA ## Overview diff --git a/docs/source/en/model_doc/vit.md b/docs/source/en/model_doc/vit.md index 5d122e777115..49c5c0e278ba 100644 --- a/docs/source/en/model_doc/vit.md +++ b/docs/source/en/model_doc/vit.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +SDPA ## Overview diff --git a/docs/source/en/model_doc/vit_hybrid.md b/docs/source/en/model_doc/vit_hybrid.md index 9a7e04e6b7ae..a79fadd2550c 100644 --- a/docs/source/en/model_doc/vit_hybrid.md +++ b/docs/source/en/model_doc/vit_hybrid.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +SDPA
diff --git a/docs/source/en/model_doc/vit_mae.md b/docs/source/en/model_doc/vit_mae.md index c3ed0d36c664..6ab509617209 100644 --- a/docs/source/en/model_doc/vit_mae.md +++ b/docs/source/en/model_doc/vit_mae.md @@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
PyTorch TensorFlow +SDPA
## Overview diff --git a/docs/source/en/model_doc/vit_msn.md b/docs/source/en/model_doc/vit_msn.md index 6d34b8513674..53cef4501106 100644 --- a/docs/source/en/model_doc/vit_msn.md +++ b/docs/source/en/model_doc/vit_msn.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +SDPA
## Overview diff --git a/docs/source/en/model_doc/vivit.md b/docs/source/en/model_doc/vivit.md index aaa86a850c4d..9c4b8f5f7163 100644 --- a/docs/source/en/model_doc/vivit.md +++ b/docs/source/en/model_doc/vivit.md @@ -14,6 +14,7 @@ specific language governing permissions and limitations under the License.
PyTorch +SDPA
## Overview diff --git a/docs/source/en/model_doc/wav2vec2.md b/docs/source/en/model_doc/wav2vec2.md index 30a707a06c96..0dac6234914b 100644 --- a/docs/source/en/model_doc/wav2vec2.md +++ b/docs/source/en/model_doc/wav2vec2.md @@ -22,6 +22,7 @@ rendered properly in your Markdown viewer. Flax FlashAttention +SDPA ## Overview diff --git a/docs/source/en/model_doc/whisper.md b/docs/source/en/model_doc/whisper.md index d11a8af98e55..aa9007a8603f 100644 --- a/docs/source/en/model_doc/whisper.md +++ b/docs/source/en/model_doc/whisper.md @@ -22,6 +22,7 @@ rendered properly in your Markdown viewer. Flax FlashAttention +SDPA ## Overview diff --git a/docs/source/en/model_doc/xlm-roberta-xl.md b/docs/source/en/model_doc/xlm-roberta-xl.md index b291105865ca..355869ad6e02 100644 --- a/docs/source/en/model_doc/xlm-roberta-xl.md +++ b/docs/source/en/model_doc/xlm-roberta-xl.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +SDPA
## Overview diff --git a/docs/source/en/model_doc/xlm-roberta.md b/docs/source/en/model_doc/xlm-roberta.md index 7b60b43404ee..2bc890257a69 100644 --- a/docs/source/en/model_doc/xlm-roberta.md +++ b/docs/source/en/model_doc/xlm-roberta.md @@ -21,6 +21,7 @@ rendered properly in your Markdown viewer. TensorFlow Flax +SDPA ## Overview diff --git a/docs/source/en/model_doc/yolos.md b/docs/source/en/model_doc/yolos.md index c92c63372338..a988d0d507e1 100644 --- a/docs/source/en/model_doc/yolos.md +++ b/docs/source/en/model_doc/yolos.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch +SDPA
## Overview diff --git a/docs/source/en/model_doc/zamba2.md b/docs/source/en/model_doc/zamba2.md index b7495745bd35..447fa27b6962 100644 --- a/docs/source/en/model_doc/zamba2.md +++ b/docs/source/en/model_doc/zamba2.md @@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
PyTorch FlashAttention +SDPA
Zamba2 is a large language model (LLM) trained by Zyphra, and made available under an Apache 2.0 license. Please see the [Zyphra Hugging Face](https://huggingface.co/collections/zyphra/) repository for model weights. diff --git a/utils/check_support_list.py b/utils/check_support_list.py deleted file mode 100644 index 55d93611f4ce..000000000000 --- a/utils/check_support_list.py +++ /dev/null @@ -1,97 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Utility that checks the supports of 3rd party libraries are listed in the documentation file. Currently, this includes: -- flash attention support -- SDPA support - -Use from the root of the repo with (as used in `make repo-consistency`): - -```bash -python utils/check_support_list.py -``` - -It has no auto-fix mode. -""" - -import os -from glob import glob - - -# All paths are set with the intent you should run this script from the root of the repo with the command -# python utils/check_doctest_list.py -REPO_PATH = "." - - -def check_flash_support_list(): - with open(os.path.join(REPO_PATH, "docs/source/en/perf_infer_gpu_one.md"), "r") as f: - doctext = f.read() - - doctext = doctext.split("FlashAttention-2 is currently supported for the following architectures:")[1] - doctext = doctext.split("You can request to add FlashAttention-2 support")[0] - - patterns = glob(os.path.join(REPO_PATH, "src/transformers/models/**/modeling_*.py")) - patterns_tf = glob(os.path.join(REPO_PATH, "src/transformers/models/**/modeling_tf_*.py")) - patterns_flax = glob(os.path.join(REPO_PATH, "src/transformers/models/**/modeling_flax_*.py")) - patterns = list(set(patterns) - set(patterns_tf) - set(patterns_flax)) - archs_supporting_fa2 = [] - for filename in patterns: - with open(filename, "r") as f: - text = f.read() - - if "_supports_flash_attn_2 = True" in text: - model_name = os.path.basename(filename).replace(".py", "").replace("modeling_", "") - archs_supporting_fa2.append(model_name) - - for arch in archs_supporting_fa2: - if arch not in doctext: - raise ValueError( - f"{arch} should be in listed in the flash attention documentation but is not. Please update the documentation." - ) - - -def check_sdpa_support_list(): - with open(os.path.join(REPO_PATH, "docs/source/en/perf_infer_gpu_one.md"), "r") as f: - doctext = f.read() - - doctext = doctext.split( - "For now, Transformers supports SDPA inference and training for the following architectures:" - )[1] - doctext = doctext.split("Note that FlashAttention can only be used for models using the")[0] - doctext = doctext.lower() - - patterns = glob(os.path.join(REPO_PATH, "src/transformers/models/**/modeling_*.py")) - patterns_tf = glob(os.path.join(REPO_PATH, "src/transformers/models/**/modeling_tf_*.py")) - patterns_flax = glob(os.path.join(REPO_PATH, "src/transformers/models/**/modeling_flax_*.py")) - patterns = list(set(patterns) - set(patterns_tf) - set(patterns_flax)) - archs_supporting_sdpa = [] - for filename in patterns: - with open(filename, "r") as f: - text = f.read() - - if "_supports_sdpa = True" in text: - model_name = os.path.basename(filename).replace(".py", "").replace("modeling_", "") - archs_supporting_sdpa.append(model_name) - - for arch in archs_supporting_sdpa: - if not any(term in doctext for term in [arch, arch.replace("_", "-"), arch.replace("_", " ")]): - raise ValueError( - f"{arch} should be in listed in the SDPA documentation but is not. Please update the documentation." - ) - - -if __name__ == "__main__": - check_flash_support_list() - check_sdpa_support_list() From ce5770dcea11bc95b77b8201f7d638dc81889841 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Fri, 31 Jan 2025 17:38:49 -0800 Subject: [PATCH 110/116] feedback --- docs/source/en/_toctree.yml | 18 ++++---- docs/source/en/add_new_model.md | 21 ++++----- docs/source/en/cache_explanation.md | 4 +- docs/source/en/chat_extras.md | 6 +-- docs/source/en/executorch.md | 59 +++++++++++++++++++++++++ docs/source/en/fast_tokenizers.md | 38 ++++++++++++++++ docs/source/en/llm_tutorial.md | 11 +++++ docs/source/en/models.md | 2 +- docs/source/en/quantization/overview.md | 22 ++++++--- docs/source/en/quantization/torchao.md | 16 +++---- docs/source/en/tiktoken.md | 59 ------------------------- 11 files changed, 156 insertions(+), 100 deletions(-) create mode 100644 docs/source/en/executorch.md delete mode 100644 docs/source/en/tiktoken.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 3fa6b89b4605..205dfd38949f 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -2,17 +2,17 @@ sections: - local: index title: Transformers - - local: quicktour - title: Quickstart - local: installation title: Installation + - local: quicktour + title: Quickstart - title: Base classes isExpanded: False sections: - title: Models sections: - local: models - title: Loading + title: Loading models - local: custom_models title: Customizing models - local: how_to_hack_models @@ -20,7 +20,7 @@ - local: model_sharing title: Sharing - local: add_new_model - title: Adding a new model + title: Adding a new model to Transformers - local: modular_transformers title: Modular Transformers - local: task_summary @@ -35,8 +35,6 @@ sections: - local: fast_tokenizers title: Tokenizers - - local: tiktoken - title: tiktoken - local: image_processors title: Image processors - local: backbones @@ -82,7 +80,7 @@ title: Getting the most out of LLMs - local: perplexity title: Perplexity of fixed-length models - - title: Chat + - title: Chat with models sections: - local: conversations title: Chat basics @@ -98,14 +96,14 @@ sections: - local: perf_torch_compile title: torch.compile - - local: tf_xla - title: XLA - local: perf_infer_gpu_one title: GPU - local: perf_infer_gpu_multi title: Distributed GPU inference - local: perf_infer_cpu title: CPU + - local: tf_xla + title: XLA - local: agents title: Agents - local: tools @@ -199,6 +197,8 @@ title: ONNX - local: tflite title: LiteRT + - local: executorch + title: ExecuTorch - local: torchscript title: TorchScript - title: Resources diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md index 702f04db64f4..252f06880598 100644 --- a/docs/source/en/add_new_model.md +++ b/docs/source/en/add_new_model.md @@ -13,7 +13,10 @@ rendered properly in your Markdown viewer. --> -# Adding a new model +# Adding a new model to Transformers + +> [!TIP] +> Try adding new models with a more [modular](./modular_transformers) approach first. This makes it significantly easier to contribute a model to Transformers! Many of the models in Transformers are contributed by developers and researchers. As an open-source first project, we're invested in empowering the community to actively and independently add more models. @@ -40,7 +43,7 @@ Some of these design choices are: - composition > over-abstraction - duplicate code isn't always bad if it greatly improves readability and accessibility -- model files are self-contained and all the necessary model code is found in the `modeling.py` file +- model files are self-contained and all the necessary model code is found in the `modeling_mymodel.py` file These design choices are important *for everyone* interacting with the model. It is easier to read, understand, and modify. @@ -73,15 +76,6 @@ A model is saved to a `model.safetensors` file and a configuration is saved to a Transformers prefers a clean and readable code over a more abstracted code style. Some of the code style choices include: -- The forward pass is written in the `modeling.py` file, completely independent of other models in the library. To reuse a block from another model, copy the code and paste it with a `# Copied from` comment above it. For example, the `RobertaSelfAttention` class is copied from the `BertSelfAttention` class. - - ```py - # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Roberta - class RobertaSelfAttention(nn.Module): - ``` - - Refer to the [Check copies](./pr_checks#check-copies) docs for more information about the `# Copied from` comment. - - The code should be accessible to non-English users. Pick descriptive variable names and avoid abbreviations. For example, "activation" is preferred over "act". One letter variables names are highly discouraged unless it's an index in a for loop. - Explicit code is preferred - even if it's longer - over shorter code. @@ -108,7 +102,7 @@ Now is a good time to get familiar with BrandNewBert. It is helpful to read a mo In addition to learning more about your model, use the tips below to help you add a model faster. > [!TIP] -> Each contributor has a unique style and workflow for porting models to Transformers. It may be helpful to take a look at how [GPT2](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) and [WMT19](https://huggingface.co/blog/porting-fsmt) were ported. +> Each contributor has a unique style and workflow for adding models to Transformers. Take a look at how [Gemma](https://github.com/huggingface/transformers/pull/29167) was added for an example. - Don't reinvent the wheel! Take your time to explore existing models and tokenizers to see what you can copy and reuse. [Grep](https://www.gnu.org/software/grep/) and [ripgrep](https://github.com/BurntSushi/ripgrep) are great tools for this. - This is more of an engineering than a science challenge. Focus on the more practical (setting up an efficient debugging environment for example) instead of the theorertical aspects of the model. @@ -522,6 +516,9 @@ All features unique to BrandNewBert should be tested in a separate test under `B ## Implement tokenizer +> [!TIP] +> We recommend adding a fast tokenizer ([`PreTrainedTokenizerFast`]) to give users the best performance. Feel free to tag [@ArthurZucker](https://github.com/ArthurZucker) or [@itazap](https://github.com/itazap) in your PR for help on how to add [`PreTrainedTokenizerFast`]. + With the model out of the way, time to focus on the tokenizer. The tokenizer should be identical or very similar to an existing tokenizer in Transformers. Find and load the original tokenizer file into your implementation. Create a script in the original repository that inputs a string and returns the `input_ids`. The pseudocode should look similar to the code below. diff --git a/docs/source/en/cache_explanation.md b/docs/source/en/cache_explanation.md index 9ac1f711deff..b13601459d89 100644 --- a/docs/source/en/cache_explanation.md +++ b/docs/source/en/cache_explanation.md @@ -24,14 +24,14 @@ To predict the 1000th token, the model requires information from the previous 99 To predict the 1001th token, you need the same information from the previous 999 tokens in addition to any information from the 1000th token. This is a lot of matrix multiplications a model has to compute over and over for each token! -A key-value (KV) cache eliminates this inefficiency by storing kv pairs derived from the self-attention layers of previously processed tokens. The stored kv pairs are retrieved from the cache and reused for subsequent tokens, avoiding the need to recompute. +A key-value (KV) cache eliminates this inefficiency by storing kv pairs derived from the attention layers of previously processed tokens. The stored kv pairs are retrieved from the cache and reused for subsequent tokens, avoiding the need to recompute. > [!WARNING] > Caching should only be used for **inference**. It may cause unexpected errors if it's enabled during training. ## Cache class -When you use Transformers' [`Cache`] class, the attention module performs several critical steps to integrate past and present information. +When you use Transformers' [`Cache`] class, the self-attention module performs several critical steps to integrate past and present information. 1. The attention module concatenates current kv pairs with past kv pairs stored in the cache. This creates attentions weights with the shape `(new_tokens_length, past_kv_length + new_tokens_length)`. The current and past kv pairs are essentially combined to compute the attention scores, ensuring a model is aware of previous context and the current input. diff --git a/docs/source/en/chat_extras.md b/docs/source/en/chat_extras.md index 13f89eb39fc8..11c195741c0a 100644 --- a/docs/source/en/chat_extras.md +++ b/docs/source/en/chat_extras.md @@ -84,7 +84,7 @@ Pass `messages` and a list of tools to [`~PreTrainedTokenizerBase.apply_chat_tem ```py inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") -inputs = {k: v.to(model.device) for k, v in inputs.items()} +inputs = {k: v for k, v in inputs.items()} outputs = model.generate(**inputs, max_new_tokens=128) print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):])) ``` @@ -114,7 +114,7 @@ Allow the assistant to read the function outputs and chat with the user. ```py inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") -inputs = {k: v.to(model.device) for k, v in inputs.items()} +inputs = {k: v for k, v in inputs.items()} out = model.generate(**inputs, max_new_tokens=128) print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):])) ``` @@ -136,7 +136,7 @@ messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": ```py inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") -inputs = {k: v.to(model.device) for k, v in inputs.items()} +inputs = {k: v for k, v in inputs.items()} out = model.generate(**inputs, max_new_tokens=128) print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):])) ``` diff --git a/docs/source/en/executorch.md b/docs/source/en/executorch.md new file mode 100644 index 000000000000..3e9c097c9e6c --- /dev/null +++ b/docs/source/en/executorch.md @@ -0,0 +1,59 @@ + + +# ExecuTorch + +[ExecuTorch](https://pytorch.org/executorch/stable/index.html) is a platform that enables PyTorch training and inference programs to be run on mobile and edge devices. It is powered by [torch.compile](https://pytorch.org/docs/stable/torch.compiler.html) and [torch.export](https://pytorch.org/docs/main/export.html) for performance and deployment. + +You can use ExecuTorch with Transformers with [torch.export](https://pytorch.org/docs/main/export.html). The [`~transformers.convert_and_export_with_cache`] method converts a [`PreTrainedModel`] into an exportable module. Under the hood, it uses [torch.export](https://pytorch.org/docs/main/export.html) to export the model, ensuring compatibility with ExecuTorch. + +```py +import torch +from transformers import LlamaForCausalLM, AutoTokenizer, GenerationConfig +from transformers.integrations.executorch import( + TorchExportableModuleWithStaticCache, + convert_and_export_with_cache +) + +generation_config = GenerationConfig( + use_cache=True, + cache_implementation="static", + cache_config={ + "batch_size": 1, + "max_cache_len": 20, + } +) + +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", pad_token="", padding_side="right") +model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="sdpa", generation_config=generation_config) + +exported_program = convert_and_export_with_cache(model) +``` + +The exported PyTorch model is now ready to be used with ExecuTorch. Wrap the model with [`~transformers.TorchExportableModuleWithStaticCache`] to generate text. + +```py +prompts = ["Simply put, the theory of relativity states that "] +prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) +prompt_token_ids = prompt_tokens["input_ids"] + +generated_ids = TorchExportableModuleWithStaticCache.generate( + exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=20, +) +generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) +print(generated_text) +['Simply put, the theory of relativity states that 1) the speed of light is the'] +``` diff --git a/docs/source/en/fast_tokenizers.md b/docs/source/en/fast_tokenizers.md index fdd05e1e85dc..921c0ba7b6f5 100644 --- a/docs/source/en/fast_tokenizers.md +++ b/docs/source/en/fast_tokenizers.md @@ -187,6 +187,41 @@ from transformers import PreTrainedTokenizerFast fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") ``` +## tiktoken + +[tiktoken](https://github.com/openai/tiktoken) is a [byte-pair encoding (BPE)](./tokenizer_summary#byte-pair-encoding-bpe) tokenizer by OpenAI. It includes several tokenization schemes or encodings for how text should be tokenized. + +There are currently two models trained and released with tiktoken, GPT2 and Llama3. Transformers supports models with a [tokenizer.model](https://hf.co/meta-llama/Meta-Llama-3-8B/blob/main/original/tokenizer.model) tiktoken file. The tiktoken file is automatically converted into Transformers Rust-based [`PreTrainedTokenizerFast`]. + +Add the `subfolder` parameter to [`~PreTrainedModel.from_pretrained`] to specify where the `tokenizer.model` tiktoken file is located. + +```py +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", subfolder="original") +``` + +### Create a tiktoken tokenizer + +The tiktoken `tokenizer.model` file contains no information about additional tokens or pattern strings. If these are important, convert the tokenizer to `tokenizer.json` (the appropriate format for [`PreTrainedTokenizerFast`]). + +Generate the tiktoken `tokenizer.model` file with the [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) function, and convert it to `tokenizer.json` with [convert_tiktoken_to_fast](https://github.com/huggingface/transformers/blob/99e0ab6ed888136ea4877c6d8ab03690a1478363/src/transformers/integrations/tiktoken.py#L8). + +```py +from transformers.integrations.tiktoken import convert_tiktoken_to_fast +from tiktoken import get_encoding + +# Load your custom encoding or the one provided by OpenAI +encoding = get_encoding("gpt2") +convert_tiktoken_to_fast(encoding, "config/save/dir") +``` + +The resulting `tokenizer.json` file is saved to the specified directory and loaded with [`~PreTrainedTokenizerFast.from_pretrained`]. + +```py +tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir") +``` + ## Preprocess @@ -244,6 +279,9 @@ print(decoded_string)
+> [!TIP] +> Visualize how different tokenizers work in the [Tokenizer Playground](https://xenova-the-tokenizer-playground.static.hf.space). + ### Special tokens Special tokens provide the model with some additional information about the text. diff --git a/docs/source/en/llm_tutorial.md b/docs/source/en/llm_tutorial.md index 55ca7f425c9d..2533192ccb44 100644 --- a/docs/source/en/llm_tutorial.md +++ b/docs/source/en/llm_tutorial.md @@ -276,3 +276,14 @@ print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_token + +## Resources + +Take a look below for some more specific and specialized text generation libraries. + +- [Optimum](https://github.com/huggingface/optimum): an extension of Transformers focused on optimizing training and inference on specific hardware devices +- [Outlines](https://github.com/dottxt-ai/outlines): a library for constrained text generation (generate JSON files for example). +- [SynCode](https://github.com/uiuc-focal-lab/syncode): a library for context-free grammar guided generation (JSON, SQL, Python). +- [Text Generation Inference](https://github.com/huggingface/text-generation-inference): a production-ready server for LLMs. +- [Text generation web UI](https://github.com/oobabooga/text-generation-webui): a Gradio web UI for text generation. +- [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo): additional logits processors for controlling text generation. \ No newline at end of file diff --git a/docs/source/en/models.md b/docs/source/en/models.md index 5263c6c11885..aef309960101 100644 --- a/docs/source/en/models.md +++ b/docs/source/en/models.md @@ -14,7 +14,7 @@ rendered properly in your Markdown viewer. --> -# Loading +# Loading models Transformers provides many pretrained models that are ready to use with a single line of code. It requires a model class and the [`~PreTrainedModel.from_pretrained`] method. diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index eb5dcc8a848d..fd20e0b52219 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -22,12 +22,22 @@ Transformers supports many quantization methods, each with their pros and cons, Use the Space below to help you pick a quantization method depending on your hardware and number of bits to quantize to. - +| Quantization Method | On the fly quantization | CPU | CUDA GPU | ROCm GPU | Metal (Apple Silicon) | Intel GPU | Torch compile() | Bits | PEFT Fine Tuning | Serializable with 🤗Transformers | 🤗Transformers Support | Link to library | +|-----------------------------------------------|----------------------|-----------------|----------|-----------|------------------------------------|-----------------|-----------------|---------------|------------------|-----------------------------|-------------------------|---------------------------------------------| +| [AQLM](./aqlm.md) | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1/2 | 🟢 | 🟢 | 🟢 | https://github.com/Vahe1994/AQLM | +| [AWQ](./awq.md) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | ? | 4 | 🟢 | 🟢 | 🟢 | https://github.com/casper-hansen/AutoAWQ | +| [bitsandbytes](./bitsandbytes.md) | 🟢 | 🟡 | 🟢 | 🟡 | 🔴 | 🟡 | 🔴 | 4/8 | 🟢 | 🟢 | 🟢 | https://github.com/bitsandbytes-foundation/bitsandbytes | +| [compressed-tensors](./compressed_tensors.md) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 1/8 | 🟢 | 🟢 | 🟢 | https://github.com/neuralmagic/compressed-tensors | +| [EETQ](./eetq.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ | +| [GGUF / GGML (llama.cpp)](../gguf.md) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 1/8 | 🔴 | [See Notes](../gguf.md) | [See Notes](../gguf.md) | https://github.com/ggerganov/llama.cpp | +| [GPTQModel](./gptq.md) | 🔴 | 🟢 | 🟢 | 🟢 | 🟢 | 🟢 | 🔴 | 2/3/4/8 | 🟢 | 🟢 | 🟢 | https://github.com/ModelCloud/GPTQModel | +| [AutoGPTQ](./gptq.md) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 2/3/4/8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ | +| [HIGGS](./higgs.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 2/4 | 🔴 | 🟢 | 🟢 | https://github.com/HanGuo97/flute | +| [HQQ](./hqq.md) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1/8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ | +| [optimum-quanto](./quanto.md) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🟢 | 2/4/8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/optimum-quanto | +| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM | +| [torchao](./torchao.md) | 🟢 | | 🟢 | 🔴 | 🟡 | 🔴 | | 4/8 | | 🟢🔴 | 🟢 | https://github.com/pytorch/ao | +| [VPTQ](./vptq.md) | 🔴 | 🔴 | 🟢 | 🟡 | 🔴 | 🔴 | 🟢 | 1/8 | 🔴 | 🟢 | 🟢 | https://github.com/microsoft/VPTQ | ## Resources diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md index cb1291ce0df5..0ad4633c91cb 100644 --- a/docs/source/en/quantization/torchao.md +++ b/docs/source/en/quantization/torchao.md @@ -22,6 +22,8 @@ pip install --upgrade torch torchao transformers torchao supports int8 weight quantization and int8 dynamic quantization of weights. Create a [`TorchAoConfig`] and specify the quantization type and `group_size` of the weights to quantize. +Set the `cache_implementation` to `"static"` to automatically [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) the forward method. + ```py import torch from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer @@ -33,16 +35,14 @@ quantized_model = AutoModelForCausalLM.from_pretrained( device_map="auto", quantization_config=quantization_config ) -``` - -## torch.compile -Wrap the quantized model with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for even faster generation. - -```py -import torchao +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B") +input_text = "What are we having for dinner?" +input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") -quantized_model = torch.compile(quantized_model, mode="max-autotune") +# auto-compile the quantized model with `cache_implementation="static"` to get speed up +output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") +print(tokenizer.decode(output[0], skip_special_tokens=True)) ``` ## Serialization diff --git a/docs/source/en/tiktoken.md b/docs/source/en/tiktoken.md deleted file mode 100644 index 4cddd02a8804..000000000000 --- a/docs/source/en/tiktoken.md +++ /dev/null @@ -1,59 +0,0 @@ - - -# tiktoken - -[tiktoken](https://github.com/openai/tiktoken) is a [byte-pair encoding (BPE)](./tokenizer_summary#byte-pair-encoding-bpe) tokenizer by OpenAI. It includes several tokenization schemes or encodings for how text should be tokenized. - -There are currently two models trained and released with tiktoken, GPT2 and Llama3. Transformers supports models with a [tokenizer.model](https://hf.co/meta-llama/Meta-Llama-3-8B/blob/main/original/tokenizer.model) tiktoken file. The tiktoken file is automatically converted into Transformers Rust-based [`PreTrainedTokenizerFast`]. - -Add the `subfolder` parameter to [`~PreTrainedModel.from_pretrained`] to specify where the `tokenizer.model` tiktoken file is located. - -```py -from transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", subfolder="original") -``` - -## Create a tiktoken tokenizer - -The tiktoken `tokenizer.model` file contains no information about additional tokens or pattern strings. If these are important, convert the tokenizer to `tokenizer.json` (the appropriate format for [`PreTrainedTokenizerFast`]). - -Generate the tiktoken `tokenizer.model` file with the [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) function, and convert it to `tokenizer.json` with [convert_tiktoken_to_fast](https://github.com/huggingface/transformers/blob/99e0ab6ed888136ea4877c6d8ab03690a1478363/src/transformers/integrations/tiktoken.py#L8). - -```py -from transformers.integrations.tiktoken import convert_tiktoken_to_fast -from tiktoken import get_encoding - -# Load your custom encoding or the one provided by OpenAI -encoding = get_encoding("gpt2") -convert_tiktoken_to_fast(encoding, "config/save/dir") -``` - -The resulting `tokenizer.json` file is saved to the specified directory and loaded with [`~PreTrainedTokenizerFast.from_pretrained`]. - -```py -tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir") -``` - -Visualize how the tiktoken tokenizer works by selecting Llama3 in the Tokenizer Playground below. - - From 34dc6844bc41634ac9f2aed2057abdaef01c81e7 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Mon, 3 Feb 2025 10:19:12 -0800 Subject: [PATCH 111/116] updates/feedback --- docs/source/en/add_new_model.md | 58 ++++++++++++++-------------- docs/source/en/model_doc/got_ocr2.md | 4 ++ 2 files changed, 33 insertions(+), 29 deletions(-) diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md index 252f06880598..aca5c8c55f61 100644 --- a/docs/source/en/add_new_model.md +++ b/docs/source/en/add_new_model.md @@ -30,7 +30,7 @@ When you add a model to Transformers, you'll learn: It is a challenging but rewarding process. -This guide will walk you through adding an example BrandNewBert PyTorch model to Transformers. Before you begin, it is a good idea to familiarize yourself with the library. +This guide will walk you through adding an example BrandNewLlama PyTorch model to Transformers. Before you begin, it is a good idea to familiarize yourself with the library. ## Transformers overview @@ -53,16 +53,16 @@ This section describes how the model and configuration classes interact and the All Transformers' models inherit from a base [`PreTrainedModel`] and [`PretrainedConfig`] class. The configuration is the models blueprint. -There is never more than two levels of abstraction for any model to keep the code readable. The example model here, BrandNewBert, inherits from `BrandNewBertPreTrainedModel` and [`PreTrainedModel`]. It is important that a new model only depends on [`PreTrainedModel`] so that it can use the [`~PreTrainedModel.from_pretrained`] and [`~PreTrainedModel.save_pretrained`] methods. +There is never more than two levels of abstraction for any model to keep the code readable. The example model here, BrandNewBert, inherits from `BrandNewLlamaPreTrainedModel` and [`PreTrainedModel`]. It is important that a new model only depends on [`PreTrainedModel`] so that it can use the [`~PreTrainedModel.from_pretrained`] and [`~PreTrainedModel.save_pretrained`] methods. Other important functions like the forward method are defined in the `modeling.py` file. Specific model heads (for example, sequence classification or language modeling) should call the base model in the forward pass rather than inherting from it to keep abstraction low. -New models require a configuration, for example `BrandNewBertConfig`, that is stored as an attribute of [`PreTrainedModel`]. +New models require a configuration, for example `BrandNewLlamaConfig`, that is stored as an attribute of [`PreTrainedModel`]. ```py -model = BrandNewBertModel.from_pretrained("username/brand_new_bert") +model = BrandNewLlamaModel.from_pretrained("username/brand_new_llama") model.config ``` @@ -102,7 +102,7 @@ Now is a good time to get familiar with BrandNewBert. It is helpful to read a mo In addition to learning more about your model, use the tips below to help you add a model faster. > [!TIP] -> Each contributor has a unique style and workflow for adding models to Transformers. Take a look at how [Gemma](https://github.com/huggingface/transformers/pull/29167) was added for an example. +> Each contributor has a unique style and workflow for adding models to Transformers. For an example, take a look at how [Gemma](https://github.com/huggingface/transformers/pull/29167) was added. - Don't reinvent the wheel! Take your time to explore existing models and tokenizers to see what you can copy and reuse. [Grep](https://www.gnu.org/software/grep/) and [ripgrep](https://github.com/BurntSushi/ripgrep) are great tools for this. - This is more of an engineering than a science challenge. Focus on the more practical (setting up an efficient debugging environment for example) instead of the theorertical aspects of the model. @@ -135,7 +135,7 @@ pip install -e ".[quality]" Return to the parent directory and clone and install the original BrandNewBert repository. ```bash -git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git +git clone https://github.com/org_that_created_brand_new_llama_org/brand_new_llama.git cd brand_new_bert pip install -e . ``` @@ -166,7 +166,7 @@ transformers-cli add-new-model-like ## Create a pull request -Before you start adapting the code, create a pull request to track your progress and get feedback from the Transformers team. Title your pull request **[WIP] Add BrandNewBert** so it's clear that this is a work in progress. +Before you start adapting the code, create a pull request to track your progress and get feedback from the Transformers team. Title your pull request **[WIP] Add BrandNewLlama** so it's clear that this is a work in progress. Create a branch with a descriptive name from your main branch. @@ -218,9 +218,9 @@ The last point is especially important because you'll need a thorough understand A good first step is to load a *small* pretrained checkpoint and try to reproduce a single forward pass with an example integer vector of inputs. For example, in pseudocode, this could look like the following. ```py -model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +model = BrandNewLlamaModel.load_pretrained_checkpoint("/path/to/checkpoint/") input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids -original_output = model.predict(input_ids) +original_output = model.generate(input_ids) ``` ### Debugging @@ -294,10 +294,10 @@ Once you're able to run the original checkpoint, you're ready to start adapting The `transformers-cli add-new-model-like` command should have generated a model and configuration file. -- `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` -- `src/transformers/models/brand_new_bert/configuration_brand_new_bert.py` +- `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py` +- `src/transformers/models/brand_new_llama/configuration_brand_new_llama.py` -The automatically generated code in the `modeling.py` file has the same architecture as BERT if you answered it's an encoder-only model or it will have the same architecture as BART if you answered it's an encoder-decoder model. The generated code is just a starting point. Based on your research on the new model, you'll need to implement those specific changes by adapting the generated code. This may involve changes to the self-attention layer, the order of the normalization layer, and so on. +The automatically generated code in the `modeling.py` file has the same architecture as Llama if you answered it's a decoder-only model or it will have the same architecture as BART if you answered it's an encoder-decoder model. The generated code is just a starting point. Based on your research on the new model, you'll need to implement those specific changes by adapting the generated code. This may involve changes to the self-attention layer, the order of the normalization layer, and so on. ### Model initialization @@ -308,7 +308,7 @@ from transformers import BrandNewBert, BrandNewBertConfig model = BrandNewBert(BrandNewBertConfig()) ``` -Random initialization occurs in the `_init_weights` method of `BrandNewBertPreTrainedModel`. All leaf modules are initialized depending on the configuration's variables. +Random initialization occurs in the `_init_weights` method of `BrandNewLlamaPreTrainedModel`. All leaf modules are initialized depending on the configuration's variables. ```py def _init_weights(self, module): @@ -356,7 +356,7 @@ The original checkpoint must be converted to a Transformers compatible checkpoin Make sure **all** required weights are initialized and print out all the checkpoint weights that weren't used for initialization to make sure the model has been converted correctly. -You may encounter wrong shape statements or name assignments during the conversion. This is most likely because of incorrect parameters in `BrandNewBertConfig`, the wrong architecture, a bug in the `init` method of your implementation, or you need to transpose one of the checkpoint weights. +You may encounter wrong shape statements or name assignments during the conversion. This is most likely because of incorrect parameters in `BrandNewLlamaConfig`, the wrong architecture, a bug in the `init` method of your implementation, or you need to transpose one of the checkpoint weights. Keep iterating on the [Adapt the model code](#adapt-the-model-code) section until all the checkpoint weights are correctly loaded. Once you can load a checkpoint in your model, save it to a folder. This should contain a `model.safetensors` file and a `config.json` file. @@ -440,16 +440,16 @@ assert ( logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}") ``` -When the shape or name don't match, you may have assigned the incorrect checkpoint weight to a randomly initialized layer. An incorrect shape may be because the `BrandNewBert` parameters don't exactly match the original models parameters. But it could also be that the PyTorch layer implementation requires the weights to be transposed first. +When the shape or name don't match, you may have assigned the incorrect checkpoint weight to a randomly initialized layer. An incorrect shape may be because the `BrandNewLlama` parameters don't exactly match the original models parameters. But it could also be that the PyTorch layer implementation requires the weights to be transposed first. ### Implement the forward pass The forward pass should be implemented next if the model loads correctly. It takes some inputs and returns the model output. ```py -model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder") +model = BrandNewLlamaModel.from_pretrained("/path/to/converted/checkpoint/folder") input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19] -output = model(input_ids).last_hidden_states +output = model.generate(input_ids).last_hidden_states ``` Don't be discouraged if your forward pass isn't identical with the output from the original model or if it returns an error. Check that the forward pass doesn't throw any errors. This is often because the dimensions are wrong or because the wrong data type is used ([torch.long](https://pytorch.org/docs/stable/generated/torch.Tensor.long.html) instead of [torch.float32](https://pytorch.org/docs/stable/tensors.html)). @@ -487,29 +487,29 @@ While the model works, you still need to add tests to ensure it is compatible wi [Cookiecutter](https://cookiecutter.readthedocs.io/en/stable/) should have added a test file for your model. Run the test file below to make sure all common tests pass. ```bash -pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py +pytest tests/models/brand_new_llama/test_modeling_brand_new_llama.py ``` -The integration tests should be added first because they serve the same purpose as the debugging scripts you used earlier to implement the new model in Transformers. A template of those model tests, `BrandNewBertModelIntegrationTests`, was added by Cookiecutter and should be filled out. To ensure it passes, run the following command. +The integration tests should be added first because they serve the same purpose as the debugging scripts you used earlier to implement the new model in Transformers. A template of those model tests, `BrandNewLlamaModelIntegrationTests`, was added by Cookiecutter and should be filled out. To ensure it passes, run the following command. ```bash -RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests +RUN_SLOW=1 pytest -sv tests/models/brand_new_llama/test_modeling_brand_new_llama.py::BrandNewLlamaModelIntegrationTests ``` ```bash -SET RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests +SET RUN_SLOW=1 pytest -sv tests/models/brand_new_llama/test_modeling_brand_new_llama.py::BrandNewLlamaModelIntegrationTests ``` -All features unique to BrandNewBert should be tested in a separate test under `BrandNewBertModelTester/BrandNewBertModelTest`. This test is often overlooked, but it is extremely important because: +All features unique to BrandNewBert should be tested in a separate test under `BrandNewLlamaModelTester/BrandNewLlamaModelTest`. This test is often overlooked, but it is extremely important because: - it helps transfer knowledge you acquired during the process to the community by showing how the models novel features work - future contributors can quickly test changes to the model by running these special tests @@ -525,17 +525,17 @@ Find and load the original tokenizer file into your implementation. Create a scr ```py input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." -model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +model = BrandNewLlamaModel.load_pretrained_checkpoint("/path/to/checkpoint/") input_ids = model.tokenize(input_str) ``` You may need to search the original repository to find the correct tokenizer function or modify the existing tokenizer in your clone of the original repository to only return the `input_ids`. The script for your tokenizer should look similar to the following. ```py -from transformers import BrandNewBertTokenizer +from transformers import BrandNewLlamaTokenizer input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." -tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/") +tokenizer = BrandNewLlamaTokenizer.from_pretrained("/path/to/tokenizer/folder/") input_ids = tokenizer(input_str).input_ids ``` @@ -543,7 +543,7 @@ When both implementations have the same `input_ids`, add a tokenizer test file. ## Integration tests -Now that you have a model and tokenizer, add end-to-end integration tests for the model and tokenizer to `tests/models/brand_new_bert/test_modeling_brand-new_bert.py`. +Now that you have a model and tokenizer, add end-to-end integration tests for the model and tokenizer to `tests/models/brand_new_llama/test_modeling_brand_new_llama.py`. The test should provide a meaningful text-to-text example to show the model works as expected. For example, you can include a source-to-target translation pair, an article-to-summary pair, or a question-to-answer pair. @@ -553,11 +553,11 @@ Finally, try to make sure your tests can run on a GPU by adding `.to(self.device ## Add documentation -Your model is only useful if users know how to use it. This is why it's important to add documentation and docstrings. Cookiecutter added a template file, `docs/source/model_doc/brand_new_bert.md`, that you can fill out with information about your model. +Your model is only useful if users know how to use it. This is why it's important to add documentation and docstrings. Cookiecutter added a template file, `docs/source/model_doc/brand_new_llama.md`, that you can fill out with information about your model. This is generally a user's first interaction with a model, so the documentation should be clear and concise. It is often very useful to add examples of how the model should be used. -Make sure docstrings are added to `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` and includes all necessary inputs and outputs. Review our [guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification) for writing documentation and docstrings. +Make sure docstrings are added to `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py` and includes all necessary inputs and outputs. Review our [guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification) for writing documentation and docstrings. ## Refactor @@ -589,7 +589,7 @@ You should also consult with the Transformers team to decide on an appropriate n Use the [`~PreTrainedModel.push_to_hub`] method to upload the model. ```py -brand_new_bert.push_to_hub("brand_new_bert") +brand_new_bert.push_to_hub("brand_new_llama") ``` Refer to the [Sharing](./model_sharing) guide for more information about uploading models to the Hub. diff --git a/docs/source/en/model_doc/got_ocr2.md b/docs/source/en/model_doc/got_ocr2.md index a560f78269cc..a08761f452dd 100644 --- a/docs/source/en/model_doc/got_ocr2.md +++ b/docs/source/en/model_doc/got_ocr2.md @@ -16,6 +16,10 @@ rendered properly in your Markdown viewer. # GOT-OCR2 +
+PyTorch +
+ ## Overview The GOT-OCR2 model was proposed in [General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model](https://arxiv.org/abs/2409.01704) by Haoran Wei, Chenglong Liu, Jinyue Chen, Jia Wang, Lingyu Kong, Yanming Xu, Zheng Ge, Liang Zhao, Jianjian Sun, Yuang Peng, Chunrui Han, Xiangyu Zhang. From e8ca4009bb25f2e603a44b3879af3cbdbe3b76bb Mon Sep 17 00:00:00 2001 From: stevhliu Date: Tue, 11 Feb 2025 09:52:42 -0800 Subject: [PATCH 112/116] review --- docs/source/en/add_new_model.md | 24 ++--- docs/source/en/add_new_pipeline.md | 6 +- docs/source/en/chat_extras.md | 2 +- docs/source/en/chat_templating_multimodal.md | 4 +- docs/source/en/generation_features.md | 2 +- docs/source/en/generation_strategies.md | 12 +-- docs/source/en/how_to_hack_models.md | 6 +- docs/source/en/installation.md | 4 +- docs/source/en/llm_tutorial.md | 10 +-- docs/source/en/llm_tutorial_optimization.md | 1 + docs/source/en/model_doc/dab-detr.md | 4 + docs/source/en/model_doc/depth_pro.md | 4 + docs/source/en/model_doc/rt_detr_v2.md | 4 + docs/source/en/models.md | 4 +- docs/source/en/quicktour.md | 2 +- docs/source/en/torchscript.md | 2 +- docs/source/en/trainer.md | 15 ++++ docs/source/en/training.md | 93 -------------------- 18 files changed, 67 insertions(+), 132 deletions(-) diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md index aca5c8c55f61..bfab511972e7 100644 --- a/docs/source/en/add_new_model.md +++ b/docs/source/en/add_new_model.md @@ -53,7 +53,7 @@ This section describes how the model and configuration classes interact and the All Transformers' models inherit from a base [`PreTrainedModel`] and [`PretrainedConfig`] class. The configuration is the models blueprint. -There is never more than two levels of abstraction for any model to keep the code readable. The example model here, BrandNewBert, inherits from `BrandNewLlamaPreTrainedModel` and [`PreTrainedModel`]. It is important that a new model only depends on [`PreTrainedModel`] so that it can use the [`~PreTrainedModel.from_pretrained`] and [`~PreTrainedModel.save_pretrained`] methods. +There is never more than two levels of abstraction for any model to keep the code readable. The example model here, BrandNewLlama, inherits from `BrandNewLlamaPreTrainedModel` and [`PreTrainedModel`]. It is important that a new model only depends on [`PreTrainedModel`] so that it can use the [`~PreTrainedModel.from_pretrained`] and [`~PreTrainedModel.save_pretrained`] methods. Other important functions like the forward method are defined in the `modeling.py` file. @@ -91,13 +91,13 @@ Open a [New model addition](https://github.com/huggingface/transformers/issues/n > [!TIP] > Filter by the [New model](https://github.com/huggingface/transformers/labels/New%20model) label on GitHub to view and add any existing model requests. -Now is a good time to get familiar with BrandNewBert. It is helpful to read a models research paper to understand its technical design and implementation. You don't necessarily have to worry too much about the theoretical details. Instead, focus on the practical ones. Use the questions below to guide your reading. +Now is a good time to get familiar with BrandNewLlama. It is helpful to read a models research paper to understand its technical design and implementation. You don't necessarily have to worry too much about the theoretical details. Instead, focus on the practical ones. Use the questions below to guide your reading. -- What type of model is BrandNewBert? Is it a encoder, decoder, or encoder-decoder model? -- What tasks can BrandNewBert be used for? -- What makes BrandNewBert different from other models? -- What models in Transformers are most similar to BrandNewBert? -- What tokenizer does BrandNewBert use? +- What type of model is BrandNewLlama? Is it a encoder, decoder, or encoder-decoder model? +- What tasks can BrandNewLlama be used for? +- What makes BrandNewLlama different from other models? +- What models in Transformers are most similar to BrandNewLlama? +- What tokenizer does BrandNewLlama use? In addition to learning more about your model, use the tips below to help you add a model faster. @@ -132,7 +132,7 @@ Due to the number of optional dependencies as Transformers grows, this command m pip install -e ".[quality]" ``` -Return to the parent directory and clone and install the original BrandNewBert repository. +Return to the parent directory and clone and install the original BrandNewLlama repository. ```bash git clone https://github.com/org_that_created_brand_new_llama_org/brand_new_llama.git @@ -140,7 +140,7 @@ cd brand_new_bert pip install -e . ``` -Return to your clone of Transformers to begin porting BrandNewBert. +Return to your clone of Transformers to begin porting BrandNewLlama. ```bash cd transformers @@ -304,8 +304,8 @@ The automatically generated code in the `modeling.py` file has the same architec At this point, your code doesn't have to be clean or even fully correct, It is more efficient to quickly create a first draft and then iteratively improve on it. The most important thing is that your model can be instantiated from Transformers. The command below creates a model from the configuration with random weights, verifying that the `__init__` method works. ```py -from transformers import BrandNewBert, BrandNewBertConfig -model = BrandNewBert(BrandNewBertConfig()) +from transformers import BrandNewLlama, BrandNewLlamaConfig +model = BrandNewLlama(BrandNewLlamaConfig()) ``` Random initialization occurs in the `_init_weights` method of `BrandNewLlamaPreTrainedModel`. All leaf modules are initialized depending on the configuration's variables. @@ -509,7 +509,7 @@ SET RUN_SLOW=1 pytest -sv tests/models/brand_new_llama/test_modeling_brand_new_l -All features unique to BrandNewBert should be tested in a separate test under `BrandNewLlamaModelTester/BrandNewLlamaModelTest`. This test is often overlooked, but it is extremely important because: +All features unique to BrandNewLlama should be tested in a separate test under `BrandNewLlamaModelTester/BrandNewLlamaModelTest`. This test is often overlooked, but it is extremely important because: - it helps transfer knowledge you acquired during the process to the community by showing how the models novel features work - future contributors can quickly test changes to the model by running these special tests diff --git a/docs/source/en/add_new_pipeline.md b/docs/source/en/add_new_pipeline.md index c0029146b207..60ef43dab585 100644 --- a/docs/source/en/add_new_pipeline.md +++ b/docs/source/en/add_new_pipeline.md @@ -52,7 +52,7 @@ def preprocess(self, inputs, maybe_arg=2): return {"model_input": model_input} ``` -1. `_forward` shouldn't be called directly. `forward` is the preferred method because it includes safeguards to make sure everything works correctly on the expected device. Anything linked to the model belongs in `_forward` and everything else belongs in either `preprocess` or `postprocess`. +2. `_forward` shouldn't be called directly. `forward` is the preferred method because it includes safeguards to make sure everything works correctly on the expected device. Anything linked to the model belongs in `_forward` and everything else belongs in either `preprocess` or `postprocess`. ```py def _forward(self, model_inputs): @@ -60,7 +60,7 @@ def _forward(self, model_inputs): return outputs ``` -1. `postprocess` generates the final output from the models output in `_forward`. +3. `postprocess` generates the final output from the models output in `_forward`. ```py def postprocess(self, model_outputs, top_k=5): @@ -68,7 +68,7 @@ def postprocess(self, model_outputs, top_k=5): return best_class ``` -1. `_sanitize_parameters` lets users pass additional parameters to [`Pipeline`]. This could be during initialization or when [`Pipeline`] is called. `_sanitize_parameters` returns 3 dicts of additional keyword arguments that are passed directly to `preprocess`, `_forward`, and `postprocess`. Don't add anything if a user didn't call the pipeline with extra parameters. This keeps the default arguments in the function definition which is always more natural. +4. `_sanitize_parameters` lets users pass additional parameters to [`Pipeline`]. This could be during initialization or when [`Pipeline`] is called. `_sanitize_parameters` returns 3 dicts of additional keyword arguments that are passed directly to `preprocess`, `_forward`, and `postprocess`. Don't add anything if a user didn't call the pipeline with extra parameters. This keeps the default arguments in the function definition which is always more natural. For example, add a `top_k` parameter in `postprocess` to return the top 5 most likely classes. Then in `_sanitize_parameters`, check if the user passed in `top_k` and add it to `postprocess_kwargs`. diff --git a/docs/source/en/chat_extras.md b/docs/source/en/chat_extras.md index 11c195741c0a..026268500b13 100644 --- a/docs/source/en/chat_extras.md +++ b/docs/source/en/chat_extras.md @@ -240,7 +240,7 @@ model_input = tokenizer.apply_chat_template( ) ``` -## Retrieval-augmented generation (RAG) +## RAG Retrieval-augmented generation (RAG) models enhance a models existing knowledge by allowing it to search documents for additional information before returning a query. For RAG models, add a `documents` parameter to [`~PreTrainedTokenizerBase.apply_chat_template`]. This `documents` parameter should be a list of documents, and each document should be a single dict with `title` and `content` keys. diff --git a/docs/source/en/chat_templating_multimodal.md b/docs/source/en/chat_templating_multimodal.md index 509cf452819b..4ac936b2a856 100644 --- a/docs/source/en/chat_templating_multimodal.md +++ b/docs/source/en/chat_templating_multimodal.md @@ -50,7 +50,7 @@ messages = [ Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map=“auto”](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory. > [!TIP] -> [`ImageTextToTextPipeline`] accepts chats in the OpenAI format to make inference easier and more accessible. +> The [`ImageTextToTextPipeline`] accepts chats in the OpenAI format to make inference easier and more accessible. ```python import torch @@ -158,7 +158,7 @@ These inputs are now ready to be used in [`~GenerationMixin.generate`]. ## Template configuration -You can create a custom chat template with [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with [`~ProcessorMixin.chat_template`]. Refer to the [Template writing](./chat_templating_writing) guide for more details. +You can create a custom chat template with [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with [`~ProcessorMixin.apply_chat_template`]. Refer to the [Template writing](./chat_templating_writing) guide for more details. For example, to enable a template to handle a *list of content* from multiple modalities while still supporting plain strings for text-only inference, specify how to handle the `content['type']` if it is an image or text as shown below in the Llama 3.2 Vision Instruct [template](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct/blob/main/chat_template.json). diff --git a/docs/source/en/generation_features.md b/docs/source/en/generation_features.md index 110d4f76b2bb..56163ba01d92 100644 --- a/docs/source/en/generation_features.md +++ b/docs/source/en/generation_features.md @@ -55,7 +55,7 @@ Watermarking is supported for any generative model in Transformers and doesn't r Create a [`WatermarkingConfig`] with the bias value to add to the logits and watermarking algorithm. The example below uses the `"selfhash"` algorithm, where the green token selection only depends on the current token. Pass the [`WatermarkingConfig`] to [`~GenerationMixin.generate`]. > [!TIP] -> [`WatermarkDetector`] detects the proportion of green tokens in generated text, which is why it is recommended to strip the prompt text, if it is much longer than the generated text. Padding can also have an effect on [`WatermarkDetector`]. +> The [`WatermarkDetector`] class detects the proportion of green tokens in generated text, which is why it is recommended to strip the prompt text, if it is much longer than the generated text. Padding can also have an effect on [`WatermarkDetector`]. ```py from transformers import AutoTokenizer, AutoModelForCausalLM, WatermarkDetector, WatermarkingConfig diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md index 6899cbf61645..706443906ad5 100644 --- a/docs/source/en/generation_strategies.md +++ b/docs/source/en/generation_strategies.md @@ -64,7 +64,7 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True) Beam search keeps track of several generated sequences (beams) at each time step. After a certain number of steps, it selects the sequence with the highest *overall* probability. Unlike greedy search, this strategy can "look ahead" and pick a sequence with a higher probability overall even if the initial tokens have a lower probability. -> [TIP] +> [!TIP] > Check out the [beam search visualizer](https://huggingface.co/spaces/m-ric/beam_search_visualizer) to see how beam search works. Enable beam search with the `num_beams` parameter (should be greater than 1 otherwise it's equivalent to greedy search). @@ -144,7 +144,7 @@ outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams= ## Speculative decoding -[Speculative](https://hf.co/papers/2211.17192) or assistive decoding isn't a search or sampling strategy. This method is especially useful for LLMs where it can be more costly and slower to generate tokens. Instead, speculative decoding adds a second smaller model to generate candidate tokens. The main model verifies the candidate tokens in a single `forward` pass, which speeds up the decoding process overall. Refer to the [speculative decoding](./llm_optims#speculative-decoding) guide to learn more. +[Speculative](https://hf.co/papers/2211.17192) or assistive decoding isn't a search or sampling strategy. Instead, speculative decoding adds a second smaller model to generate candidate tokens. The main model verifies the candidate tokens in a single `forward` pass, which speeds up the decoding process overall. This method is especially useful for LLMs where it can be more costly and slower to generate tokens. Refer to the [speculative decoding](./llm_optims#speculative-decoding) guide to learn more. Currently, only greedy search and multinomial sampling are supported with speculative decoding. Batched inputs aren't supported either. @@ -281,10 +281,10 @@ Enable DoLa with the following parameters. It can also be a list of integers that represent the layer indices between 0 and the total number of layers. Layer 0 is the word embedding, 1 is the first transformer layer, and so on. Refer to the table below for the range of layer indices depending on the number of model layers. -| layers | low | high | -|---|---|---| -| > 40 | (0, 20, 2) | (N - 20, N, 2) | -| <= 40 | range(0, N // 2, 2) | range(N // 2, N, 2) | + | layers | low | high | + |---|---|---| + | > 40 | (0, 20, 2) | (N - 20, N, 2) | + | <= 40 | range(0, N // 2, 2) | range(N // 2, N, 2) | - `repetition_penalty` reduces repetition and it is recommended to set it to 1.2. diff --git a/docs/source/en/how_to_hack_models.md b/docs/source/en/how_to_hack_models.md index 550ac85d4f50..c57e141a1033 100644 --- a/docs/source/en/how_to_hack_models.md +++ b/docs/source/en/how_to_hack_models.md @@ -112,7 +112,7 @@ model = SamModel.from_pretrained("facebook/sam-vit-base") With separate `q`, `k`, and `v` projections, apply LoRA to `q` and `v`. -Create a [`~peft.LoraConfig`] and specify the rank `r`, `lora_alpha`, `lora_dropout`, `task_type`, and most importantly, the modules to target. +Create a [LoraConfig](https://huggingface.co/docs/peft/package_reference/config#peft.PeftConfig) and specify the rank `r`, `lora_alpha`, `lora_dropout`, `task_type`, and most importantly, the modules to target. ```py from peft import LoraConfig, get_peft_model @@ -127,13 +127,13 @@ config = LoraConfig( ) ``` -Pass the model and [`~peft.LoraConfig`] to [`~peft.get_peft_model`] to apply LoRA to the model. +Pass the model and [LoraConfig](https://huggingface.co/docs/peft/package_reference/config#peft.PeftConfig) to [get_peft_model](https://huggingface.co/docs/peft/package_reference/peft_model#peft.get_peft_model) to apply LoRA to the model. ```py model = get_peft_model(model, config) ``` -Call [`~peft.PeftModel.print_trainable_parameters`] to view the number of parameters you're training as a result versus the total number of parameters. +Call [print_trainable_parameters](https://huggingface.co/docs/peft/package_reference/peft_model#peft.PeftMixedModel.print_trainable_parameters) to view the number of parameters you're training as a result versus the total number of parameters. ```py model.print_trainable_parameters() diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md index 46d4aa385bdb..2c7ddabb8783 100644 --- a/docs/source/en/installation.md +++ b/docs/source/en/installation.md @@ -29,7 +29,7 @@ A virtual environment helps manage different projects and avoids compatibility i -Create and activate a virtual environment in your project directory with venv or uv. +Create and activate a virtual environment in your project directory with [venv](https://docs.python.org/3/library/venv.html). ```bash python -m venv .env @@ -74,7 +74,7 @@ uv pip install transformers -For GPU acceleration, install the appropriate CUDA drivers for [PyTorch](https://pytorch.org/get-started/locally) and TensorFlow(https://www.tensorflow.org/install/pip). +For GPU acceleration, install the appropriate CUDA drivers for [PyTorch](https://pytorch.org/get-started/locally) and [TensorFlow](https://www.tensorflow.org/install/pip). Run the command below to check if your system detects an NVIDIA GPU. diff --git a/docs/source/en/llm_tutorial.md b/docs/source/en/llm_tutorial.md index 2533192ccb44..e5c254debf21 100644 --- a/docs/source/en/llm_tutorial.md +++ b/docs/source/en/llm_tutorial.md @@ -80,7 +80,7 @@ GenerationConfig { } ``` -You can customize [`~GenerationMixin.generate`] by overriding the parameters and values in [`GenerationConfig`]. Some of the most commonly adjusted parameters are [`~GenerationConfig.max_new_tokens`], [`~GenerationConfig.num_beams`], [`~GenerationConfig.do_sample`], and [`~GenerationConfig.num_return_sequences`]. +You can customize [`~GenerationMixin.generate`] by overriding the parameters and values in [`GenerationConfig`]. Some of the most commonly adjusted parameters are [max_new_tokens](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.max_new_tokens), [num_beams](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.num_beams), [do_sample](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.do_sample), and [num_return_sequences](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.num_return_sequences). ```py # enable beam search sampling strategy @@ -138,7 +138,7 @@ print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) The section below covers some common issues you may encounter during text generation and how to solve them. -## Wrong output length +### Output length [`~GenerationMixin.generate`] returns up to 20 tokens by default unless otherwise specified in a models [`GenerationConfig`]. It is highly recommended to manually set the number of generated tokens with the [`max_new_tokens`] parameter to control the output length. [Decoder-only](https://hf.co/learn/nlp-course/chapter1/6?fw=pt) models returns the initial prompt along with the generated tokens. @@ -167,7 +167,7 @@ tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] -## Wrong decoding strategy +### Decoding strategy The default decoding strategy in [`~GenerationMixin.generate`] is *greedy search*, which selects the next most likely token, unless otherwise specified in a models [`GenerationConfig`]. While this decoding strategy works well for input-grounded tasks (transcription, translation), it is not optimal for more creative use cases (story writing, chat applications). @@ -196,7 +196,7 @@ tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] -## Wrong padding side +### Padding side Inputs need to be padded if they don't have the same length. But LLMs aren't trained to continue generation from padding tokens, which means the [`~PreTrainedTokenizer.padding_side`] parameter needs to be set to the left of the input. @@ -229,7 +229,7 @@ tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] -## Wrong prompt format +### Prompt format Some models and tasks expect a certain input prompt format, and if the format is incorrect, the model returns a suboptimal output. You can learn more about prompting in the [prompt engineering](./tasks/prompting) guide. diff --git a/docs/source/en/llm_tutorial_optimization.md b/docs/source/en/llm_tutorial_optimization.md index 1fd458d430f6..ef972789553a 100644 --- a/docs/source/en/llm_tutorial_optimization.md +++ b/docs/source/en/llm_tutorial_optimization.md @@ -8,6 +8,7 @@ specific language governing permissions and limitations under the License. ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer. --> + # Optimizing LLMs for Speed and Memory [[open-in-colab]] diff --git a/docs/source/en/model_doc/dab-detr.md b/docs/source/en/model_doc/dab-detr.md index 6071ee6ca460..d19b45b486b0 100644 --- a/docs/source/en/model_doc/dab-detr.md +++ b/docs/source/en/model_doc/dab-detr.md @@ -16,6 +16,10 @@ rendered properly in your Markdown viewer. # DAB-DETR +
+PyTorch +
+ ## Overview The DAB-DETR model was proposed in [DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR](https://arxiv.org/abs/2201.12329) by Shilong Liu, Feng Li, Hao Zhang, Xiao Yang, Xianbiao Qi, Hang Su, Jun Zhu, Lei Zhang. diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index 2447b7d93dd5..91cbb61907e6 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -16,6 +16,10 @@ rendered properly in your Markdown viewer. # DepthPro +
+PyTorch +
+ ## Overview The DepthPro model was proposed in [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073) by Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, Vladlen Koltun. diff --git a/docs/source/en/model_doc/rt_detr_v2.md b/docs/source/en/model_doc/rt_detr_v2.md index 0c125af3d2a1..e5212d945ce7 100644 --- a/docs/source/en/model_doc/rt_detr_v2.md +++ b/docs/source/en/model_doc/rt_detr_v2.md @@ -16,6 +16,10 @@ rendered properly in your Markdown viewer. # RT-DETRv2 +
+PyTorch +
+ ## Overview The RT-DETRv2 model was proposed in [RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer](https://arxiv.org/abs/2407.17140) by Wenyu Lv, Yian Zhao, Qinyao Chang, Kui Huang, Guanzhong Wang, Yi Liu. diff --git a/docs/source/en/models.md b/docs/source/en/models.md index aef309960101..cc897dcc958f 100644 --- a/docs/source/en/models.md +++ b/docs/source/en/models.md @@ -21,7 +21,7 @@ Transformers provides many pretrained models that are ready to use with a single Call [`~PreTrainedModel.from_pretrained`] to download and load a models weights and configuration stored on the Hugging Face [Hub](https://hf.co/models). > [!TIP] -> [`~PreTrainedModel.from_pretrained`] loads weights stored in the [safetensors](https://hf.co/docs/safetensors/index) file format if they're available. Traditionally, PyTorch model weights are serialized with the [pickle](https://docs.python.org/3/library/pickle.html) utility which is known to be unsecure. Safetensor files are more secure and faster to load. +> The [`~PreTrainedModel.from_pretrained`] method loads weights stored in the [safetensors](https://hf.co/docs/safetensors/index) file format if they're available. Traditionally, PyTorch model weights are serialized with the [pickle](https://docs.python.org/3/library/pickle.html) utility which is known to be unsecure. Safetensor files are more secure and faster to load. ```py from transformers import AutoModelForCausalLM @@ -232,7 +232,7 @@ index["weight_map"] -[`~PreTrainedModel.from_pretrained`] method is supercharged with Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature. +[`~PreTrainedModel.from_pretrained`] is supercharged with Accelerate's [Big Model Inference](https://hf.co/docs/accelerate/usage_guides/big_modeling) feature. Big Model Inference creates a *model skeleton* on the PyTorch [meta](https://pytorch.org/docs/main/meta.html) device. The meta device doesn't store any real data, only the metadata. diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md index 81fcacf20756..4b6b6869bfa4 100755 --- a/docs/source/en/quicktour.md +++ b/docs/source/en/quicktour.md @@ -83,7 +83,7 @@ Use [`~PreTrainedModel.from_pretrained`] to load the weights and configuration f When you load a model, configure the following parameters to ensure the model is optimally loaded. - `device_map="auto"` automatically allocates the model weights to your fastest device first, which is typically the GPU. -- `torch_dtype="auto"` directly initializes the model weights in the data type they're stored in, which can help avoid loading the weights twice (PyTorch loads weights in torch.float32 by default). +- `torch_dtype="auto"` directly initializes the model weights in the data type they're stored in, which can help avoid loading the weights twice (PyTorch loads weights in `torch.float32` by default). ```py from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/docs/source/en/torchscript.md b/docs/source/en/torchscript.md index ae3c10f77da8..75d66e454837 100644 --- a/docs/source/en/torchscript.md +++ b/docs/source/en/torchscript.md @@ -54,7 +54,7 @@ The trace is created based on the provided inputs dimensions and it can only han Try to create a trace with a dummy input size at least as large as the largest expected input during inference. Padding can help fill missing values for larger inputs. It may be slower though since a larger input size requires more calculations. Be mindful of the total number of operations performed on each input and track the model performance when exporting models with variable sequence lengths. -## torchscript parameter +## Tied weights Weights between the `Embedding` and `Decoding` layers are tied in Transformers and TorchScript can't export models with tied weights. Instantiating a model with `torchscript=True`, separates the `Embedding` and `Decoding` layers and they aren't trained any further because it would throw the two layers out of sync which can lead to unexpected results. diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md index 0459916d7ad9..2cd0a2e99cf9 100644 --- a/docs/source/en/trainer.md +++ b/docs/source/en/trainer.md @@ -369,6 +369,21 @@ accelerate launch \ [`Trainer`] supports various optimizations to improve *training* performance - reduce memory and increase training speed - and *model* performance. +### torch.compile + +[torch.compile](./perf_torch_compile) can significantly speed up training and reduce computational overhead. Configure your torch.compile settings in [`TrainingArguments`]. Set `torch.compile` to `True`, and select a backend and compile mode. + +```py +from transformers import TrainingArguments + +training_args = TrainingArguments( + torch.compile=True, + torch.compile_backend="inductor", + torch_compile_mode="default", + ..., +) +``` + ### GaLore [Gradient Low-Rank Projection (GaLore)](https://hf.co/papers/2403.03507) significantly reduces memory usage when training large language models (LLMs). One of GaLores key benefits is *full-parameter* learning, unlike low-rank adaptation methods like [LoRA](https://hf.co/papers/2106.09685), which produces better model performance. diff --git a/docs/source/en/training.md b/docs/source/en/training.md index d056076c5bfa..7f2a622b4840 100644 --- a/docs/source/en/training.md +++ b/docs/source/en/training.md @@ -173,96 +173,3 @@ model.fit(tf_dataset) ## Resources Refer to the Transformers [examples](https://github.com/huggingface/transformers/tree/main/examples) for more detailed training scripts on various tasks. You can also check out the [notebooks](./notebooks) for interactive examples. - -### Fine-Tuning with torch.compile and Padding-Free Data Collation - -In addition to optimizing inference, you can also enhance the training efficiency of large language models by leveraging torch.compile during fine-tuning and using a padding-free data collator. This approach can significantly speed up training and reduce computational overhead. - -Here's how you can fine-tune a Llama model using SFTTrainer from the TRL library, with torch_compile enabled and a padding-free data collator: - -``` -#################### IMPORTS ################### - -import math -import datasets -import dataclasses -from transformers import ( - AutoModelForCausalLM, - AutoTokenizer, - TrainingArguments -) -from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM - -#################### MODEL LOADING WITH FLASH ATTENTION ################### - -model_name = "meta-llama/Llama-3.2-1B" -model = AutoModelForCausalLM.from_pretrained( - model_name, - attn_implementation="flash_attention_2" # Enables FlashAttention-2 -) -tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) - -#################### DATA PREPROCESSING (PADDING-FREE) ################### - -response_template = "\n### Label:" -response_template_ids = tokenizer.encode( - response_template, add_special_tokens=False -)[2:] # Exclude special tokens - -data_collator = DataCollatorForCompletionOnlyLM( - response_template_ids=response_template_ids, - tokenizer=tokenizer, - ignore_index=-100, - padding_free=True # Enables padding-free collation -) - -def format_dataset(example): - return { - "output": example["output"] + tokenizer.eos_token - } - -data_files = {"train": "path/to/dataset"} # Replace with your dataset path -json_dataset = datasets.load_dataset("json", data_files=data_files) -formatted_train_dataset = json_dataset["train"].map(format_dataset) - -################# TRAINING CONFIGURATION ############################ - -train_args = TrainingArguments( - num_train_epochs=5, - per_device_train_batch_size=4, - per_device_eval_batch_size=4, - gradient_accumulation_steps=4, - learning_rate=1e-5, - weight_decay=0.0, - warmup_ratio=0.03, - lr_scheduler_type="cosine", - logging_steps=1, - include_tokens_per_second=True, - save_strategy="epoch", - output_dir="output", - torch_compile=True, # Enables torch.compile - torch_compile_backend="inductor", - torch_compile_mode="default" -) - -# Convert TrainingArguments to SFTConfig -transformer_train_arg_fields = [x.name for x in dataclasses.fields(SFTConfig)] -transformer_kwargs = { - k: v - for k, v in train_args.to_dict().items() - if k in transformer_train_arg_fields -} -training_args = SFTConfig(**transformer_kwargs) - -####################### FINE-TUNING ##################### - -trainer = SFTTrainer( - model=model, - tokenizer=tokenizer, - train_dataset=formatted_train_dataset, - data_collator=data_collator, - dataset_text_field="output", - args=training_args, -) -trainer.train() -``` \ No newline at end of file From 4164a1b77d32d1d0cdc13b4a2a02b2e9cf3eeecd Mon Sep 17 00:00:00 2001 From: stevhliu Date: Wed, 12 Feb 2025 11:13:47 -0800 Subject: [PATCH 113/116] feedback --- docs/source/en/how_to_hack_models.md | 15 +++++ docs/source/en/optimizers.md | 66 ++++++++++++++++++++- docs/source/en/perf_infer_gpu_multi.md | 17 +++++- docs/source/en/quantization/bitsandbytes.md | 10 ++-- 4 files changed, 100 insertions(+), 8 deletions(-) diff --git a/docs/source/en/how_to_hack_models.md b/docs/source/en/how_to_hack_models.md index c57e141a1033..635698b5ab52 100644 --- a/docs/source/en/how_to_hack_models.md +++ b/docs/source/en/how_to_hack_models.md @@ -19,6 +19,21 @@ Another way to customize a model is to modify their components, rather than writ This guide will show you how to customize a models attention mechanism in order to apply [Low-Rank Adaptation (LoRA)](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) to it. +> [!TIP] +> The [clear_import_cache](https://github.com/huggingface/transformers/blob/9985d06add07a4cc691dc54a7e34f54205c04d40/src/transformers/utils/import_utils.py#L2286) utility is very useful when you're iteratively modifying and developing model code. It removes all cached Transformers modules and allows Python to reload the modified code without constantly restarting your environment. +> +> ```py +> from transformers import AutoModel +> from transformers.utils.import_utils import clear_import_cache +> +> model = AutoModel.from_pretrained("bert-base-uncased") +> # modifications to model code +> # clear cache to reload modified code +> clear_import_cache() +> # re-import to use updated code +> model = AutoModel.from_pretrained("bert-base-uncased") +> ``` + ## Attention class [Segment Anything](./model_doc/sam) is an image segmentation model, and it combines the query-key-value (`qkv`) projection in its attention mechanims. To reduce the number of trainable parameters and computational overhead, you can apply LoRA to the `qkv` projection. This requires splitting the `qkv` projection so that you can separately target the `q` and `v` with LoRA. diff --git a/docs/source/en/optimizers.md b/docs/source/en/optimizers.md index ae4637b2b6b0..a02b02c359c9 100644 --- a/docs/source/en/optimizers.md +++ b/docs/source/en/optimizers.md @@ -36,6 +36,65 @@ args = TrainingArguments( ) ``` +## APOLLO + +```bash +pip install apollo-torch +``` + +[Approximated Gradient Scaling for Memory Efficient LLM Optimization (APOLLO)](https://github.com/zhuhanqing/APOLLO) is a memory-efficient optimizer that allows full parameter learning for both pretraining and fine-tuning. It maintains AdamW-level performance with SGD-like memory efficiency. For extreme memory efficiency, you can use APOLLO-Mini, a rank 1 variant of APOLLO. APOLLO optimizers support: + +* Ultra-low rank efficiency. You can use a much lower rank than [GaLoRE](./trainer#galore), rank 1 is sufficient. +* Avoid expensive SVD computations. APOLLO leverages random projections to avoid training stalls. + +Use the `optim_target_modules` parameter to specify which layers to train. + +```diff +import torch +from transformers import TrainingArguments + +args = TrainingArguments( + output_dir="./test-apollo", + max_steps=100, + per_device_train_batch_size=2, ++ optim="apollo_adamw", ++ optim_target_modules=[r".*.attn.*", r".*.mlp.*"], + logging_strategy="steps", + logging_steps=1, + learning_rate=2e-5, + save_strategy="no", + run_name="apollo_adamw", +) +``` + +For additional training options, use `optim_args` to define hyperparameters like `rank`, `scale`, and more. Refer to the table below for a complete list of available hyperparameters. + +> [!TIP] +> The `scale` parameter can be set to `n/r`, where `n` is the original space dimension and `r` is the low-rank space dimension. You could achieve a similar effect by adjusting the learning rate while keeping `scale` at its default value. + +| parameter | description | APOLLO | APOLLO-Mini | +|---|---|---|---| +| rank | rank of the auxiliary sub-space for gradient scaling | 256 | 1 | +| scale_type | how scaling factors are applied | `channel` (per-channel scaling) | `tensor` (per-tensor scaling) | +| scale | adjusts gradient updates to stabilize training | 1.0 | 128 | +| update_proj_gap | steps before updating projection matrices | 200 | 200 | +| proj | projection type | `random` | `random` | + +The example below enables the APOLLO-Mini optimizer. + +```py +from transformers import TrainingArguments + +args = TrainingArguments( + output_dir="./test-apollo_mini", + max_steps=100, + per_device_train_batch_size=2, + optim="apollo_adamw", + optim_target_modules=[r".*.attn.*", r".*.mlp.*"], + optim_args="proj=random,rank=1,scale=128.0,scale_type=tensor,update_proj_gap=200", +) +``` + ## GrokAdamW ```bash @@ -96,14 +155,17 @@ pip install schedulefree [Schedule Free optimizer (SFO)](https://hf.co/papers/2405.15682) replaces the base optimizers momentum with a combination of averaging and interpolation. Unlike a traditional scheduler, SFO completely removes the need to anneal the learning rate. -SFO supports both the AdamW (`schedule_free_adamw`) and SGD (`schedule_free_sgd`) optimizers. +SFO supports the RAdam (`schedule_free_radam`), AdamW (`schedule_free_adamw`) and SGD (`schedule_free_sgd`) optimizers. The RAdam scheduler doesn't require `warmup_steps` or `warmup_ratio`. + +By default, it is recommended to set `lr_scheduler_type="constant"`. Other `lr_scheduler_type` values may also work, but combining SFO optimizers with other learning rate schedules could affect SFOs intended behavior and performance. ```diff args = TrainingArguments( output_dir="./test-schedulefree", max_steps=1000, per_device_train_batch_size=4, -+ optim="schedule_free_adamw", ++ optim="schedule_free_radamw, ++ lr_scheduler_type="constant", gradient_checkpointing=True, logging_strategy="steps", logging_steps=1, diff --git a/docs/source/en/perf_infer_gpu_multi.md b/docs/source/en/perf_infer_gpu_multi.md index 19bb9f4394be..995e086e1508 100644 --- a/docs/source/en/perf_infer_gpu_multi.md +++ b/docs/source/en/perf_infer_gpu_multi.md @@ -18,7 +18,22 @@ rendered properly in your Markdown viewer. [Tensor parallelism](./perf_train_gpu_many#tensor-parallelism) shards a model onto multiple GPUs and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each GPU can process a tensor slice. > [!TIP] -> Tensor parallelism is currently only supported for [Llama](./model_doc/llama). Open a GitHub issue or pull request to add tensor parallelism support for another model. +> Expand the list below to see which models support tensor parallelism. Open a GitHub issue or pull request to add support for a model not currently below. +>
+> Supported models +> +> * [Cohere](./model_doc/cohere) and [Cohere 2](./model_doc/cohere2) +> * [Gemma](./model_doc/gemma) and [Gemma 2](./model_doc/gemma2) +> * [GLM](./model_doc/glm) +> * [Granite](./model_doc/granite) +> * [Llama](./model_doc/llama) +> * [Mistral](./model_doc/mistral) +> * [Mixtral](./model_doc/mixtral) +> * [OLMo](./model_doc/olmo) and [OLMo2](./model_doc/olmo2) +> * [Phi](./model_doc/phi) and [Phi-3](./model_doc/phi3) +> * [Qwen2](./model_doc/qwen2), [Qwen2Moe](./model_doc/qwen2_moe), and [Qwen2-VL](./model_doc/qwen2_5_vl) +> * [Starcoder2](./model_doc/starcoder2) +>
Set `tp_plan="auto"` in [`~AutoModel.from_pretrained`] to enable tensor parallelism for inference. diff --git a/docs/source/en/quantization/bitsandbytes.md b/docs/source/en/quantization/bitsandbytes.md index 26070070d51d..2f87cb0dcd04 100644 --- a/docs/source/en/quantization/bitsandbytes.md +++ b/docs/source/en/quantization/bitsandbytes.md @@ -18,7 +18,7 @@ rendered properly in your Markdown viewer. [bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes) features the LLM.int8 and QLoRA quantization to enable accessible large language model inference and training. -LLM.int8 matrix multiplication, or 8-bit quantization, is based on vector-wise quantization to quantize most of the weights to 8-bits and treating outliers with 16-bit matrix multiplication to reduce their degradative effect on model accuracy. +[LLM.int8()](https://hf.co/papers/2208.07339) is a quantization method that aims to make large language model inference more accessible without significant degradation. Unlike naive 8-bit quantization, which can result in loss of critical information and accuracy, LLM.int8() dynamically adapts to ensure sensitive components of the computation retain higher precision when needed. QLoRA, or 4-bit quantization, compresses a model even further to 4-bits and inserts a small set of trainable low-rank adaptation (LoRA) weights to allowing training. @@ -28,7 +28,7 @@ Run the command below to install bitsandbytes. pip install --upgrade transformers accelerate bitsandbytes ``` -Quantize a model by passing a [`BitsAndBytesConfig`] to [`~PreTrainedModel.from_pretrained`]. This works for any model in any modality, as long as it supports [Accelerate]https://huggingface.co/docs/accelerate/index() and contains [torch.nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) layers. +Quantize a model by passing a [`BitsAndBytesConfig`] to [`~PreTrainedModel.from_pretrained`]. This works for any model in any modality, as long as it supports [Accelerate](https://huggingface.co/docs/accelerate/index) and contains [torch.nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) layers. @@ -46,7 +46,7 @@ model_8bit = AutoModelForCausalLM.from_pretrained( ) ``` -By default, all other modules such as [torch.nn.LayerNorm](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter.. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file. +By default, all other modules such as [torch.nn.LayerNorm](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) are set to the default torch dtype. You can change the data type of these modules with the `torch_dtype` parameter. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file. ```py import torch @@ -173,7 +173,7 @@ model_8bit = AutoModelForCausalLM.from_pretrained( An "outlier" is a hidden state value greater than a certain threshold, and these values are computed in fp16. While the values are usually normally distributed ([-3.5, 3.5]), this distribution can be very different for large models ([-60, 6] or [6, 60]). 8-bit quantization works well for values ~5, but beyond that, there is a significant performance penalty. A good default threshold value is 6, but a lower threshold may be needed for more unstable models (small models or finetuning). -To find the best threshold for your model, experiment with the `llm_int8_threshold` parameter in [`BitsAndBytesConfig`]. +To find the best threshold for your model, experiment with the `llm_int8_threshold` parameter in [`BitsAndBytesConfig`]. For example, setting the threshold to `0.0` significantly speeds up inference at the potential cost of some accuracy loss. ```py from transformers import AutoModelForCausalLM, BitsAndBytesConfig @@ -181,7 +181,7 @@ from transformers import AutoModelForCausalLM, BitsAndBytesConfig model_id = "bigscience/bloom-1b7" quantization_config = BitsAndBytesConfig( - llm_int8_threshold=10.0, + llm_int8_threshold=0.0, llm_int8_enable_fp32_cpu_offload=True ) From 79fac81dc19031a0dfab7087c9afe6e657c22182 Mon Sep 17 00:00:00 2001 From: stevhliu Date: Wed, 12 Feb 2025 11:52:04 -0800 Subject: [PATCH 114/116] fix --- docs/source/en/perf_infer_gpu_multi.md | 32 ++++++++++++++------------ 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/docs/source/en/perf_infer_gpu_multi.md b/docs/source/en/perf_infer_gpu_multi.md index 995e086e1508..e1e4aca8d4b3 100644 --- a/docs/source/en/perf_infer_gpu_multi.md +++ b/docs/source/en/perf_infer_gpu_multi.md @@ -19,21 +19,23 @@ rendered properly in your Markdown viewer. > [!TIP] > Expand the list below to see which models support tensor parallelism. Open a GitHub issue or pull request to add support for a model not currently below. ->
-> Supported models -> -> * [Cohere](./model_doc/cohere) and [Cohere 2](./model_doc/cohere2) -> * [Gemma](./model_doc/gemma) and [Gemma 2](./model_doc/gemma2) -> * [GLM](./model_doc/glm) -> * [Granite](./model_doc/granite) -> * [Llama](./model_doc/llama) -> * [Mistral](./model_doc/mistral) -> * [Mixtral](./model_doc/mixtral) -> * [OLMo](./model_doc/olmo) and [OLMo2](./model_doc/olmo2) -> * [Phi](./model_doc/phi) and [Phi-3](./model_doc/phi3) -> * [Qwen2](./model_doc/qwen2), [Qwen2Moe](./model_doc/qwen2_moe), and [Qwen2-VL](./model_doc/qwen2_5_vl) -> * [Starcoder2](./model_doc/starcoder2) ->
+ +
+Supported models + +* [Cohere](./model_doc/cohere) and [Cohere 2](./model_doc/cohere2) +* [Gemma](./model_doc/gemma) and [Gemma 2](./model_doc/gemma2) +* [GLM](./model_doc/glm) +* [Granite](./model_doc/granite) +* [Llama](./model_doc/llama) +* [Mistral](./model_doc/mistral) +* [Mixtral](./model_doc/mixtral) +* [OLMo](./model_doc/olmo) and [OLMo2](./model_doc/olmo2) +* [Phi](./model_doc/phi) and [Phi-3](./model_doc/phi3) +* [Qwen2](./model_doc/qwen2), [Qwen2Moe](./model_doc/qwen2_moe), and [Qwen2-VL](./model_doc/qwen2_5_vl) +* [Starcoder2](./model_doc/starcoder2) + +
Set `tp_plan="auto"` in [`~AutoModel.from_pretrained`] to enable tensor parallelism for inference. From b88e9e315ececcc0906c99a796ff7e0313f71ebb Mon Sep 17 00:00:00 2001 From: stevhliu Date: Tue, 18 Feb 2025 14:54:02 -0800 Subject: [PATCH 115/116] update --- docs/source/en/_toctree.yml | 4 + docs/source/en/chat_template_advanced.md | 463 ------------------ docs/source/en/chat_template_basics.md | 287 ----------- docs/source/en/chat_template_multimodal.md | 289 ----------- .../en/chat_template_tools_and_documents.md | 410 ---------------- docs/source/en/chat_templating.md | 229 +++++++++ docs/source/en/chat_templating_multimodal.md | 105 +++- .../source/en/quantization/finegrained_fp8.md | 26 +- docs/source/en/quantization/overview.md | 2 + docs/source/en/quantization/spqr.md | 11 +- docs/source/en/trainer.md | 21 + 11 files changed, 374 insertions(+), 1473 deletions(-) delete mode 100644 docs/source/en/chat_template_advanced.md delete mode 100644 docs/source/en/chat_template_basics.md delete mode 100644 docs/source/en/chat_template_multimodal.md delete mode 100644 docs/source/en/chat_template_tools_and_documents.md create mode 100644 docs/source/en/chat_templating.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 205dfd38949f..a0407f38535d 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -172,6 +172,8 @@ title: EETQ - local: quantization/fbgemm_fp8 title: FBGEMM + - local: quantization/finegrained_fp8 + title: Fine-grained FP8 - local: gguf title: GGUF - local: quantization/gptq @@ -186,6 +188,8 @@ title: Quanto - local: quantization/torchao title: torchao + - local: quantization/spqr + title: SpQR - local: quantization/vptq title: VPTQ - local: quantization/contribute diff --git a/docs/source/en/chat_template_advanced.md b/docs/source/en/chat_template_advanced.md deleted file mode 100644 index 5943709539e7..000000000000 --- a/docs/source/en/chat_template_advanced.md +++ /dev/null @@ -1,463 +0,0 @@ - - -# Advanced Usage and Customizing Your Chat Templates - -In this page, we’ll explore more advanced techniques for working with chat templates in Transformers. Whether you’re looking to write your own templates, create custom components, or optimize your templates for efficiency, we’ll cover everything you need to take your templates to the next level. Let’s dive into the tools and strategies that will help you get the most out of your chat models. - - -## How do chat templates work? - -The chat template for a model is stored on the `tokenizer.chat_template` attribute. Let's take a look at a `Zephyr` chat template, though note this -one is a little simplified from the actual one! - -``` -{%- for message in messages %} - {{- '<|' + message['role'] + '|>\n' }} - {{- message['content'] + eos_token }} -{%- endfor %} -{%- if add_generation_prompt %} - {{- '<|assistant|>\n' }} -{%- endif %} -``` - -If you've never seen one of these before, this is a [Jinja template](https://jinja.palletsprojects.com/en/3.1.x/templates/). -Jinja is a templating language that allows you to write simple code that generates text. In many ways, the code and -syntax resembles Python. In pure Python, this template would look something like this: - -```python -for message in messages: - print(f'<|{message["role"]}|>') - print(message['content'] + eos_token) -if add_generation_prompt: - print('<|assistant|>') -``` - -Effectively, the template does three things: -1. For each message, print the role enclosed in `<|` and `|>`, like `<|user|>` or `<|assistant|>`. -2. Next, print the content of the message, followed by the end-of-sequence token. -3. Finally, if `add_generation_prompt` is set, print the assistant token, so that the model knows to start generating - an assistant response. - -This is a pretty simple template but Jinja gives you a lot of flexibility to do more complex things! Let's see a Jinja -template that can format inputs similarly to the way LLaMA formats them (note that the real LLaMA template includes -handling for default system messages and slightly different system message handling in general - don't use this one -in your actual code!) - -``` -{%- for message in messages %} - {%- if message['role'] == 'user' %} - {{- bos_token + '[INST] ' + message['content'] + ' [/INST]' }} - {%- elif message['role'] == 'system' %} - {{- '<>\\n' + message['content'] + '\\n<>\\n\\n' }} - {%- elif message['role'] == 'assistant' %} - {{- ' ' + message['content'] + ' ' + eos_token }} - {%- endif %} -{%- endfor %} -``` - -Hopefully if you stare at this for a little bit you can see what this template is doing - it adds specific tokens like -`[INST]` and `[/INST]` based on the role of each message. User, assistant and system messages are clearly -distinguishable to the model because of the tokens they're wrapped in. - - -## How do I create a chat template? - -Simple, just write a jinja template and set `tokenizer.chat_template`. You may find it easier to start with an -existing template from another model and simply edit it for your needs! For example, we could take the LLaMA template -above and add "[ASST]" and "[/ASST]" to assistant messages: - -``` -{%- for message in messages %} - {%- if message['role'] == 'user' %} - {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }} - {%- elif message['role'] == 'system' %} - {{- '<>\\n' + message['content'].strip() + '\\n<>\\n\\n' }} - {%- elif message['role'] == 'assistant' %} - {{- '[ASST] ' + message['content'] + ' [/ASST]' + eos_token }} - {%- endif %} -{%- endfor %} -``` - -Now, simply set the `tokenizer.chat_template` attribute. Next time you use [`~PreTrainedTokenizer.apply_chat_template`], it will -use your new template! This attribute will be saved in the `tokenizer_config.json` file, so you can use -[`~utils.PushToHubMixin.push_to_hub`] to upload your new template to the Hub and make sure everyone's using the right -template for your model! - -```python -template = tokenizer.chat_template -template = template.replace("SYS", "SYSTEM") # Change the system token -tokenizer.chat_template = template # Set the new template -tokenizer.push_to_hub("model_name") # Upload your new template to the Hub! -``` - -The method [`~PreTrainedTokenizer.apply_chat_template`] which uses your chat template is called by the [`TextGenerationPipeline`] class, so -once you set the correct chat template, your model will automatically become compatible with [`TextGenerationPipeline`]. - - -If you're fine-tuning a model for chat, in addition to setting a chat template, you should probably add any new chat -control tokens as special tokens in the tokenizer. Special tokens are never split, -ensuring that your control tokens are always handled as single tokens rather than being tokenized in pieces. You -should also set the tokenizer's `eos_token` attribute to the token that marks the end of assistant generations in your -template. This will ensure that text generation tools can correctly figure out when to stop generating text. - - - -## Why do some models have multiple templates? - -Some models use different templates for different use cases. For example, they might use one template for normal chat -and another for tool-use, or retrieval-augmented generation. In these cases, `tokenizer.chat_template` is a dictionary. -This can cause some confusion, and where possible, we recommend using a single template for all use-cases. You can use -Jinja statements like `if tools is defined` and `{% macro %}` definitions to easily wrap multiple code paths in a -single template. - -When a tokenizer has multiple templates, `tokenizer.chat_template` will be a `dict`, where each key is the name -of a template. The `apply_chat_template` method has special handling for certain template names: Specifically, it will -look for a template named `default` in most cases, and will raise an error if it can't find one. However, if a template -named `tool_use` exists when the user has passed a `tools` argument, it will use that instead. To access templates -with other names, pass the name of the template you want to the `chat_template` argument of -`apply_chat_template()`. - -We find that this can be a bit confusing for users, though - so if you're writing a template yourself, we recommend -trying to put it all in a single template where possible! - - -## What template should I use? - -When setting the template for a model that's already been trained for chat, you should ensure that the template -exactly matches the message formatting that the model saw during training, or else you will probably experience -performance degradation. This is true even if you're training the model further - you will probably get the best -performance if you keep the chat tokens constant. This is very analogous to tokenization - you generally get the -best performance for inference or fine-tuning when you precisely match the tokenization used during training. - -If you're training a model from scratch, or fine-tuning a base language model for chat, on the other hand, -you have a lot of freedom to choose an appropriate template! LLMs are smart enough to learn to handle lots of different -input formats. One popular choice is the `ChatML` format, and this is a good, flexible choice for many use-cases. -It looks like this: - -``` -{%- for message in messages %} - {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }} -{%- endfor %} -``` - -If you like this one, here it is in one-liner form, ready to copy into your code. The one-liner also includes -handy support for [generation prompts](#what-are-generation-prompts), but note that it doesn't add BOS or EOS tokens! -If your model expects those, they won't be added automatically by `apply_chat_template` - in other words, the -text will be tokenized with `add_special_tokens=False`. This is to avoid potential conflicts between the template and -the `add_special_tokens` logic. If your model expects special tokens, make sure to add them to the template! - -```python -tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" -``` - -This template wraps each message in `<|im_start|>` and `<|im_end|>` tokens, and simply writes the role as a string, which -allows for flexibility in the roles you train with. The output looks like this: - -```text -<|im_start|>system -You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|> -<|im_start|>user -How are you?<|im_end|> -<|im_start|>assistant -I'm doing great!<|im_end|> -``` - -The "user", "system" and "assistant" roles are the standard for chat, and we recommend using them when it makes sense, -particularly if you want your model to operate well with [`TextGenerationPipeline`]. However, you are not limited -to these roles - templating is extremely flexible, and any string can be a role. - -## I want to add some chat templates! How should I get started? - -If you have any chat models, you should set their `tokenizer.chat_template` attribute and test it using -[`~PreTrainedTokenizer.apply_chat_template`], then push the updated tokenizer to the Hub. This applies even if you're -not the model owner - if you're using a model with an empty chat template, or one that's still using the default class -template, please open a [pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) to the model repository so that this attribute can be set properly! - -Once the attribute is set, that's it, you're done! `tokenizer.apply_chat_template` will now work correctly for that -model, which means it is also automatically supported in places like `TextGenerationPipeline`! - -By ensuring that models have this attribute, we can make sure that the whole community gets to use the full power of -open-source models. Formatting mismatches have been haunting the field and silently harming performance for too long - -it's time to put an end to them! - - - - -The easiest way to get started with writing Jinja templates is to take a look at some existing ones. You can use -`print(tokenizer.chat_template)` for any chat model to see what template it's using. In general, models that support tool use have -much more complex templates than other models - so when you're just getting started, they're probably a bad example -to learn from! You can also take a look at the -[Jinja documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for details -of general Jinja formatting and syntax. - - - -Jinja templates in `transformers` are identical to Jinja templates elsewhere. The main thing to know is that -the conversation history will be accessible inside your template as a variable called `messages`. -You will be able to access `messages` in your template just like you can in Python, which means you can loop over -it with `{% for message in messages %}` or access individual messages with `{{ messages[0] }}`, for example. - -You can also use the following tips to write clean, efficient Jinja templates: - -### Trimming whitespace - -By default, Jinja will print any whitespace that comes before or after a block. This can be a problem for chat -templates, which generally want to be very precise with whitespace! To avoid this, we strongly recommend writing -your templates like this: - -``` -{%- for message in messages %} - {{- message['role'] + message['content'] }} -{%- endfor %} -``` - -rather than like this: - -``` -{% for message in messages %} - {{ message['role'] + message['content'] }} -{% endfor %} -``` - -Adding `-` will strip any whitespace that comes before the block. The second example looks innocent, but the newline -and indentation may end up being included in the output, which is probably not what you want! - -### Special variables - -Inside your template, you will have access several special variables. The most important of these is `messages`, -which contains the chat history as a list of message dicts. However, there are several others. Not every -variable will be used in every template. The most common other variables are: - -- `tools` contains a list of tools in JSON schema format. Will be `None` or undefined if no tools are passed. -- `documents` contains a list of documents in the format `{"title": "Title", "contents": "Contents"}`, used for retrieval-augmented generation. Will be `None` or undefined if no documents are passed. -- `add_generation_prompt` is a bool that is `True` if the user has requested a generation prompt, and `False` otherwise. If this is set, your template should add the header for an assistant message to the end of the conversation. If your model doesn't have a specific header for assistant messages, you can ignore this flag. -- **Special tokens** like `bos_token` and `eos_token`. These are extracted from `tokenizer.special_tokens_map`. The exact tokens available inside each template will differ depending on the parent tokenizer. - - - -You can actually pass any `kwarg` to `apply_chat_template`, and it will be accessible inside the template as a variable. In general, -we recommend trying to stick to the core variables above, as it will make your model harder to use if users have -to write custom code to pass model-specific `kwargs`. However, we're aware that this field moves quickly, so if you -have a new use-case that doesn't fit in the core API, feel free to use a new `kwarg` for it! If a new `kwarg` -becomes common we may promote it into the core API and create a standard, documented format for it. - - - -### Callable functions - -There is also a short list of callable functions available to you inside your templates. These are: - -- `raise_exception(msg)`: Raises a `TemplateException`. This is useful for debugging, and for telling users when they're -doing something that your template doesn't support. -- `strftime_now(format_str)`: Equivalent to `datetime.now().strftime(format_str)` in Python. This is used for getting -the current date/time in a specific format, which is sometimes included in system messages. - -### Compatibility with non-Python Jinja - -There are multiple implementations of Jinja in various languages. They generally have the same syntax, -but a key difference is that when you're writing a template in Python you can use Python methods, such as -`.lower()` on strings or `.items()` on dicts. This will break if someone tries to use your template on a non-Python -implementation of Jinja. Non-Python implementations are particularly common in deployment environments, where JS -and Rust are very popular. - -Don't panic, though! There are a few easy changes you can make to your templates to ensure they're compatible across -all implementations of Jinja: - -- Replace Python methods with Jinja filters. These usually have the same name, for example `string.lower()` becomes - `string|lower`, and `dict.items()` becomes `dict|items`. One notable change is that `string.strip()` becomes `string|trim`. - See the [list of built-in filters](https://jinja.palletsprojects.com/en/3.1.x/templates/#builtin-filters) - in the Jinja documentation for more. -- Replace `True`, `False` and `None`, which are Python-specific, with `true`, `false` and `none`. -- Directly rendering a dict or list may give different results in other implementations (for example, string entries - might change from single-quoted to double-quoted). Adding the `tojson` filter can help to ensure consistency here. - -### Writing generation prompts - -We mentioned above that `add_generation_prompt` is a special variable that will be accessible inside your template, -and is controlled by the user setting the `add_generation_prompt` flag. If your model expects a header for -assistant messages, then your template must support adding the header when `add_generation_prompt` is set. - -Here is an example of a template that formats messages ChatML-style, with generation prompt support: - -```text -{{- bos_token }} -{%- for message in messages %} - {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }} -{%- endfor %} -{%- if add_generation_prompt %} - {{- '<|im_start|>assistant\n' }} -{%- endif %} -``` - -The exact content of the assistant header will depend on your specific model, but it should always be **the string -that represents the start of an assistant message**, so that if the user applies your template with -`add_generation_prompt=True` and then generates text, the model will write an assistant response. Also note that some -models do not need a generation prompt, because assistant messages always begin immediately after user messages. -This is particularly common for LLaMA and Mistral models, where assistant messages begin immediately after the `[/INST]` -token that ends user messages. In these cases, the template can ignore the `add_generation_prompt` flag. - -Generation prompts are important! If your model requires a generation prompt but it is not set in the template, then -model generations will likely be severely degraded, or the model may display unusual behaviour like continuing -the final user message! - -### Writing and debugging larger templates - -When this feature was introduced, most templates were quite small, the Jinja equivalent of a "one-liner" script. -However, with new models and features like tool-use and RAG, some templates can be 100 lines long or more. When -writing templates like these, it's a good idea to write them in a separate file, using a text editor. You can easily -extract a chat template to a file: - -```python -open("template.jinja", "w").write(tokenizer.chat_template) -``` - -Or load the edited template back into the tokenizer: - -```python -tokenizer.chat_template = open("template.jinja").read() -``` - -As an added bonus, when you write a long, multi-line template in a separate file, line numbers in that file will -exactly correspond to line numbers in template parsing or execution errors. This will make it much easier to -identify the source of issues. - - - -## Writing templates for tools - -Although chat templates do not enforce a specific API for tools (or for anything, really), we recommend -template authors try to stick to a standard API where possible. The whole point of chat templates is to allow code -to be transferable across models, so deviating from the standard tools API means users will have to write -custom code to use tools with your model. Sometimes it's unavoidable, but often with clever templating you can -make the standard API work! - -Below, we'll list the elements of the standard API, and give tips on writing templates that will work well with it. - -### Tool definitions - -Your template should expect that the variable `tools` will either be null (if no tools are passed), or is a list -of JSON schema dicts. Our chat template methods allow users to pass tools as either JSON schema or Python functions, but when -functions are passed, we automatically generate JSON schema and pass that to your template. As a result, the -`tools` variable that your template receives will always be a list of JSON schema. Here is -a sample tool JSON schema: - -```json -{ - "type": "function", - "function": { - "name": "multiply", - "description": "A function that multiplies two numbers", - "parameters": { - "type": "object", - "properties": { - "a": { - "type": "number", - "description": "The first number to multiply" - }, - "b": { - "type": "number", - "description": "The second number to multiply" - } - }, - "required": ["a", "b"] - } - } -} -``` - -And here is some example code for handling tools in your chat template. Remember, this is just an example for a -specific format - your model will probably need different formatting! - -```text -{%- if tools %} - {%- for tool in tools %} - {{- '' + tool['function']['name'] + '\n' }} - {%- for argument in tool['function']['parameters']['properties'] %} - {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }} - {%- endfor %} - {{- '\n' }} - {%- endif %} -{%- endif %} -``` - -The specific tokens and tool descriptions your template renders should of course be chosen to match the ones your model -was trained with. There is no requirement that your **model** understands JSON schema input, only that your template can translate -JSON schema into your model's format. For example, [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) -was trained with tools defined using Python function headers, but the Command-R tool template accepts JSON schema, -converts types internally and renders the input tools as Python headers. You can do a lot with templates! - -### Tool calls - -Tool calls, if present, will be a list attached to a message with the "assistant" role. Note that `tool_calls` is -always a list, even though most tool-calling models only support single tool calls at a time, which means -the list will usually only have a single element. Here is a sample message dict containing a tool call: - -```json -{ - "role": "assistant", - "tool_calls": [ - { - "type": "function", - "function": { - "name": "multiply", - "arguments": { - "a": 5, - "b": 6 - } - } - } - ] -} -``` - -And a common pattern for handling them would be something like this: - -```text -{%- if message['role'] == 'assistant' and 'tool_calls' in message %} - {%- for tool_call in message['tool_calls'] %} - {{- '' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n' }} - {%- endif %} - {%- endfor %} -{%- endif %} -``` - -Again, you should render the tool call with the formatting and special tokens that your model expects. - -### Tool responses - -Tool responses have a simple format: They are a message dict with the "tool" role, a "name" key giving the name -of the called function, and a "content" key containing the result of the tool call. Here is a sample tool response: - -```json -{ - "role": "tool", - "name": "multiply", - "content": "30" -} -``` - -You don't need to use all of the keys in the tool response. For example, if your model doesn't expect the function -name to be included in the tool response, then rendering it can be as simple as: - -```text -{%- if message['role'] == 'tool' %} - {{- "" + message['content'] + "" }} -{%- endif %} -``` - -Again, remember that the actual formatting and special tokens are model-specific - you should take a lot of care -to ensure that tokens, whitespace and everything else exactly match the format your model was trained with! diff --git a/docs/source/en/chat_template_basics.md b/docs/source/en/chat_template_basics.md deleted file mode 100644 index 2179fa4779ad..000000000000 --- a/docs/source/en/chat_template_basics.md +++ /dev/null @@ -1,287 +0,0 @@ - - -# Getting Started with Chat Templates for Text LLMs - -An increasingly common use case for LLMs is **chat**. In a chat context, rather than continuing a single string -of text (as is the case with a standard language model), the model instead continues a conversation that consists -of one or more **messages**, each of which includes a **role**, like "user" or "assistant", as well as message text. - -Much like tokenization, different models expect very different input formats for chat. This is the reason we added -**chat templates** as a feature. Chat templates are part of the tokenizer for text-only LLMs or processor for multimodal LLMs. They specify how to convert conversations, represented as lists of messages, into a single tokenizable string in the format that the model expects. - -We'll explore the basic usage of chat templates with text-only LLMs in this page. For detailed guidance on multimodal models, we have a dedicated [documentation oage for multimodal models](./chat_template_multimodal), which covers how to work with image, video and audio inputs in your templates. - -Let's make this concrete with a quick example using the `mistralai/Mistral-7B-Instruct-v0.1` model: - -```python ->>> from transformers import AutoTokenizer ->>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1") - ->>> chat = [ -... {"role": "user", "content": "Hello, how are you?"}, -... {"role": "assistant", "content": "I'm doing great. How can I help you today?"}, -... {"role": "user", "content": "I'd like to show off how chat templating works!"}, -... ] - ->>> tokenizer.apply_chat_template(chat, tokenize=False) -"[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today? [INST] I'd like to show off how chat templating works! [/INST]" -``` - -Notice how the tokenizer has added the control tokens [INST] and [/INST] to indicate the start and end of -user messages (but not assistant messages!), and the entire chat is condensed into a single string. -If we use `tokenize=True`, which is the default setting, that string will also be tokenized for us. - -Now, try the same code, but swap in the `HuggingFaceH4/zephyr-7b-beta` model instead, and you should get: - -```text -<|user|> -Hello, how are you? -<|assistant|> -I'm doing great. How can I help you today? -<|user|> -I'd like to show off how chat templating works! -``` - -Both Zephyr and Mistral-Instruct were fine-tuned from the same base model, `Mistral-7B-v0.1`. However, they were trained -with totally different chat formats. Without chat templates, you would have to write manual formatting code for each -model, and it's very easy to make minor errors that hurt performance! Chat templates handle the details of formatting -for you, allowing you to write universal code that works for any model. - - -## How do I use chat templates? - -As you can see in the example above, chat templates are easy to use. Simply build a list of messages, with `role` -and `content` keys, and then pass it to the [`~PreTrainedTokenizer.apply_chat_template`] or [`~ProcessorMixin.apply_chat_template`] method -depending on what type of model you are using. Once you do that, -you'll get output that's ready to go! When using chat templates as input for model generation, it's also a good idea -to use `add_generation_prompt=True` to add a [generation prompt](#what-are-generation-prompts). - -Here's an example of preparing input for `model.generate()`, using `Zephyr` again: - -```python -from transformers import AutoModelForCausalLM, AutoTokenizer - -checkpoint = "HuggingFaceH4/zephyr-7b-beta" -tokenizer = AutoTokenizer.from_pretrained(checkpoint) -model = AutoModelForCausalLM.from_pretrained(checkpoint) # You may want to use bfloat16 and/or move to GPU here - -messages = [ - { - "role": "system", - "content": "You are a friendly chatbot who always responds in the style of a pirate", - }, - {"role": "user", "content": "How many helicopters can a human eat in one sitting?"}, - ] -tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt") -print(tokenizer.decode(tokenized_chat[0])) -``` -This will yield a string in the input format that Zephyr expects. -```text -<|system|> -You are a friendly chatbot who always responds in the style of a pirate -<|user|> -How many helicopters can a human eat in one sitting? -<|assistant|> -``` - -Now that our input is formatted correctly for Zephyr, we can use the model to generate a response to the user's question: - -```python -outputs = model.generate(tokenized_chat, max_new_tokens=128) -print(tokenizer.decode(outputs[0])) -``` - -This will yield: - -```text -<|system|> -You are a friendly chatbot who always responds in the style of a pirate -<|user|> -How many helicopters can a human eat in one sitting? -<|assistant|> -Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all. -``` - -Arr, 'twas easy after all! - - -## Is there an automated pipeline for chat? - -Yes, there is! Our text generation pipelines support chat inputs, which makes it easy to use chat models. In the past, -we used to use a dedicated "ConversationalPipeline" class, but this has now been deprecated and its functionality -has been merged into the [`TextGenerationPipeline`]. Let's try the `Zephyr` example again, but this time using -a pipeline: - -```python -from transformers import pipeline - -pipe = pipeline("text-generation", "HuggingFaceH4/zephyr-7b-beta") -messages = [ - { - "role": "system", - "content": "You are a friendly chatbot who always responds in the style of a pirate", - }, - {"role": "user", "content": "How many helicopters can a human eat in one sitting?"}, -] -print(pipe(messages, max_new_tokens=128)[0]['generated_text'][-1]) # Print the assistant's response -``` - -```text -{'role': 'assistant', 'content': "Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all."} -``` - -The pipeline will take care of all the details of tokenization and calling `apply_chat_template` for you - -once the model has a chat template, all you need to do is initialize the pipeline and pass it the list of messages! - - -## What are "generation prompts"? - -You may have noticed that the `apply_chat_template` method has an `add_generation_prompt` argument. This argument tells -the template to add tokens that indicate the start of a bot response. For example, consider the following chat: - -```python -messages = [ - {"role": "user", "content": "Hi there!"}, - {"role": "assistant", "content": "Nice to meet you!"}, - {"role": "user", "content": "Can I ask a question?"} -] -``` - -Here's what this will look like without a generation prompt, for a model that uses standard "ChatML" formatting: - -```python -tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) -"""<|im_start|>user -Hi there!<|im_end|> -<|im_start|>assistant -Nice to meet you!<|im_end|> -<|im_start|>user -Can I ask a question?<|im_end|> -""" -``` - -And here's what it looks like **with** a generation prompt: - -```python -tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) -"""<|im_start|>user -Hi there!<|im_end|> -<|im_start|>assistant -Nice to meet you!<|im_end|> -<|im_start|>user -Can I ask a question?<|im_end|> -<|im_start|>assistant -""" -``` - -Note that this time, we've added the tokens that indicate the start of a bot response. This ensures that when the model -generates text it will write a bot response instead of doing something unexpected, like continuing the user's -message. Remember, chat models are still just language models - they're trained to continue text, and chat is just a -special kind of text to them! You need to guide them with appropriate control tokens, so they know what they're -supposed to be doing. - -Not all models require generation prompts. Some models, like LLaMA, don't have any -special tokens before bot responses. In these cases, the `add_generation_prompt` argument will have no effect. The exact -effect that `add_generation_prompt` has will depend on the template being used. - - -## What does "continue_final_message" do? - -When passing a list of messages to `apply_chat_template` or `TextGenerationPipeline`, you can choose -to format the chat so the model will continue the final message in the chat instead of starting a new one. This is done -by removing any end-of-sequence tokens that indicate the end of the final message, so that the model will simply -extend the final message when it begins to generate text. This is useful for "prefilling" the model's response. - -Here's an example: - -```python -chat = [ - {"role": "user", "content": "Can you format the answer in JSON?"}, - {"role": "assistant", "content": '{"name": "'}, -] - -formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True) -model.generate(**formatted_chat) -``` - -The model will generate text that continues the JSON string, rather than starting a new message. This approach -can be very useful for improving the accuracy of the model's instruction-following when you know how you want -it to start its replies. - -Because `add_generation_prompt` adds the tokens that start a new message, and `continue_final_message` removes any -end-of-message tokens from the final message, it does not make sense to use them together. As a result, you'll -get an error if you try! - - - -The default behaviour of `TextGenerationPipeline` is to set `add_generation_prompt=True` so that it starts a new -message. However, if the final message in the input chat has the "assistant" role, it will assume that this message is -a prefill and switch to `continue_final_message=True` instead, because most models do not support multiple -consecutive assistant messages. You can override this behaviour by explicitly passing the `continue_final_message` -argument when calling the pipeline. - - - - -## Can I use chat templates in training? - -Yes! This is a good way to ensure that the chat template matches the tokens the model sees during training. -We recommend that you apply the chat template as a preprocessing step for your dataset. After this, you -can simply continue like any other language model training task. When training, you should usually set -`add_generation_prompt=False`, because the added tokens to prompt an assistant response will not be helpful during -training. Let's see an example: - -```python -from transformers import AutoTokenizer -from datasets import Dataset - -tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") - -chat1 = [ - {"role": "user", "content": "Which is bigger, the moon or the sun?"}, - {"role": "assistant", "content": "The sun."} -] -chat2 = [ - {"role": "user", "content": "Which is bigger, a virus or a bacterium?"}, - {"role": "assistant", "content": "A bacterium."} -] - -dataset = Dataset.from_dict({"chat": [chat1, chat2]}) -dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)}) -print(dataset['formatted_chat'][0]) -``` -And we get: -```text -<|user|> -Which is bigger, the moon or the sun? -<|assistant|> -The sun. -``` - -From here, just continue training like you would with a standard language modelling task, using the `formatted_chat` column. - - - -By default, some tokenizers add special tokens like `` and `` to text they tokenize. Chat templates should -already include all the special tokens they need, and so additional special tokens will often be incorrect or -duplicated, which will hurt model performance. - -Therefore, if you format text with `apply_chat_template(tokenize=False)`, you should set the argument -`add_special_tokens=False` when you tokenize that text later. If you use `apply_chat_template(tokenize=True)`, you don't need to worry about this! - - - diff --git a/docs/source/en/chat_template_multimodal.md b/docs/source/en/chat_template_multimodal.md deleted file mode 100644 index 1b283449605b..000000000000 --- a/docs/source/en/chat_template_multimodal.md +++ /dev/null @@ -1,289 +0,0 @@ - - -# Multimodal Chat Templates for Vision and Audio LLMs - -In this section, we'll explore how to use chat templates with multimodal models, enabling your templates to handle a variety of inputs such as text, images, and audio. Multimodal models provide richer, more interactive experiences, and understanding how to effectively combine these inputs within your templates is key. We’ll walk through how to work with different modalities, configure your templates for optimal performance, and tackle common challenges along the way. - -Just like with text-only LLMs, multimodal models expect a chat with **messages**, each of which includes a **role** and **content**. However, for multimodal models, chat templates are a part of the [Processor](./main_cllasses/processors) class. Let's see how we can format our prompts when there are images or videos in the input along with text. - - -## Image inputs - -For models such as [LLaVA](https://huggingface.co/llava-hf) the prompts can be formatted as below. Notice that the only difference from text-only models is that we need to also pass a placeholder for input images. To accommodate for extra modalities, each **content** is a list containing either a text or an image **type**. - -Let's make this concrete with a quick example using the `llava-hf/llava-onevision-qwen2-0.5b-ov-hf` model: - -```python -from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration - -model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" -processor = AutoProcessor.from_pretrained(model_id) - -messages = [ - { - "role": "system", - "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], - }, - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": "What are these?"}, - ], - }, -] - -formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) -print(formatted_prompt) -``` - -This yields a string in LLaVA's expected input format with many `` tokens prepended before the text. -```text -'<|im_start|>system -<|im_start|>system -You are a friendly chatbot who always responds in the style of a pirate<|im_end|><|im_start|>user -What are these?<|im_end|> -``` - - -### Image paths or URLs - -To incorporate images into your chat templates, you can pass them as file paths or URLs. This method automatically loads the image, processes it, and prepares the necessary pixel values to create ready-to-use inputs for the model. This approach simplifies the integration of images, enabling seamless multimodal functionality. - -Let's see how it works with an example using the same model as above. This time we'll indicate an image URL with `"url"` key in the message's **content** and ask the chat template to `tokenize` and `return_dict`. Currently, "base64", "url", and "path" are supported image sources. - -```python -from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration - -model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" -model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id) -processor = AutoProcessor.from_pretrained(model_id) - -messages = [ - { - "role": "system", - "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], - }, - { - "role": "user", - "content": [ - {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}, - {"type": "text", "text": "What are these?"}, - ], - }, -] - -processed_chat = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt") -print(processed_chat.keys()) -``` - -This yields a dictionary with inputs processed and ready to be further passed into [`~GenerationMixin.generate`] to generate text. -```text -dict_keys(["input_ids", "attention_mask", "pixel_values", "image_sizes"]) -``` - - -## Video inputs - -Some vision models support videos as inputs as well as images. The message format is very similar to the image-only models with tiny differences to handle loading videos from a URL. We can continue using the same model as before since it supports videos. - -### Sampling with fixed number of frames - -Here's an example of how to set up a conversation with video inputs. Notice the extra `kwargs` passed to `processor.apply_chat_template()`. The key parameter here is `num_frames`, which controls how many frames to sample uniformly from the video. Each model checkpoint has a maximum frame count it was trained with, and exceeding this limit can significantly impact generation quality. So, it’s important to choose a frame count that fits both the model's capacity and your computational resources. If you don't specify `num_frames`, the entire video will be loaded without any frame sampling. - -You also have the option to choose a specific framework to load the video, depending on your preferences or needs. Currently, we support `decord`, `pyav` (the default), `opencv`, and `torchvision`. For this example, we’ll use `decord`, as it's a bit faster than `pyav`. - - - - -Note that if you are trying to load a video from URL, you can decode the video only with `pyav` or `decord` as backend. - - - - -```python -from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration - -model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" -model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id) -processor = AutoProcessor.from_pretrained(model_id) - -messages = [ - { - "role": "system", - "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], - }, - { - "role": "user", - "content": [ - {"type": "video", "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"}, - {"type": "text", "text": "What do you see in this video?"}, - ], - }, -] - -processed_chat = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - return_tensors="pt", - num_frames=32, - video_load_backend="decord", -) -print(processed_chat.keys()) -``` - -### Sampling with FPS - -When working with long videos, you might want to sample more frames for better representation. Instead of a fixed number of frames, you can specify `video_fps`, which determines how many frames per second to extract. For example, if a video is **10 seconds long** and you set `video_fps=2`, the model will sample **20 frames** (2 per second, uniformly spaced). - -Using the above model, we need to apply chat template as follows to sample 2 frames per second. - -```python -processed_chat = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - video_fps=32, - video_load_backend="decord", -) -print(processed_chat.keys()) -``` - - -### Custom Frame Sampling with a Function - -Not all models sample frames **uniformly** — some require more complex logic to determine which frames to use. If your model follows a different sampling strategy, you can **customize** frame selection by providing a function: - -🔹 Use the `sample_indices_fn` argument to pass a **callable function** for sampling. -🔹 If provided, this function **overrides** standard `num_frames` and `fps` methods. -🔹 It receives all the arguments passed to `load_video` and must return **valid frame indices** to sample. - -You should use `sample_indices_fn` when: - -- If you need a custom sampling strategy (e.g., **adaptive frame selection** instead of uniform sampling). -- If your model prioritizes **key moments** in a video rather than evenly spaced frames. - -Here’s an example of how to implement it: - - -```python - -def sample_indices_fn(metadata, **kwargs): - # samples only the first and the second frame - return [0, 1] - -processed_chat = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - sample_indices_fn=sample_indices_fn, - video_load_backend="decord", -) -print(processed_chat.keys()) -``` - -By using `sample_indices_fn`, you gain **full control** over frame selection, making your model **more adaptable** to different video scenarios. 🚀 - - -### List of image frames as video - -Sometimes, instead of having a full video file, you might only have a set of sampled frames stored as images. - -You can pass a list of image file paths, and the processor will automatically concatenate them into a video. Just make sure that all images have the same size, as they are assumed to be from the same video. - - -```python -frames_paths = ["/path/to/frame0.png", "/path/to/frame5.png", "/path/to/frame10.png"] -messages = [ - { - "role": "system", - "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], - }, - { - "role": "user", - "content": [ - {"type": "video", "path": frames_paths}, - {"type": "text", "text": "What do you see in this video?"}, - ], - }, -] - -processed_chat = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, -) -print(processed_chat.keys()) -``` - - -## Multimodal conversational pipeline - -[`ImageTextToTextPipeline`] currently accepts images as inputs but we are planning to add support for video inputs in the future. The pipeline supports chat inputs in the same format as we have seen above. Apart from that, the pipeline will accept chats in OpenAI format. This format is supported exclusively within the pipeline to make inference easier and more accessible. - -Here is how the OpenAI conversation format looks: - -```python -messages = [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?", - }, - { - "type": "image_url", - "image_url": {"url": f"http://images.cocodataset.org/val2017/000000039769.jpg"}, - }, - ], - } -] -``` - -## Best Practices for Multimodal Template Configuration - - -To add a custom chat template for your multimodal LLM, simply create your template using [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with `processor.chat_template`. If you're new to writing chat templates or need some tips, check out our [tutorial here](./chat_template_advanced) for helpful guidance. - -In some cases, you may want your template to handle a **list of content** from multiple modalities, while still supporting a plain string for text-only inference. Here's an example of how you can achieve that, using the [Llama-Vision](https://huggingface.co/collections/meta-llama/metas-llama-32-multimodal-models-675bfd70e574a62dd0e4059b) chat template. - - -``` -{% for message in messages %} -{% if loop.index0 == 0 %}{{ bos_token }}{% endif %} -{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} -{% if message['content'] is string %} -{{ message['content'] }} -{% else %} -{% for content in message['content'] %} -{% if content['type'] == 'image' %} -{{ '<|image|>' }} -{% elif content['type'] == 'text' %} -{{ content['text'] }} -{% endif %} -{% endfor %} -{% endif %} -{{ '<|eot_id|>' }} -{% endfor %} -{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %} -``` diff --git a/docs/source/en/chat_template_tools_and_documents.md b/docs/source/en/chat_template_tools_and_documents.md deleted file mode 100644 index 6c5491a2484a..000000000000 --- a/docs/source/en/chat_template_tools_and_documents.md +++ /dev/null @@ -1,410 +0,0 @@ - - - -# Expanding Chat Templates with Tools and Documents - -The only argument that `apply_chat_template` requires is `messages`. However, you can pass any keyword -argument to `apply_chat_template` and it will be accessible inside the template. This gives you a lot of freedom to use -chat templates for many things. There are no restrictions on the names or the format of these arguments - you can pass -strings, lists, dicts or whatever else you want. - -That said, there are some common use-cases for these extra arguments, -such as passing tools for function calling, or documents for retrieval-augmented generation. In these common cases, -we have some opinionated recommendations about what the names and formats of these arguments should be, which are -described in the sections below. We encourage model authors to make their chat templates compatible with this format, -to make it easy to transfer tool-calling code between models. - -## Tool use / function calling - -"Tool use" LLMs can choose to call functions as external tools before generating an answer. When passing tools -to a tool-use model, you can simply pass a list of functions to the `tools` argument: - -```python -import datetime - -def current_time(): - """Get the current local time as a string.""" - return str(datetime.now()) - -def multiply(a: float, b: float): - """ - A function that multiplies two numbers - - Args: - a: The first number to multiply - b: The second number to multiply - """ - return a * b - -tools = [current_time, multiply] - -model_input = tokenizer.apply_chat_template( - messages, - tools=tools -) -``` - -In order for this to work correctly, you should write your functions in the format above, so that they can be parsed -correctly as tools. Specifically, you should follow these rules: - -- The function should have a descriptive name -- Every argument must have a type hint -- The function must have a docstring in the standard Google style (in other words, an initial function description - followed by an `Args:` block that describes the arguments, unless the function does not have any arguments.) -- Do not include types in the `Args:` block. In other words, write `a: The first number to multiply`, not - `a (int): The first number to multiply`. Type hints should go in the function header instead. -- The function can have a return type and a `Returns:` block in the docstring. However, these are optional - because most tool-use models ignore them. - -### Passing tool results to the model - -The sample code above is enough to list the available tools for your model, but what happens if it wants to actually use -one? If that happens, you should: - -1. Parse the model's output to get the tool name(s) and arguments. -2. Add the model's tool call(s) to the conversation. -3. Call the corresponding function(s) with those arguments. -4. Add the result(s) to the conversation - -### A complete tool use example - -Let's walk through a tool use example, step by step. For this example, we will use an 8B `Hermes-2-Pro` model, -as it is one of the highest-performing tool-use models in its size category at the time of writing. If you have the -memory, you can consider using a larger model instead like [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01) -or [Mixtral-8x22B](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1), both of which also support tool use -and offer even stronger performance. - -First, let's load our model and tokenizer: - -```python -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer - -checkpoint = "NousResearch/Hermes-2-Pro-Llama-3-8B" - -tokenizer = AutoTokenizer.from_pretrained(checkpoint) -model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto") -``` - -Next, let's define a list of tools: - -```python -def get_current_temperature(location: str, unit: str) -> float: - """ - Get the current temperature at a location. - - Args: - location: The location to get the temperature for, in the format "City, Country" - unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"]) - Returns: - The current temperature at the specified location in the specified units, as a float. - """ - return 22. # A real function should probably actually get the temperature! - -def get_current_wind_speed(location: str) -> float: - """ - Get the current wind speed in km/h at a given location. - - Args: - location: The location to get the temperature for, in the format "City, Country" - Returns: - The current wind speed at the given location in km/h, as a float. - """ - return 6. # A real function should probably actually get the wind speed! - -tools = [get_current_temperature, get_current_wind_speed] -``` - -Now, let's set up a conversation for our bot: - -```python -messages = [ - {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."}, - {"role": "user", "content": "Hey, what's the temperature in Paris right now?"} -] -``` - -Now, let's apply the chat template and generate a response: - -```python -inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") -inputs = {k: v.to(model.device) for k, v in inputs.items()} -out = model.generate(**inputs, max_new_tokens=128) -print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):])) -``` - -And we get: - -```text - -{"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"} -<|im_end|> -``` - -The model has called the function with valid arguments, in the format requested by the function docstring. It has -inferred that we're most likely referring to the Paris in France, and it remembered that, as the home of SI units, -the temperature in France should certainly be displayed in Celsius. - - - -The output format above is specific to the `Hermes-2-Pro` model we're using in this example. Other models may emit different -tool call formats, and you may need to do some manual parsing at this step. For example, `Llama-3.1` models will emit -slightly different JSON, with `parameters` instead of `arguments`. Regardless of the format the model outputs, you -should add the tool call to the conversation in the format below, with `tool_calls`, `function` and `arguments` keys. - - - -Next, let's append the model's tool call to the conversation. - -```python -tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}} -messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]}) -``` - - - -If you're familiar with the OpenAI API, you should pay attention to an important difference here - the `tool_call` is -a dict, but in the OpenAI API it's a JSON string. Passing a string may cause errors or strange model behaviour! - - - -Now that we've added the tool call to the conversation, we can call the function and append the result to the -conversation. Since we're just using a dummy function for this example that always returns 22.0, we can just append -that result directly. - -```python -messages.append({"role": "tool", "name": "get_current_temperature", "content": "22.0"}) -``` - - - -Some model architectures, notably Mistral/Mixtral, also require a `tool_call_id` here, which should be -9 randomly-generated alphanumeric characters, and assigned to the `id` key of the tool call -dictionary. The same key should also be assigned to the `tool_call_id` key of the tool response dictionary below, so -that tool calls can be matched to tool responses. So, for Mistral/Mixtral models, the code above would be: - -```python -tool_call_id = "9Ae3bDc2F" # Random ID, 9 alphanumeric characters -tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}} -messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]}) -``` - -and - -```python -messages.append({"role": "tool", "tool_call_id": tool_call_id, "name": "get_current_temperature", "content": "22.0"}) -``` - - - -Finally, let's let the assistant read the function outputs and continue chatting with the user: - -```python -inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") -inputs = {k: v.to(model.device) for k, v in inputs.items()} -out = model.generate(**inputs, max_new_tokens=128) -print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):])) -``` - -And we get: - -```text -The current temperature in Paris, France is 22.0 ° Celsius.<|im_end|> -``` - -Although this was a simple demo with dummy tools and a single call, the same technique works with -multiple real tools and longer conversations. This can be a powerful way to extend the capabilities of conversational -agents with real-time information, computational tools like calculators, or access to large databases. - -### Understanding tool schemas - -Each function you pass to the `tools` argument of `apply_chat_template` is converted into a -[JSON schema](https://json-schema.org/learn/getting-started-step-by-step). These schemas -are then passed to the model chat template. In other words, tool-use models do not see your functions directly, and they -never see the actual code inside them. What they care about is the function **definitions** and the **arguments** they -need to pass to them - they care about what the tools do and how to use them, not how they work! It is up to you -to read their outputs, detect if they have requested to use a tool, pass their arguments to the tool function, and -return the response in the chat. - -Generating JSON schemas to pass to the template should be automatic and invisible as long as your functions -follow the specification above, but if you encounter problems, or you simply want more control over the conversion, -you can handle the conversion manually. Here is an example of a manual schema conversion. - -```python -from transformers.utils import get_json_schema - -def multiply(a: float, b: float): - """ - A function that multiplies two numbers - - Args: - a: The first number to multiply - b: The second number to multiply - """ - return a * b - -schema = get_json_schema(multiply) -print(schema) -``` - -This will yield: - -```json -{ - "type": "function", - "function": { - "name": "multiply", - "description": "A function that multiplies two numbers", - "parameters": { - "type": "object", - "properties": { - "a": { - "type": "number", - "description": "The first number to multiply" - }, - "b": { - "type": "number", - "description": "The second number to multiply" - } - }, - "required": ["a", "b"] - } - } -} -``` - -If you wish, you can edit these schemas, or even write them from scratch yourself without using `get_json_schema` at -all. JSON schemas can be passed directly to the `tools` argument of -`apply_chat_template` - this gives you a lot of power to define precise schemas for more complex functions. Be careful, -though - the more complex your schemas, the more likely the model is to get confused when dealing with them! We -recommend simple function signatures where possible, keeping arguments (and especially complex, nested arguments) -to a minimum. - -Here is an example of defining schemas by hand, and passing them directly to `apply_chat_template`: - -```python -# A simple function that takes no arguments -current_time = { - "type": "function", - "function": { - "name": "current_time", - "description": "Get the current local time as a string.", - "parameters": { - 'type': 'object', - 'properties': {} - } - } -} - -# A more complete function that takes two numerical arguments -multiply = { - 'type': 'function', - 'function': { - 'name': 'multiply', - 'description': 'A function that multiplies two numbers', - 'parameters': { - 'type': 'object', - 'properties': { - 'a': { - 'type': 'number', - 'description': 'The first number to multiply' - }, - 'b': { - 'type': 'number', 'description': 'The second number to multiply' - } - }, - 'required': ['a', 'b'] - } - } -} - -model_input = tokenizer.apply_chat_template( - messages, - tools = [current_time, multiply] -) -``` - -## Retrieval-augmented generation - -"Retrieval-augmented generation" or "RAG" LLMs can search a corpus of documents for information before responding -to a query. This allows models to vastly expand their knowledge base beyond their limited context size. Our -recommendation for RAG models is that their template -should accept a `documents` argument. This should be a list of documents, where each "document" -is a single dict with `title` and `contents` keys, both of which are strings. Because this format is much simpler -than the JSON schemas used for tools, no helper functions are necessary. - -Here's an example of a RAG template in action: - -```python -from transformers import AutoTokenizer, AutoModelForCausalLM - -# Load the model and tokenizer -model_id = "CohereForAI/c4ai-command-r-v01-4bit" -tokenizer = AutoTokenizer.from_pretrained(model_id) -model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") -device = model.device # Get the device the model is loaded on - -# Define conversation input -conversation = [ - {"role": "user", "content": "What has Man always dreamed of?"} -] - -# Define documents for retrieval-based generation -documents = [ - { - "title": "The Moon: Our Age-Old Foe", - "text": "Man has always dreamed of destroying the moon. In this essay, I shall..." - }, - { - "title": "The Sun: Our Age-Old Friend", - "text": "Although often underappreciated, the sun provides several notable benefits..." - } -] - -# Tokenize conversation and documents using a RAG template, returning PyTorch tensors. -input_ids = tokenizer.apply_chat_template( - conversation=conversation, - documents=documents, - chat_template="rag", - tokenize=True, - add_generation_prompt=True, - return_tensors="pt").to(device) - -# Generate a response -gen_tokens = model.generate( - input_ids, - max_new_tokens=100, - do_sample=True, - temperature=0.3, - ) - -# Decode and print the generated text along with generation prompt -gen_text = tokenizer.decode(gen_tokens[0]) -print(gen_text) -``` - - - -The `documents` input for retrieval-augmented generation is not widely supported, and many models have chat templates which simply ignore this input. - -To verify if a model supports the `documents` input, you can read its model card, or `print(tokenizer.chat_template)` to see if the `documents` key is used anywhere. - -One model class that does support it, though, is Cohere's [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) and [Command-R+](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024), through their `rag` chat template. You can see additional examples of grounded generation using this feature in their model cards. - - - - diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md new file mode 100644 index 000000000000..d11b8fa5b404 --- /dev/null +++ b/docs/source/en/chat_templating.md @@ -0,0 +1,229 @@ + + +# Templates + +The [chat pipeline](./conversations) guide introduced [`TextGenerationPipeline`] and the concept of a chat prompt or chat template for conversing with a model. Underlying this high-level pipeline is the [`apply_chat_template`] method. A chat template is a part of the tokenizer and it specifies how to convert conversations into a single tokenizable string in the expected model format. + +In the example below, Mistral-7B-Instruct and Zephyr-7B are finetuned from the same base model but they’re trained with different chat formats. Without chat templates, you have to manually write formatting code for each model and even minor errors can hurt performance. Chat templates offer a universal way to format chat inputs to any model. + + + + +```py +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1") +chat = [ + {"role": "user", "content": "Hello, how are you?"}, + {"role": "assistant", "content": "I'm doing great. How can I help you today?"}, + {"role": "user", "content": "I'd like to show off how chat templating works!"}, +] + +tokenizer.apply_chat_template(chat, tokenize=False) +``` +```md +[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today? [INST] I'd like to show off how chat templating works! [/INST] +``` + + + + +```py +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") +chat = [ + {"role": "user", "content": "Hello, how are you?"}, + {"role": "assistant", "content": "I'm doing great. How can I help you today?"}, + {"role": "user", "content": "I'd like to show off how chat templating works!"}, +] + +tokenizer.apply_chat_template(chat, tokenize=False) +``` +```md +<|user|>\nHello, how are you?\n<|assistant|>\nI'm doing great. How can I help you today?\n<|user|>\nI'd like to show off how chat templating works!\n +``` + + + + +This guide explores [`apply_chat_template`] and chat templates in more detail. + +## apply_chat_template + +Chats should be structured as a list of dictionaries with `role` and `content` keys. The `role` key specifies the speaker (usually between you and the system), and the `content` key contains your message. For the system, the `content` is a high-level description of how the model should behave and respond when you’re chatting with it. + +Pass your messages to [`apply_chat_template`] to tokenize and format them. You can set [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` to indicate the start of a message. + +```py +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") +model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.bfloat16) + +messages = [ + {"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate",}, + {"role": "user", "content": "How many helicopters can a human eat in one sitting?"}, + ] +tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt") +print(tokenizer.decode(tokenized_chat[0])) +``` +```md +<|system|> +You are a friendly chatbot who always responds in the style of a pirate +<|user|> +How many helicopters can a human eat in one sitting? +<|assistant|> +``` + +Now pass the tokenized chat to [`~GenerationMixin.generate`] to generate a response. + +```py +outputs = model.generate(tokenized_chat, max_new_tokens=128) +print(tokenizer.decode(outputs[0])) +``` +```md +<|system|> +You are a friendly chatbot who always responds in the style of a pirate +<|user|> +How many helicopters can a human eat in one sitting? +<|assistant|> +Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all. +``` + +### add_generation_prompt +The [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) parameter adds tokens that indicate the start of a response. This ensures the chat model generates a system response instead of continuing a users message. + +Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don’t have any special tokens before the system response. In this case, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect. + +```py +tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) +tokenized_chat +``` +```md +<|im_start|>user +Hi there!<|im_end|> +<|im_start|>assistant +Nice to meet you!<|im_end|> +<|im_start|>user +Can I ask a question?<|im_end|> +``` + +### continue_final_message + +The [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) parameter controls whether the final message in the chat should be continued or not instead of starting a new one. It removes end of sequence tokens so that the model continues generation from the final message. + +This is useful for “prefilling” a model response. In the example below, the model generates text that continues the JSON string rather than starting a new message. It can be very useful for improving the accuracy for instruction following when you know how to start its replies. + +```py +chat = [ + {"role": "user", "content": "Can you format the answer in JSON?"}, + {"role": "assistant", "content": '{"name": "'}, +] + +formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True) +model.generate(**formatted_chat) +``` + +> [!WARNING] +> You shouldn’t use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error. + +[`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the “assistant” role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don’t support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) to the pipeline. + +## Multiple templates + +A model may have several different templates for different use cases. For example, a model may have a template for regular chat, tool use, and RAG. + +When there are multiple templates, the chat template is a dictionary. Each key corresponds to the name of a template. [`apply_chat_template`] handles multiple templates based on their name. It looks for a template named `default` in most cases and if it can’t find one, it raises an error. + +For a tool calling template, if a user passes a `tools` parameter and a `tool_use` template exists, the tool calling template is used instead of `default`. + +To access templates with other names, pass the template name to the `chat_template` parameter in [`apply_chat_template`]. For example, if you’re using a RAG template then set `chat_template="rag"`. + +It can be confusing to manage multiple templates though, so we recommend using a single template for all use cases. Use Jinja statements like `if tools is defined` and `{% macro %}` definitions to wrap multiple code paths in a single template. + +## Template selection + +It is important to set a chat template format that matches the template format a model was pretrained on, otherwise performance may suffer. Even if you’re training the model further, performance is best if the chat tokens are kept constant. + +But if you’re training a model from scratch or finetuning a model for chat, you have more options to select a template. For example, [ChatML](https://github.com/openai/openai-python/blob/release-v0.28.0/chatml.md) is a popular format that is flexbile enough to handle many use cases. It even includes support for [generation prompts](#add_generation_prompt), but it doesn’t add beginning-of-string (`BOS`) or end-of-string (`EOS`) tokens. If your model expects `BOS` and `EOS` tokens, set `add_special_tokens=True` and make sure to add them to your template. + +```py +{%- for message in messages %} + {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }} +{%- endfor %} +``` + +Set the template with the following logic to support [generation prompts](#add_generation_prompt). The template wraps each message with `<|im_start|>` and `<|im_end|>` tokens and writes the role as a string. This allows you to easily customize the roles you want to train with. + +```py +tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" +``` + +The `user`, `system` and `assistant` roles are standard roles in chat templates. We recommend using these roles when it makes sense, especially if you’re using your model with the [`TextGenerationPipeline`]. + +```py +<|im_start|>system +You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|> +<|im_start|>user +How are you?<|im_end|> +<|im_start|>assistant +I'm doing great!<|im_end|> +``` + +## Model training + +Training a model with a chat template is a good way to ensure a chat template matches the tokens a model is trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren’t helpful during training. + +An example of preprocessing a dataset with a chat template is shown below. + +```py +from transformers import AutoTokenizer +from datasets import Dataset + +tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") + +chat1 = [ + {"role": "user", "content": "Which is bigger, the moon or the sun?"}, + {"role": "assistant", "content": "The sun."} +] +chat2 = [ + {"role": "user", "content": "Which is bigger, a virus or a bacterium?"}, + {"role": "assistant", "content": "A bacterium."} +] + +dataset = Dataset.from_dict({"chat": [chat1, chat2]}) +dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)}) +print(dataset['formatted_chat'][0]) +``` +```md +<|user|> +Which is bigger, the moon or the sun? +<|assistant|> +The sun. +``` + +After this step, you can continue following the [training recipe](./tasks/language_modeling) for causal language models using the `formatted_chat` column. + +Some tokenizers add special `` and `` tokens. Chat templates should already include all the necessary special tokens, and adding additional special tokens is often incorrect or duplicated, hurting model performance. When you format text with `apply_chat_template(tokenize=False)`, make sure you set `add_special_tokens=False` as well to avoid duplicating them. + +```py +apply_chat_template(messages, tokenize=False, add_special_tokens=False) +``` + +This isn’t an issue if `apply_chat_template(tokenize=True)`. diff --git a/docs/source/en/chat_templating_multimodal.md b/docs/source/en/chat_templating_multimodal.md index 4ac936b2a856..07ee8c828e41 100644 --- a/docs/source/en/chat_templating_multimodal.md +++ b/docs/source/en/chat_templating_multimodal.md @@ -72,8 +72,8 @@ pipeline(text=messages, max_new_tokens=50, return_full_text=False) For multimodal models that accept images like [LLaVA](./model_doc/llava), include the following in `content` as shown below. -- `"type": "image"` means the content is an image. -- `"url": ""` is a link to the image, but it could also be a file path (`"path"`). Images are automatically loaded, processed, and prepared into pixel values as inputs to the model. +- The content `"type"` can be an `"image"` or `"text"`. +- For images, it can be a link to the image (`"url"`), a file path (`"path"`), or `"base64"`. Images are automatically loaded, processed, and prepared into pixel values as inputs to the model. ```python from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration @@ -109,8 +109,11 @@ These inputs are now ready to be used in [`~GenerationMixin.generate`]. Some vision models also support video inputs. The message format is very similar to the format for [image inputs](#image-inputs). -- `"type": "video"` means the content is a video. -- `"url": ""` is a link to the video, , but it could also be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord). +- The content `"type"` should be `"video"` to indicate the the content is a video. +- For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord). + +> [!WARNING] +> Loading a video from `"url"` is only supported by the PyAV or Decord backends. ```python from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration @@ -134,12 +137,17 @@ messages = [ ] ``` -Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`]. +Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`] that controls the sampling process. + +The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html). + +The examples below uses Decord as the backend because it is a bit faster than PyAV. + + + -- `num_frames` controls how many frames to uniformly sample from the video. Each checkpoint has a maximum frame count it was pretrained with and exceeding this count can significantly lower generation quality. It's important to choose a frame count that fits both the model capacity and your hardware resources. If `num_frames` isn't specified, the entire video is loaded without any frame sampling. -- `video_load_backend` refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html). +The `num_frames` parameter controls how many frames to uniformly sample from the video. Each checkpoint has a maximum frame count it was pretrained with and exceeding this count can significantly lower generation quality. It's important to choose a frame count that fits both the model capacity and your hardware resources. If `num_frames` isn't specified, the entire video is loaded without any frame sampling. -The example below uses Decord as the backend because it is a bit faster than PyAV. ```python processed_chat = processor.apply_chat_template( @@ -156,6 +164,87 @@ print(processed_chat.keys()) These inputs are now ready to be used in [`~GenerationMixin.generate`]. + + + +For longer videos, it may be better to sample more frames for better representation with the `video_fps` parameter. This determines how many frames per second to extract. As an example, if a video is 10 seconds long and `video_fps=2`, then the model samples 20 frames. In other words, 2 frames are uniformly sampled every 10 seconds. + +```py +processed_chat = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + video_fps=32, + video_load_backend="decord", +) +print(processed_chat.keys()) +``` + + + + +Some models don't sample frames *uniformly* and require more complex logic to determine which frames to use. For example, the model may have an *adaptive frame selection* or if the model prioritizes *key moments* in a video rather than evenly spaced frames. + +If a model has a different sampling strategy, you can write a function that customizes frame selection. The function should include the following requirements. + +- Use the `sample_indices_fn` parameter to pass a callable function for sampling. +- If provided, this function *overrides* the standard `num_frames` and `fps` parameters. +- The function receives all the parameters passed to `load_video` and must return valid frame indices to sample from. + +An example function is shown below. This gives you full control over frame selection, making the model more adaptable to different video scenarios. + +```py +def sample_indices_fn(metadata, **kwargs): + # samples only the first and the second frame + return [0, 1] + +processed_chat = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + sample_indices_fn=sample_indices_fn, + video_load_backend="decord", +) +print(processed_chat.keys()) +``` + + + + +Videos may also exist as a set of sampled frames stored as images rather than the full video file. + +In this case, pass a list of image file paths and the processor automatically concatenates them into a video. Make sure all images are the same size since they are assumed to be from the same video. + +```py +frames_paths = ["/path/to/frame0.png", "/path/to/frame5.png", "/path/to/frame10.png"] +messages = [ + { + "role": "system", + "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], + }, + { + "role": "user", + "content": [ + {"type": "video", "path": frames_paths}, + {"type": "text", "text": "What do you see in this video?"}, + ], + }, +] + +processed_chat = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, +) +print(processed_chat.keys()) +``` + + + + ## Template configuration You can create a custom chat template with [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with [`~ProcessorMixin.apply_chat_template`]. Refer to the [Template writing](./chat_templating_writing) guide for more details. diff --git a/docs/source/en/quantization/finegrained_fp8.md b/docs/source/en/quantization/finegrained_fp8.md index 785e5e88e128..53e2a1cd3b8f 100644 --- a/docs/source/en/quantization/finegrained_fp8.md +++ b/docs/source/en/quantization/finegrained_fp8.md @@ -1,4 +1,4 @@ -