diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7e497d755a14..2aef0775fd7b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -170,8 +170,7 @@ jobs:
- store_artifacts:
path: ~/transformers/installed.txt
- run: python utils/check_copies.py
- - run: python utils/check_modular_conversion.py --num_workers 4
- - run: python utils/check_table.py
+ - run: python utils/check_modular_conversion.py
- run: python utils/check_dummies.py
- run: python utils/check_repo.py
- run: python utils/check_inits.py
@@ -181,7 +180,6 @@ jobs:
- run: make deps_table_check_updated
- run: python utils/update_metadata.py --check-only
- run: python utils/check_docstrings.py
- - run: python utils/check_support_list.py
workflows:
version: 2
diff --git a/Makefile b/Makefile
index 710c555b74f6..21152e985082 100644
--- a/Makefile
+++ b/Makefile
@@ -37,7 +37,6 @@ autogenerate_code: deps_table_update
repo-consistency:
python utils/check_copies.py
python utils/check_modular_conversion.py
- python utils/check_table.py
python utils/check_dummies.py
python utils/check_repo.py
python utils/check_inits.py
@@ -46,7 +45,6 @@ repo-consistency:
python utils/check_doctest_list.py
python utils/update_metadata.py --check-only
python utils/check_docstrings.py
- python utils/check_support_list.py
# this target runs checks on all files
@@ -82,7 +80,6 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency
fix-copies:
python utils/check_copies.py --fix_and_overwrite
python utils/check_modular_conversion.py --fix_and_overwrite
- python utils/check_table.py --fix_and_overwrite
python utils/check_dummies.py --fix_and_overwrite
python utils/check_doctest_list.py --fix_and_overwrite
python utils/check_docstrings.py --fix_and_overwrite
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 4b34ccf0e3e9..a0407f38535d 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -1,292 +1,307 @@
-- sections:
+- title: Get started
+ sections:
- local: index
- title: 🤗 Transformers
- - local: quicktour
- title: Quick tour
+ title: Transformers
- local: installation
title: Installation
- - local: add_new_model
- title: Adding a new model to `transformers`
- title: Get started
-- sections:
- - local: pipeline_tutorial
- title: Run inference with pipelines
- - local: autoclass_tutorial
- title: Write portable code with AutoClass
- - local: preprocessing
- title: Preprocess data
- - local: training
- title: Fine-tune a pretrained model
- - local: run_scripts
- title: Train with a script
- - local: accelerate
- title: Set up distributed training with 🤗 Accelerate
- - local: peft
- title: Load and train adapters with 🤗 PEFT
- - local: model_sharing
- title: Share your model
- - local: agents
- title: Agents 101
- - local: agents_advanced
- title: Agents, supercharged - Multi-agents, External tools, and more
- - local: llm_tutorial
- title: Generation with LLMs
- - local: conversations
- title: Chatting with Transformers
- title: Tutorials
-- sections:
- - isExpanded: false
- sections:
- - local: tasks/sequence_classification
- title: Text classification
- - local: tasks/token_classification
- title: Token classification
- - local: tasks/question_answering
- title: Question answering
- - local: tasks/language_modeling
- title: Causal language modeling
- - local: tasks/masked_language_modeling
- title: Masked language modeling
- - local: tasks/translation
- title: Translation
- - local: tasks/summarization
- title: Summarization
- - local: tasks/multiple_choice
- title: Multiple choice
- title: Natural Language Processing
- - isExpanded: false
+ - local: quicktour
+ title: Quickstart
+- title: Base classes
+ isExpanded: False
+ sections:
+ - title: Models
sections:
- - local: tasks/audio_classification
- title: Audio classification
- - local: tasks/asr
- title: Automatic speech recognition
- title: Audio
- - isExpanded: false
+ - local: models
+ title: Loading models
+ - local: custom_models
+ title: Customizing models
+ - local: how_to_hack_models
+ title: Customizing model components
+ - local: model_sharing
+ title: Sharing
+ - local: add_new_model
+ title: Adding a new model to Transformers
+ - local: modular_transformers
+ title: Modular Transformers
+ - local: task_summary
+ title: What 🤗 Transformers can do
+ - local: tasks_explained
+ title: How 🤗 Transformers solve tasks
+ - local: model_summary
+ title: The Transformer model family
+ - local: attention
+ title: Attention mechanisms
+ - title: Preprocessors
sections:
- - local: tasks/image_classification
- title: Image classification
- - local: tasks/semantic_segmentation
- title: Image segmentation
- - local: tasks/video_classification
- title: Video classification
- - local: tasks/object_detection
- title: Object detection
- - local: tasks/zero_shot_object_detection
- title: Zero-shot object detection
- - local: tasks/zero_shot_image_classification
- title: Zero-shot image classification
- - local: tasks/monocular_depth_estimation
- title: Depth estimation
- - local: tasks/image_to_image
- title: Image-to-Image
- - local: tasks/image_feature_extraction
- title: Image Feature Extraction
- - local: tasks/mask_generation
- title: Mask Generation
- - local: tasks/keypoint_detection
- title: Keypoint Detection
- - local: tasks/knowledge_distillation_for_image_classification
- title: Knowledge Distillation for Computer Vision
- title: Computer Vision
- - isExpanded: false
+ - local: fast_tokenizers
+ title: Tokenizers
+ - local: image_processors
+ title: Image processors
+ - local: backbones
+ title: Backbones
+ - local: feature_extractors
+ title: Feature extractors
+ - local: processors
+ title: Processors
+ - local: tokenizer_summary
+ title: Summary of the tokenizers
+ - local: pad_truncation
+ title: Padding and truncation
+- title: Inference
+ isExpanded: False
+ sections:
+ - title: Pipeline API
sections:
- - local: tasks/image_captioning
- title: Image captioning
- - local: tasks/document_question_answering
- title: Document Question Answering
- - local: tasks/visual_question_answering
- title: Visual Question Answering
- - local: tasks/text-to-speech
- title: Text to speech
- - local: tasks/image_text_to_text
- title: Image-text-to-text
- - local: tasks/video_text_to_text
- title: Video-text-to-text
- title: Multimodal
- - isExpanded: false
+ - local: pipeline_tutorial
+ title: Pipeline
+ - local: pipeline_gradio
+ title: Machine learning apps
+ - local: pipeline_webserver
+ title: Web server inference
+ - local: add_new_pipeline
+ title: Adding a new pipeline
+ - title: LLMs
sections:
+ - local: llm_tutorial
+ title: Text generation
- local: generation_strategies
- title: Customize the generation strategy
+ title: Generation strategies
+ - local: generation_features
+ title: Generation features
+ - local: tasks/prompting
+ title: Prompt engineering
+ - local: llm_optims
+ title: Optimizing inference
- local: kv_cache
- title: Best Practices for Generation with Cache
- title: Generation
- - isExpanded: false
+ title: KV cache strategies
+ - local: cache_explanation
+ title: Caching
+ - local: llm_tutorial_optimization
+ title: Getting the most out of LLMs
+ - local: perplexity
+ title: Perplexity of fixed-length models
+ - title: Chat with models
sections:
- - local: chat_template_basics
- title: Getting Started with Chat Templates for Text LLMs
- - local: chat_template_multimodal
- title: Multimodal Chat Templates for Vision and Audio LLMs
- - local: chat_template_tools_and_documents
- title: Expanding Chat Templates with Tools and Documents
- - local: chat_template_advanced
- title: Advanced Usage and Customizing Your Chat Templates
- title: Chat Templates
- - isExpanded: false
+ - local: conversations
+ title: Chat basics
+ - local: chat_templating
+ title: Templates
+ - local: chat_templating_multimodal
+ title: Multimodal templates
+ - local: chat_templating_writing
+ title: Template writing
+ - local: chat_extras
+ title: Tools and RAG
+ - title: Optimization
sections:
- - local: tasks/idefics
- title: Image tasks with IDEFICS
- - local: tasks/prompting
- title: LLM prompting guide
- title: Prompting
- title: Task Guides
-- sections:
- - local: fast_tokenizers
- title: Use fast tokenizers from 🤗 Tokenizers
- - local: multilingual
- title: Run inference with multilingual models
- - local: create_a_model
- title: Use model-specific APIs
- - local: custom_models
- title: Share a custom model
- - local: trainer
- title: Trainer
- - local: sagemaker
- title: Run training on Amazon SageMaker
- - local: serialization
- title: Export to ONNX
- - local: tflite
- title: Export to TFLite
- - local: torchscript
- title: Export to TorchScript
- - local: notebooks
- title: Notebooks with examples
- - local: community
- title: Community resources
- - local: troubleshooting
- title: Troubleshoot
- - local: gguf
- title: Interoperability with GGUF files
- - local: tiktoken
- title: Interoperability with TikToken files
- - local: modular_transformers
- title: Modularity in `transformers`
- - local: how_to_hack_models
- title: Model Hacking (overwriting a class to your usage)
- title: Developer guides
-- sections:
+ - local: perf_torch_compile
+ title: torch.compile
+ - local: perf_infer_gpu_one
+ title: GPU
+ - local: perf_infer_gpu_multi
+ title: Distributed GPU inference
+ - local: perf_infer_cpu
+ title: CPU
+ - local: tf_xla
+ title: XLA
+ - local: agents
+ title: Agents
+ - local: tools
+ title: Tools
+- title: Training
+ isExpanded: False
+ sections:
+ - title: Trainer API
+ sections:
+ - local: trainer
+ title: Trainer
+ - local: training
+ title: Fine-tuning
+ - local: optimizers
+ title: Optimizers
+ - local: hpo_train
+ title: Hyperparameter search
+ - title: Distributed training
+ sections:
+ - local: gpu_selection
+ title: GPU selection
+ - local: accelerate
+ title: Accelerate
+ - local: fsdp
+ title: FullyShardedDataParallel
+ - local: deepspeed
+ title: DeepSpeed
+ - local: debugging
+ title: Multi-GPU debugging
+ - local: perf_train_cpu_many
+ title: Distributed CPUs
+ - local: perf_train_gpu_many
+ title: Parallelism methods
+ - title: Hardware
+ sections:
+ - local: perf_train_gpu_one
+ title: GPU
+ - local: perf_train_cpu
+ title: CPU
+ - local: perf_train_tpu_tf
+ title: TPU
+ - local: perf_train_special
+ title: Apple Silicon
+ - local: perf_hardware
+ title: Build your own machine
+ - local: peft
+ title: PEFT
+ - local: model_memory_anatomy
+ title: Model training anatomy
+- title: Quantization
+ isExpanded: False
+ sections:
- local: quantization/overview
- title: Getting started
- - local: quantization/bitsandbytes
- title: bitsandbytes
- - local: quantization/gptq
- title: GPTQ
- - local: quantization/awq
- title: AWQ
+ title: Overview
- local: quantization/aqlm
title: AQLM
- - local: quantization/vptq
- title: SpQR
- - local: quantization/spqr
- title: VPTQ
- - local: quantization/quanto
- title: Quanto
+ - local: quantization/awq
+ title: AWQ
+ - local: quantization/bitnet
+ title: BitNet
+ - local: quantization/bitsandbytes
+ title: bitsandbytes
+ - local: quantization/compressed_tensors
+ title: compressed-tensors
- local: quantization/eetq
title: EETQ
+ - local: quantization/fbgemm_fp8
+ title: FBGEMM
+ - local: quantization/finegrained_fp8
+ title: Fine-grained FP8
+ - local: gguf
+ title: GGUF
+ - local: quantization/gptq
+ title: GPTQ
- local: quantization/higgs
title: HIGGS
- local: quantization/hqq
title: HQQ
- - local: quantization/fbgemm_fp8
- title: FBGEMM_FP8
- local: quantization/optimum
title: Optimum
+ - local: quantization/quanto
+ title: Quanto
- local: quantization/torchao
- title: TorchAO
- - local: quantization/bitnet
- title: BitNet
- - local: quantization/compressed_tensors
- title: compressed-tensors
- - local: quantization/finegrained_fp8
- title: Fine-grained FP8
+ title: torchao
+ - local: quantization/spqr
+ title: SpQR
+ - local: quantization/vptq
+ title: VPTQ
- local: quantization/contribute
- title: Contribute new quantization method
- title: Quantization Methods
-- sections:
- - local: performance
- title: Overview
- - local: llm_optims
- title: LLM inference optimization
- - sections:
- - local: perf_train_gpu_one
- title: Methods and tools for efficient training on a single GPU
- - local: perf_train_gpu_many
- title: Multiple GPUs and parallelism
- - local: fsdp
- title: Fully Sharded Data Parallel
- - local: deepspeed
- title: DeepSpeed
- - local: perf_train_cpu
- title: Efficient training on CPU
- - local: perf_train_cpu_many
- title: Distributed CPU training
- - local: perf_train_tpu_tf
- title: Training on TPU with TensorFlow
- - local: perf_train_special
- title: PyTorch training on Apple silicon
- - local: perf_hardware
- title: Custom hardware for training
- - local: hpo_train
- title: Hyperparameter Search using Trainer API
- title: Efficient training techniques
- - sections:
- - local: perf_infer_cpu
- title: CPU inference
- - local: perf_infer_gpu_one
- title: GPU inference
- - local: perf_infer_gpu_multi
- title: Multi-GPU inference
- title: Optimizing inference
- - local: big_models
- title: Instantiate a big model
- - local: debugging
- title: Debugging
- - local: tf_xla
- title: XLA Integration for TensorFlow Models
- - local: perf_torch_compile
- title: Optimize inference using `torch.compile()`
- title: Performance and scalability
-- sections:
+ title: Contribute
+- title: Export to production
+ isExpanded: False
+ sections:
+ - local: serialization
+ title: ONNX
+ - local: tflite
+ title: LiteRT
+ - local: executorch
+ title: ExecuTorch
+ - local: torchscript
+ title: TorchScript
+- title: Resources
+ isExpanded: False
+ sections:
+ - title: Task recipes
+ sections:
+ - title: Natural language processing
+ sections:
+ - local: tasks/sequence_classification
+ title: Text classification
+ - local: tasks/token_classification
+ title: Token classification
+ - local: tasks/question_answering
+ title: Question answering
+ - local: tasks/language_modeling
+ title: Causal language modeling
+ - local: tasks/masked_language_modeling
+ title: Masked language modeling
+ - local: tasks/translation
+ title: Translation
+ - local: tasks/summarization
+ title: Summarization
+ - local: tasks/multiple_choice
+ title: Multiple choice
+ - title: Audio
+ sections:
+ - local: tasks/audio_classification
+ title: Audio classification
+ - local: tasks/asr
+ title: Automatic speech recognition
+ - title: Computer vision
+ sections:
+ - local: tasks/image_classification
+ title: Image classification
+ - local: tasks/semantic_segmentation
+ title: Image segmentation
+ - local: tasks/video_classification
+ title: Video classification
+ - local: tasks/object_detection
+ title: Object detection
+ - local: tasks/zero_shot_object_detection
+ title: Zero-shot object detection
+ - local: tasks/zero_shot_image_classification
+ title: Zero-shot image classification
+ - local: tasks/monocular_depth_estimation
+ title: Depth estimation
+ - local: tasks/image_to_image
+ title: Image-to-Image
+ - local: tasks/image_feature_extraction
+ title: Image Feature Extraction
+ - local: tasks/mask_generation
+ title: Mask Generation
+ - local: tasks/keypoint_detection
+ title: Keypoint detection
+ - local: tasks/knowledge_distillation_for_image_classification
+ title: Knowledge Distillation for Computer Vision
+ - title: Multimodal
+ sections:
+ - local: tasks/image_captioning
+ title: Image captioning
+ - local: tasks/document_question_answering
+ title: Document Question Answering
+ - local: tasks/visual_question_answering
+ title: Visual Question Answering
+ - local: tasks/text-to-speech
+ title: Text to speech
+ - local: tasks/idefics
+ title: Image tasks with IDEFICS
+ - local: tasks/image_text_to_text
+ title: Image-text-to-text
+ - local: tasks/video_text_to_text
+ title: Video-text-to-text
+ - local: run_scripts
+ title: Training scripts
+ - local: glossary
+ title: Glossary
+ - local: philosophy
+ title: Philosophy
+ - local: notebooks
+ title: Notebooks with examples
+ - local: community
+ title: Community resources
+ - local: troubleshooting
+ title: Troubleshoot
+- title: Contribute
+ isExpanded: False
+ sections:
- local: contributing
- title: How to contribute to 🤗 Transformers?
- - local: add_new_model
- title: How to add a model to 🤗 Transformers?
- - local: add_new_pipeline
- title: How to add a pipeline to 🤗 Transformers?
+ title: Contribute to Transformers
- local: testing
- title: Testing
+ title: Transformers model tests
- local: pr_checks
- title: Checks on a Pull Request
- title: Contribute
-- sections:
- - local: philosophy
- title: Philosophy
- - local: glossary
- title: Glossary
- - local: task_summary
- title: What 🤗 Transformers can do
- - local: tasks_explained
- title: How 🤗 Transformers solve tasks
- - local: model_summary
- title: The Transformer model family
- - local: tokenizer_summary
- title: Summary of the tokenizers
- - local: attention
- title: Attention mechanisms
- - local: pad_truncation
- title: Padding and truncation
- - local: bertology
- title: BERTology
- - local: perplexity
- title: Perplexity of fixed-length models
- - local: pipeline_webserver
- title: Pipelines for webserver inference
- - local: model_memory_anatomy
- title: Model training anatomy
- - local: llm_tutorial_optimization
- title: Getting the most out of LLMs
- title: Conceptual guides
-- sections:
- - sections:
+ title: Pull request checks
+- title: API
+ isExpanded: False
+ sections:
+ - title: Main classes
+ sections:
- local: main_classes/agent
title: Agents and Tools
- local: model_doc/auto
@@ -313,6 +328,8 @@
title: Optimization
- local: main_classes/output
title: Model outputs
+ - local: main_classes/peft
+ title: PEFT
- local: main_classes/pipelines
title: Pipelines
- local: main_classes/processors
@@ -331,9 +348,9 @@
title: Feature Extractor
- local: main_classes/image_processor
title: Image Processor
- title: Main Classes
- - sections:
- - isExpanded: false
+ - title: Models
+ sections:
+ - title: Text models
sections:
- local: model_doc/albert
title: ALBERT
@@ -643,8 +660,7 @@
title: Zamba
- local: model_doc/zamba2
title: Zamba2
- title: Text models
- - isExpanded: false
+ - title: Vision models
sections:
- local: model_doc/beit
title: BEiT
@@ -772,8 +788,7 @@
title: YOLOS
- local: model_doc/zoedepth
title: ZoeDepth
- title: Vision models
- - isExpanded: false
+ - title: Audio models
sections:
- local: model_doc/audio-spectrogram-transformer
title: Audio Spectrogram Transformer
@@ -843,8 +858,7 @@
title: XLS-R
- local: model_doc/xlsr_wav2vec2
title: XLSR-Wav2Vec2
- title: Audio models
- - isExpanded: false
+ - title: Video models
sections:
- local: model_doc/timesformer
title: TimeSformer
@@ -852,8 +866,7 @@
title: VideoMAE
- local: model_doc/vivit
title: ViViT
- title: Video models
- - isExpanded: false
+ - title: Multimodal models
sections:
- local: model_doc/align
title: ALIGN
@@ -991,15 +1004,13 @@
title: VisualBERT
- local: model_doc/xclip
title: X-CLIP
- title: Multimodal models
- - isExpanded: false
+ - title: Reinforcement learning models
sections:
- local: model_doc/decision_transformer
title: Decision Transformer
- local: model_doc/trajectory_transformer
title: Trajectory Transformer
- title: Reinforcement learning models
- - isExpanded: false
+ - title: Time series models
sections:
- local: model_doc/autoformer
title: Autoformer
@@ -1011,14 +1022,12 @@
title: PatchTST
- local: model_doc/time_series_transformer
title: Time Series Transformer
- title: Time series models
- - isExpanded: false
+ - title: Graph models
sections:
- local: model_doc/graphormer
title: Graphormer
- title: Graph models
- title: Models
- - sections:
+ - title: Internal helpers
+ sections:
- local: internal/modeling_utils
title: Custom Layers and Utilities
- local: internal/pipelines_utils
@@ -1037,5 +1046,4 @@
title: General Utilities
- local: internal/time_series_utils
title: Utilities for Time Series
- title: Internal Helpers
- title: API
+
\ No newline at end of file
diff --git a/docs/source/en/accelerate.md b/docs/source/en/accelerate.md
index e0a7a9c65623..c0ad46f8ac91 100644
--- a/docs/source/en/accelerate.md
+++ b/docs/source/en/accelerate.md
@@ -1,4 +1,4 @@
-
-# Distributed training with 🤗 Accelerate
+# Accelerate
-As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the [🤗 Accelerate](https://huggingface.co/docs/accelerate) library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU's on one machine or multiple GPU's across several machines. In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment.
+[Accelerate](https://hf.co/docs/accelerate/index) is a library designed to simplify distributed training on any type of setup with PyTorch by uniting the most common frameworks ([Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) and [DeepSpeed](https://www.deepspeed.ai/)) for it into a single interface. [`Trainer`] is powered by Accelerate under the hood, enabling loading big models and distributed training.
-## Setup
-
-Get started by installing 🤗 Accelerate:
+This guide will show you two ways to use Accelerate with Transformers, using FSDP as the backend. The first method demonstrates distributed training with [`Trainer`], and the second method demonstrates adapting a PyTorch training loop. For more detailed information about Accelerate, please refer to the [documentation](https://hf.co/docs/accelerate/index).
```bash
pip install accelerate
```
-Then import and create an [`~accelerate.Accelerator`] object. The [`~accelerate.Accelerator`] will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device.
+Start by running [accelerate config](https://hf.co/docs/accelerate/main/en/package_reference/cli#accelerate-config) in the command line to answer a series of prompts about your training system. This creates and saves a configuration file to help Accelerate correctly set up training based on your setup.
-```py
->>> from accelerate import Accelerator
+```bash
+accelerate config
+```
->>> accelerator = Accelerator()
+Depending on your setup and the answers you provide, an example configuration file for distributing training with FSDP on one machine with two GPUs may look like the following.
+
+```yaml
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+ fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+ fsdp_backward_prefetch_policy: BACKWARD_PRE
+ fsdp_forward_prefetch: false
+ fsdp_cpu_ram_efficient_loading: true
+ fsdp_offload_params: false
+ fsdp_sharding_strategy: FULL_SHARD
+ fsdp_state_dict_type: SHARDED_STATE_DICT
+ fsdp_sync_module_states: true
+ fsdp_transformer_layer_cls_to_wrap: BertLayer
+ fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
```
-## Prepare to accelerate
+## Trainer
-The next step is to pass all the relevant training objects to the [`~accelerate.Accelerator.prepare`] method. This includes your training and evaluation DataLoaders, a model and an optimizer:
+Pass the path to the saved configuration file to [`TrainingArguments`], and from there, pass your [`TrainingArguments`] to [`Trainer`].
```py
->>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-... train_dataloader, eval_dataloader, model, optimizer
-... )
+from transformers import TrainingArguments, Trainer
+
+training_args = TrainingArguments(
+ output_dir="your-model",
+ learning_rate=2e-5,
+ per_device_train_batch_size=16,
+ per_device_eval_batch_size=16,
+ num_train_epochs=2,
+ fsdp_config="path/to/fsdp_config",
+ fsdp_strategy="full_shard",
+ weight_decay=0.01,
+ eval_strategy="epoch",
+ save_strategy="epoch",
+ load_best_model_at_end=True,
+ push_to_hub=True,
+)
+
+trainer = Trainer(
+ model=model,
+ args=training_args,
+ train_dataset=dataset["train"],
+ eval_dataset=dataset["test"],
+ processing_class=tokenizer,
+ data_collator=data_collator,
+ compute_metrics=compute_metrics,
+)
+
+trainer.train()
```
-## Backward
+## Native PyTorch
-The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`~accelerate.Accelerator.backward`] method:
+Accelerate can also be added to any PyTorch training loop to enable distributed training. The [`~accelerate.Accelerator`] is the main entry point for adapting your PyTorch code to work with Accelerate. It automatically detects your distributed training setup and initializes all the necessary components for training. You don't need to explicitly place your model on a device because [`~accelerate.Accelerator`] knows which device to move your model to.
```py
->>> for epoch in range(num_epochs):
-... for batch in train_dataloader:
-... outputs = model(**batch)
-... loss = outputs.loss
-... accelerator.backward(loss)
-
-... optimizer.step()
-... lr_scheduler.step()
-... optimizer.zero_grad()
-... progress_bar.update(1)
+from accelerate import Accelerator
+
+accelerator = Accelerator()
+device = accelerator.device
```
-As you can see in the following code, you only need to add four additional lines of code to your training loop to enable distributed training!
+All PyTorch objects (model, optimizer, scheduler, dataloaders) should be passed to the [`~accelerate.Accelerator.prepare`] method now. This method moves your model to the appropriate device or devices, adapts the optimizer and scheduler to use [`~accelerate.optimizer.AcceleratedOptimizer`] and [`~accelerate.scheduler.AcceleratedScheduler`], and creates a new shardable dataloader.
-```diff
-+ from accelerate import Accelerator
- from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+```py
+train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+ train_dataloader, eval_dataloader, model, optimizer
+)
+```
-+ accelerator = Accelerator()
+Replace `loss.backward` in your training loop with Accelerates [`~accelerate.Accelerator.backward`] method to scale the gradients and determine the appropriate `backward` method to use depending on your framework (for example, DeepSpeed or Megatron).
- model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
- optimizer = AdamW(model.parameters(), lr=3e-5)
+```py
+for epoch in range(num_epochs):
+ for batch in train_dataloader:
+ outputs = model(**batch)
+ loss = outputs.loss
+ accelerator.backward(loss)
+ optimizer.step()
+ lr_scheduler.step()
+ optimizer.zero_grad()
+ progress_bar.update(1)
+```
-- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-- model.to(device)
+Combine everything into a function and make it callable as a script.
-+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
-+ train_dataloader, eval_dataloader, model, optimizer
-+ )
+```py
+from accelerate import Accelerator
+
+def main():
+ accelerator = Accelerator()
- num_epochs = 3
- num_training_steps = num_epochs * len(train_dataloader)
- lr_scheduler = get_scheduler(
- "linear",
- optimizer=optimizer,
- num_warmup_steps=0,
- num_training_steps=num_training_steps
+ model, optimizer, training_dataloader, scheduler = accelerator.prepare(
+ model, optimizer, training_dataloader, scheduler
)
- progress_bar = tqdm(range(num_training_steps))
-
- model.train()
- for epoch in range(num_epochs):
- for batch in train_dataloader:
-- batch = {k: v.to(device) for k, v in batch.items()}
- outputs = model(**batch)
- loss = outputs.loss
-- loss.backward()
-+ accelerator.backward(loss)
-
- optimizer.step()
- lr_scheduler.step()
- optimizer.zero_grad()
- progress_bar.update(1)
+ for batch in training_dataloader:
+ optimizer.zero_grad()
+ inputs, targets = batch
+ outputs = model(inputs)
+ loss = loss_function(outputs, targets)
+ accelerator.backward(loss)
+ optimizer.step()
+ scheduler.step()
+
+if __name__ == "__main__":
+ main()
```
-## Train
-
-Once you've added the relevant lines of code, launch your training in a script or a notebook like Colaboratory.
+From the command line, call [accelerate launch](https://hf.co/docs/accelerate/main/en/package_reference/cli#accelerate-launch) to run your training script. Any additional arguments or parameters can be passed here as well.
-### Train with a script
-
-If you are running your training from a script, run the following command to create and save a configuration file:
-
-```bash
-accelerate config
-```
-
-Then launch your training with:
+To launch your training script on two GPUs, add the `--num_processes` argument.
```bash
-accelerate launch train.py
-```
-
-### Train with a notebook
-
-🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. Wrap all the code responsible for training in a function, and pass it to [`~accelerate.notebook_launcher`]:
-
-```py
->>> from accelerate import notebook_launcher
-
->>> notebook_launcher(training_function)
+accelerate launch --num_processes=2 your_script.py
```
-For more information about 🤗 Accelerate and its rich features, refer to the [documentation](https://huggingface.co/docs/accelerate).
+Refer to the [Launching Accelerate scripts](https://hf.co/docs/accelerate/main/en/basic_tutorials/launch) for more details.
diff --git a/docs/source/en/add_new_model.md b/docs/source/en/add_new_model.md
index 9aab36bb6fbe..bfab511972e7 100644
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@@ -1,4 +1,4 @@
-
-# How to add a model to 🤗 Transformers?
+# Adding a new model to Transformers
-The 🤗 Transformers library is often able to offer new models thanks to community contributors. But this can be a challenging project and requires an in-depth knowledge of the 🤗 Transformers library and the model to implement. At Hugging Face, we're trying to empower more of the community to actively add models and we've put together this guide to walk you through the process of adding a PyTorch model (make sure you have [PyTorch installed](https://pytorch.org/get-started/locally/)).
+> [!TIP]
+> Try adding new models with a more [modular](./modular_transformers) approach first. This makes it significantly easier to contribute a model to Transformers!
-Along the way, you'll:
+Many of the models in Transformers are contributed by developers and researchers. As an open-source first project, we're invested in empowering the community to actively and independently add more models.
-- get insights into open-source best practices
-- understand the design principles behind one of the most popular deep learning libraries
-- learn how to efficiently test large models
-- learn how to integrate Python utilities like `black`, `ruff`, and `make fix-copies` to ensure clean and readable code
+When you add a model to Transformers, you'll learn:
-A Hugging Face team member will be available to help you along the way so you'll never be alone. 🤗 ❤️
+- more about open-source best practices
+- about a models architecture
+- about Transformers' design principles
+- how to efficiently test large models
+- how to use Python utilities like [Black](https://black.readthedocs.io/en/stable/) and [Ruff](https://docs.astral.sh/ruff/) to create clean and readable code
-To get started, open a [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) issue for the model you want to see in 🤗 Transformers. If you're not especially picky about contributing a specific model, you can filter by the [New model label](https://github.com/huggingface/transformers/labels/New%20model) to see if there are any unclaimed model requests and work on it.
+It is a challenging but rewarding process.
-Once you've opened a new model request, the first step is to get familiar with 🤗 Transformers if you aren't already!
+This guide will walk you through adding an example BrandNewLlama PyTorch model to Transformers. Before you begin, it is a good idea to familiarize yourself with the library.
-## General overview of 🤗 Transformers
+## Transformers overview
-First, you should get a general overview of 🤗 Transformers. 🤗 Transformers is a very opinionated library, so there is a
-chance that you don't agree with some of the library's philosophies or design choices. From our experience, however, we
-found that the fundamental design choices and philosophies of the library are crucial to efficiently scale 🤗
-Transformers while keeping maintenance costs at a reasonable level.
+Transformers is an opinionated library with its own unique philosophy and design choices. These choices help us sustainably scale and maintain Transformers.
-A good first starting point to better understand the library is to read the [documentation of our philosophy](philosophy). As a result of our way of working, there are some choices that we try to apply to all models:
+> [!TIP]
+> Learn more about our design principles on the [Philosophy](./philosophy) doc.
-- Composition is generally favored over-abstraction
-- Duplicating code is not always bad if it strongly improves the readability or accessibility of a model
-- Model files are as self-contained as possible so that when you read the code of a specific model, you ideally only
- have to look into the respective `modeling_....py` file.
+Some of these design choices are:
-In our opinion, the library's code is not just a means to provide a product, *e.g.* the ability to use BERT for
-inference, but also as the very product that we want to improve. Hence, when adding a model, the user is not only the
-person who will use your model, but also everybody who will read, try to understand, and possibly tweak your code.
+- composition > over-abstraction
+- duplicate code isn't always bad if it greatly improves readability and accessibility
+- model files are self-contained and all the necessary model code is found in the `modeling_mymodel.py` file
-With this in mind, let's go a bit deeper into the general library design.
+These design choices are important *for everyone* interacting with the model. It is easier to read, understand, and modify.
-### Overview of models
+This section describes how the model and configuration classes interact and the Transformers code style.
-To successfully add a model, it is important to understand the interaction between your model and its config,
-[`PreTrainedModel`], and [`PretrainedConfig`]. For exemplary purposes, we will
-call the model to be added to 🤗 Transformers `BrandNewBert`.
+### Model and configuration
-Let's take a look:
+All Transformers' models inherit from a base [`PreTrainedModel`] and [`PretrainedConfig`] class. The configuration is the models blueprint.
-
+There is never more than two levels of abstraction for any model to keep the code readable. The example model here, BrandNewLlama, inherits from `BrandNewLlamaPreTrainedModel` and [`PreTrainedModel`]. It is important that a new model only depends on [`PreTrainedModel`] so that it can use the [`~PreTrainedModel.from_pretrained`] and [`~PreTrainedModel.save_pretrained`] methods.
-As you can see, we do make use of inheritance in 🤗 Transformers, but we keep the level of abstraction to an absolute
-minimum. There are never more than two levels of abstraction for any model in the library. `BrandNewBertModel`
-inherits from `BrandNewBertPreTrainedModel` which in turn inherits from [`PreTrainedModel`] and
-that's it. As a general rule, we want to make sure that a new model only depends on
-[`PreTrainedModel`]. The important functionalities that are automatically provided to every new
-model are [`~PreTrainedModel.from_pretrained`] and
-[`~PreTrainedModel.save_pretrained`], which are used for serialization and deserialization. All of the
-other important functionalities, such as `BrandNewBertModel.forward` should be completely defined in the new
-`modeling_brand_new_bert.py` script. Next, we want to make sure that a model with a specific head layer, such as
-`BrandNewBertForMaskedLM` does not inherit from `BrandNewBertModel`, but rather uses `BrandNewBertModel`
-as a component that can be called in its forward pass to keep the level of abstraction low. Every new model requires a
-configuration class, called `BrandNewBertConfig`. This configuration is always stored as an attribute in
-[`PreTrainedModel`], and thus can be accessed via the `config` attribute for all classes
-inheriting from `BrandNewBertPreTrainedModel`:
+Other important functions like the forward method are defined in the `modeling.py` file.
-```python
-model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
-model.config # model has access to its config
+Specific model heads (for example, sequence classification or language modeling) should call the base model in the forward pass rather than inherting from it to keep abstraction low.
+
+New models require a configuration, for example `BrandNewLlamaConfig`, that is stored as an attribute of [`PreTrainedModel`].
+
+```py
+model = BrandNewLlamaModel.from_pretrained("username/brand_new_llama")
+model.config
```
-Similar to the model, the configuration inherits basic serialization and deserialization functionalities from
-[`PretrainedConfig`]. Note that the configuration and the model are always serialized into two
-different formats - the model to a *pytorch_model.bin* file and the configuration to a *config.json* file. Calling
-the model's [`~PreTrainedModel.save_pretrained`] will automatically call
-the config's [`~PretrainedConfig.save_pretrained`], so that both model and configuration are saved.
+[`PretrainedConfig`] provides the [`~PretrainedConfig.from_pretrained`] and [`~PretrainedConfig.save_pretrained`] methods.
+When you use [`PreTrainedModel.save_pretrained`], it automatically calls [`PretrainedConfig.save_pretrained`] so that both the model and configuration are saved together.
+
+A model is saved to a `model.safetensors` file and a configuration is saved to a `config.json` file.
### Code style
-When coding your new model, keep in mind that Transformers is an opinionated library and we have a few quirks of our
-own regarding how code should be written :-)
-
-1. The forward pass of your model should be fully written in the modeling file while being fully independent of other
- models in the library. If you want to reuse a block from another model, copy the code and paste it with a
- `# Copied from` comment on top (see [here](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
- for a good example and [there](pr_checks#check-copies) for more documentation on Copied from).
-2. The code should be fully understandable, even by a non-native English speaker. This means you should pick
- descriptive variable names and avoid abbreviations. As an example, `activation` is preferred to `act`.
- One-letter variable names are strongly discouraged unless it's an index in a for loop.
-3. More generally we prefer longer explicit code to short magical one.
-4. Avoid subclassing `nn.Sequential` in PyTorch but subclass `nn.Module` and write the forward pass, so that anyone
- using your code can quickly debug it by adding print statements or breaking points.
-5. Your function signature should be type-annotated. For the rest, good variable names are way more readable and
- understandable than type annotations.
-
-### Overview of tokenizers
-
-Not quite ready yet :-( This section will be added soon!
-
-## Step-by-step recipe to add a model to 🤗 Transformers
-
-Everyone has different preferences of how to port a model so it can be very helpful for you to take a look at summaries
-of how other contributors ported models to Hugging Face. Here is a list of community blog posts on how to port a model:
-
-1. [Porting GPT2 Model](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) by [Thomas](https://huggingface.co/thomwolf)
-2. [Porting WMT19 MT Model](https://huggingface.co/blog/porting-fsmt) by [Stas](https://huggingface.co/stas)
-
-From experience, we can tell you that the most important things to keep in mind when adding a model are:
-
-- Don't reinvent the wheel! Most parts of the code you will add for the new 🤗 Transformers model already exist
- somewhere in 🤗 Transformers. Take some time to find similar, already existing models and tokenizers you can copy
- from. [grep](https://www.gnu.org/software/grep/) and [rg](https://github.com/BurntSushi/ripgrep) are your
- friends. Note that it might very well happen that your model's tokenizer is based on one model implementation, and
- your model's modeling code on another one. *E.g.* FSMT's modeling code is based on BART, while FSMT's tokenizer code
- is based on XLM.
-- It's more of an engineering challenge than a scientific challenge. You should spend more time creating an
- efficient debugging environment rather than trying to understand all theoretical aspects of the model in the paper.
-- Ask for help, when you're stuck! Models are the core component of 🤗 Transformers so we at Hugging Face are more
- than happy to help you at every step to add your model. Don't hesitate to ask if you notice you are not making
- progress.
-
-In the following, we try to give you a general recipe that we found most useful when porting a model to 🤗 Transformers.
-
-The following list is a summary of everything that has to be done to add a model and can be used by you as a To-Do
-List:
-
-☐ (Optional) Understood the model's theoretical aspects
-☐ Prepared 🤗 Transformers dev environment
-☐ Set up debugging environment of the original repository
-☐ Created script that successfully runs the `forward()` pass using the original repository and checkpoint
-☐ Successfully added the model skeleton to 🤗 Transformers
-☐ Successfully converted original checkpoint to 🤗 Transformers checkpoint
-☐ Successfully ran `forward()` pass in 🤗 Transformers that gives identical output to original checkpoint
-☐ Finished model tests in 🤗 Transformers
-☐ Successfully added tokenizer in 🤗 Transformers
-☐ Run end-to-end integration tests
-☐ Finished docs
-☐ Uploaded model weights to the Hub
-☐ Submitted the pull request
-☐ (Optional) Added a demo notebook
-
-To begin with, we usually recommend starting by getting a good theoretical understanding of `BrandNewBert`. However,
-if you prefer to understand the theoretical aspects of the model *on-the-job*, then it is totally fine to directly dive
-into the `BrandNewBert`'s code-base. This option might suit you better if your engineering skills are better than
-your theoretical skill, if you have trouble understanding `BrandNewBert`'s paper, or if you just enjoy programming
-much more than reading scientific papers.
-
-### 1. (Optional) Theoretical aspects of BrandNewBert
-
-You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large
-sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is
-not to get a deep theoretical understanding of the paper, but to extract the necessary information required to
-effectively re-implement the model in 🤗 Transformers. That being said, you don't have to spend too much time on the
-theoretical aspects, but rather focus on the practical ones, namely:
-
-- What type of model is *brand_new_bert*? BERT-like encoder-only model? GPT2-like decoder-only model? BART-like
- encoder-decoder model? Look at the [model_summary](model_summary) if you're not familiar with the differences between those.
-- What are the applications of *brand_new_bert*? Text classification? Text generation? Seq2Seq tasks, *e.g.,*
- summarization?
-- What is the novel feature of the model that makes it different from BERT/GPT-2/BART?
-- Which of the already existing [🤗 Transformers models](https://huggingface.co/transformers/#contents) is most
- similar to *brand_new_bert*?
-- What type of tokenizer is used? A sentencepiece tokenizer? Word piece tokenizer? Is it the same tokenizer as used
- for BERT or BART?
-
-After you feel like you have gotten a good overview of the architecture of the model, you might want to write to the
-Hugging Face team with any questions you might have. This might include questions regarding the model's architecture,
-its attention layer, etc. We will be more than happy to help you.
-
-### 2. Next prepare your environment
-
-1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the ‘Fork' button on the
- repository's page. This creates a copy of the code under your GitHub user account.
-
-2. Clone your `transformers` fork to your local disk, and add the base repository as a remote:
-
- ```bash
- git clone https://github.com/[your Github handle]/transformers.git
- cd transformers
- git remote add upstream https://github.com/huggingface/transformers.git
- ```
-
-3. Set up a development environment, for instance by running the following command:
-
- ```bash
- python -m venv .env
- source .env/bin/activate
- pip install -e ".[dev]"
- ```
-
- Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
- failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
- (PyTorch, TensorFlow and/or Flax) then do:
-
- ```bash
- pip install -e ".[quality]"
- ```
-
- which should be enough for most use cases. You can then return to the parent directory
-
- ```bash
- cd ..
- ```
-
-4. We recommend adding the PyTorch version of *brand_new_bert* to Transformers. To install PyTorch, please follow the
- instructions on https://pytorch.org/get-started/locally/.
-
- **Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient.
-
-5. To port *brand_new_bert*, you will also need access to its original repository:
-
- ```bash
- git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git
- cd brand_new_bert
- pip install -e .
- ```
-
-Now you have set up a development environment to port *brand_new_bert* to 🤗 Transformers.
-
-### 3.-4. Run a pretrained checkpoint using the original repository
-
-At first, you will work on the original *brand_new_bert* repository. Often, the original implementation is very
-“researchy”. Meaning that documentation might be lacking and the code can be difficult to understand. But this should
-be exactly your motivation to reimplement *brand_new_bert*. At Hugging Face, one of our main goals is to *make people
-stand on the shoulders of giants* which translates here very well into taking a working model and rewriting it to make
-it as **accessible, user-friendly, and beautiful** as possible. This is the number-one motivation to re-implement
-models into 🤗 Transformers - trying to make complex new NLP technology accessible to **everybody**.
-
-You should start thereby by diving into the original repository.
-
-Successfully running the official pretrained model in the original repository is often **the most difficult** step.
-From our experience, it is very important to spend some time getting familiar with the original code-base. You need to
-figure out the following:
-
-- Where to find the pretrained weights?
-- How to load the pretrained weights into the corresponding model?
-- How to run the tokenizer independently from the model?
-- Trace one forward pass so that you know which classes and functions are required for a simple forward pass. Usually,
- you only have to reimplement those functions.
-- Be able to locate the important components of the model: Where is the model's class? Are there model sub-classes,
- *e.g.* EncoderModel, DecoderModel? Where is the self-attention layer? Are there multiple different attention layers,
- *e.g.* *self-attention*, *cross-attention*...?
-- How can you debug the model in the original environment of the repo? Do you have to add *print* statements, can you
- work with an interactive debugger like *ipdb*, or should you use an efficient IDE to debug the model, like PyCharm?
-
-It is very important that before you start the porting process, you can **efficiently** debug code in the original
-repository! Also, remember that you are working with an open-source library, so do not hesitate to open an issue, or
-even a pull request in the original repository. The maintainers of this repository are most likely very happy about
-someone looking into their code!
-
-At this point, it is really up to you which debugging environment and strategy you prefer to use to debug the original
-model. We strongly advise against setting up a costly GPU environment, but simply work on a CPU both when starting to
-dive into the original repository and also when starting to write the 🤗 Transformers implementation of the model. Only
-at the very end, when the model has already been successfully ported to 🤗 Transformers, one should verify that the
-model also works as expected on GPU.
-
-In general, there are two possible debugging environments for running the original model
+Transformers prefers a clean and readable code over a more abstracted code style. Some of the code style choices include:
-- [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb)
-- Local python scripts.
+- The code should be accessible to non-English users. Pick descriptive variable names and avoid abbreviations. For example, "activation" is preferred over "act". One letter variables names are highly discouraged unless it's an index in a for loop.
-Jupyter notebooks have the advantage that they allow for cell-by-cell execution which can be helpful to better split
-logical components from one another and to have faster debugging cycles as intermediate results can be stored. Also,
-notebooks are often easier to share with other contributors, which might be very helpful if you want to ask the Hugging
-Face team for help. If you are familiar with Jupyter notebooks, we strongly recommend you work with them.
+- Explicit code is preferred - even if it's longer - over shorter code.
-The obvious disadvantage of Jupyter notebooks is that if you are not used to working with them you will have to spend
-some time adjusting to the new programming environment and you might not be able to use your known debugging tools
-anymore, like `ipdb`.
+- Avoid subclassing [nn.Sequential](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html). Subclass [nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module) instead so the code can be quickly debugged with print statements or breakpoints.
-For each code-base, a good first step is always to load a **small** pretrained checkpoint and to be able to reproduce a
-single forward pass using a dummy integer vector of input IDs as an input. Such a script could look like this (in
-pseudocode):
+- Function signatures should be type-annotated. Otherwise, use good variable names so they're more understandable.
-```python
-model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
-input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids
-original_output = model.predict(input_ids)
-```
+## New model addition issue
-Next, regarding the debugging strategy, there are generally a few from which to choose from:
+Open a [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) issue to add a specific model.
-- Decompose the original model into many small testable components and run a forward pass on each of those for
- verification
-- Decompose the original model only into the original *tokenizer* and the original *model*, run a forward pass on
- those, and use intermediate print statements or breakpoints for verification
+> [!TIP]
+> Filter by the [New model](https://github.com/huggingface/transformers/labels/New%20model) label on GitHub to view and add any existing model requests.
-Again, it is up to you which strategy to choose. Often, one or the other is advantageous depending on the original code
-base.
+Now is a good time to get familiar with BrandNewLlama. It is helpful to read a models research paper to understand its technical design and implementation. You don't necessarily have to worry too much about the theoretical details. Instead, focus on the practical ones. Use the questions below to guide your reading.
-If the original code-base allows you to decompose the model into smaller sub-components, *e.g.* if the original
-code-base can easily be run in eager mode, it is usually worth the effort to do so. There are some important advantages
-to taking the more difficult road in the beginning:
+- What type of model is BrandNewLlama? Is it a encoder, decoder, or encoder-decoder model?
+- What tasks can BrandNewLlama be used for?
+- What makes BrandNewLlama different from other models?
+- What models in Transformers are most similar to BrandNewLlama?
+- What tokenizer does BrandNewLlama use?
-- at a later stage when comparing the original model to the Hugging Face implementation, you can verify automatically
- for each component individually that the corresponding component of the 🤗 Transformers implementation matches instead
- of relying on visual comparison via print statements
-- it can give you some rope to decompose the big problem of porting a model into smaller problems of just porting
- individual components and thus structure your work better
-- separating the model into logical meaningful components will help you to get a better overview of the model's design
- and thus to better understand the model
-- at a later stage those component-by-component tests help you to ensure that no regression occurs as you continue
- changing your code
+In addition to learning more about your model, use the tips below to help you add a model faster.
-[Lysandre's](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) integration checks for ELECTRA
-gives a nice example of how this can be done.
+> [!TIP]
+> Each contributor has a unique style and workflow for adding models to Transformers. For an example, take a look at how [Gemma](https://github.com/huggingface/transformers/pull/29167) was added.
-However, if the original code-base is very complex or only allows intermediate components to be run in a compiled mode,
-it might be too time-consuming or even impossible to separate the model into smaller testable sub-components. A good
-example is [T5's MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) library which is
-very complex and does not offer a simple way to decompose the model into its sub-components. For such libraries, one
-often relies on verifying print statements.
+- Don't reinvent the wheel! Take your time to explore existing models and tokenizers to see what you can copy and reuse. [Grep](https://www.gnu.org/software/grep/) and [ripgrep](https://github.com/BurntSushi/ripgrep) are great tools for this.
+- This is more of an engineering than a science challenge. Focus on the more practical (setting up an efficient debugging environment for example) instead of the theorertical aspects of the model.
+- Don't be shy to ask for help! We are here to support you. 🤗
-No matter which strategy you choose, the recommended procedure is often the same that you should start to debug the
-starting layers first and the ending layers last.
+## Dev environment
-It is recommended that you retrieve the output, either by print statements or sub-component functions, of the following
-layers in the following order:
+Click on the **Fork** button on the [Transformers](https://github.com/huggingface/transformers) repository to create your own copy to work on. Clone the repository to your local disk and add the base repository as the remote.
-1. Retrieve the input IDs passed to the model
-2. Retrieve the word embeddings
-3. Retrieve the input of the first Transformer layer
-4. Retrieve the output of the first Transformer layer
-5. Retrieve the output of the following n - 1 Transformer layers
-6. Retrieve the output of the whole BrandNewBert Model
+```bash
+git clone https://github.com/[your Github handle]/transformers.git
+cd transformers
+git remote add upstream https://github.com/huggingface/transformers.git
+```
-Input IDs should thereby consists of an array of integers, *e.g.* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`
+Create a virtual environment and perform an [editable install](./installation#editable-install) of the library with the "dev" or development dependencies.
-The outputs of the following layers often consist of multi-dimensional float arrays and can look like this:
+```bash
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
+
+Due to the number of optional dependencies as Transformers grows, this command may fail. In this case, install the "quality" dependencies. Also make sure you have a deep learning framework installed.
+```bash
+pip install -e ".[quality]"
```
-[[
- [-0.1465, -0.6501, 0.1993, ..., 0.1451, 0.3430, 0.6024],
- [-0.4417, -0.5920, 0.3450, ..., -0.3062, 0.6182, 0.7132],
- [-0.5009, -0.7122, 0.4548, ..., -0.3662, 0.6091, 0.7648],
- ...,
- [-0.5613, -0.6332, 0.4324, ..., -0.3792, 0.7372, 0.9288],
- [-0.5416, -0.6345, 0.4180, ..., -0.3564, 0.6992, 0.9191],
- [-0.5334, -0.6403, 0.4271, ..., -0.3339, 0.6533, 0.8694]]],
+
+Return to the parent directory and clone and install the original BrandNewLlama repository.
+
+```bash
+git clone https://github.com/org_that_created_brand_new_llama_org/brand_new_llama.git
+cd brand_new_bert
+pip install -e .
```
-We expect that every model added to 🤗 Transformers passes a couple of integration tests, meaning that the original
-model and the reimplemented version in 🤗 Transformers have to give the exact same output up to a precision of 0.001!
-Since it is normal that the exact same model written in different libraries can give a slightly different output
-depending on the library framework, we accept an error tolerance of 1e-3 (0.001). It is not enough if the model gives
-nearly the same output, they have to be almost identical. Therefore, you will certainly compare the intermediate
-outputs of the 🤗 Transformers version multiple times against the intermediate outputs of the original implementation of
-*brand_new_bert* in which case an **efficient** debugging environment of the original repository is absolutely
-important. Here is some advice to make your debugging environment as efficient as possible.
-
-- Find the best way of debugging intermediate results. Is the original repository written in PyTorch? Then you should
- probably take the time to write a longer script that decomposes the original model into smaller sub-components to
- retrieve intermediate values. Is the original repository written in Tensorflow 1? Then you might have to rely on
- TensorFlow print operations like [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) to output
- intermediate values. Is the original repository written in Jax? Then make sure that the model is **not jitted** when
- running the forward pass, *e.g.* check-out [this link](https://github.com/google/jax/issues/196).
-- Use the smallest pretrained checkpoint you can find. The smaller the checkpoint, the faster your debug cycle
- becomes. It is not efficient if your pretrained model is so big that your forward pass takes more than 10 seconds.
- In case only very large checkpoints are available, it might make more sense to create a dummy model in the new
- environment with randomly initialized weights and save those weights for comparison with the 🤗 Transformers version
- of your model
-- Make sure you are using the easiest way of calling a forward pass in the original repository. Ideally, you want to
- find the function in the original repository that **only** calls a single forward pass, *i.e.* that is often called
- `predict`, `evaluate`, `forward` or `__call__`. You don't want to debug a function that calls `forward`
- multiple times, *e.g.* to generate text, like `autoregressive_sample`, `generate`.
-- Try to separate the tokenization from the model's *forward* pass. If the original repository shows examples where
- you have to input a string, then try to find out where in the forward call the string input is changed to input ids
- and start from this point. This might mean that you have to possibly write a small script yourself or change the
- original code so that you can directly input the ids instead of an input string.
-- Make sure that the model in your debugging setup is **not** in training mode, which often causes the model to yield
- random outputs due to multiple dropout layers in the model. Make sure that the forward pass in your debugging
- environment is **deterministic** so that the dropout layers are not used. Or use *transformers.utils.set_seed*
- if the old and new implementations are in the same framework.
-
-The following section gives you more specific details/tips on how you can do this for *brand_new_bert*.
-
-### 5.-14. Port BrandNewBert to 🤗 Transformers
-
-Next, you can finally start adding new code to 🤗 Transformers. Go into the clone of your 🤗 Transformers' fork:
+Return to your clone of Transformers to begin porting BrandNewLlama.
```bash
cd transformers
```
-In the special case that you are adding a model whose architecture exactly matches the model architecture of an
-existing model you only have to add a conversion script as described in [this section](#write-a-conversion-script).
-In this case, you can just re-use the whole model architecture of the already existing model.
+There are two possible debugging environments for running the original model, a notebook ([Google Colab](https://colab.research.google.com/notebooks/intro.ipynb) or [Jupyter](https://jupyter.org/)) or a local Python script.
+
+> [!WARNING]
+> We don't recommend setting up a GPU environment to run the original model because it can be expensive. Instead, work in a CPU environment first to verify the model works in Transformers. Once it does, then you can verify it on a GPU.
+
+Notebooks are great for executing code cell-by-cell which can help split logical components from one another. It can also accelerate debugging cycles because intermediate results can be stored. You can also share notebooks when working with other contributors.
+
+The downside is that if you aren't used to them, it may take some time to get used to.
+
+> [!TIP]
+> If the model architecture is identical to an existing model, skip ahead to add a [conversion script](#conversion-script), because you can reuse the architecture of the existing model.
-Otherwise, let's start generating a new model. We recommend using the following script to add a model starting from
-an existing model:
+Run the command below to start and complete the questionnaire with some basic information about the new model. This command jumpstarts the process by automatically generating some model code that you'll need to adapt.
```bash
transformers-cli add-new-model-like
```
-You will be prompted with a questionnaire to fill in the basic information of your model.
+## Create a pull request
-**Open a Pull Request on the main huggingface/transformers repo**
+Before you start adapting the code, create a pull request to track your progress and get feedback from the Transformers team. Title your pull request **[WIP] Add BrandNewLlama** so it's clear that this is a work in progress.
-Before starting to adapt the automatically generated code, now is the time to open a “Work in progress (WIP)” pull
-request, *e.g.* “[WIP] Add *brand_new_bert*”, in 🤗 Transformers so that you and the Hugging Face team can work
-side-by-side on integrating the model into 🤗 Transformers.
+Create a branch with a descriptive name from your main branch.
-You should do the following:
+```bash
+git checkout -b add_brand_new_bert
+```
-1. Create a branch with a descriptive name from your main branch
+Commit the code, and then fetch and rebase on the main branch.
- ```bash
- git checkout -b add_brand_new_bert
- ```
+```bash
+git add .
+git commit
+git fetch upstream
+git rebase upstream/main
+```
-2. Commit the automatically generated code:
+Push any changes to your branch and click on **Compare & pull request** to open a pull request on GitHub. Open the pull request as a *draft* to indicate it's a work in progress.
- ```bash
- git add .
- git commit
- ```
+```bash
+git push -u origin a-descriptive-name-for-my-changes
+```
-3. Fetch and rebase to current main
+Include relevant Hugging Face team members by adding their GitHub handles in the pull request for questions, feedback, comments, and reviews. Direct team members to specific parts of the code you want by clicking on the **Files changed** tab, and then clicking on **+** to the left of the line number to add a comment. When a question or problem is solved, click on **Resolve** to indicate the issue is resolved. This keeps the conversation organized and clean.
- ```bash
- git fetch upstream
- git rebase upstream/main
- ```
+Remember to periodically commit and push your work, and update your work with the current main branch.
-4. Push the changes to your account using:
+```bash
+git fetch upstream
+git merge upstream/main
+```
- ```bash
- git push -u origin a-descriptive-name-for-my-changes
- ```
+## Original checkpoint
-5. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the
- GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for
- future changes.
+Take some time to work on the original model implementation first to understand how it works.
-6. Change the PR into a draft by clicking on “Convert to draft” on the right of the GitHub pull request web page.
+This can be difficult if the original model repository is lacking documentation or if the codebase is complex. But you should use this as your motivation to implement the model in Transformers. Your contribution makes it more accessible and user-friendly to everyone!
-In the following, whenever you have made some progress, don't forget to commit your work and push it to your account so
-that it shows in the pull request. Additionally, you should make sure to update your work with the current main from
-time to time by doing:
+Orient yourself with the original repository by doing the following.
-```bash
-git fetch upstream
-git merge upstream/main
+- Locate the pretrained weights.
+- Figure out how to the load pretrained weights into the model.
+- Figure out how to run the tokenizer independently of the model.
+- Trace one forward pass to understand which classes and functions are required. These are probably the only classes and functions you'll have to implement.
+- Locate all the important components (model class, model subclasses, self-attention layer, etc.) of the model.
+- Figure out how to debug the model in the original repository. Add print statements, use interactive debuggers like [ipdb](https://github.com/gotcha/ipdb), or a efficient integrated development environment (IDE) like [PyCharm](https://www.jetbrains.com/pycharm/).
+
+The last point is especially important because you'll need a thorough understanding of what's happening inside the original model before you can reimplement it in Transformers. Feel free to open issues and pull requests in the original repository if you encounter any issues.
+
+A good first step is to load a *small* pretrained checkpoint and try to reproduce a single forward pass with an example integer vector of inputs. For example, in pseudocode, this could look like the following.
+
+```py
+model = BrandNewLlamaModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids
+original_output = model.generate(input_ids)
```
-In general, all questions you might have regarding the model or your implementation should be asked in your PR and
-discussed/solved in the PR. This way, the Hugging Face team will always be notified when you are committing new code or
-if you have a question. It is often very helpful to point the Hugging Face team to your added code so that the Hugging
-Face team can efficiently understand your problem or question.
+### Debugging
+
+If you run into issues, you'll need to choose one of the following debugging strategies depending on the original models codebase.
+
+
+
+
+This strategy relies on breaking the original model into smaller sub-components, such as when the code can be easily run in eager mode. While more difficult, there are some advantages to this approach.
+
+1. It is easier later to compare the original model to your implementation. You can automatically verify that each individual component matches its corresponding component in the Transformers' implementation. This is better than relying on a visual comparison based on print statements.
+2. It is easier to port individal components instead of the entire model.
+3. It is easier for understanding how a model works by breaking it up into smaller parts.
+4. It is easier to prevent regressions at a later stage when you change your code thanks to component-by-component tests.
-To do so, you can go to the “Files changed” tab where you see all of your changes, go to a line regarding which you
-want to ask a question, and click on the “+” symbol to add a comment. Whenever a question or problem has been solved,
-you can click on the “Resolve” button of the created comment.
+> [!TIP]
+> Refer to the ELECTRA [integration checks](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) for a good example of how to decompose a model into smaller components.
-In the same way, the Hugging Face team will open comments when reviewing your code. We recommend asking most questions
-on GitHub on your PR. For some very general questions that are not very useful for the public, feel free to ping the
-Hugging Face team by Slack or email.
+
+
-**5. Adapt the generated models code for brand_new_bert**
+This strategy is viable when the original codebase is too complex, only allows intermediate components to be run in compiled mode, or if it's too time-consuming (maybe even impossible) to separate the model into smaller sub-components.
-At first, we will focus only on the model itself and not care about the tokenizer. All the relevant code should be
-found in the generated files `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` and
-`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.
+For example, the MeshTensorFlow implementation of [T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) is too complex and doesn't offer a simple way to decompose the model into its sub-components. In this situation, you'll have to rely on verifying print statements.
-Now you can finally start coding :). The generated code in
-`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` will either have the same architecture as BERT if
-it's an encoder-only model or BART if it's an encoder-decoder model. At this point, you should remind yourself what
-you've learned in the beginning about the theoretical aspects of the model: *How is the model different from BERT or
-BART?*". Implement those changes which often means changing the *self-attention* layer, the order of the normalization
-layer, etc… Again, it is often useful to look at the similar architecture of already existing models in Transformers to
-get a better feeling of how your model should be implemented.
+
+
-**Note** that at this point, you don't have to be very sure that your code is fully correct or clean. Rather, it is
-advised to add a first *unclean*, copy-pasted version of the original code to
-`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` until you feel like all the necessary code is
-added. From our experience, it is much more efficient to quickly add a first version of the required code and
-improve/correct the code iteratively with the conversion script as described in the next section. The only thing that
-has to work at this point is that you can instantiate the 🤗 Transformers implementation of *brand_new_bert*, *i.e.* the
-following command should work:
+Whichever strategy you choose, it is recommended to debug the initial layers first and the final layers last. Retrieve the output, either with print statements or sub-component functions, of the following layers in this order.
-```python
-from transformers import BrandNewBertModel, BrandNewBertConfig
+1. input ids passed to the model
+2. word embeddings
+3. input of the first Transformer layer
+4. output of the first Transformer layer
+5. output of the following n-1 Transformer layers
+6. output of the whole model
-model = BrandNewBertModel(BrandNewBertConfig())
+The input ids should just be an array of integers like `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`.
+
+Layer outputs often consist of multi-dimensional float arrays.
+
+```py
+[[
+ [-0.1465, -0.6501, 0.1993, ..., 0.1451, 0.3430, 0.6024],
+ [-0.4417, -0.5920, 0.3450, ..., -0.3062, 0.6182, 0.7132],
+ [-0.5009, -0.7122, 0.4548, ..., -0.3662, 0.6091, 0.7648],
+ ...,
+ [-0.5613, -0.6332, 0.4324, ..., -0.3792, 0.7372, 0.9288],
+ [-0.5416, -0.6345, 0.4180, ..., -0.3564, 0.6992, 0.9191],
+ [-0.5334, -0.6403, 0.4271, ..., -0.3339, 0.6533, 0.8694]]],
```
-The above command will create a model according to the default parameters as defined in `BrandNewBertConfig()` with
-random weights, thus making sure that the `init()` methods of all components works.
+Every Transformers model output should have a precision or error tolerance of *1e-3*. This accounts for any output differences that arise from using a different library framework. Compare the intermediate outputs of the original model with the Transformers implementation to ensure they're nearly identical. Having an *efficient* debugging environment is crucial for this step.
+
+Here are some tips for an efficient debugging environment.
+
+- To debug intermediate results, it depends on the machine learning framework the original model repository is using. For PyTorch, you should write a script to decompose the original model into smaller sub-components to retrieve the intermediate values. For TensorFlow, you may need to use [tf.print](https://www.tensorflow.org/api_docs/python/tf/print). For Flax, make sure the model is *not jitted* during the forward pass (refer to this GitHub [Issue](https://github.com/google/jax/issues/196) for more details).
+
+- It is faster to debug with a smaller pretrained checkpoint versus a larger checkpoint where the forward pass takes more than 10 seconds. If only large checkpoints are available, create a dummy model with randomly initialized weights and save those weights to compare against the Transformers implementation.
+
+- Find the easiest way to call the model's forward pass. Ideally, this function (may be called `predict`, `evaluate`, `forward`, or `__call__`) should only call the forward pass *once*. It is more difficult to debug a function that calls the forward pass multiple times.
+
+- Separate tokenization from the forward pass. Locate where a string input is changed to input ids in the forward pass and start here. You may need to create a small script or modify the original code to directly input the input ids instead of an input string.
+
+- Ensure the model is *not* in training mode. This can produce random outputs due to multiple dropout layers in a model. The forward pass in your debugging environment should be *deterministic* so that the dropout layers aren't used.
+
+Once you're able to run the original checkpoint, you're ready to start adapting the model code for Transformers.
+
+## Adapt the model code
+
+The `transformers-cli add-new-model-like` command should have generated a model and configuration file.
+
+- `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py`
+- `src/transformers/models/brand_new_llama/configuration_brand_new_llama.py`
+
+The automatically generated code in the `modeling.py` file has the same architecture as Llama if you answered it's a decoder-only model or it will have the same architecture as BART if you answered it's an encoder-decoder model. The generated code is just a starting point. Based on your research on the new model, you'll need to implement those specific changes by adapting the generated code. This may involve changes to the self-attention layer, the order of the normalization layer, and so on.
-Note that all random initialization should happen in the `_init_weights` method of your `BrandnewBertPreTrainedModel`
-class. It should initialize all leaf modules depending on the variables of the config. Here is an example with the
-BERT `_init_weights` method:
+### Model initialization
+
+At this point, your code doesn't have to be clean or even fully correct, It is more efficient to quickly create a first draft and then iteratively improve on it. The most important thing is that your model can be instantiated from Transformers. The command below creates a model from the configuration with random weights, verifying that the `__init__` method works.
+
+```py
+from transformers import BrandNewLlama, BrandNewLlamaConfig
+model = BrandNewLlama(BrandNewLlamaConfig())
+```
+
+Random initialization occurs in the `_init_weights` method of `BrandNewLlamaPreTrainedModel`. All leaf modules are initialized depending on the configuration's variables.
```py
def _init_weights(self, module):
@@ -520,9 +326,9 @@ def _init_weights(self, module):
module.weight.data.fill_(1.0)
```
-You can have some more custom schemes if you need a special initialization for some modules. For instance, in
-`Wav2Vec2ForPreTraining`, the last two linear layers need to have the initialization of the regular PyTorch `nn.Linear`
-but all the other ones should use an initialization as above. This is coded like this:
+The initialization scheme can look different if you need to adapt it to your model. For example, [`Wav2Vec2ForPreTraining`] initializes [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) in its last two linear layers.
+
+The `_is_hf_initialized` flag makes sure the submodule is only initialized once. Setting `module.project_q` and `module.project_hid` to `True` ensures the custom initialization is not overriden later. The `_init_weights` function won't be applied to these modules.
```py
def _init_weights(self, module):
@@ -538,30 +344,34 @@ def _init_weights(self, module):
module.bias.data.zero_()
```
-The `_is_hf_initialized` flag is internally used to make sure we only initialize a submodule once. By setting it to
-`True` for `module.project_q` and `module.project_hid`, we make sure the custom initialization we did is not overridden later on,
-the `_init_weights` function won't be applied to them.
+### Convert checkpoints to Transformers
-**6. Write a conversion script**
+The original checkpoint must be converted to a Transformers compatible checkpoint.
-Next, you should write a conversion script that lets you convert the checkpoint you used to debug *brand_new_bert* in
-the original repository to a checkpoint compatible with your just created 🤗 Transformers implementation of
-*brand_new_bert*. It is not advised to write the conversion script from scratch, but rather to look through already
-existing conversion scripts in 🤗 Transformers for one that has been used to convert a similar model that was written in
-the same framework as *brand_new_bert*. Usually, it is enough to copy an already existing conversion script and
-slightly adapt it for your use case. Don't hesitate to ask the Hugging Face team to point you to a similar already
-existing conversion script for your model.
+> [!TIP]
+> Try looking for an existing conversion script to copy, adapt, and reuse for your model!
+>
+> - If you're porting a model from TensorFlow to PyTorch, a good starting point may be the BERT [conversion script](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91).
+> - If you're porting a model from PyTorch to PyTorch, a good starting point may be the BART [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py).
-- If you are porting a model from TensorFlow to PyTorch, a good starting point might be BERT's conversion script [here](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)
-- If you are porting a model from PyTorch to PyTorch, a good starting point might be BART's conversion script [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)
+Make sure **all** required weights are initialized and print out all the checkpoint weights that weren't used for initialization to make sure the model has been converted correctly.
-In the following, we'll quickly explain how PyTorch models store layer weights and define layer names. In PyTorch, the
-name of a layer is defined by the name of the class attribute you give the layer. Let's define a dummy model in
-PyTorch, called `SimpleModel` as follows:
+You may encounter wrong shape statements or name assignments during the conversion. This is most likely because of incorrect parameters in `BrandNewLlamaConfig`, the wrong architecture, a bug in the `init` method of your implementation, or you need to transpose one of the checkpoint weights.
-```python
-from torch import nn
+Keep iterating on the [Adapt the model code](#adapt-the-model-code) section until all the checkpoint weights are correctly loaded. Once you can load a checkpoint in your model, save it to a folder. This should contain a `model.safetensors` file and a `config.json` file.
+```py
+model.save_pretrained("/path/to/converted/checkpoint/folder")
+```
+
+To help with conversion, the next section briefly describes how PyTorch models stores and defines layer weights and names.
+
+#### PyTorch layer weights and names
+
+It is helpful to create a basic PyTorch model to understand how layer names are defined and weights are initialized.
+
+```py
+from torch import nn
class SimpleModel(nn.Module):
def __init__(self):
@@ -571,18 +381,11 @@ class SimpleModel(nn.Module):
self.layer_norm = nn.LayerNorm(10)
```
-Now we can create an instance of this model definition which will fill all weights: `dense`, `intermediate`,
-`layer_norm` with random weights. We can print the model to see its architecture
+PyTorch layer names are defined by the class attribute name of the layer (`dense`, `intermediate`, `layer_norm`). Create a instance of `SimpleModel` to fill all the layers with random weights.
-```python
+```py
model = SimpleModel()
-
print(model)
-```
-
-This will print out the following:
-
-```
SimpleModel(
(dense): Linear(in_features=10, out_features=10, bias=True)
(intermediate): Linear(in_features=10, out_features=10, bias=True)
@@ -590,16 +393,10 @@ SimpleModel(
)
```
-We can see that the layer names are defined by the name of the class attribute in PyTorch. You can print out the weight
-values of a specific layer:
+The weight values of a specific layer are randomly initialized.
-```python
+```py
print(model.dense.weight.data)
-```
-
-to see that the weights were randomly initialized
-
-```
tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212,
-0.2077, 0.2157],
[ 0.1044, 0.0201, 0.0990, 0.2482, 0.3116, 0.2509, 0.2866, -0.2190,
@@ -622,339 +419,205 @@ tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212,
0.2220, 0.2358]]).
```
-In the conversion script, you should fill those randomly initialized weights with the exact weights of the
-corresponding layer in the checkpoint. *E.g.*
+In the conversion script, the random weights should be replaced with the exact weights from the corresponding layer in the original checkpoint.
-```python
-# retrieve matching layer weights, e.g. by
-# recursive algorithm
+```py
+# retrieve matching layer weights with recursive algorithm
layer_name = "dense"
pretrained_weight = array_of_dense_layer
model_pointer = getattr(model, "dense")
-
model_pointer.weight.data = torch.from_numpy(pretrained_weight)
```
-While doing so, you must verify that each randomly initialized weight of your PyTorch model and its corresponding
-pretrained checkpoint weight exactly match in both **shape and name**. To do so, it is **necessary** to add assert
-statements for the shape and print out the names of the checkpoints weights. E.g. you should add statements like:
+Verify the randomly initialized weights and their corresponding pretrained checkpoint weights have the identical **shape** and **name**. Add assert statements for the shape and print out the checkpoint weight names.
-```python
+```py
assert (
model_pointer.weight.shape == pretrained_weight.shape
), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
+
+logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
```
-Besides, you should also print out the names of both weights to make sure they match, *e.g.*
+When the shape or name don't match, you may have assigned the incorrect checkpoint weight to a randomly initialized layer. An incorrect shape may be because the `BrandNewLlama` parameters don't exactly match the original models parameters. But it could also be that the PyTorch layer implementation requires the weights to be transposed first.
-```python
-logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
+### Implement the forward pass
+
+The forward pass should be implemented next if the model loads correctly. It takes some inputs and returns the model output.
+
+```py
+model = BrandNewLlamaModel.from_pretrained("/path/to/converted/checkpoint/folder")
+input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
+output = model.generate(input_ids).last_hidden_states
```
-If either the shape or the name doesn't match, you probably assigned the wrong checkpoint weight to a randomly
-initialized layer of the 🤗 Transformers implementation.
+Don't be discouraged if your forward pass isn't identical with the output from the original model or if it returns an error. Check that the forward pass doesn't throw any errors. This is often because the dimensions are wrong or because the wrong data type is used ([torch.long](https://pytorch.org/docs/stable/generated/torch.Tensor.long.html) instead of [torch.float32](https://pytorch.org/docs/stable/tensors.html)).
-An incorrect shape is most likely due to an incorrect setting of the config parameters in `BrandNewBertConfig()` that
-do not exactly match those that were used for the checkpoint you want to convert. However, it could also be that
-PyTorch's implementation of a layer requires the weight to be transposed beforehand.
+Your output should have a precision of *1e-3*. Ensure the output shapes and output values are identical. Common reasons for why the outputs aren't identical include:
-Finally, you should also check that **all** required weights are initialized and print out all checkpoint weights that
-were not used for initialization to make sure the model is correctly converted. It is completely normal, that the
-conversion trials fail with either a wrong shape statement or a wrong name assignment. This is most likely because either
-you used incorrect parameters in `BrandNewBertConfig()`, have a wrong architecture in the 🤗 Transformers
-implementation, you have a bug in the `init()` functions of one of the components of the 🤗 Transformers
-implementation or you need to transpose one of the checkpoint weights.
+- Some layers were not added (activation layer or a residual connection).
+- The word embedding matix is not tied.
+- The wrong positional embeddings are used because the original implementation includes an offset.
+- Dropout is applied during the forward pass. Fix this error by making sure `model.training` is `False` and passing `self.training` to [torch.nn.functional.dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout).
-This step should be iterated with the previous step until all weights of the checkpoint are correctly loaded in the
-Transformers model. Having correctly loaded the checkpoint into the 🤗 Transformers implementation, you can then save
-the model under a folder of your choice `/path/to/converted/checkpoint/folder` that should then contain both a
-`pytorch_model.bin` file and a `config.json` file:
+Compare the forward pass of the original model and your implementation to check if there are any differences. Ideally, debug and print out the intermediate outputs of both implementations of the forward pass to pinpoint where the original implementation differs from yours.
-```python
-model.save_pretrained("/path/to/converted/checkpoint/folder")
-```
+1. Make sure the hardcoded `input_ids` in both implementations are identical.
+2. Verify the outputs of the first transformation of `input_ids` (usually the word embeddings) are identical, and work your way through to the last layer.
-**7. Implement the forward pass**
+Any difference between the two implementations should point to the bug in your implementation.
-Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make
-sure that the forward pass is correctly implemented. In [Get familiar with the original repository](#3-4-run-a-pretrained-checkpoint-using-the-original-repository), you have already created a script that runs a forward
-pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers
-implementation instead of the original one. It should look as follows:
+One of the best strategies is to add many print statements to the same positions in both implementations, and then successively remove them when they output identical values for the intermediate outputs.
-```python
-model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder")
-input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
-output = model(input_ids).last_hidden_states
-```
-
-It is very likely that the 🤗 Transformers implementation and the original model implementation don't give the exact
-same output the very first time or that the forward pass throws an error. Don't be disappointed - it's expected! First,
-you should make sure that the forward pass doesn't throw any errors. It often happens that the wrong dimensions are
-used leading to a *Dimensionality mismatch* error or that the wrong data type object is used, *e.g.* `torch.long`
-instead of `torch.float32`. Don't hesitate to ask the Hugging Face team for help, if you don't manage to solve
-certain errors.
-
-The final part to make sure the 🤗 Transformers implementation works correctly is to ensure that the outputs are
-equivalent to a precision of `1e-3`. First, you should ensure that the output shapes are identical, *i.e.*
-`outputs.shape` should yield the same value for the script of the 🤗 Transformers implementation and the original
-implementation. Next, you should make sure that the output values are identical as well. This one of the most difficult
-parts of adding a new model. Common mistakes why the outputs are not identical are:
-
-- Some layers were not added, *i.e.* an *activation* layer was not added, or the residual connection was forgotten
-- The word embedding matrix was not tied
-- The wrong positional embeddings are used because the original implementation uses on offset
-- Dropout is applied during the forward pass. To fix this make sure *model.training is False* and that no dropout
- layer is falsely activated during the forward pass, *i.e.* pass *self.training* to [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)
-
-The best way to fix the problem is usually to look at the forward pass of the original implementation and the 🤗
-Transformers implementation side-by-side and check if there are any differences. Ideally, you should debug/print out
-intermediate outputs of both implementations of the forward pass to find the exact position in the network where the 🤗
-Transformers implementation shows a different output than the original implementation. First, make sure that the
-hard-coded `input_ids` in both scripts are identical. Next, verify that the outputs of the first transformation of
-the `input_ids` (usually the word embeddings) are identical. And then work your way up to the very last layer of the
-network. At some point, you will notice a difference between the two implementations, which should point you to the bug
-in the 🤗 Transformers implementation. From our experience, a simple and efficient way is to add many print statements
-in both the original implementation and 🤗 Transformers implementation, at the same positions in the network
-respectively, and to successively remove print statements showing the same values for intermediate presentations.
-
-When you're confident that both implementations yield the same output, verify the outputs with
-`torch.allclose(original_output, output, atol=1e-3)`, you're done with the most difficult part! Congratulations - the
-work left to be done should be a cakewalk 😊.
-
-**8. Adding all necessary model tests**
-
-At this point, you have successfully added a new model. However, it is very much possible that the model does not yet
-fully comply with the required design. To make sure, the implementation is fully compatible with 🤗 Transformers, all
-common tests should pass. The Cookiecutter should have automatically added a test file for your model, probably under
-the same `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`. Run this test file to verify that all common
-tests pass:
+When both implementations produce the same output, verify the outputs are within a precision of *1e-3*.
-```bash
-pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py
+```py
+torch.allclose(original_output, output, atol=1e-3)
```
-Having fixed all common tests, it is now crucial to ensure that all the nice work you have done is well tested, so that
+This is typically the most difficult part of the process. Congratulations if you've made it this far!
-- a) The community can easily understand your work by looking at specific tests of *brand_new_bert*
-- b) Future changes to your model will not break any important feature of the model.
+And if you're stuck or struggling with this step, don't hesitate to ask for help on your pull request.
-At first, integration tests should be added. Those integration tests essentially do the same as the debugging scripts
-you used earlier to implement the model to 🤗 Transformers. A template of those model tests has already added by the
-Cookiecutter, called `BrandNewBertModelIntegrationTests` and only has to be filled out by you. To ensure that those
-tests are passing, run
+### Add model tests
+
+While the model works, you still need to add tests to ensure it is compatible with Transformers. Tests are important because they help users understand your work by looking at specific tests, and because they prevent your model from breaking in the future if any changes are made.
+
+[Cookiecutter](https://cookiecutter.readthedocs.io/en/stable/) should have added a test file for your model. Run the test file below to make sure all common tests pass.
```bash
-RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
+pytest tests/models/brand_new_llama/test_modeling_brand_new_llama.py
```
-
+The integration tests should be added first because they serve the same purpose as the debugging scripts you used earlier to implement the new model in Transformers. A template of those model tests, `BrandNewLlamaModelIntegrationTests`, was added by Cookiecutter and should be filled out. To ensure it passes, run the following command.
+
+
+
+
+```bash
+RUN_SLOW=1 pytest -sv tests/models/brand_new_llama/test_modeling_brand_new_llama.py::BrandNewLlamaModelIntegrationTests
+```
-In case you are using Windows, you should replace `RUN_SLOW=1` with `SET RUN_SLOW=1`
+
+
-
+```bash
+SET RUN_SLOW=1 pytest -sv tests/models/brand_new_llama/test_modeling_brand_new_llama.py::BrandNewLlamaModelIntegrationTests
+```
-Second, all features that are special to *brand_new_bert* should be tested additionally in a separate test under
-`BrandNewBertModelTester`/`BrandNewBertModelTest`. This part is often forgotten but is extremely useful in two
-ways:
+
+
-- It helps to transfer the knowledge you have acquired during the model addition to the community by showing how the
- special features of *brand_new_bert* should work.
-- Future contributors can quickly test changes to the model by running those special tests.
+All features unique to BrandNewLlama should be tested in a separate test under `BrandNewLlamaModelTester/BrandNewLlamaModelTest`. This test is often overlooked, but it is extremely important because:
+- it helps transfer knowledge you acquired during the process to the community by showing how the models novel features work
+- future contributors can quickly test changes to the model by running these special tests
-**9. Implement the tokenizer**
+## Implement tokenizer
-Next, we should add the tokenizer of *brand_new_bert*. Usually, the tokenizer is equivalent to or very similar to an
-already existing tokenizer of 🤗 Transformers.
+> [!TIP]
+> We recommend adding a fast tokenizer ([`PreTrainedTokenizerFast`]) to give users the best performance. Feel free to tag [@ArthurZucker](https://github.com/ArthurZucker) or [@itazap](https://github.com/itazap) in your PR for help on how to add [`PreTrainedTokenizerFast`].
-It is very important to find/extract the original tokenizer file and to manage to load this file into the 🤗
-Transformers' implementation of the tokenizer.
+With the model out of the way, time to focus on the tokenizer. The tokenizer should be identical or very similar to an existing tokenizer in Transformers.
-To ensure that the tokenizer works correctly, it is recommended to first create a script in the original repository
-that inputs a string and returns the `input_ids`. It could look similar to this (in pseudo-code):
+Find and load the original tokenizer file into your implementation. Create a script in the original repository that inputs a string and returns the `input_ids`. The pseudocode should look similar to the code below.
-```python
+```py
input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
-model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+model = BrandNewLlamaModel.load_pretrained_checkpoint("/path/to/checkpoint/")
input_ids = model.tokenize(input_str)
```
-You might have to take a deeper look again into the original repository to find the correct tokenizer function or you
-might even have to do changes to your clone of the original repository to only output the `input_ids`. Having written
-a functional tokenization script that uses the original repository, an analogous script for 🤗 Transformers should be
-created. It should look similar to this:
+You may need to search the original repository to find the correct tokenizer function or modify the existing tokenizer in your clone of the original repository to only return the `input_ids`. The script for your tokenizer should look similar to the following.
-```python
-from transformers import BrandNewBertTokenizer
+```py
+from transformers import BrandNewLlamaTokenizer
input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
-
-tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
-
+tokenizer = BrandNewLlamaTokenizer.from_pretrained("/path/to/tokenizer/folder/")
input_ids = tokenizer(input_str).input_ids
```
-When both `input_ids` yield the same values, as a final step a tokenizer test file should also be added.
+When both implementations have the same `input_ids`, add a tokenizer test file. This file is analogous to the modeling test files. The tokenizer test files should contain a couple of hardcoded integration tests.
+
+## Integration tests
-Analogous to the modeling test files of *brand_new_bert*, the tokenization test files of *brand_new_bert* should
-contain a couple of hard-coded integration tests.
+Now that you have a model and tokenizer, add end-to-end integration tests for the model and tokenizer to `tests/models/brand_new_llama/test_modeling_brand_new_llama.py`.
-**10. Run End-to-end integration tests**
+The test should provide a meaningful text-to-text example to show the model works as expected. For example, you can include a source-to-target translation pair, an article-to-summary pair, or a question-to-answer pair.
-Having added the tokenizer, you should also add a couple of end-to-end integration tests using both the model and the
-tokenizer to `tests/models/brand_new_bert/test_modeling_brand_new_bert.py` in 🤗 Transformers.
-Such a test should show on a meaningful
-text-to-text sample that the 🤗 Transformers implementation works as expected. A meaningful text-to-text sample can
-include *e.g.* a source-to-target-translation pair, an article-to-summary pair, a question-to-answer pair, etc… If none
-of the ported checkpoints has been fine-tuned on a downstream task it is enough to simply rely on the model tests. In a
-final step to ensure that the model is fully functional, it is advised that you also run all tests on GPU. It can
-happen that you forgot to add some `.to(self.device)` statements to internal tensors of the model, which in such a
-test would show in an error. In case you have no access to a GPU, the Hugging Face team can take care of running those
-tests for you.
+If the checkpoint hasn't been fine-tuned on a downstream task, then the model tests are sufficient.
-**11. Add Docstring**
+Finally, try to make sure your tests can run on a GPU by adding `.to(self.device)` statements to the models internal tensors. If you don't have access to a GPU, we can take care of that for you.
-Now, all the necessary functionality for *brand_new_bert* is added - you're almost done! The only thing left to add is
-a nice docstring and a doc page. The Cookiecutter should have added a template file called
-`docs/source/model_doc/brand_new_bert.md` that you should fill out. Users of your model will usually first look at
-this page before using your model. Hence, the documentation must be understandable and concise. It is very useful for
-the community to add some *Tips* to show how the model should be used. Don't hesitate to ping the Hugging Face team
-regarding the docstrings.
+## Add documentation
-Next, make sure that the docstring added to `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` is
-correct and included all necessary inputs and outputs. We have a detailed guide about writing documentation and our docstring format [here](writing-documentation). It is always good to remind oneself that documentation should
-be treated at least as carefully as the code in 🤗 Transformers since the documentation is usually the first contact
-point of the community with the model.
+Your model is only useful if users know how to use it. This is why it's important to add documentation and docstrings. Cookiecutter added a template file, `docs/source/model_doc/brand_new_llama.md`, that you can fill out with information about your model.
-**Code refactor**
+This is generally a user's first interaction with a model, so the documentation should be clear and concise. It is often very useful to add examples of how the model should be used.
-Great, now you have added all the necessary code for *brand_new_bert*. At this point, you should correct some potential
-incorrect code style by running:
+Make sure docstrings are added to `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py` and includes all necessary inputs and outputs. Review our [guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification) for writing documentation and docstrings.
+
+## Refactor
+
+Time to tidy things up and make sure the code style is consistent with the rest of the library. Run the following command to automatically fix incorrect styles.
```bash
make style
```
-and verify that your coding style passes the quality check:
+To verify the code style passes quality checks, run the command below.
```bash
make quality
```
-There are a couple of other very strict design tests in 🤗 Transformers that might still be failing, which shows up in
-the tests of your pull request. This is often because of some missing information in the docstring or some incorrect
-naming. The Hugging Face team will surely help you if you're stuck here.
-
-Lastly, it is always a good idea to refactor one's code after having ensured that the code works correctly. With all
-tests passing, now it's a good time to go over the added code again and do some refactoring.
-
-You have now finished the coding part, congratulation! 🎉 You are Awesome! 😎
-
-**12. Upload the models to the model hub**
-
-In this final part, you should convert and upload all checkpoints to the model hub and add a model card for each
-uploaded model checkpoint. You can get familiar with the hub functionalities by reading our [Model sharing and uploading Page](model_sharing). You should work alongside the Hugging Face team here to decide on a fitting name for each
-checkpoint and to get the required access rights to be able to upload the model under the author's organization of
-*brand_new_bert*. The `push_to_hub` method, present in all models in `transformers`, is a quick and efficient way to push your checkpoint to the hub. A little snippet is pasted below:
-
-```python
-brand_new_bert.push_to_hub("brand_new_bert")
-# Uncomment the following line to push to an organization.
-# brand_new_bert.push_to_hub("/brand_new_bert")
-```
-
-It is worth spending some time to create fitting model cards for each checkpoint. The model cards should highlight the
-specific characteristics of this particular checkpoint, *e.g.* On which dataset was the checkpoint
-pretrained/fine-tuned on? On what down-stream task should the model be used? And also include some code on how to
-correctly use the model.
+There may be other failing tests or checks (missing docstring or incorrect naming) on your pull request due to Transformers strict design tests. We can help you with these issues if you're stuck.
-**13. (Optional) Add notebook**
+After ensuring the code runs correctly, you may want to refactor it to make it more readable or cleaner.
-It is very helpful to add a notebook that showcases in-detail how *brand_new_bert* can be used for inference and/or
-fine-tuned on a downstream task. This is not mandatory to merge your PR, but very useful for the community.
+## Upload to the Hub
-**14. Submit your finished PR**
+Convert and upload all checkpoints to the [Hub](https://hf.co/models). Add a model card to provide more transparency and context about the model. The model card should highlight specific characteristics of a checkpoint, how the model was trained, and code examples of how to use it.
-You're done programming now and can move to the last step, which is getting your PR merged into main. Usually, the
-Hugging Face team should have helped you already at this point, but it is worth taking some time to give your finished
-PR a nice description and eventually add comments to your code, if you want to point out certain design choices to your
-reviewer.
+> [!TIP]
+> In many cases, adding an interactive notebook users can run is a great way to showcase how to use the model for inference or fine-tune it on a downstream task. While not required, including a notebook can drive greater adoption of your model.
-### Share your work!!
+You should also consult with the Transformers team to decide on an appropriate name for the model, and getting the required access rights to upload the model.
-Now, it's time to get some credit from the community for your work! Having completed a model addition is a major
-contribution to Transformers and the whole NLP community. Your code and the ported pre-trained models will certainly be
-used by hundreds and possibly even thousands of developers and researchers. You should be proud of your work and share
-your achievements with the community.
+Use the [`~PreTrainedModel.push_to_hub`] method to upload the model.
-**You have made another model that is super easy to access for everyone in the community! 🤯**
-
-## Model additions and their timeline: when is a model added to transformers?
-
-We aim for `transformers` to have support for new model architectures and checkpoints as early as possible:
-availability can range from day-0 (and hour-0) releases for some models, to a few days/weeks for others.
-
-The availability of this is usually up to the model contributors, as well as how excited the community is for the
-architecture.
-
-We can split the model architecture possibilities in four sections:
-- Day-0 integration
-- Same-week integration
-- Post-release integration
-- Hub-first release
-
-Let's dive into each of these and see how we (the transformers team) can help you contribute your architecture and get
-your architecture to be very easily used by all members of the community.
-
-### Day-0 integration
-
-For a day-0 integration to work, we'll usually want to work hand-in-hand with you directly. In order to keep your
-architecture private until your checkpoints and release are ready, we'll work together in a private fork of
-transformers.
+```py
+brand_new_bert.push_to_hub("brand_new_llama")
+```
-If you plan on having a transformers-first release, this is a great option: we run CI ahead of time, ensure the
-documentation is clear, and we aim to optimize your model as much as possible (providing quantization, optimizing it
-with Flash-Attention/SDPA, optimizing the KV cache, etc).
+Refer to the [Sharing](./model_sharing) guide for more information about uploading models to the Hub.
-We can also lend you a hand in adding the model, reviewing it early, and help you make sure the `transformers`
-API works as expected!
+## Merge your model
-If this is the path you wish to go with, we ask for you to reach out in advance, especially if the architecture is
-particularly novel (at least a few days, but a few weeks will enable the absolute best integration). In order to reach
-out, please contact transformers@huggingface.co 🤗.
+You're finally ready to merge your pull request and officially add the model to Transformers! Make sure all the tests are passing and all comments and feedback have been addressed.
-### Same-week integration
+Congratulations on adding a new model to Transformers! 🥳
-A same-week integration usually happens when model authors do not reach out; but we see significant community
-requests.
+This is a very significant contribution. Your work makes Transformers more accessible to developers and researchers around the world. You should be proud of your contribution and share your accomplishment with the community!
-In order to specify you'd like for us to integrate a specific model, we'll redirect you to our
-[issue tracker](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&projects=&template=new-model-addition.yml)
-where you can request a specific model.
+## Model addition timeline
-The more activity on the issue, the faster/more likely we are to integrate the model!
+There are four timelines for model additions depending on the model contributor and community demand for an architecture.
-### Post-release integration
+- **day-0 integration**: If you plan on having a Transformers-first release, this is a great option because we can ensure the documentation is clear and optimize your model as much as possible (quantization, FlashAttention, KV-cache, etc.). We can also help you add the model, provide early reviews and make sure it works as expected.
-A post-release integration usually happens when there has not been sufficient activity/requests to warrant a same-week
-integration, or that we lack the sufficient bandwidth to integrate it.
+ Reach out to transformers@huggingface.co a few days (preferably weeks) in advance, especially if an architecture is particularly novel, to ensure model integration. We'll work together on a private fork of Transformers until your checkpoint and release is ready.
-We very gladly welcome community contributions in those instances; more than half of the library was contributed
-by contributors external to Hugging Face. If this is something that is interesting to you, we recommend that you look
-at our [open issues tagged with "New model"](https://github.com/huggingface/transformers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+model%22).
+- **same week integration**: Models with significant requests/demand are usually added the same week if the model author doesn't reach out.
-We recommend you try your hand at a heavily requested model as this will multiply the impact of your contribution.
-We'll be there to help you in case that's your first contribution 🤗.
+ Use the [issue tracker](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&projects=&template=new-model-addition.yml) to request a specific model to add. The more activity on the issue, the faster and more likely we'll integrate it.
-### Code-on-Hub release
+- **post-release integration**: Models without popular requests/demand or if we don't have the bandwidth to integrate it are added post-release.
-Finally, transformers has a "remote-code" possibility, in which contributions are not made within the toolkit, but on
-the Hub. This can be particularly interesting for groups that are using `transformers` as a backbone for their project,
-but don't have the bandwidth to contribute the model to transformers directly.
+ This is a good opportunity if you're interested in contributing a model to Transformers. Take a look at open issues tagged with ["New model"](https://github.com/huggingface/transformers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+model%22). Feel free to give the most requested models a try first to multiply the impact of your contribution. We'll be there to help you each step of the way!
-In case the model is very successful, then we'll very likely end up integrating it in `transformers` at the end - as this
-provides better documentation, CI, maintenance, and optimizations - but this remains a great way to make your model
-accessible day-0 with minimal friction.
+- **Hub-first release**: Transformers [remote-code](./models#custom-models) feature allows Transformers-based projects to be shared directly on the Hub. This is a good option if you don't have the bandwidth to add a model directly to Transformers.
-This guide is a great starting point for a Hub-first release: [Custom models](./custom_models)
\ No newline at end of file
+ If a model ends up being very popular, then it's very likely that we'll integrate it in Transformers ourselves to enable better support (documentation, maintenance, optimization, etc.) for it. A Hub-first release is the most frictionless way to add a model.
\ No newline at end of file
diff --git a/docs/source/en/add_new_pipeline.md b/docs/source/en/add_new_pipeline.md
index e8234c565b26..60ef43dab585 100644
--- a/docs/source/en/add_new_pipeline.md
+++ b/docs/source/en/add_new_pipeline.md
@@ -1,4 +1,4 @@
-
-# How to create a custom pipeline?
+# Adding a new pipeline
-In this guide, we will see how to create a custom pipeline and share it on the [Hub](https://hf.co/models) or add it to the
-🤗 Transformers library.
+Make [`Pipeline`] your own by subclassing it and implementing a few methods. Share the code with the community on the [Hub](https://hf.co) and register the pipeline with Transformers so that everyone can quickly and easily use it.
-First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
-dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible
-as it makes compatibility easier (even through other languages via JSON). Those will be the `inputs` of the
-pipeline (`preprocess`).
+This guide will walk you through the process of adding a new pipeline to Transformers.
-Then define the `outputs`. Same policy as the `inputs`. The simpler, the better. Those will be the outputs of
-`postprocess` method.
+## Design choices
-Start by inheriting the base class `Pipeline` with the 4 methods needed to implement `preprocess`,
-`_forward`, `postprocess`, and `_sanitize_parameters`.
+At a minimum, you only need to provide [`Pipeline`] with an appropriate input for a task. This is also where you should begin when designing your pipeline.
+Decide what input types [`Pipeline`] can accept. It can be strings, raw bytes, dictionaries, and so on. Try to keep the inputs in pure Python where possible because it's more compatible. Next, decide on the output [`Pipeline`] should return. Again, keeping the output in Python is the simplest and best option because it's easier to work with.
-```python
-from transformers import Pipeline
+Keeping the inputs and outputs simple, and ideally JSON-serializable, makes it easier for users to run your [`Pipeline`] without needing to learn new object types. It's also common to support many different input types for even greater ease of use. For example, making an audio file acceptable from a filename, URL, or raw bytes gives the user more flexibility in how they provide the audio data.
+
+## Create a pipeline
+
+With an input and output decided, you can start implementing [`Pipeline`]. Your pipeline should inherit from the base [`Pipeline`] class and include 4 methods.
+```py
+from transformers import Pipeline
class MyPipeline(Pipeline):
def _sanitize_parameters(self, **kwargs):
- preprocess_kwargs = {}
- if "maybe_arg" in kwargs:
- preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
- return preprocess_kwargs, {}, {}
- def preprocess(self, inputs, maybe_arg=2):
- model_input = Tensor(inputs["input_ids"])
- return {"model_input": model_input}
+ def preprocess(self, inputs, args=2):
def _forward(self, model_inputs):
- # model_inputs == {"model_input": model_input}
- outputs = self.model(**model_inputs)
- # Maybe {"logits": Tensor(...)}
- return outputs
def postprocess(self, model_outputs):
- best_class = model_outputs["logits"].softmax(-1)
- return best_class
```
-The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing
-pre/postprocessing on the CPU on different threads
-
-`preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might
-contain more information and is usually a `Dict`.
-
-`_forward` is the implementation detail and is not meant to be called directly. `forward` is the preferred
-called method as it contains safeguards to make sure everything is working on the expected device. If anything is
-linked to a real model it belongs in the `_forward` method, anything else is in the preprocess/postprocess.
-
-`postprocess` methods will take the output of `_forward` and turn it into the final output that was decided
-earlier.
-
-`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
-time `pipeline(...., maybe_arg=4)` or at call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
+1. `preprocess` takes the inputs and transforms them into the appropriate input format for the model.
-The returns of `_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to `preprocess`,
-`_forward`, and `postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
-allows to keep the default arguments in the function definition which is always more "natural".
-
-A classic example would be a `top_k` argument in the post processing in classification tasks.
+```py
+def preprocess(self, inputs, maybe_arg=2):
+ model_input = Tensor(inputs["input_ids"])
+ return {"model_input": model_input}
+```
-```python
->>> pipe = pipeline("my-new-task")
->>> pipe("This is a test")
-[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
-{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
+2. `_forward` shouldn't be called directly. `forward` is the preferred method because it includes safeguards to make sure everything works correctly on the expected device. Anything linked to the model belongs in `_forward` and everything else belongs in either `preprocess` or `postprocess`.
->>> pipe("This is a test", top_k=2)
-[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
+```py
+def _forward(self, model_inputs):
+ outputs = self.model(**model_inputs)
+ return outputs
```
-In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit
-`_sanitize_parameters` to allow this new parameter.
-
+3. `postprocess` generates the final output from the models output in `_forward`.
-```python
+```py
def postprocess(self, model_outputs, top_k=5):
best_class = model_outputs["logits"].softmax(-1)
- # Add logic to handle top_k
return best_class
+```
+
+4. `_sanitize_parameters` lets users pass additional parameters to [`Pipeline`]. This could be during initialization or when [`Pipeline`] is called. `_sanitize_parameters` returns 3 dicts of additional keyword arguments that are passed directly to `preprocess`, `_forward`, and `postprocess`. Don't add anything if a user didn't call the pipeline with extra parameters. This keeps the default arguments in the function definition which is always more natural.
+For example, add a `top_k` parameter in `postprocess` to return the top 5 most likely classes. Then in `_sanitize_parameters`, check if the user passed in `top_k` and add it to `postprocess_kwargs`.
+```py
def _sanitize_parameters(self, **kwargs):
preprocess_kwargs = {}
if "maybe_arg" in kwargs:
@@ -110,55 +84,61 @@ def _sanitize_parameters(self, **kwargs):
return preprocess_kwargs, {}, postprocess_kwargs
```
-Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
-without requiring users to understand new kinds of objects. It's also relatively common to support many different types
-of arguments for ease of use (audio files, which can be filenames, URLs or pure bytes)
+Now the pipeline can return the top most likely labels if a user chooses to.
+```py
+from transformers import pipeline
+pipeline = pipeline("my-task")
+# returns 3 most likely labels
+pipeline("This is the best meal I've ever had", top_k=3)
+# returns 5 most likely labels by default
+pipeline("This is the best meal I've ever had")
+```
+
+## Register a pipeline
-## Adding it to the list of supported tasks
+Register the new task your pipeline supports in the `PIPELINE_REGISTRY`. The registry defines:
-To register your `new-task` to the list of supported tasks, you have to add it to the `PIPELINE_REGISTRY`:
+- the machine learning framework the pipeline supports with either `pt_model` or `tf_model` (add both to ensure it works with either frameworks)
+- a default model which should come from a specific revision (branch, or commit hash) where the model works as expected with `default`
+- the expected input with `type`
-```python
+```py
from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
PIPELINE_REGISTRY.register_pipeline(
"new-task",
pipeline_class=MyPipeline,
pt_model=AutoModelForSequenceClassification,
+ tf_model=TFAutoModelForSequenceClassification,
+ default={"pt": ("user/awesome-model", "branch-name")},
+ type="text",
)
```
-You can specify a default model if you want, in which case it should come with a specific revision (which can be the name of a branch or a commit hash, here we took `"abcdef"`) as well as the type:
+## Share your pipeline
-```python
-PIPELINE_REGISTRY.register_pipeline(
- "new-task",
- pipeline_class=MyPipeline,
- pt_model=AutoModelForSequenceClassification,
- default={"pt": ("user/awesome_model", "abcdef")},
- type="text", # current support type: text, audio, image, multimodal
-)
-```
+Share your pipeline with the community on the [Hub](https://hf.co) or you can add it directly to Transformers.
-## Share your pipeline on the Hub
+It's faster to upload your pipeline code to the Hub because it doesn't require a review from the Transformers team. Adding the pipeline to Transformers may be slower because it requires a review and you need to add tests to ensure your [`Pipeline`] works.
-To share your custom pipeline on the Hub, you just have to save the custom code of your `Pipeline` subclass in a
-python file. For instance, let's say we want to use a custom pipeline for sentence pair classification like this:
+### Upload to the Hub
+
+Add your pipeline code to the Hub in a Python file.
+
+For example, a custom pipeline for sentence pair classification might look like the following code below. The implementation works for PyTorch and TensorFlow models.
```py
import numpy as np
-
from transformers import Pipeline
-
def softmax(outputs):
maxes = np.max(outputs, axis=-1, keepdims=True)
shifted_exp = np.exp(outputs - maxes)
return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
-
class PairClassificationPipeline(Pipeline):
def _sanitize_parameters(self, **kwargs):
preprocess_kwargs = {}
@@ -183,8 +163,7 @@ class PairClassificationPipeline(Pipeline):
return {"label": label, "score": score, "logits": logits}
```
-The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. If we have saved this in
-a file named `pair_classification.py`, we can then import it and register it like this.
+Save the code in a file named `pair_classification.py`, and import and register it as shown below.
```py
from pair_classification import PairClassificationPipeline
@@ -215,56 +194,36 @@ The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5f
},
```
-Once this is done, we can use it with a pretrained model. For instance `sgugger/finetuned-bert-mrpc` has been
-fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not.
+Call [`~Pipeline.push_to_hub`] to push the pipeline to the Hub. The Python file containing the code is copied to the Hub, and the pipelines model and tokenizer are also saved and pushed to the Hub. Your pipeline should now be available on the Hub under your namespace.
```py
from transformers import pipeline
-classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc")
+pipeline = pipeline(task="pair-classification", model="sgugger/finetuned-bert-mrpc")
+pipeline.push_to_hub("pair-classification-pipeline")
```
-Then we can share it on the Hub by using the `push_to_hub` method:
-
-```py
-classifier.push_to_hub("test-dynamic-pipeline")
-```
-
-This will copy the file where you defined `PairClassificationPipeline` inside the folder `"test-dynamic-pipeline"`,
-along with saving the model and tokenizer of the pipeline, before pushing everything into the repository
-`{your_username}/test-dynamic-pipeline`. After that, anyone can use it as long as they provide the option
-`trust_remote_code=True`:
+To use the pipeline, add `trust_remote_code=True` when loading the pipeline.
```py
from transformers import pipeline
-classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True)
+pipeline = pipeline(task="pair-classification", trust_remote_code=True)
```
-## Add the pipeline to 🤗 Transformers
+### Add to Transformers
+
+Adding a custom pipeline to Transformers requires adding tests to make sure everything works as expected, and requesting a review from the Transformers team.
-If you want to contribute your pipeline to 🤗 Transformers, you will need to add a new module in the `pipelines` submodule
-with the code of your pipeline, then add it to the list of tasks defined in `pipelines/__init__.py`.
+Add your pipeline code as a new module to the [pipelines](https://github.com/huggingface/transformers/tree/main/src/transformers/pipelines) submodule, and add it to the list of tasks defined in [pipelines/__init__.py](https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/__init__.py).
-Then you will need to add tests. Create a new file `tests/test_pipelines_MY_PIPELINE.py` with examples of the other tests.
+Next, add a new test for the pipeline in [transformers/tests/pipelines](https://github.com/huggingface/transformers/tree/main/tests/pipelines). You can look at the other tests for examples of how to test your pipeline.
-The `run_pipeline_test` function will be very generic and run on small random models on every possible
-architecture as defined by `model_mapping` and `tf_model_mapping`.
+The [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function should be very generic and run on the models defined in [model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L48) and [tf_model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L49). This is important for testing future compatibility with new models.
-This is very important to test future compatibility, meaning if someone adds a new model for
-`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
-impossible to check for actual values, that's why there is a helper `ANY` that will simply attempt to match the
-output of the pipeline TYPE.
+You'll also notice `ANY` is used throughout the [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function. The models are random, so you can't check the actual values. Using `ANY` allows the test to match the output of the pipeline type instead.
-You also *need* to implement 2 (ideally 4) tests.
+Finally, you should also implement the following 4 tests.
-- `test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
- and test the pipeline outputs. The results should be the same as `test_small_model_tf`.
-- `test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
- and test the pipeline outputs. The results should be the same as `test_small_model_pt`.
-- `test_large_model_pt` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
- make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
- sure there is no drift in future releases.
-- `test_large_model_tf` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
- make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
- sure there is no drift in future releases.
+1. [test_small_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L59) and [test_small_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L150), use a small model for these pipelines to make sure they return the correct outputs. The results don't have to make sense. Each pipeline should return the same result.
+1. [test_large_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L187) nad [test_large_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L220), use a realistic model for these pipelines to make sure they return meaningful results. These tests are slow and should be marked as slow.
diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md
index 56c9184980f4..2a061eba3385 100644
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@@ -13,211 +13,135 @@ specific language governing permissions and limitations under the License.
rendered properly in your Markdown viewer.
-->
-# Agents and tools
-[[open-in-colab]]
-
-### What is an agent?
+> [!TIP]
+> Agents and tools are being spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. These docs will be deprecated in the future!
-Large Language Models (LLMs) trained to perform [causal language modeling](./tasks/language_modeling) can tackle a wide range of tasks, but they often struggle with basic tasks like logic, calculation, and search. When prompted in domains in which they do not perform well, they often fail to generate the answer we expect them to.
+# Agents
-One approach to overcome this weakness is to create an *agent*.
+[[open-in-colab]]
-An agent is a system that uses an LLM as its engine, and it has access to functions called *tools*.
+An agent is a system where a large language model (LLM) can execute more complex tasks through *planning* and using *tools*.
-These *tools* are functions for performing a task, and they contain all necessary description for the agent to properly use them.
+- Planning helps a LLM reason its way through a task by breaking it down into smaller subtasks. For example, [`CodeAgent`] plans a series of actions to take and then generates Python code to execute all the actions at once.
-The agent can be programmed to:
-- devise a series of actions/tools and run them all at once, like the [`CodeAgent`]
-- plan and execute actions/tools one by one and wait for the outcome of each action before launching the next one, like the [`ReactJsonAgent`]
+ Another planning method is by self-reflection and refinement of its previous actions to improve its performance. The [`ReactJsonAgent`] is an example of this type of planning, and it's based on the [ReAct](https://hf.co/papers/2210.03629) framework. This agent plans and executes actions one at a time based on the feedback it receives from each action.
-### Types of agents
+- Tools give a LLM access to external functions or APIs that it can use to help it complete a task. For example, [gradio-tools](https://github.com/freddyaboulton/gradio-tools) gives a LLM access to any of the [Gradio](https://www.gradio.app/) apps available on Hugging Face [Spaces](https://hf.co/spaces). These apps can be used for a wide range of tasks such as image generation, video generation, audio transcription, and more.
-#### Code agent
+To use agents in Transformers, make sure you have the extra `agents` dependencies installed.
-This agent has a planning step, then generates python code to execute all its actions at once. It natively handles different input and output types for its tools, thus it is the recommended choice for multimodal tasks.
+```bash
+!pip install transformers[agents]
+```
-#### React agents
+Create an agent instance (refer to the [Agents](./main_classes/agent#agents) API for supported agents in Transformers) and a list of tools available for it to use, then [`~ReactAgent.run`] the agent on your task. The example below demonstrates how a ReAct agent reasons through a task.
-This is the go-to agent to solve reasoning tasks, since the ReAct framework ([Yao et al., 2022](https://huggingface.co/papers/2210.03629)) makes it really efficient to think on the basis of its previous observations.
+```py
+from transformers import ReactCodeAgent
-We implement two versions of ReactJsonAgent:
-- [`ReactJsonAgent`] generates tool calls as a JSON in its output.
-- [`ReactCodeAgent`] is a new type of ReactJsonAgent that generates its tool calls as blobs of code, which works really well for LLMs that have strong coding performance.
+agent = ReactCodeAgent(tools=[])
+agent.run(
+ "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
+)
+```
-> [!TIP]
-> Read [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) blog post to learn more about ReAct agents.
-
-
-
-
-
-
-data:image/s3,"s3://crabby-images/d4333/d43335b6c1452b9ca63bd03e37d0fbcda8dead7d" alt="Framework of a React Agent"
-
-For example, here is how a ReAct Code agent would work its way through the following question.
-
-```py3
->>> agent.run(
-... "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
-... )
-=====New task=====
+```bash
+======== New task ========
How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?
-====Agent is executing the code below:
-bert_blocks = search(query="number of blocks in BERT base encoder")
-print("BERT blocks:", bert_blocks)
+==== Agent is executing the code below:
+bert_layers = 12 # BERT base encoder has 12 layers
+attention_layers = 6 # Encoder in Attention is All You Need has 6 layers
+layer_diff = bert_layers - attention_layers
+print("The difference in layers between BERT base encoder and Attention is All You Need is", layer_diff)
====
Print outputs:
-BERT blocks: twelve encoder blocks
+The difference in layers between BERT base encoder and Attention is All You Need is 6
-====Agent is executing the code below:
-attention_layer = search(query="number of layers in Attention is All You Need")
-print("Attention layers:", attention_layer)
+==== Agent is executing the code below:
+final_answer("BERT base encoder has {} more layers than the encoder from Attention is All You Need.".format(layer_diff))
====
Print outputs:
-Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture.
-
-====Agent is executing the code below:
-bert_blocks = 12
-attention_layers = 6
-diff = bert_blocks - attention_layers
-print("Difference in blocks:", diff)
-final_answer(diff)
-====
-
-Print outputs:
-Difference in blocks: 6
-
-Final answer: 6
-```
-### How can I build an agent?
-
-To initialize an agent, you need these arguments:
-
-- an LLM to power your agent - the agent is not exactly the LLM, it’s more like the agent is a program that uses an LLM as its engine.
-- a system prompt: what the LLM engine will be prompted with to generate its output
-- a toolbox from which the agent pick tools to execute
-- a parser to extract from the LLM output which tools are to call and with which arguments
-
-Upon initialization of the agent system, the tool attributes are used to generate a tool description, then baked into the agent’s `system_prompt` to let it know which tools it can use and why.
-
-To start with, please install the `agents` extras in order to install all default dependencies.
-
-```bash
-pip install transformers[agents]
+>>> Final answer:
+BERT base encoder has 6 more layers than the encoder from Attention is All You Need.
```
-Build your LLM engine by defining a `llm_engine` method which accepts a list of [messages](./chat_templating) and returns text. This callable also needs to accept a `stop` argument that indicates when to stop generating.
+This guide will walk you through in more detail how to initialize an agent.
-```python
-from huggingface_hub import login, InferenceClient
+## LLM
-login("")
+An agent uses a LLM to plan and execute a task; it is the engine that powers the agent. To choose and build your own LLM engine, you need a method that:
-client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct")
+1. the input uses the [chat template](./chat_templating) format, `List[Dict[str, str]]`, and it returns a string
+2. the LLM stops generating outputs when it encounters the sequences in `stop_sequences`
+```py
def llm_engine(messages, stop_sequences=["Task"]) -> str:
response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
answer = response.choices[0].message.content
return answer
```
-You could use any `llm_engine` method as long as:
-1. it follows the [messages format](./chat_templating) (`List[Dict[str, str]]`) for its input `messages`, and it returns a `str`.
-2. it stops generating outputs at the sequences passed in the argument `stop_sequences`
+Next, initialize an engine to load a model. To run an agent locally, create a [`TransformersEngine`] to load a preinitialized [`Pipeline`].
-Additionally, `llm_engine` can also take a `grammar` argument. In the case where you specify a `grammar` upon agent initialization, this argument will be passed to the calls to llm_engine, with the `grammar` that you defined upon initialization, to allow [constrained generation](https://huggingface.co/docs/text-generation-inference/conceptual/guidance) in order to force properly-formatted agent outputs.
+However, you could also leverage Hugging Face's powerful inference infrastructure, [Inference API](https://hf.co/docs/api-inference/index) or [Inference Endpoints](https://hf.co/docs/inference-endpoints/index), to run your model. This is useful for loading larger models that are typically required for agentic behavior. In this case, load the [`HfApiEngine`] to run the agent.
-You will also need a `tools` argument which accepts a list of `Tools` - it can be an empty list. You can also add the default toolbox on top of your `tools` list by defining the optional argument `add_base_tools=True`.
+The agent requires a list of tools it can use to complete a task. If you aren't using any additional tools, pass an empty list. The default tools provided by Transformers are loaded automatically, but you can optionally set `add_base_tools=True` to explicitly enable them.
-Now you can create an agent, like [`CodeAgent`], and run it. You can also create a [`TransformersEngine`] with a pre-initialized pipeline to run inference on your local machine using `transformers`.
-For convenience, since agentic behaviours generally require stronger models such as `Llama-3.1-70B-Instruct` that are harder to run locally for now, we also provide the [`HfApiEngine`] class that initializes a `huggingface_hub.InferenceClient` under the hood.
+
+
-```python
-from transformers import CodeAgent, HfApiEngine
-
-llm_engine = HfApiEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
-agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TransformersEngine, CodeAgent
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct").to("cuda")
+pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
+llm_engine = TransformersEngine(pipeline)
+agent = CodeAgent(tools=[], llm_engine=llm_engine)
agent.run(
- "Could you translate this sentence from French, say it out loud and return the audio.",
- sentence="Où est la boulangerie la plus proche?",
+ "What causes bread to rise?",
)
```
-This will be handy in case of emergency baguette need!
-You can even leave the argument `llm_engine` undefined, and an [`HfApiEngine`] will be created by default.
-
-```python
-from transformers import CodeAgent
+
+
-agent = CodeAgent(tools=[], add_base_tools=True)
+```py
+from transformers import CodeAgent, HfApiEngine
+llm_engine = HfApiEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
+agent = CodeAgent(tools=[], llm_engine=llm_engine)
agent.run(
- "Could you translate this sentence from French, say it out loud and give me the audio.",
+ "Could you translate this sentence from French, say it out loud and return the audio.",
sentence="Où est la boulangerie la plus proche?",
)
```
-Note that we used an additional `sentence` argument: you can pass text as additional arguments to the model.
+
+
-You can also use this to indicate the path to local or remote files for the model to use:
+The agent supports [constrained generation](https://hf.co/docs/text-generation-inference/conceptual/guidance) for generating outputs according to a specific structure with the `grammar` parameter. The `grammar` parameter should be specified in the `llm_engine` method or you can set it when initializing an agent.
+
+Lastly, an agent accepts additional inputs such as text and audio. In the [`HfApiEngine`] example above, the agent accepted a sentence to translate. But you could also pass a path to a local or remote file for the agent to access. The example below demonstrates how to pass a path to an audio file.
```py
from transformers import ReactCodeAgent
-agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
-
-agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
+agent = ReactCodeAgent(tools=[], llm_engine=llm_engine)
+agent.run("Why doesn't he know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
```
+## System prompt
-The prompt and output parser were automatically defined, but you can easily inspect them by calling the `system_prompt_template` on your agent.
+A system prompt describes how an agent should behave, a description of the available tools, and the expected output format.
-```python
-print(agent.system_prompt_template)
-```
-
-It's important to explain as clearly as possible the task you want to perform.
-Every [`~Agent.run`] operation is independent, and since an agent is powered by an LLM, minor variations in your prompt might yield completely different results.
-You can also run an agent consecutively for different tasks: each time the attributes `agent.task` and `agent.logs` will be re-initialized.
-
-
-#### Code execution
+Tools are defined by the `<>` token which is dynamically replaced during runtime with the actual tool. The tool description is derived from the tool name, description, inputs, output type, and a Jinja2 template. Refer to the [Tools](./tools) guide for more information about how to describe tools.
-A Python interpreter executes the code on a set of inputs passed along with your tools.
-This should be safe because the only functions that can be called are the tools you provided (especially if it's only tools by Hugging Face) and the print function, so you're already limited in what can be executed.
-
-The Python interpreter also doesn't allow imports by default outside of a safe list, so all the most obvious attacks shouldn't be an issue.
-You can still authorize additional imports by passing the authorized modules as a list of strings in argument `additional_authorized_imports` upon initialization of your [`ReactCodeAgent`] or [`CodeAgent`]:
+The example below is the system prompt for [`ReactCodeAgent`].
```py
->>> from transformers import ReactCodeAgent
-
->>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
->>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
-
-(...)
-'Hugging Face – Blog'
-```
-
-The execution will stop at any code trying to perform an illegal operation or if there is a regular Python error with the code generated by the agent.
-
-> [!WARNING]
-> The LLM can generate arbitrary code that will then be executed: do not add any unsafe imports!
-
-### The system prompt
-
-An agent, or rather the LLM that drives the agent, generates an output based on the system prompt. The system prompt can be customized and tailored to the intended task. For example, check the system prompt for the [`ReactCodeAgent`] (below version is slightly simplified).
-
-```text
You will be given a task to solve as best you can.
You have access to the following tools:
<>
@@ -249,183 +173,125 @@ Remember to make sure that variables you use are all defined.
Now Begin!
```
-The system prompt includes:
-- An *introduction* that explains how the agent should behave and what tools are.
-- A description of all the tools that is defined by a `<>` token that is dynamically replaced at runtime with the tools defined/chosen by the user.
- - The tool description comes from the tool attributes, `name`, `description`, `inputs` and `output_type`, and a simple `jinja2` template that you can refine.
-- The expected output format.
-
-You could improve the system prompt, for example, by adding an explanation of the output format.
+The system prompt can be tailored to the intended task. For example, you can add a better explanation of the output format or you can overwrite the system prompt template entirely with your own custom system prompt as shown below.
-For maximum flexibility, you can overwrite the whole system prompt template by passing your custom prompt as an argument to the `system_prompt` parameter.
+> [!WARNING]
+> If you're writing a custom system prompt, make sure to include `<>` in the template so the agent is aware of the available tools.
-```python
+```py
from transformers import ReactJsonAgent
from transformers.agents import PythonInterpreterTool
agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}")
```
-> [!WARNING]
-> Please make sure to define the `<>` string somewhere in the `template` so the agent is aware
-of the available tools.
-
-
-### Inspecting an agent run
-
-Here are a few useful attributes to inspect what happened after a run:
-- `agent.logs` stores the fine-grained logs of the agent. At every step of the agent's run, everything gets stored in a dictionary that then is appended to `agent.logs`.
-- Running `agent.write_inner_memory_from_logs()` creates an inner memory of the agent's logs for the LLM to view, as a list of chat messages. This method goes over each step of the log and only stores what it's interested in as a message: for instance, it will save the system prompt and task in separate messages, then for each step it will store the LLM output as a message, and the tool call output as another message. Use this if you want a higher-level view of what has happened - but not every log will be transcripted by this method.
-
-## Tools
-
-A tool is an atomic function to be used by an agent.
-
-You can for instance check the [`PythonInterpreterTool`]: it has a name, a description, input descriptions, an output type, and a `__call__` method to perform the action.
-
-When the agent is initialized, the tool attributes are used to generate a tool description which is baked into the agent's system prompt. This lets the agent know which tools it can use and why.
-
-### Default toolbox
-
-Transformers comes with a default toolbox for empowering agents, that you can add to your agent upon initialization with argument `add_base_tools = True`:
+## Code execution
-- **Document question answering**: given a document (such as a PDF) in image format, answer a question on this document ([Donut](./model_doc/donut))
-- **Image question answering**: given an image, answer a question on this image ([VILT](./model_doc/vilt))
-- **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](./model_doc/whisper))
-- **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5))
-- **Translation**: translates a given sentence from source language to target language.
-- **DuckDuckGo search***: performs a web search using DuckDuckGo browser.
-- **Python code interpreter**: runs your the LLM generated Python code in a secure environment. This tool will only be added to [`ReactJsonAgent`] if you initialize it with `add_base_tools=True`, since code-based agent can already natively execute Python code
+For safety, only the tools you provide (and the default Transformers tools) and the `print` function are executed. The interpreter doesn't allow importing modules that aren't on a safe list.
+To import modules that aren't on the list, add them as a list to the `additional_authorized_imports` parameter when initializing an agent.
-You can manually use a tool by calling the [`load_tool`] function and a task to perform.
-
-
-```python
-from transformers import load_tool
+```py
+from transformers import ReactCodeAgent
-tool = load_tool("text-to-speech")
-audio = tool("This is a text to speech tool")
+agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
+agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
```
+Code execution stops if a tool isn't on the safe list, it isn't authorized, or if the code generated by the agent returns a Python error.
-### Create a new tool
-
-You can create your own tool for use cases not covered by the default tools from Hugging Face.
-For example, let's create a tool that returns the most downloaded model for a given task from the Hub.
-
-You'll start with the code below.
-
-```python
-from huggingface_hub import list_models
-
-task = "text-classification"
-
-model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-print(model.id)
-```
-
-This code can quickly be converted into a tool, just by wrapping it in a function and adding the `tool` decorator:
+> [!WARNING]
+> A LLM can generate any arbitrary code that can be executed, so don't add any unsafe imports!
+## Multi-agent
-```py
-from transformers import tool
-
-@tool
-def model_download_tool(task: str) -> str:
- """
- This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub.
- It returns the name of the checkpoint.
-
- Args:
- task: The task for which
- """
- model = next(iter(list_models(filter="text-classification", sort="downloads", direction=-1)))
- return model.id
-```
+[Multi-agent](https://hf.co/papers/2308.08155) refers to multiple agents working together to solve a task. Performance is typically better because each agent is specialized for a particular subtask.
-The function needs:
-- A clear name. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's put `model_download_tool`.
-- Type hints on both inputs and output
-- A description, that includes an 'Args:' part where each argument is described (without a type indication this time, it will be pulled from the type hint).
-All these will be automatically baked into the agent's system prompt upon initialization: so strive to make them as clear as possible!
+Multi-agents are created through a [`ManagedAgent`] class, where a *manager agent* oversees how other agents work together. The manager agent requires an agent and their name and description. These are added to the manager agents system prompt which lets it know how to call and use them.
-> [!TIP]
-> This definition format is the same as tool schemas used in `apply_chat_template`, the only difference is the added `tool` decorator: read more on our tool use API [here](https://huggingface.co/blog/unified-tool-use#passing-tools-to-a-chat-template).
+The multi-agent example below creates a web search agent that is managed by another [`ReactCodeAgent`].
-Then you can directly initialize your agent:
```py
-from transformers import CodeAgent
-agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine)
-agent.run(
- "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
+from transformers.agents import ReactCodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent
+
+llm_engine = HfApiEngine()
+web_agent = ReactCodeAgent(tools=[DuckDuckGoSearchTool()], llm_engine=llm_engine)
+managed_web_agent = ManagedAgent(
+ agent=web_agent,
+ name="web_search",
+ description="Runs web searches for you. Give it your query as an argument."
)
-```
-
-You get the following:
-```text
-======== New task ========
-Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?
-==== Agent is executing the code below:
-most_downloaded_model = model_download_tool(task="text-to-video")
-print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.")
-====
-```
-
-And the output:
-`"The most downloaded model for the 'text-to-video' task is ByteDance/AnimateDiff-Lightning."`
-
-### Manage your agent's toolbox
-
-If you have already initialized an agent, it is inconvenient to reinitialize it from scratch with a tool you want to use. With Transformers, you can manage an agent's toolbox by adding or replacing a tool.
-
-Let's add the `model_download_tool` to an existing agent initialized with only the default toolbox.
-
-```python
-from transformers import CodeAgent
-
-agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
-agent.toolbox.add_tool(model_download_tool)
-```
-Now we can leverage both the new tool and the previous text-to-speech tool:
-
-```python
-agent.run(
- "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?"
+manager_agent = ReactCodeAgent(
+ tools=[], llm_engine=llm_engine, managed_agents=[managed_web_agent]
)
+manager_agent.run("Who is the CEO of Hugging Face?")
```
+## Gradio integration
-| **Audio** |
-|------------------------------------------------------------------------------------------------------------------------------------------------------|
-|