Bring head to pytorch/torchtune (#18)

* Llama 3.3 70B (pytorch#2124) * Llama 3.3 readme updates (pytorch#2125) * update configs (pytorch#2107) Co-authored-by: Felipe Mello <[email protected]> * Reduce logging output for distributed KD (pytorch#2120) * Support Early Exit Loss and/or Layer Dropout (pytorch#1076) Co-authored-by: ebsmothers <[email protected]> * Update checkpointing directory (pytorch#2074) Co-authored-by: Felipe Mello <[email protected]> Co-authored-by: vancoyendall <[email protected]> * pass correct arg (pytorch#2127) Co-authored-by: Felipe Mello <[email protected]> * update configs (pytorch#2128) Co-authored-by: Felipe Mello <[email protected]> * fix qat_lora_test (pytorch#2131) Co-authored-by: Felipe Mello <[email protected]> * guard ckpt imports (pytorch#2133) Co-authored-by: Felipe Mello <[email protected]> * [bug fix] add parents=True (pytorch#2136) Co-authored-by: Felipe Mello <[email protected]> * [bug fix] re-add model (pytorch#2135) Co-authored-by: Felipe Mello <[email protected]> * Update save sizes into GiB (pytorch#2143) * [bug fix] remove config download when source is kaggle (pytorch#2144) Co-authored-by: Felipe Mello <[email protected]> * [fix] remove "with_suffix" (pytorch#2146) Co-authored-by: Felipe Mello <[email protected]> * DoRA fixes (pytorch#2139) Co-authored-by: Mircea Mironenco <[email protected]> * [Fix] Llama 3.2 Vision decoder_trainable flag fixed (pytorch#2150) * Small readme, config updates (pytorch#2157) * Using `FormattedCheckpointFiles` in configs (pytorch#2147) * Move ``get_world_size_and_rank`` to utils (pytorch#2155) * Faster intermediate checkpoints with DCP async save in TorchTune (pytorch#2006) Co-authored-by: Saurabh Mishra <[email protected]> * torchdata integration - multi-dataset and streaming support (pytorch#1929) * Allow higher version of lm-eval (pytorch#2165) * Using `FormattedCheckpointFiles` in configs... round 2 (pytorch#2167) * [EZ] Fix set_torch_num_threads in multi-node. (pytorch#2164) --------- Co-authored-by: Philip Bontrager <[email protected]> Co-authored-by: ebsmothers <[email protected]> Co-authored-by: Felipe Mello <[email protected]> Co-authored-by: Felipe Mello <[email protected]> Co-authored-by: Joe Cummings <[email protected]> Co-authored-by: Mostafa Elhoushi <[email protected]> Co-authored-by: vancoyendall <[email protected]> Co-authored-by: Mircea Mironenco <[email protected]> Co-authored-by: salman <[email protected]> Co-authored-by: Saurabh Mishra <[email protected]> Co-authored-by: Saurabh Mishra <[email protected]> Co-authored-by: Andrew Ho <[email protected]> Co-authored-by: Eugen Hotaj <[email protected]>
sarvamai · Dec 18, 2024 · 0fb4536 · 0fb4536
1 parent 21c5f1a
commit 0fb4536
Show file tree

Hide file tree

Showing 76 changed files with 3,282 additions and 729 deletions.
diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml
@@ -53,7 +53,7 @@ jobs:
       - name: Install remaining dependencies
         run: |
           python -m pip install -e ".[dev]"
-          python -m pip install lm-eval==0.4.5
+          python -m pip install lm-eval>=0.4.5
       - name: Run recipe and unit tests with coverage
         run: pytest tests --ignore tests/torchtune/modules/_export --with-integration --cov=. --cov-report=xml --durations=20 -vv
       - name: Upload Coverage to Codecov

diff --git a/.github/workflows/recipe_test.yaml b/.github/workflows/recipe_test.yaml
@@ -42,7 +42,7 @@ jobs:
         run: |
           python -m pip install torch torchvision torchao
           python -m pip install -e ".[dev]"
-          python -m pip install lm-eval==0.4.5
+          python -m pip install lm-eval>=0.4.5
       - name: Run recipe tests with coverage
         run: pytest tests -m integration_test --cov=. --cov-report=xml --durations=20 -vv
       - name: Upload Coverage to Codecov

diff --git a/.github/workflows/regression_test.yaml b/.github/workflows/regression_test.yaml
@@ -58,7 +58,7 @@ jobs:
       - name: Install remaining dependencies
         run: |
           python -m pip install -e ".[dev]"
-          python -m pip install lm-eval==0.4.5
+          python -m pip install lm-eval>=0.4.5
       - name: Run regression tests with coverage
         run: pytest tests -m slow_integration_test --silence-s3-logs --cov=. --cov-report=xml --durations=20 -vv
       - name: Upload Coverage to Codecov

diff --git a/README.md b/README.md
@@ -140,7 +140,7 @@ loss=torchtune.modules.loss.CEWithChunkedOutputLoss \
 enable_activation_checkpointing=True \
 optimizer_in_bwd=False \
 enable_activation_offloading=True \
-optimizer._component_=torch.optim.AdamW \
+optimizer=torch.optim.AdamW \
 tokenizer.max_seq_len=4096 \
 gradient_accumulation_steps=1 \
 epochs=1 \

diff --git a/docs/source/api_ref_training.rst b/docs/source/api_ref_training.rst
@@ -52,7 +52,6 @@ Utilities for enabling and working with distributed training.
 
     init_distributed
     is_distributed
-    get_world_size_and_rank
     gather_cpu_state_dict
 
 .. _ac_label:

diff --git a/docs/source/api_ref_utilities.rst b/docs/source/api_ref_utilities.rst
@@ -18,3 +18,4 @@ Miscellaneous
     get_device
     get_logger
     torch_version_ge
+    get_world_size_and_rank
diff --git a/recipes/configs/llama2/70B_lora.yaml b/recipes/configs/llama2/70B_lora.yaml
@@ -29,23 +29,9 @@ tokenizer:
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
   checkpoint_dir:  /tmp/Llama-2-70b-hf
-  checkpoint_files: [
-    pytorch_model-00001-of-00015.bin,
-    pytorch_model-00002-of-00015.bin,
-    pytorch_model-00003-of-00015.bin,
-    pytorch_model-00004-of-00015.bin,
-    pytorch_model-00005-of-00015.bin,
-    pytorch_model-00006-of-00015.bin,
-    pytorch_model-00007-of-00015.bin,
-    pytorch_model-00008-of-00015.bin,
-    pytorch_model-00009-of-00015.bin,
-    pytorch_model-00010-of-00015.bin,
-    pytorch_model-00011-of-00015.bin,
-    pytorch_model-00012-of-00015.bin,
-    pytorch_model-00013-of-00015.bin,
-    pytorch_model-00014-of-00015.bin,
-    pytorch_model-00015-of-00015.bin,
-  ]
+  checkpoint_files:
+    filename_format: pytorch_model-{}-of-{}.bin
+    max_filename: "00015"
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA2

diff --git a/recipes/configs/llama2/70B_qlora.yaml b/recipes/configs/llama2/70B_qlora.yaml
@@ -34,23 +34,9 @@ tokenizer:
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
   checkpoint_dir:  /tmp/Llama-2-70b-hf
-  checkpoint_files: [
-    pytorch_model-00001-of-00015.bin,
-    pytorch_model-00002-of-00015.bin,
-    pytorch_model-00003-of-00015.bin,
-    pytorch_model-00004-of-00015.bin,
-    pytorch_model-00005-of-00015.bin,
-    pytorch_model-00006-of-00015.bin,
-    pytorch_model-00007-of-00015.bin,
-    pytorch_model-00008-of-00015.bin,
-    pytorch_model-00009-of-00015.bin,
-    pytorch_model-00010-of-00015.bin,
-    pytorch_model-00011-of-00015.bin,
-    pytorch_model-00012-of-00015.bin,
-    pytorch_model-00013-of-00015.bin,
-    pytorch_model-00014-of-00015.bin,
-    pytorch_model-00015-of-00015.bin,
-  ]
+  checkpoint_files:
+    filename_format: pytorch_model-{}-of-{}.bin
+    max_filename: "00015"
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA2

diff --git a/recipes/configs/llama3/70B_full.yaml b/recipes/configs/llama3/70B_full.yaml
@@ -39,38 +39,9 @@ model:
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
   checkpoint_dir: /tmp/Meta-Llama-3-70B-Instruct
-  checkpoint_files: [
-    model-00001-of-00030.safetensors,
-    model-00002-of-00030.safetensors,
-    model-00003-of-00030.safetensors,
-    model-00004-of-00030.safetensors,
-    model-00005-of-00030.safetensors,
-    model-00006-of-00030.safetensors,
-    model-00007-of-00030.safetensors,
-    model-00008-of-00030.safetensors,
-    model-00009-of-00030.safetensors,
-    model-00010-of-00030.safetensors,
-    model-00011-of-00030.safetensors,
-    model-00012-of-00030.safetensors,
-    model-00013-of-00030.safetensors,
-    model-00014-of-00030.safetensors,
-    model-00015-of-00030.safetensors,
-    model-00016-of-00030.safetensors,
-    model-00017-of-00030.safetensors,
-    model-00018-of-00030.safetensors,
-    model-00019-of-00030.safetensors,
-    model-00020-of-00030.safetensors,
-    model-00021-of-00030.safetensors,
-    model-00022-of-00030.safetensors,
-    model-00023-of-00030.safetensors,
-    model-00024-of-00030.safetensors,
-    model-00025-of-00030.safetensors,
-    model-00026-of-00030.safetensors,
-    model-00027-of-00030.safetensors,
-    model-00028-of-00030.safetensors,
-    model-00029-of-00030.safetensors,
-    model-00030-of-00030.safetensors,
-  ]
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00030"
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA3

diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml
@@ -29,38 +29,9 @@ tokenizer:
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
   checkpoint_dir:  /tmp/Meta-Llama-3-70B-Instruct
-  checkpoint_files: [
-    model-00001-of-00030.safetensors,
-    model-00002-of-00030.safetensors,
-    model-00003-of-00030.safetensors,
-    model-00004-of-00030.safetensors,
-    model-00005-of-00030.safetensors,
-    model-00006-of-00030.safetensors,
-    model-00007-of-00030.safetensors,
-    model-00008-of-00030.safetensors,
-    model-00009-of-00030.safetensors,
-    model-00010-of-00030.safetensors,
-    model-00011-of-00030.safetensors,
-    model-00012-of-00030.safetensors,
-    model-00013-of-00030.safetensors,
-    model-00014-of-00030.safetensors,
-    model-00015-of-00030.safetensors,
-    model-00016-of-00030.safetensors,
-    model-00017-of-00030.safetensors,
-    model-00018-of-00030.safetensors,
-    model-00019-of-00030.safetensors,
-    model-00020-of-00030.safetensors,
-    model-00021-of-00030.safetensors,
-    model-00022-of-00030.safetensors,
-    model-00023-of-00030.safetensors,
-    model-00024-of-00030.safetensors,
-    model-00025-of-00030.safetensors,
-    model-00026-of-00030.safetensors,
-    model-00027-of-00030.safetensors,
-    model-00028-of-00030.safetensors,
-    model-00029-of-00030.safetensors,
-    model-00030-of-00030.safetensors,
-  ]
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00030"
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA3

diff --git a/recipes/configs/llama3/8B_dora.yaml b/recipes/configs/llama3/8B_dora.yaml
@@ -61,7 +61,7 @@ lr_scheduler:
   num_warmup_steps: 100
 
 loss:
-  _component_: torch.nn.CrossEntropyLoss
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 
 # Training
 epochs: 1

diff --git a/recipes/configs/llama3/8B_dora_single_device.yaml b/recipes/configs/llama3/8B_dora_single_device.yaml
@@ -63,7 +63,7 @@ lr_scheduler:
   num_warmup_steps: 100
 
 loss:
-  _component_: torch.nn.CrossEntropyLoss
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 
 # Training
 epochs: 1

diff --git a/recipes/configs/llama3/8B_qdora_single_device.yaml b/recipes/configs/llama3/8B_qdora_single_device.yaml
@@ -64,7 +64,7 @@ lr_scheduler:
   num_warmup_steps: 100
 
 loss:
-  _component_: torch.nn.CrossEntropyLoss
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 
 # Training
 epochs: 1

diff --git a/recipes/configs/llama3_1/405B_qlora.yaml b/recipes/configs/llama3_1/405B_qlora.yaml
@@ -34,7 +34,7 @@ checkpointer:
   checkpoint_dir: /tmp/Meta-Llama-3.1-405B-Instruct/
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
-    max_filename: 00191
+    max_filename: "00191"
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA3
@@ -61,7 +61,7 @@ lr_scheduler:
   num_warmup_steps: 100
 
 loss:
-  _component_: torch.nn.CrossEntropyLoss
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 
 fsdp:
   cpu_offload: False

diff --git a/recipes/configs/llama3_1/70B_full.yaml b/recipes/configs/llama3_1/70B_full.yaml
@@ -38,38 +38,9 @@ model:
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
   checkpoint_dir: /tmp/Meta-Llama-3.1-70B-Instruct/
-  checkpoint_files: [
-    model-00001-of-00030.safetensors,
-    model-00002-of-00030.safetensors,
-    model-00003-of-00030.safetensors,
-    model-00004-of-00030.safetensors,
-    model-00005-of-00030.safetensors,
-    model-00006-of-00030.safetensors,
-    model-00007-of-00030.safetensors,
-    model-00008-of-00030.safetensors,
-    model-00009-of-00030.safetensors,
-    model-00010-of-00030.safetensors,
-    model-00011-of-00030.safetensors,
-    model-00012-of-00030.safetensors,
-    model-00013-of-00030.safetensors,
-    model-00014-of-00030.safetensors,
-    model-00015-of-00030.safetensors,
-    model-00016-of-00030.safetensors,
-    model-00017-of-00030.safetensors,
-    model-00018-of-00030.safetensors,
-    model-00019-of-00030.safetensors,
-    model-00020-of-00030.safetensors,
-    model-00021-of-00030.safetensors,
-    model-00022-of-00030.safetensors,
-    model-00023-of-00030.safetensors,
-    model-00024-of-00030.safetensors,
-    model-00025-of-00030.safetensors,
-    model-00026-of-00030.safetensors,
-    model-00027-of-00030.safetensors,
-    model-00028-of-00030.safetensors,
-    model-00029-of-00030.safetensors,
-    model-00030-of-00030.safetensors,
-  ]
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00030"
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA3

diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml
@@ -28,38 +28,9 @@ tokenizer:
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
   checkpoint_dir: /tmp/Meta-Llama-3.1-70B-Instruct/
-  checkpoint_files: [
-    model-00001-of-00030.safetensors,
-    model-00002-of-00030.safetensors,
-    model-00003-of-00030.safetensors,
-    model-00004-of-00030.safetensors,
-    model-00005-of-00030.safetensors,
-    model-00006-of-00030.safetensors,
-    model-00007-of-00030.safetensors,
-    model-00008-of-00030.safetensors,
-    model-00009-of-00030.safetensors,
-    model-00010-of-00030.safetensors,
-    model-00011-of-00030.safetensors,
-    model-00012-of-00030.safetensors,
-    model-00013-of-00030.safetensors,
-    model-00014-of-00030.safetensors,
-    model-00015-of-00030.safetensors,
-    model-00016-of-00030.safetensors,
-    model-00017-of-00030.safetensors,
-    model-00018-of-00030.safetensors,
-    model-00019-of-00030.safetensors,
-    model-00020-of-00030.safetensors,
-    model-00021-of-00030.safetensors,
-    model-00022-of-00030.safetensors,
-    model-00023-of-00030.safetensors,
-    model-00024-of-00030.safetensors,
-    model-00025-of-00030.safetensors,
-    model-00026-of-00030.safetensors,
-    model-00027-of-00030.safetensors,
-    model-00028-of-00030.safetensors,
-    model-00029-of-00030.safetensors,
-    model-00030-of-00030.safetensors,
-  ]
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00030"
   recipe_checkpoint: null
   output_dir: ${output_dir}
   model_type: LLAMA3

diff --git a/recipes/configs/llama3_2/1B_full_single_device.yaml b/recipes/configs/llama3_2/1B_full_single_device.yaml
@@ -19,6 +19,11 @@
 #
 # This config works only for training on single device.
 
+output_dir: /tmp/torchtune/llama3_2_1B/full_single_device # /tmp may be deleted by your system. Change it to your preference.
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3_2.llama3_2_1b
 
 output_dir: /tmp/torchtune/llama3_2_1B/full_single_device # /tmp may be deleted by your system. Change it to your preference.
 

diff --git a/recipes/configs/llama3_2_vision/11B_evaluation.yaml b/recipes/configs/llama3_2_vision/11B_evaluation.yaml
@@ -3,8 +3,8 @@
 # This config assumes that you've run the following command before launching:
 #   tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct --ignore-patterns "original/consolidated*"
 #
-# It also assumes that you've downloaded the EleutherAI Eval Harness (v0.4.5):
-#   pip install lm_eval==0.4.5
+# It also assumes that you've downloaded the EleutherAI Eval Harness (v0.4.5 or higher):
+#   pip install lm_eval
 #
 # To launch, run the following command from root torchtune directory:
 #    tune run eleuther_eval --config llama3_2_vision/11B_evaluation