From 26d274b45222e2d443aff788f8b478d684d3bc2d Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 28 Jul 2022 12:02:16 -0400
Subject: [PATCH] Use main_process_first

---
 examples/by_feature/checkpointing.py               | 12 +++++++-----
 examples/by_feature/cross_validation.py            | 12 +++++++-----
 examples/by_feature/fsdp_with_peak_mem_tracking.py | 12 +++++++-----
 examples/by_feature/gradient_accumulation.py       | 12 +++++++-----
 examples/by_feature/memory.py                      | 12 +++++++-----
 examples/by_feature/multi_process_metrics.py       | 12 +++++++-----
 examples/by_feature/tracking.py                    | 12 +++++++-----
 examples/complete_nlp_example.py                   | 12 +++++++-----
 examples/nlp_example.py                            | 12 +++++++-----
 9 files changed, 63 insertions(+), 45 deletions(-)

diff --git a/examples/by_feature/checkpointing.py b/examples/by_feature/checkpointing.py
index 5d31dd8f433..1b462a1f42f 100644
--- a/examples/by_feature/checkpointing.py
+++ b/examples/by_feature/checkpointing.py
@@ -72,11 +72,13 @@ def tokenize_function(examples):
         return outputs
 
     # Apply the method we just defined to all the examples in all the splits of the dataset
-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        batched=True,
-        remove_columns=["idx", "sentence1", "sentence2"],
-    )
+    # starting with the main process first:
+    with accelerator.main_process_first():
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["idx", "sentence1", "sentence2"],
+        )
 
     # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
     # transformers library
diff --git a/examples/by_feature/cross_validation.py b/examples/by_feature/cross_validation.py
index dc41b91e571..6dd61bbf812 100644
--- a/examples/by_feature/cross_validation.py
+++ b/examples/by_feature/cross_validation.py
@@ -92,11 +92,13 @@ def tokenize_function(examples):
         return outputs
 
     # Apply the method we just defined to all the examples in all the splits of the dataset
-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        batched=True,
-        remove_columns=["idx", "sentence1", "sentence2"],
-    )
+    # starting with the main process first:
+    with accelerator.main_process_first():
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["idx", "sentence1", "sentence2"],
+        )
 
     # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
     # transformers library
diff --git a/examples/by_feature/fsdp_with_peak_mem_tracking.py b/examples/by_feature/fsdp_with_peak_mem_tracking.py
index b95fb9908cd..a279e64eb67 100644
--- a/examples/by_feature/fsdp_with_peak_mem_tracking.py
+++ b/examples/by_feature/fsdp_with_peak_mem_tracking.py
@@ -127,11 +127,13 @@ def tokenize_function(examples):
         return outputs
 
     # Apply the method we just defined to all the examples in all the splits of the dataset
-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        batched=True,
-        remove_columns=["idx", "sentence1", "sentence2"],
-    )
+    # starting with the main process first:
+    with accelerator.main_process_first():
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["idx", "sentence1", "sentence2"],
+        )
 
     # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
     # transformers library
diff --git a/examples/by_feature/gradient_accumulation.py b/examples/by_feature/gradient_accumulation.py
index 580d07a57a0..69856c97b1c 100644
--- a/examples/by_feature/gradient_accumulation.py
+++ b/examples/by_feature/gradient_accumulation.py
@@ -67,11 +67,13 @@ def tokenize_function(examples):
         return outputs
 
     # Apply the method we just defined to all the examples in all the splits of the dataset
-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        batched=True,
-        remove_columns=["idx", "sentence1", "sentence2"],
-    )
+    # starting with the main process first:
+    with accelerator.main_process_first():
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["idx", "sentence1", "sentence2"],
+        )
 
     # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
     # transformers library
diff --git a/examples/by_feature/memory.py b/examples/by_feature/memory.py
index d85319552aa..91f3e41d156 100644
--- a/examples/by_feature/memory.py
+++ b/examples/by_feature/memory.py
@@ -72,11 +72,13 @@ def tokenize_function(examples):
         return outputs
 
     # Apply the method we just defined to all the examples in all the splits of the dataset
-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        batched=True,
-        remove_columns=["idx", "sentence1", "sentence2"],
-    )
+    # starting with the main process first:
+    with accelerator.main_process_first():
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["idx", "sentence1", "sentence2"],
+        )
 
     # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
     # transformers library
diff --git a/examples/by_feature/multi_process_metrics.py b/examples/by_feature/multi_process_metrics.py
index cf581c73d17..522cc571b5c 100644
--- a/examples/by_feature/multi_process_metrics.py
+++ b/examples/by_feature/multi_process_metrics.py
@@ -74,11 +74,13 @@ def tokenize_function(examples):
         return outputs
 
     # Apply the method we just defined to all the examples in all the splits of the dataset
-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        batched=True,
-        remove_columns=["idx", "sentence1", "sentence2"],
-    )
+    # starting with the main process first:
+    with accelerator.main_process_first():
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["idx", "sentence1", "sentence2"],
+        )
 
     # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
     # transformers library
diff --git a/examples/by_feature/tracking.py b/examples/by_feature/tracking.py
index 0da8c437ab9..d7248682dbf 100644
--- a/examples/by_feature/tracking.py
+++ b/examples/by_feature/tracking.py
@@ -72,11 +72,13 @@ def tokenize_function(examples):
         return outputs
 
     # Apply the method we just defined to all the examples in all the splits of the dataset
-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        batched=True,
-        remove_columns=["idx", "sentence1", "sentence2"],
-    )
+    # starting with the main process first:
+    with accelerator.main_process_first():
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["idx", "sentence1", "sentence2"],
+        )
 
     # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
     # transformers library
diff --git a/examples/complete_nlp_example.py b/examples/complete_nlp_example.py
index fc0fae90ba6..000da603804 100644
--- a/examples/complete_nlp_example.py
+++ b/examples/complete_nlp_example.py
@@ -91,11 +91,13 @@ def tokenize_function(examples):
         return outputs
 
     # Apply the method we just defined to all the examples in all the splits of the dataset
-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        batched=True,
-        remove_columns=["idx", "sentence1", "sentence2"],
-    )
+    # starting with the main process first:
+    with accelerator.main_process_first():
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["idx", "sentence1", "sentence2"],
+        )
 
     # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
     # transformers library
diff --git a/examples/nlp_example.py b/examples/nlp_example.py
index 33c0ed7aa4c..b1e7cba270a 100644
--- a/examples/nlp_example.py
+++ b/examples/nlp_example.py
@@ -65,11 +65,13 @@ def tokenize_function(examples):
         return outputs
 
     # Apply the method we just defined to all the examples in all the splits of the dataset
-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        batched=True,
-        remove_columns=["idx", "sentence1", "sentence2"],
-    )
+    # starting with the main process first:
+    with accelerator.main_process_first():
+        tokenized_datasets = datasets.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=["idx", "sentence1", "sentence2"],
+        )
 
     # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
     # transformers library