From 3f6fe2083f59099456ef47f9f7a42c2722d9a778 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Tue, 23 Mar 2021 19:31:59 +0530
Subject: [PATCH 1/9] Fix checkpoint callback issue for TPUs

---
 .../plugins/training_type/tpu_spawn.py            |  3 ++-
 tests/models/test_tpu.py                          | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
index c883ff504f24d..2ed04b09d7344 100644
--- a/pytorch_lightning/plugins/training_type/tpu_spawn.py
+++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -130,7 +130,8 @@ def barrier(self, name: Optional[str] = None) -> None:
             rendezvous(f"pl.Trainer.{name}")
 
     def transfer_distrib_spawn_state_on_fit_end(self, results):
-        best_model_path = self.lightning_module.trainer.checkpoint_callback.best_model_path
+        checkpoint_callback = self.lightning_module.trainer.checkpoint_callback
+        best_model_path = checkpoint_callback.best_model_path if checkpoint_callback else None
 
         if self.mp_queue is not None:
             rank_zero_warn("cleaning up ddp environment...")
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 5358b9f881048..e62199d1f7572 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -383,3 +383,18 @@ def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir):
         mock_clip_grad_norm.assert_called()
     else:
         mock_clip_grad_norm.assert_not_called()
+
+
+@RunIf(tpu=True)
+@pl_multi_process_test
+def test_if_test_works_with_checkpoint_false(tmpdir):
+    """
+    Ensure that model trains properly when
+    `checkpoint_callback` is set to False.
+    """
+
+    # Train a model on TPU
+    model = BoringModel()
+    trainer = Trainer(max_epochs=1, tpu_cores=8, default_root_dir=tmpdir, fast_dev_run=True, checkpoint_callback=False)
+    trainer.fit(model)
+    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"

From c2dc663bd664f5c6f49a1d943064770008cf7f12 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Tue, 23 Mar 2021 19:37:48 +0530
Subject: [PATCH 2/9] update changelog

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c542b854af104..beebd2ee59e23 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -168,6 +168,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed comparing required versions ([#6434](https://github.com/PyTorchLightning/pytorch-lightning/pull/6434))
 
 
+- Fixed checkpoint callback issue with TPUs when set False ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654))
+
+
 ## [1.2.4] - 2021-03-16
 
 ### Changed

From 6541db64f06feb812bbc6dd3c3066aee6caff89d Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Wed, 24 Mar 2021 23:50:54 +0530
Subject: [PATCH 3/9] add barrier

---
 pytorch_lightning/plugins/training_type/tpu_spawn.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
index 2ed04b09d7344..866cd17f21c3b 100644
--- a/pytorch_lightning/plugins/training_type/tpu_spawn.py
+++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -17,7 +17,6 @@
 from typing import Any, Dict, Iterable, List, Optional, Union
 
 import torch
-import torch.distributed as torch_distrib
 import torch.multiprocessing as mp
 
 from pytorch_lightning.core.lightning import LightningModule
@@ -109,13 +108,15 @@ def new_process(self, process_idx: int, trainer, mp_queue) -> None:
 
         # replace trainer save_checkpoint to use `xm.save`
         trainer.save_checkpoint = self.save_checkpoint
-        self.barrier()
+        self.barrier("pre run stage")
 
         results = trainer.run_stage()
 
         self.__save_end_of_training_weights(self.lightning_module)
         self.transfer_distrib_spawn_state_on_fit_end(results)
 
+        self.barrier("end process")
+
     def __save_end_of_training_weights(self, model: LightningModule) -> None:
         # when training ends on these platforms dump weights to get out of the main process
         if on_colab_kaggle():
@@ -126,8 +127,7 @@ def model_to_device(self) -> None:
         self._model.to(xm.xla_device())
 
     def barrier(self, name: Optional[str] = None) -> None:
-        if torch_distrib.is_initialized():
-            rendezvous(f"pl.Trainer.{name}")
+        rendezvous(name)
 
     def transfer_distrib_spawn_state_on_fit_end(self, results):
         checkpoint_callback = self.lightning_module.trainer.checkpoint_callback

From d47fa9b32e146c6f8d783e9daf90781d2fd766e3 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Wed, 24 Mar 2021 23:57:02 +0530
Subject: [PATCH 4/9] apply code suggestions

---
 CHANGELOG.md             | 5 ++++-
 tests/models/test_tpu.py | 5 +----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa5254c9d7c2a..5229fd565ab71 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -188,7 +188,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed comparing required versions ([#6434](https://github.com/PyTorchLightning/pytorch-lightning/pull/6434))
 
 
-- Fixed checkpoint callback issue with TPUs when set False ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654))
+- Fixed error on TPUs when there was no `ModelCheckpoint` ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654))
+
+
+- Fixed `trainer.test` freeze on TPUs ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654))
 
 
 - Fixed a bug where gradients were disabled after calling `Trainer.predict` ([#6657](https://github.com/PyTorchLightning/pytorch-lightning/pull/6657))
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index e62199d1f7572..5e7ead6e6cc96 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -388,10 +388,7 @@ def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir):
 @RunIf(tpu=True)
 @pl_multi_process_test
 def test_if_test_works_with_checkpoint_false(tmpdir):
-    """
-    Ensure that model trains properly when
-    `checkpoint_callback` is set to False.
-    """
+    """Ensure that model trains properly when `checkpoint_callback` is set to False."""
 
     # Train a model on TPU
     model = BoringModel()

From 312b84e364e33b074c7620265ae55fc469152a56 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Thu, 25 Mar 2021 00:32:42 +0530
Subject: [PATCH 5/9] update trainer test

---
 pytorch_lightning/trainer/trainer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 644b2f52b13ea..74781f7e17864 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -57,7 +57,7 @@
 from pytorch_lightning.trainer.training_loop import TrainLoop
 from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
 from pytorch_lightning.tuner.tuning import Tuner
-from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities import DeviceType, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.debugging import InternalDebugger
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -983,7 +983,8 @@ def __load_ckpt_weights(
                     ' specify a path for a checkpoint `.{fn}(ckpt_path=PATH)`'
                 )
 
-            self.training_type_plugin.barrier()
+            if not self._device_type == DeviceType.TPU:
+                self.training_type_plugin.barrier()
 
             ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage)
             model.load_state_dict(ckpt['state_dict'])

From 6a4ee36c55aa2e4314f7ea2f43a12714b9998242 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Thu, 25 Mar 2021 10:52:28 +0530
Subject: [PATCH 6/9] remove spaces

---
 pytorch_lightning/plugins/training_type/tpu_spawn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
index 05e3628ea2e48..a8706d54cb5c9 100644
--- a/pytorch_lightning/plugins/training_type/tpu_spawn.py
+++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -108,14 +108,14 @@ def new_process(self, process_idx: int, trainer, mp_queue) -> None:
 
         # replace trainer save_checkpoint to use `xm.save`
         trainer.save_checkpoint = self.save_checkpoint
-        self.barrier("pre run stage")
+        self.barrier("pre-run-stage")
 
         results = trainer.run_stage()
 
         self.__save_end_of_training_weights(self.lightning_module)
         self.transfer_distrib_spawn_state_on_fit_end(results)
 
-        self.barrier("end process")
+        self.barrier("end-process")
 
     def __save_end_of_training_weights(self, model: LightningModule) -> None:
         # when training ends on these platforms dump weights to get out of the main process

From 38dc8e2ce22c7f0298c505b42ca3993d4b07e3e5 Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Thu, 25 Mar 2021 14:00:58 +0530
Subject: [PATCH 7/9] fix tpu tests

---
 tests/models/test_tpu.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 5e7ead6e6cc96..975e5746df85c 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -357,13 +357,14 @@ def test_reduce(rank):
     xmp.spawn(test_reduce, nprocs=8, start_method='fork')
 
 
-@pytest.mark.parametrize("clip_val", [0, 10])
+@pytest.mark.parametrize("clip_val", [10])
 @RunIf(tpu=True)
 @pl_multi_process_test
 @mock.patch("pytorch_lightning.accelerators.tpu.xla_clip_grad_norm_")
 def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir):
     """
     Ensure that clip gradients is only called if the value is greater than 0.
+    TODO: Fix (test fails with parametrize)
     """
     tutils.reset_seed()
     trainer_options = dict(

From e18dfe429d124e575f59a9ce6e1da06060e6ae04 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 25 Mar 2021 10:45:21 +0100
Subject: [PATCH 8/9] Apply suggestions from code review

---
 tests/models/test_tpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 975e5746df85c..b2ed0db87d8d5 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -357,9 +357,9 @@ def test_reduce(rank):
     xmp.spawn(test_reduce, nprocs=8, start_method='fork')
 
 
-@pytest.mark.parametrize("clip_val", [10])
 @RunIf(tpu=True)
 @pl_multi_process_test
+@pytest.mark.parametrize("clip_val", [10])
 @mock.patch("pytorch_lightning.accelerators.tpu.xla_clip_grad_norm_")
 def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir):
     """

From 80f15c155c881fadf7a75d09323c32494badbeec Mon Sep 17 00:00:00 2001
From: Kaushik Bokka <kaushikbokka@gmail.com>
Date: Thu, 25 Mar 2021 15:46:22 +0530
Subject: [PATCH 9/9] add comment

---
 pytorch_lightning/trainer/trainer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 74781f7e17864..98f4727fb9eec 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -983,6 +983,7 @@ def __load_ckpt_weights(
                     ' specify a path for a checkpoint `.{fn}(ckpt_path=PATH)`'
                 )
 
+            # only one process running at this point for TPUs, as spawn isn't triggered yet
             if not self._device_type == DeviceType.TPU:
                 self.training_type_plugin.barrier()