Lightning-AI · williamFalcon · Jul 3, 2020 · Jul 2, 2020 · Jul 2, 2020 · Jul 2, 2020
@@ -9,13 +9,22 @@
 
 import numpy as np
 import torch
+import torch.distributed as dist
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.callbacks.base import Callback
 from pytorch_lightning.utilities import rank_zero_warn
 
 torch_inf = torch.tensor(np.Inf)
 
+try:
+    import torch_xla
+    import torch_xla.core.xla_model as xm
+except ImportError:
+    XLA_AVAILABLE = False
+else:
+    XLA_AVAILABLE = True
+
 
 class EarlyStopping(Callback):
     r"""
@@ -138,17 +147,38 @@ def _run_early_stopping_check(self, trainer, pl_module):
 
         current = logs.get(self.monitor)
         if not isinstance(current, torch.Tensor):
-            current = torch.tensor(current)
+            current = torch.tensor(current, device=pl_module.device)
 
-        if self.monitor_op(current - self.min_delta, self.best_score):
+        if self.monitor_op(current - self.min_delta, self.best_score.to(pl_module.device)):
             self.best_score = current
             self.wait_count = 0
         else:
             self.wait_count += 1
-            if self.wait_count >= self.patience:
+            should_stop = self.wait_count >= self.patience
+
+            if bool(should_stop):
                 self.stopped_epoch = trainer.current_epoch
                 trainer.should_stop = True
 
+        # stop every ddp process if any world process decides to stop
+        self._stop_distributed_training(trainer, pl_module)
+
+    def _stop_distributed_training(self, trainer, pl_module):
+
+        # in ddp make sure all processes stop when one is flagged
+        if trainer.use_ddp or trainer.use_ddp2:
+            stop = torch.tensor(int(trainer.should_stop), device=pl_module.device)
+            dist.all_reduce(stop, op=dist.reduce_op.SUM)
+            dist.barrier()
+            trainer.should_stop = stop == trainer.world_size
+
+        # if trainer.use_tpu:
+        #     stop = torch.tensor(int(trainer.should_stop), device=pl_module.device)
+        #     xm.all_reduce('sum', [stop])
+        #     print(type(stop))
+        #     torch_xla.core.xla_model.rendezvous("pl.EarlyStoppingCallback.stop_distributed_training_check")
+        #     trainer.should_stop = stop.item() == trainer.world_size
+
     def on_train_end(self, trainer, pl_module):
         if self.stopped_epoch > 0 and self.verbose > 0:
             rank_zero_warn('Displayed epoch numbers by `EarlyStopping` start from "1" until v0.6.x,'

@@ -58,6 +58,30 @@ def test_multi_gpu_model(tmpdir, backend):
     memory.get_memory_profile('min_max')
 
 
+@pytest.mark.spawn
+@pytest.mark.parametrize("backend", ['dp', 'ddp', 'ddp2'])
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+def test_multi_gpu_early_stop(tmpdir, backend):
+    """Make sure DDP works. with early stopping"""
+    tutils.set_random_master_port()
+
+    trainer_options = dict(
+        default_root_dir=tmpdir,
+        early_stop_callback=True,
+        max_epochs=50,
+        limit_train_batches=10,
+        limit_val_batches=10,
+        gpus=[0, 1],
+        distributed_backend=backend,
+    )
+
+    model = EvalModelTemplate()
+    # tutils.run_model_test(trainer_options, model)
+    trainer = Trainer(**trainer_options)
+    result = trainer.fit(model)
+    assert result
+
+
 @pytest.mark.spawn
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_ddp_all_dataloaders_passed_to_fit(tmpdir):

@@ -19,6 +19,27 @@
     TPU_AVAILABLE = True
 
 
+@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+@pytest.mark.parametrize(['tpu_cores', 'expected_device'], [
+    pytest.param([1], 'xla:1'),
+    pytest.param([8], 'xla:8'),
+])
+def test_early_stop_checkpoints_on_tpu(tmpdir, tpu_cores, expected_device):
+    """Test if single TPU core training works"""
+    model = EvalModelTemplate()
+    trainer = Trainer(
+        early_stop_callback=True,
+        default_root_dir=tmpdir,
+        progress_bar_refresh_rate=0,
+        max_epochs=50,
+        limit_train_batches=10,
+        limit_val_batches=10,
+        tpu_cores=tpu_cores,
+    )
+    trainer.fit(model)
+    assert torch_xla._XLAC._xla_get_default_device() == expected_device
+
+
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
 @pytest.mark.parametrize(['tpu_cores', 'expected_device'], [
     pytest.param([1], 'xla:1'),