Lightning-AI · SeanNaren · Apr 7, 2021 · Mar 23, 2021 · Mar 23, 2021 · Mar 24, 2021
@@ -236,7 +236,7 @@ def pre_configure_ddp(self):
             self._ddp_kwargs["find_unused_parameters"] = True
 
     def _register_ddp_hooks(self) -> None:
-        # currently, DDP communication hooks only work with NCCL backend and singlge process single device mode
+        # currently, DDP communication hooks only work with NCCL backend and SPSD (singlge process single device) mode
         # https://github.com/pytorch/pytorch/blob/e6779d4357ae94cc9f9fedb83a87eb6126016769/
         # torch/nn/parallel/distributed.py#L1040
         if (

@@ -193,7 +193,7 @@ def pre_configure_ddp(self):
             self._ddp_kwargs["find_unused_parameters"] = True
 
     def _register_ddp_hooks(self) -> None:
-        # currently, DDP communication hooks only work with NCCL backend and singlge process single device mode
+        # currently, DDP communication hooks only work with NCCL backend and SPSD (singlge process single device) mode
         # https://github.com/pytorch/pytorch/blob/e6779d4357ae94cc9f9fedb83a87eb6126016769/
         # torch/nn/parallel/distributed.py#L1040
         if (

@@ -13,7 +13,7 @@
 # limitations under the License.
 import torch
 from pytorch_lightning import Trainer
-from pytorch_lightning.plugins import DDPPlugin
+from pytorch_lightning.plugins import DDPPlugin, DDPSpawnPlugin
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_8
 from tests.helpers import BoringModel
@@ -26,7 +26,7 @@
     )
 
 
-@RunIf(skip_windows=True, min_torch="1.8.0", min_gpus=2)
+@RunIf(skip_windows=True, min_torch="1.8.0", min_gpus=2, special=True)
 def test_ddp_fp16_compress_comm_hook(tmpdir):
     """Test for DDP FP16 compress hook."""
     model = BoringModel()
@@ -53,7 +53,7 @@ def test_ddp_fp16_compress_comm_hook(tmpdir):
     ), f"Training failed with {trainer.state}"
 
 
-@RunIf(skip_windows=True, min_torch="1.8.0", min_gpus=2)
+@RunIf(skip_windows=True, min_torch="1.8.0", min_gpus=2, special=True)
 def test_ddp_sgd_comm_hook(tmpdir):
     """Test for DDP FP16 compress hook."""
     model = BoringModel()
@@ -81,7 +81,7 @@ def test_ddp_sgd_comm_hook(tmpdir):
     ), f"Training failed with {trainer.state}"
 
 
-@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2)
+@RunIf(skip_windows=True, min_torch="1.9.0", min_gpus=2, special=True)
 def test_ddp_fp16_compress_wrap_sgd_comm_hook(tmpdir):
     """Test for DDP FP16 compress wrapper for SGD hook."""
     model = BoringModel()
@@ -110,3 +110,30 @@ def test_ddp_fp16_compress_wrap_sgd_comm_hook(tmpdir):
     assert (
         trainer.state == TrainerState.FINISHED
     ), f"Training failed with {trainer.state}"
+
+
+@RunIf(skip_windows=True, min_torch="1.8.0", min_gpus=2, special=True)
+def test_ddp_spawn_fp16_compress_comm_hook(tmpdir):
+    """Test for DDP Spawn FP16 compress hook."""
+    model = BoringModel()
+    training_type_plugin = DDPSpawnPlugin(
+        ddp_comm_hook=default.fp16_compress_hook,
+        sync_batchnorm=True,
+    )
+    trainer = Trainer(
+        max_epochs=1,
+        gpus=2,
+        plugins=[training_type_plugin],
+        default_root_dir=tmpdir,
+        sync_batchnorm=True,
+        fast_dev_run=True,
+    )
+    trainer.fit(model)
+    trainer_comm_hook = (
+        trainer.accelerator.training_type_plugin._model.get_ddp_logging_data().comm_hook
+    )
+    expected_comm_hook = default.fp16_compress_hook.__qualname__
+    assert trainer_comm_hook == expected_comm_hook
+    assert (
+        trainer.state == TrainerState.FINISHED
+    ), f"Training failed with {trainer.state}"