Few test fixes, expose params

Lightning-AI · SeanNaren · Feb 23, 2021 · Feb 23, 2021 · Feb 24, 2021 · Feb 24, 2021
commit d28438b4895f91c38ef1084ebbf738f519d43cb5
@@ -22,7 +22,7 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 if _FAIRSCALE_FULLY_SHARDED_AVAILABLE:
-    from fairscale.nn import enable_wrap
+    from fairscale.nn import auto_wrap, enable_wrap, wrap
     from fairscale.nn.data_parallel import FullyShardedDataParallel
 
     from pytorch_lightning.overrides.fairscale import (
@@ -42,6 +42,9 @@ def __init__(
         fp32_reduce_scatter: Optional[bool] = None,
         compute_dtype: Optional[torch.dtype] = None,
         bucket_cap_mb: int = 25,
+        automatic_module_wrap: bool = False,
+        min_num_params: int = 1e8,
+        activation_checkpoint: bool = False,
         parallel_devices: Optional[List[torch.device]] = None,
         num_nodes: int = 1,
         cluster_environment: ClusterEnvironment = None,
@@ -112,6 +115,9 @@ def __init__(
         self.fp32_reduce_scatter = fp32_reduce_scatter
         self.compute_dtype = compute_dtype
         self.bucket_cap_mb = bucket_cap_mb
+        self.automatic_module_wrap = automatic_module_wrap
+        self.min_num_params = min_num_params
+        self.activation_checkpoint = activation_checkpoint
         self._process_group = None
 
     @property
@@ -128,18 +134,6 @@ def configure_ddp(self):
             torch.cuda.set_device(self.root_device)
 
         with enable_wrap(
-            cpu_offload=self.cpu_offload,
-            flatten_parameters=self.flatten_parameters,
-            move_grads_to_cpu=self.move_grads_to_cpu,
-            mixed_precision=precision == "mixed",
-            process_group=self.process_group
-        ):
-            # todo: this should somehow be incorporated as a general hook.
-            # currently this also means you have to use fully sharded to load the model as well.
-            self.lightning_module.trainer.call_hook("on_distributed_model_setup")
-
-        self.model = FullyShardedDataParallel(
-            LightningFullyShardedDataModule(self.model),
             process_group=self.process_group,
             cpu_offload=self.cpu_offload,
             move_grads_to_cpu=self.move_grads_to_cpu,
@@ -149,8 +143,26 @@ def configure_ddp(self):
             fp32_reduce_scatter=self.fp32_reduce_scatter,
             compute_dtype=self.compute_dtype,
             bucket_cap_mb=self.bucket_cap_mb,
-        )
+        ):
+            # Allow user to manually wrap the lightning modules, and any internal modules
+            # todo: this should somehow be incorporated as a general hook.
+            # currently this also means you have to use fully sharded to load the model as well.
+            self.lightning_module.trainer.call_hook("on_distributed_model_setup")
+            if self.automatic_module_wrap:
+                self.model = auto_wrap(
+                    LightningFullyShardedDataModule(self.model),
+                    min_num_params=self.min_num_params,
+                    activation_checkpoint=self.activation_checkpoint
+                )
+                if not isinstance(self.model, FullyShardedDataParallel):
+                    self.model = wrap(self.model, activation_checkpoint=self.activation_checkpoint)
+            else:
+                self.model = wrap(
+                    LightningFullyShardedDataModule(self.model), activation_checkpoint=self.activation_checkpoint
+                )
+
         if not self.cpu_offload:
+            # When using CPU Offload, FSDP will manage the CUDA movement for us
             super().model_to_device()
         # setup optimizers after fully sharded has wrapped the lightning module
         self.lightning_module.trainer.accelerator.setup_optimizers(self.lightning_module.trainer)

@@ -5,7 +5,6 @@
 import torch
 
 from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.plugins import FullyShardedNativeMixedPrecisionPlugin, FullyShardedPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_FULLY_SHARDED_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -16,29 +15,16 @@
     from fairscale.nn import auto_wrap, FullyShardedDataParallel
 
 
-@pytest.mark.parametrize(["plugin"], [("ddp_fully_sharded", )])
 @pytest.mark.skipif(not _FAIRSCALE_FULLY_SHARDED_AVAILABLE, reason="Fairscale is not available")
-def test_sharded_ddp_choice(tmpdir, plugin):
+def test_sharded_ddp_choice(tmpdir):
     """
         Test to ensure that plugin is correctly chosen
     """
-
-    class CB(Callback):
-
-        def on_fit_start(self, trainer, pl_module):
-            if plugin == 'ddp_fully_sharded':
-                assert isinstance(trainer.accelerator.training_type_plugin, FullyShardedPlugin)
-            raise SystemExit()
-
-    model = BoringModel()
     trainer = Trainer(
         fast_dev_run=True,
-        plugins=plugin,
-        callbacks=[CB()],
+        plugins='ddp_fully_sharded',
     )
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    assert isinstance(trainer.accelerator.training_type_plugin, FullyShardedPlugin)
 
 
 @pytest.mark.skipif(not _FAIRSCALE_FULLY_SHARDED_AVAILABLE, reason="Fairscale is not available")
@@ -60,36 +46,24 @@ def test_invalid_apex_sharded(tmpdir):
         trainer.fit(model)
 
 
-@pytest.mark.parametrize(["plugin"], [("ddp_fully_sharded", )])
 @pytest.mark.skipif(not _FAIRSCALE_FULLY_SHARDED_AVAILABLE, reason="Fairscale is not available")
 @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
 @mock.patch('torch.cuda.device_count', return_value=1)
 @mock.patch('torch.cuda.is_available', return_value=True)
 @RunIf(amp_native=True)
-def test_ddp_choice_sharded_amp(device_count_mock, mock_cuda_available, plugin, tmpdir):
+def test_ddp_choice_sharded_amp(device_count_mock, mock_cuda_available, tmpdir):
     """
         Test to ensure that plugin native amp plugin is correctly chosen when using sharded
     """
-
-    class CB(Callback):
-
-        def on_fit_start(self, trainer, pl_module):
-            if plugin == 'ddp_fully_sharded':
-                assert isinstance(trainer.accelerator.training_type_plugin, FullyShardedPlugin)
-            assert isinstance(trainer.accelerator.precision_plugin, FullyShardedNativeMixedPrecisionPlugin)
-            raise SystemExit()
-
-    model = BoringModel()
     trainer = Trainer(
         fast_dev_run=True,
         gpus=1,
         precision=16,
-        plugins=plugin,
-        callbacks=[CB()],
+        plugins='ddp_fully_sharded',
     )
 
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    assert isinstance(trainer.accelerator.training_type_plugin, FullyShardedPlugin)
+    assert isinstance(trainer.accelerator.precision_plugin, FullyShardedNativeMixedPrecisionPlugin)
 
 
 @pytest.mark.skipif(not _FAIRSCALE_FULLY_SHARDED_AVAILABLE, reason="Fairscale is not available")
@@ -98,7 +72,13 @@ def test_fully_sharded_plugin_checkpoint(tmpdir):
     """
         Test to ensure that checkpoint is saved correctly when using a single GPU.
     """
-    model = BoringModel()
+
+    class TestModel(BoringModel):
+
+        def configure_optimizers(self):
+            return torch.optim.SGD(self.accelerator_model.parameters(), lr=0.1)
+
+    model = TestModel()
     trainer = Trainer(
         gpus=1,
         plugins='ddp_fully_sharded',
@@ -111,27 +91,32 @@ def test_fully_sharded_plugin_checkpoint(tmpdir):
     _assert_save_equality(tmpdir, trainer)
 
 
+@pytest.mark.parametrize('automatic_module_wrap', [True, False])
 @pytest.mark.skipif(not _FAIRSCALE_FULLY_SHARDED_AVAILABLE, reason="Fairscale is not available")
 @RunIf(min_gpus=1, skip_windows=True)
-def test_fully_sharded_plugin_checkpoint_autowrap(tmpdir):
+def test_fully_sharded_plugin_checkpoint_manual_autowrap(automatic_module_wrap, tmpdir):
     """
-        Test to ensure that checkpoint is saved correctly when using auto_wrap.
+        Test to ensure that checkpoint is saved correctly when using automatic, and manual auto_wrap.
     """
 
     class TestModel(BoringModel):
 
         def on_distributed_model_setup(self) -> None:
-            self.layer = auto_wrap(self.layer, min_num_params=1)
+            if not automatic_module_wrap:
+                self.layer = auto_wrap(self.layer, min_num_params=1)
 
         def on_train_start(self) -> None:
             assert isinstance(self.layer, FullyShardedDataParallel)
             assert isinstance(self.accelerator_model, FullyShardedDataParallel)
 
+        def configure_optimizers(self):
+            return torch.optim.SGD(self.accelerator_model.parameters(), lr=0.1)
+
     model = TestModel()
 
     trainer = Trainer(
         gpus=1,
-        plugins='ddp_fully_sharded',
+        plugins=FullyShardedPlugin(automatic_module_wrap=automatic_module_wrap, min_num_params=1),
         fast_dev_run=True,
         precision=16,
     )
@@ -150,7 +135,13 @@ def test_fully_sharded_plugin_checkpoint_multi_gpu(tmpdir):
     """
         Test to ensure that checkpoint is saved correctly when using multiple GPUs
     """
-    model = BoringModel()
+
+    class TestModel(BoringModel):
+
+        def configure_optimizers(self):
+            return torch.optim.SGD(self.accelerator_model.parameters(), lr=0.1)
+
+    model = TestModel()
     trainer = Trainer(
         gpus=2,
         plugins='fully_sharded',