Lightning-AI · SeanNaren · Feb 23, 2021 · Feb 23, 2021 · Feb 24, 2021 · Feb 24, 2021
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
@@ -290,7 +290,7 @@ def optimizer_zero_grad(self, current_epoch: int, batch_idx: int, optimizer: Opt
     def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float]) -> None:
         """clips all the optimizer parameters to the given value"""
 
-        self.precision_plugin.clip_gradients(optimizer, clip_val)
+        self.precision_plugin.clip_gradients(self.model, optimizer, clip_val)
 
     def on_train_epoch_end(self, outputs) -> None:
         """Hook to do something on the end of an training epoch
@@ -371,7 +371,7 @@ def optimizer_state(self, optimizer: Optimizer) -> dict:
         return optimizer.state_dict()
 
     def on_save(self, checkpoint):
-        return checkpoint
+        return self.training_type_plugin.on_save(checkpoint)
 
     def barrier(self, name: Optional[str] = None) -> None:
         self.training_type_plugin.barrier(name=name)

@@ -13,19 +13,39 @@
 # limitations under the License.
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, unwrap_lightning_module
-from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
+from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _FAIRSCALE_FULLY_SHARDED_AVAILABLE
+
+
+class LightningShardedDataParallel(_LightningModuleWrapperBase):
+    # Just do this for later docstrings
+    pass
+
 
-LightningShardedDataParallel = None
 if _FAIRSCALE_AVAILABLE:
     from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
 
-    class LightningShardedDataParallel(_LightningModuleWrapperBase):
-        # Just do this for later docstrings
-        pass
-
     def unwrap_lightning_module_sharded(wrapped_model) -> LightningModule:
         model = wrapped_model
         if isinstance(model, ShardedDataParallel):
             model = model.module
 
         return unwrap_lightning_module(model)
+
+
+class LightningFullyShardedDataParallel(_LightningModuleWrapperBase):
+    # Just do this for later docstrings
+    pass
+
+
+if _FAIRSCALE_FULLY_SHARDED_AVAILABLE:
+    from fairscale.nn import FlattenParamsWrapper
+    from fairscale.nn.data_parallel import FullyShardedDataParallel
+
+    def unwrap_lightning_module_fully_sharded(wrapped_model) -> LightningModule:
+        model = wrapped_model
+        if isinstance(model, FullyShardedDataParallel):
+            model = model.module
+        # Additional check if we're using a flattened parameters buffer
+        if isinstance(model, FlattenParamsWrapper):
+            model = model.module
+        return unwrap_lightning_module(model)
@@ -1,6 +1,9 @@
 from pytorch_lightning.plugins.base_plugin import Plugin  # noqa: F401
 from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin  # noqa: F401
 from pytorch_lightning.plugins.precision.deepspeed_precision import DeepSpeedPrecisionPlugin  # noqa: F401
+from pytorch_lightning.plugins.precision.fully_sharded_native_amp import (  # noqa: F401
+    FullyShardedNativeMixedPrecisionPlugin,
+)
 from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin  # noqa: F401
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin  # noqa: F401
 from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin  # noqa: F401
@@ -10,6 +13,7 @@
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.deepspeed import DeepSpeedPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.fully_sharded import FullyShardedPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.rpc import RPCPlugin  # noqa: F401
@@ -29,6 +33,8 @@
     "DDPSpawnPlugin",
     "DeepSpeedPlugin",
     "DeepSpeedPrecisionPlugin",
+    "FullyShardedPlugin",
+    "FullyShardedNativeMixedPrecisionPlugin",
     "HorovodPlugin",
     "NativeMixedPrecisionPlugin",
     "PrecisionPlugin",

@@ -1,5 +1,8 @@
 from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin  # noqa: F401
 from pytorch_lightning.plugins.precision.deepspeed_precision import DeepSpeedPrecisionPlugin  # noqa: F401
+from pytorch_lightning.plugins.precision.fully_sharded_native_amp import (  # noqa: F401
+    FullyShardedNativeMixedPrecisionPlugin,
+)
 from pytorch_lightning.plugins.precision.mixed import MixedPrecisionPlugin  # noqa: F401
 from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin  # noqa: F401
 from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin  # noqa: F401

diff --git a/pytorch_lightning/plugins/precision/deepspeed_precision.py b/pytorch_lightning/plugins/precision/deepspeed_precision.py
@@ -1,4 +1,4 @@
-from typing import Callable, Union
+from typing import Any, Callable, Union
 
 import torch
 from torch.optim import Optimizer
@@ -54,7 +54,9 @@ def backward(
 
         return closure_loss
 
-    def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)):
+    def clip_gradients(
+        self, model: Any, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)
+    ):
         """
         DeepSpeed handles clipping gradients via the training type plugin.
         """

@@ -0,0 +1,28 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Union
+
+from torch.optim import Optimizer
+
+from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin
+
+
+class FullyShardedNativeMixedPrecisionPlugin(ShardedNativeMixedPrecisionPlugin):
+    """Mixed Precision for Full Sharded Training"""
+
+    def clip_gradients(
+        self, model: Any, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)
+    ):
+        # Model manages clipping of gradients
+        model.clip_grad_norm_(clip_val, norm_type)
diff --git a/pytorch_lightning/plugins/precision/precision_plugin.py b/pytorch_lightning/plugins/precision/precision_plugin.py
@@ -86,7 +86,9 @@ def pre_optimizer_step(
     def post_optimizer_step(self, optimizer: Optimizer, optimizer_idx: int) -> None:
         """Hook to do something after each optimizer step."""
 
-    def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)) -> None:
+    def clip_gradients(
+        self, model: Any, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)
+    ):
         """Clips the gradients to a specific value"""
         # TODO: separate TPU case from here
         if clip_val is None:

diff --git a/pytorch_lightning/plugins/precision/sharded_native_amp.py b/pytorch_lightning/plugins/precision/sharded_native_amp.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import cast, Union
+from typing import Any, cast, Union
 
 from torch.optim import Optimizer
 
@@ -31,6 +31,8 @@ def __init__(self):
         super().__init__()
         self.scaler = ShardedGradScaler()
 
-    def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)):
+    def clip_gradients(
+        self, model: Any, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)
+    ):
         optimizer = cast(OSS, optimizer)
         optimizer.clip_grad_norm(clip_val, norm_type=norm_type)
@@ -3,6 +3,7 @@
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.deepspeed import DeepSpeedPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.fully_sharded import FullyShardedPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.rpc import RPCPlugin  # noqa: F401

@@ -0,0 +1,150 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional
+
+import torch
+
+from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
+from pytorch_lightning.plugins.training_type.ddp import DDPPlugin
+from pytorch_lightning.utilities import _FAIRSCALE_FULLY_SHARDED_AVAILABLE
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+
+if _FAIRSCALE_FULLY_SHARDED_AVAILABLE:
+    from fairscale.nn.data_parallel import FullyShardedDataParallel
+
+    from pytorch_lightning.overrides.fairscale import (
+        LightningFullyShardedDataParallel,
+        unwrap_lightning_module_fully_sharded,
+    )
+
+
+class FullyShardedPlugin(DDPPlugin):
+
+    def __init__(
+        self,
+        cpu_offload: bool = True,
+        flatten_parameters: bool = False,
+        reshard_after_forward: bool = True,
+        move_grads_to_cpu: Optional[bool] = None,
+        fp32_reduce_scatter: Optional[bool] = None,
+        compute_dtype: Optional[torch.dtype] = None,
+        bucket_cap_mb: int = 25,
+        parallel_devices: Optional[List[torch.device]] = None,
+        num_nodes: int = 1,
+        cluster_environment: ClusterEnvironment = None,
+        sync_batchnorm: Optional[bool] = False
+    ):
+        """
+
+        Provides capabilities to run training using the Full Sharded capabilities provided by FairScale.
+
+        Full Sharded Training shards the entire model across all available GPUs, allowing you to scale model
+        size, whilst using efficient communication to reduce overhead. In practice, this means we can remain
+        at parity with PyTorch DDP, whilst scaling our model sizes dramatically. The technique is similar
+        to ZeRO-Stage 3 but have been modified/adjusted for PyTorch.
+
+        `For more information: https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html`.
+
+        .. warning:: ``FullyShardedPlugin`` is in beta and subject to change.
+
+        Defaults have been set to enable CPU Offload, but options have been exposed and may require configuration
+        based on your level of memory/speed efficiency.
+        We suggest having a look at this PR for more information.
+        `https://github.com/facebookresearch/fairscale/pull/413`
+
+
+        Many of the helpful doc strings below came from the original FairScale documentation:
+        `https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html`
+
+        Arguments:
+
+           cpu_offload: Offload FP32 params to CPU. Only useable in precision=16 mode (default: False).
+
+           move_grads_to_cpu: Moves gradient shards to CPU after reducation.
+                Only disable if using CPU based optimizers (defaults to ``cpu_offload``).
+
+           flatten_parameters: Flattens parameter into single contiguous tensor for speed efficiency
+                (default: False).
+
+           reshard_after_forward: Reshard parameters after the forward pass, which saves memory but slows
+                down training. Only revelant when nesting FullyShardedDataParallel wrappers inside the model.
+                (default: False).
+
+           fp32_reduce_scatter: Reduce-Scatter gradients in FP32. Only relevant in mixed precision
+                (default: None)
+
+           compute_dtype: dtype for full parameters for computation. Default to torch.float32,
+                unless using mixed precision, in which case defaults to torch.float16.
+
+           bucket_cap_mb: bucket parameters so that gradient reduction
+           can potentially overlap with backward computation.
+           bucket_cap_mb controls the bucket size in MegaBytes (MB).
+           Buckets are sub-divided based on world_size,
+           so the max shard size is roughly bucket_cap_mb / world_size.
+           Values <= 0 disable bucketing. (Default: 25).
+
+        """
+        if not _FAIRSCALE_FULLY_SHARDED_AVAILABLE:
+            raise MisconfigurationException(
+                "Full Sharded Training is not available. Install the latest FairScale via `pip install fairscale -U`"
+            )
+
+        if sync_batchnorm:
+            raise MisconfigurationException("Currently sync batch norm is not supported by Full Sharded Training.")
+        super().__init__(parallel_devices, num_nodes, cluster_environment, sync_batchnorm=sync_batchnorm)
+        self.cpu_offload = cpu_offload
+        self.move_grads_to_cpu = move_grads_to_cpu
+        self.flatten_parameters = flatten_parameters
+        self.reshard_after_forward = reshard_after_forward
+        self.fp32_reduce_scatter = fp32_reduce_scatter
+        self.compute_dtype = compute_dtype
+        self.bucket_cap_mb = bucket_cap_mb
+
+    def configure_ddp(self):
+        precision = self.lightning_module.trainer.precision
+        self.model = FullyShardedDataParallel(
+            LightningFullyShardedDataParallel(self.model),
+            cpu_offload=self.cpu_offload,
+            move_grads_to_cpu=self.move_grads_to_cpu,
+            flatten_parameters=self.flatten_parameters,
+            mixed_precision=precision == "mixed",
+            reshard_after_forward=self.reshard_after_forward,
+            fp32_reduce_scatter=self.fp32_reduce_scatter,
+            compute_dtype=self.compute_dtype,
+            bucket_cap_mb=self.bucket_cap_mb,
+        )
+
+    @property
+    def lightning_module(self) -> LightningModule:
+        return unwrap_lightning_module_fully_sharded(self.model)
+
+    def model_to_device(self):
+        if not self.cpu_offload:
+            super().model_to_device()
+
+    def on_save(self, checkpoint: dict) -> dict:
+        state_dict = self.collate_state_dict()
+        checkpoint['state_dict'] = state_dict
+        return checkpoint
+
+    def collate_state_dict(self):
+        """
+        Collects the models sharded state dict from all processes before returning.
+        Returns: The unsharded model state dict.
+        """
+        state_dict = self.model.state_dict()
+        # Remove module prefix from state dict as this is the behaviour of state dict.
+        state_dict = {k.partition('module.')[2]: state_dict[k] for k in state_dict.keys()}
+        return state_dict
diff --git a/pytorch_lightning/plugins/training_type/rpc_sequential.py b/pytorch_lightning/plugins/training_type/rpc_sequential.py
@@ -25,7 +25,7 @@
 from pytorch_lightning.overrides.distributed import LightningDistributedModule
 from pytorch_lightning.plugins.training_type.rpc import DEFAULT_RPC_TIMEOUT_SEC, RPCPlugin
 from pytorch_lightning.trainer.states import RunningStage
-from pytorch_lightning.utilities import _FAIRSCALE_PIPE_AVAILABLE, rank_zero_only
+from pytorch_lightning.utilities import _FAIRSCALE_PIPE_AVAILABLE, rank_zero_only, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 if _FAIRSCALE_PIPE_AVAILABLE:
@@ -56,6 +56,10 @@ def __init__(
 
         .. _RPCSequentialPlugin: https://arxiv.org/abs/1811.06965
 
+        .. deprecated::
+               This plugin has been deprecated. Please use the ``FullyShardedPlugin`` which provides better performance
+               and scaling without pipelining the model.
+
         Pipeline parallelism comes with with checkpointing to reduce peak
         memory required to train while minimizing device under-utilization.
         This is turned on by default and can be turned off via the checkpoint argument.
@@ -87,6 +91,10 @@ def __init__(
             at the same time. Defaults to `True` if
             `get_model_parallel_world_size() > 1`
         """
+        rank_zero_warn(
+            "RPC Sequential Plugin has been deprecated. Please use the `FullyShardedPlugin` "
+            "which provides better performance and scaling without pipelining the model.", DeprecationWarning
+        )
         self._check_pipe_available()
         super().__init__(rpc_timeout_sec=rpc_timeout_sec, **kwargs)