Lightning-AI · awaelchli · Oct 18, 2023 · Oct 18, 2023 · Oct 18, 2023 · Oct 18, 2023
@@ -32,9 +32,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Fixed mismatching reduce-type in FSDP when using mixed precision ([#18818](https://github.com/Lightning-AI/lightning/pull/18818))
 - Fixed false-positive warnings about method calls on the Fabric-wrapped module ([#18819](https://github.com/Lightning-AI/lightning/pull/18819))
 
 
+
 ## [2.1.0] - 2023-10-11
 
 ### Added

@@ -81,12 +81,8 @@ def mixed_precision_config(self) -> "TorchMixedPrecision":
         # With PyTorch < 2.0, FSDP uses the noneness of `param_dtype` as a proxy for the `_uses_param_mixed_precision`
         # property. In order to avoid FSDP assertion failures, we therefore avoid setting `param_dtype` to
         # `torch.float32` here with PyTorch < 2.0.
-        if self.precision == "16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
-            reduce_dtype = buffer_dtype = torch.float16
-        elif self.precision == "bf16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
-            reduce_dtype = buffer_dtype = torch.bfloat16
+        if self.precision in ("16-mixed", "bf16-mixed"):
+            param_dtype = reduce_dtype = buffer_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
         elif self.precision == "16-true":
             param_dtype = reduce_dtype = buffer_dtype = torch.float16
         elif self.precision == "bf16-true":

@@ -32,6 +32,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed an issue when replacing an existing `last.ckpt` file with a symlink ([#18793](https://github.com/Lightning-AI/lightning/pull/18793))
 
+- Fixed mismatching reduce-type in FSDP when using mixed precision ([#18818](https://github.com/Lightning-AI/lightning/pull/18818))
 
 - Fixed an issue when `BatchSizeFinder` `steps_per_trial` parameter ends up defining how many validation batches to run during the entire training ([#18394](https://github.com/Lightning-AI/lightning/issues/18394))
 

@@ -90,12 +90,8 @@ def mixed_precision_config(self) -> "TorchMixedPrecision":
         # With PyTorch < 2.0, FSDP uses the noneness of `param_dtype` as a proxy for the `_uses_param_mixed_precision`
         # property. In order to avoid FSDP assertion failures, we therefore avoid setting `param_dtype` to
         # `torch.float32` here with PyTorch < 2.0.
-        if self.precision == "16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
-            reduce_dtype = buffer_dtype = torch.float16
-        elif self.precision == "bf16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
-            reduce_dtype = buffer_dtype = torch.bfloat16
+        if self.precision in ("16-mixed", "bf16-mixed"):
+            param_dtype = reduce_dtype = buffer_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
         elif self.precision == "16-true":
             param_dtype = reduce_dtype = buffer_dtype = torch.float16
         elif self.precision == "bf16-true":

@@ -27,20 +27,16 @@
         ("16-true", (torch.float16, torch.float16, torch.float16)),
         ("bf16-true", (torch.bfloat16, torch.bfloat16, torch.bfloat16)),
         pytest.param(
-            "16-mixed", (torch.float32, torch.float16, torch.float16), marks=RunIf(min_torch="2.0"), id="16-mixed-ge2_0"
-        ),
-        pytest.param(
-            "16-mixed", (None, torch.float16, torch.float16), marks=RunIf(max_torch="2.0"), id="16-mixed-lt2_0"
+            "16-mixed", (torch.float32, torch.float32, torch.float32), marks=RunIf(min_torch="2.0"), id="16-mixed-ge2_0"
         ),
+        pytest.param("16-mixed", (None, None, None), marks=RunIf(max_torch="2.0"), id="16-mixed-lt2_0"),
         pytest.param(
             "bf16-mixed",
-            (torch.float32, torch.bfloat16, torch.bfloat16),
+            (torch.float32, torch.float32, torch.float32),
             marks=RunIf(min_torch="2.0"),
             id="bf16-mixed-ge2_0",
         ),
-        pytest.param(
-            "bf16-mixed", (None, torch.bfloat16, torch.bfloat16), marks=RunIf(max_torch="2.0"), id="bf16-mixed-lt2_0"
-        ),
+        pytest.param("bf16-mixed", (None, None, None), marks=RunIf(max_torch="2.0"), id="bf16-mixed-lt2_0"),
         pytest.param(
             "32-true", (torch.float32, torch.float32, torch.float32), marks=RunIf(min_torch="2.0"), id="32-true-ge2_0"
         ),

@@ -46,11 +46,9 @@ def step(self, model, batch):
         precision = self._precision
         assert isinstance(precision, FSDPPrecision)
         if precision.precision == "16-mixed":
-            param_dtype = torch.float32
-            reduce_dtype = buffer_dtype = torch.float16
+            param_dtype = reduce_dtype = buffer_dtype = torch.float32
         elif precision.precision == "bf16-mixed":
-            param_dtype = torch.float32
-            reduce_dtype = buffer_dtype = torch.bfloat16
+            param_dtype = reduce_dtype = buffer_dtype = torch.float32
         elif precision.precision == "16-true":
             param_dtype = reduce_dtype = buffer_dtype = torch.float16
         elif precision.precision == "bf16-true":

@@ -27,20 +27,16 @@
         ("16-true", (torch.float16, torch.float16, torch.float16)),
         ("bf16-true", (torch.bfloat16, torch.bfloat16, torch.bfloat16)),
         pytest.param(
-            "16-mixed", (torch.float32, torch.float16, torch.float16), marks=RunIf(min_torch="2.0"), id="16-mixed-ge2_0"
-        ),
-        pytest.param(
-            "16-mixed", (None, torch.float16, torch.float16), marks=RunIf(max_torch="2.0"), id="16-mixed-lt2_0"
+            "16-mixed", (torch.float32, torch.float32, torch.float32), marks=RunIf(min_torch="2.0"), id="16-mixed-ge2_0"
         ),
+        pytest.param("16-mixed", (None, None, None), marks=RunIf(max_torch="2.0"), id="16-mixed-lt2_0"),
         pytest.param(
             "bf16-mixed",
-            (torch.float32, torch.bfloat16, torch.bfloat16),
+            (torch.float32, torch.float32, torch.float32),
             marks=RunIf(min_torch="2.0"),
             id="bf16-mixed-ge2_0",
         ),
-        pytest.param(
-            "bf16-mixed", (None, torch.bfloat16, torch.bfloat16), marks=RunIf(max_torch="2.0"), id="bf16-mixed-lt2_0"
-        ),
+        pytest.param("bf16-mixed", (None, None, None), marks=RunIf(max_torch="2.0"), id="bf16-mixed-lt2_0"),
         pytest.param(
             "32-true", (torch.float32, torch.float32, torch.float32), marks=RunIf(min_torch="2.0"), id="32-true-ge2_0"
         ),

@@ -83,11 +83,9 @@ def _assert_layer_fsdp_instance(self) -> None:
         assert isinstance(self.trainer.strategy.precision_plugin, FSDPPrecisionPlugin)
 
         if self.trainer.precision == "16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
-            reduce_dtype = buffer_dtype = torch.float16
+            param_dtype = reduce_dtype = buffer_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
         elif self.trainer.precision == "bf16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
-            reduce_dtype = buffer_dtype = torch.bfloat16
+            param_dtype = reduce_dtype = buffer_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
         elif self.trainer.precision == "16-true":
             param_dtype = reduce_dtype = buffer_dtype = torch.float16
         elif self.trainer.precision == "bf16-true":
@@ -146,11 +144,9 @@ def _assert_layer_fsdp_instance(self) -> None:
         assert isinstance(self.trainer.strategy.precision_plugin, FSDPPrecisionPlugin)
 
         if self.trainer.precision == "16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
-            reduce_dtype = buffer_dtype = torch.float16
+            param_dtype = reduce_dtype = buffer_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
         elif self.trainer.precision == "bf16-mixed":
-            param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
-            reduce_dtype = buffer_dtype = torch.bfloat16
+            param_dtype = reduce_dtype = buffer_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
         elif self.trainer.precision == "16-true":
             param_dtype = reduce_dtype = buffer_dtype = torch.float16
         elif self.trainer.precision == "bf16-true":
Original file line number	Diff line number	Diff line change
Expand Up		@@ -32,6 +32,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

		- Fixed an issue when replacing an existing `last.ckpt` file with a symlink ([#18793](https://github.com/Lightning-AI/lightning/pull/18793))

		- Fixed mismatching reduce-type in FSDP when using mixed precision ([#18818](https://github.com/Lightning-AI/lightning/pull/18818))

		- Fixed an issue when `BatchSizeFinder` `steps_per_trial` parameter ends up defining how many validation batches to run during the entire training ([#18394](https://github.com/Lightning-AI/lightning/issues/18394))

Expand Down