Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix reduce type in FSDP mixed precision #18818

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/lightning/fabric/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Fixed

- Fixed mismatching reduce-type in FSDP when using mixed precision ([#18818](https://github.com/Lightning-AI/lightning/pull/18818))
- Fixed false-positive warnings about method calls on the Fabric-wrapped module ([#18819](https://github.com/Lightning-AI/lightning/pull/18819))



## [2.1.0] - 2023-10-11

### Added
Expand Down
8 changes: 2 additions & 6 deletions src/lightning/fabric/plugins/precision/fsdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,8 @@ def mixed_precision_config(self) -> "TorchMixedPrecision":
# With PyTorch < 2.0, FSDP uses the noneness of `param_dtype` as a proxy for the `_uses_param_mixed_precision`
# property. In order to avoid FSDP assertion failures, we therefore avoid setting `param_dtype` to
# `torch.float32` here with PyTorch < 2.0.
if self.precision == "16-mixed":
param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
reduce_dtype = buffer_dtype = torch.float16
elif self.precision == "bf16-mixed":
param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
reduce_dtype = buffer_dtype = torch.bfloat16
if self.precision in ("16-mixed", "bf16-mixed"):
param_dtype = reduce_dtype = buffer_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
elif self.precision == "16-true":
param_dtype = reduce_dtype = buffer_dtype = torch.float16
elif self.precision == "bf16-true":
Expand Down
1 change: 1 addition & 0 deletions src/lightning/pytorch/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

- Fixed an issue when replacing an existing `last.ckpt` file with a symlink ([#18793](https://github.com/Lightning-AI/lightning/pull/18793))

- Fixed mismatching reduce-type in FSDP when using mixed precision ([#18818](https://github.com/Lightning-AI/lightning/pull/18818))

- Fixed an issue when `BatchSizeFinder` `steps_per_trial` parameter ends up defining how many validation batches to run during the entire training ([#18394](https://github.com/Lightning-AI/lightning/issues/18394))

Expand Down
8 changes: 2 additions & 6 deletions src/lightning/pytorch/plugins/precision/fsdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,8 @@ def mixed_precision_config(self) -> "TorchMixedPrecision":
# With PyTorch < 2.0, FSDP uses the noneness of `param_dtype` as a proxy for the `_uses_param_mixed_precision`
# property. In order to avoid FSDP assertion failures, we therefore avoid setting `param_dtype` to
# `torch.float32` here with PyTorch < 2.0.
if self.precision == "16-mixed":
param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
reduce_dtype = buffer_dtype = torch.float16
elif self.precision == "bf16-mixed":
param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
reduce_dtype = buffer_dtype = torch.bfloat16
if self.precision in ("16-mixed", "bf16-mixed"):
param_dtype = reduce_dtype = buffer_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
elif self.precision == "16-true":
param_dtype = reduce_dtype = buffer_dtype = torch.float16
elif self.precision == "bf16-true":
Expand Down
12 changes: 4 additions & 8 deletions tests/tests_fabric/plugins/precision/test_fsdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,16 @@
("16-true", (torch.float16, torch.float16, torch.float16)),
("bf16-true", (torch.bfloat16, torch.bfloat16, torch.bfloat16)),
pytest.param(
"16-mixed", (torch.float32, torch.float16, torch.float16), marks=RunIf(min_torch="2.0"), id="16-mixed-ge2_0"
),
pytest.param(
"16-mixed", (None, torch.float16, torch.float16), marks=RunIf(max_torch="2.0"), id="16-mixed-lt2_0"
"16-mixed", (torch.float32, torch.float32, torch.float32), marks=RunIf(min_torch="2.0"), id="16-mixed-ge2_0"
),
pytest.param("16-mixed", (None, None, None), marks=RunIf(max_torch="2.0"), id="16-mixed-lt2_0"),
pytest.param(
"bf16-mixed",
(torch.float32, torch.bfloat16, torch.bfloat16),
(torch.float32, torch.float32, torch.float32),
marks=RunIf(min_torch="2.0"),
id="bf16-mixed-ge2_0",
),
pytest.param(
"bf16-mixed", (None, torch.bfloat16, torch.bfloat16), marks=RunIf(max_torch="2.0"), id="bf16-mixed-lt2_0"
),
pytest.param("bf16-mixed", (None, None, None), marks=RunIf(max_torch="2.0"), id="bf16-mixed-lt2_0"),
pytest.param(
"32-true", (torch.float32, torch.float32, torch.float32), marks=RunIf(min_torch="2.0"), id="32-true-ge2_0"
),
Expand Down
6 changes: 2 additions & 4 deletions tests/tests_fabric/strategies/test_fsdp_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,9 @@ def step(self, model, batch):
precision = self._precision
assert isinstance(precision, FSDPPrecision)
if precision.precision == "16-mixed":
param_dtype = torch.float32
reduce_dtype = buffer_dtype = torch.float16
param_dtype = reduce_dtype = buffer_dtype = torch.float32
elif precision.precision == "bf16-mixed":
param_dtype = torch.float32
reduce_dtype = buffer_dtype = torch.bfloat16
param_dtype = reduce_dtype = buffer_dtype = torch.float32
elif precision.precision == "16-true":
param_dtype = reduce_dtype = buffer_dtype = torch.float16
elif precision.precision == "bf16-true":
Expand Down
12 changes: 4 additions & 8 deletions tests/tests_pytorch/plugins/precision/test_fsdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,16 @@
("16-true", (torch.float16, torch.float16, torch.float16)),
("bf16-true", (torch.bfloat16, torch.bfloat16, torch.bfloat16)),
pytest.param(
"16-mixed", (torch.float32, torch.float16, torch.float16), marks=RunIf(min_torch="2.0"), id="16-mixed-ge2_0"
),
pytest.param(
"16-mixed", (None, torch.float16, torch.float16), marks=RunIf(max_torch="2.0"), id="16-mixed-lt2_0"
"16-mixed", (torch.float32, torch.float32, torch.float32), marks=RunIf(min_torch="2.0"), id="16-mixed-ge2_0"
),
pytest.param("16-mixed", (None, None, None), marks=RunIf(max_torch="2.0"), id="16-mixed-lt2_0"),
pytest.param(
"bf16-mixed",
(torch.float32, torch.bfloat16, torch.bfloat16),
(torch.float32, torch.float32, torch.float32),
marks=RunIf(min_torch="2.0"),
id="bf16-mixed-ge2_0",
),
pytest.param(
"bf16-mixed", (None, torch.bfloat16, torch.bfloat16), marks=RunIf(max_torch="2.0"), id="bf16-mixed-lt2_0"
),
pytest.param("bf16-mixed", (None, None, None), marks=RunIf(max_torch="2.0"), id="bf16-mixed-lt2_0"),
pytest.param(
"32-true", (torch.float32, torch.float32, torch.float32), marks=RunIf(min_torch="2.0"), id="32-true-ge2_0"
),
Expand Down
12 changes: 4 additions & 8 deletions tests/tests_pytorch/strategies/test_fsdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,9 @@ def _assert_layer_fsdp_instance(self) -> None:
assert isinstance(self.trainer.strategy.precision_plugin, FSDPPrecisionPlugin)

if self.trainer.precision == "16-mixed":
param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
reduce_dtype = buffer_dtype = torch.float16
param_dtype = reduce_dtype = buffer_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
elif self.trainer.precision == "bf16-mixed":
param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
reduce_dtype = buffer_dtype = torch.bfloat16
param_dtype = reduce_dtype = buffer_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
elif self.trainer.precision == "16-true":
param_dtype = reduce_dtype = buffer_dtype = torch.float16
elif self.trainer.precision == "bf16-true":
Expand Down Expand Up @@ -146,11 +144,9 @@ def _assert_layer_fsdp_instance(self) -> None:
assert isinstance(self.trainer.strategy.precision_plugin, FSDPPrecisionPlugin)

if self.trainer.precision == "16-mixed":
param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
reduce_dtype = buffer_dtype = torch.float16
param_dtype = reduce_dtype = buffer_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
elif self.trainer.precision == "bf16-mixed":
param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
reduce_dtype = buffer_dtype = torch.bfloat16
param_dtype = reduce_dtype = buffer_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32
elif self.trainer.precision == "16-true":
param_dtype = reduce_dtype = buffer_dtype = torch.float16
elif self.trainer.precision == "bf16-true":
Expand Down