From 349c0c351f1021ca51689e40629fbbe51ae4ccea Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 30 Jun 2023 02:53:21 -0700 Subject: [PATCH 01/13] validate --- src/lightning/fabric/strategies/deepspeed.py | 12 ++++++++++++ src/lightning/pytorch/strategies/deepspeed.py | 3 ++- .../trainer/connectors/accelerator_connector.py | 1 - 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index 66b4c3ca6efa4..5cd569a7bff36 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -598,6 +598,7 @@ def _setup_distributed(self) -> None: f"The DeepSpeed strategy is only supported on CUDA GPUs but `{self.accelerator.__class__.__name__}`" " is used." ) + _validate_device_index_selection(self.parallel_devices) reset_seed() self._set_world_ranks() self._init_deepspeed_distributed() @@ -831,3 +832,14 @@ def _validate_state_keys(state: Dict[str, Any]) -> None: " values being overwritten by DeepSpeed. Consider changing the name of these keys to something else: " + ", ".join(colliding_keys) ) + + +def _validate_device_index_selection(parallel_devices: List[torch.device]) -> None: + selected_device_indices = [device.index for device in parallel_devices] + expected_device_indices = list(range(len(parallel_devices))) + if selected_device_indices != expected_device_indices: + raise ValueError( + f"The selected device indices {selected_device_indices!r} don't match the local rank values of the." + " If you need to select GPUs at a specific index, set the `CUDA_VISIBLE_DEVICES` environment variable" + f" instead. For example: `CUDA_VISIBLE_DEVICES={','.join(selected_device_indices)}`." + ) diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py index 4a8a30a18fe27..488e85dc5bbf8 100644 --- a/src/lightning/pytorch/strategies/deepspeed.py +++ b/src/lightning/pytorch/strategies/deepspeed.py @@ -30,7 +30,7 @@ import lightning.pytorch as pl from lightning.fabric.plugins import ClusterEnvironment from lightning.fabric.strategies import _StrategyRegistry -from lightning.fabric.strategies.deepspeed import _DEEPSPEED_AVAILABLE +from lightning.fabric.strategies.deepspeed import _DEEPSPEED_AVAILABLE, _validate_device_index_selection from lightning.fabric.utilities.optimizer import _optimizers_to_device from lightning.fabric.utilities.seed import reset_seed from lightning.fabric.utilities.types import _PATH, LRScheduler, ReduceLROnPlateau @@ -325,6 +325,7 @@ def _load_config(self, config: Optional[Union[_PATH, Dict[str, Any]]]) -> Option return config def setup_distributed(self) -> None: + _validate_device_index_selection(self.parallel_devices) reset_seed() self.set_world_ranks() self._init_deepspeed_distributed() diff --git a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py index 42f46cd75047d..7e21f6216c1e3 100644 --- a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py +++ b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py @@ -444,7 +444,6 @@ def _choose_strategy(self) -> Union[Strategy, str]: if self._num_nodes_flag > 1: return "ddp" if len(self._parallel_devices) <= 1: - # TODO: Change this once gpu accelerator was renamed to cuda accelerator if isinstance(self._accelerator_flag, (CUDAAccelerator, MPSAccelerator)) or ( isinstance(self._accelerator_flag, str) and self._accelerator_flag in ("cuda", "gpu", "mps") ): From 45ef33743b3759d465cd6e4c31b62f2e72b8dd26 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 30 Jun 2023 02:54:48 -0700 Subject: [PATCH 02/13] x --- src/lightning/fabric/strategies/deepspeed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index 5cd569a7bff36..b8d628cb9ff53 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -841,5 +841,5 @@ def _validate_device_index_selection(parallel_devices: List[torch.device]) -> No raise ValueError( f"The selected device indices {selected_device_indices!r} don't match the local rank values of the." " If you need to select GPUs at a specific index, set the `CUDA_VISIBLE_DEVICES` environment variable" - f" instead. For example: `CUDA_VISIBLE_DEVICES={','.join(selected_device_indices)}`." + f" instead. For example: `CUDA_VISIBLE_DEVICES={','.join(str(i) for i in selected_device_indices)}`." ) From 463719e892e9fcf1d6e7835e9e20e77a554467ec Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 30 Jun 2023 02:57:07 -0700 Subject: [PATCH 03/13] x --- src/lightning/fabric/strategies/deepspeed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index b8d628cb9ff53..9ba6d6fa50bf9 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -839,7 +839,7 @@ def _validate_device_index_selection(parallel_devices: List[torch.device]) -> No expected_device_indices = list(range(len(parallel_devices))) if selected_device_indices != expected_device_indices: raise ValueError( - f"The selected device indices {selected_device_indices!r} don't match the local rank values of the." + f"The selected device indices {selected_device_indices!r} don't match the local rank values of processes." " If you need to select GPUs at a specific index, set the `CUDA_VISIBLE_DEVICES` environment variable" f" instead. For example: `CUDA_VISIBLE_DEVICES={','.join(str(i) for i in selected_device_indices)}`." ) From 93d7c717c526032a41db0e7f744bc07459e260dc Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 30 Jun 2023 03:07:33 -0700 Subject: [PATCH 04/13] add test --- .../tests_fabric/strategies/test_deepspeed.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/tests_fabric/strategies/test_deepspeed.py b/tests/tests_fabric/strategies/test_deepspeed.py index 0e4d29dcca368..7a64ff080a7ea 100644 --- a/tests/tests_fabric/strategies/test_deepspeed.py +++ b/tests/tests_fabric/strategies/test_deepspeed.py @@ -22,7 +22,7 @@ from torch.optim import Optimizer from lightning.fabric import Fabric -from lightning.fabric.accelerators import CPUAccelerator +from lightning.fabric.accelerators import CPUAccelerator, CUDAAccelerator from lightning.fabric.strategies import DeepSpeedStrategy from tests_fabric.helpers.runif import RunIf @@ -349,3 +349,19 @@ def test_deepspeed_save_filter(tmp_path): fabric = Fabric(devices=1, strategy="deepspeed") with pytest.raises(TypeError, match="manages the state serialization internally"): fabric.save(tmp_path, {}, filter={}) + + +@RunIf(deepspeed=True) +@pytest.mark.parametrize("device_indices", [[1], [1, 0], [0, 2], [3, 2, 1]]) +def test_validate_parallel_devices_indices(device_indices): + """Test that the strategy validates that it doesn't support selecting specific devices by index. DeepSpeed + doesn't support it and needs the index to match to the local rank of the process.""" + strategy = DeepSpeedStrategy( + accelerator=CUDAAccelerator(), + parallel_devices=[torch.device("cuda", i) for i in device_indices] + ) + with pytest.raises( + RuntimeError, + match=escape(f"device indices {device_indices!r} don't match the local rank values of processes") + ): + strategy.setup_environment() From 899ed8745c7e77b9733a012ba87b1d460fac3ab6 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 30 Jun 2023 03:07:39 -0700 Subject: [PATCH 05/13] change error --- src/lightning/fabric/strategies/deepspeed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index 9ba6d6fa50bf9..b299ca377f209 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -838,7 +838,7 @@ def _validate_device_index_selection(parallel_devices: List[torch.device]) -> No selected_device_indices = [device.index for device in parallel_devices] expected_device_indices = list(range(len(parallel_devices))) if selected_device_indices != expected_device_indices: - raise ValueError( + raise RuntimeError( f"The selected device indices {selected_device_indices!r} don't match the local rank values of processes." " If you need to select GPUs at a specific index, set the `CUDA_VISIBLE_DEVICES` environment variable" f" instead. For example: `CUDA_VISIBLE_DEVICES={','.join(str(i) for i in selected_device_indices)}`." From 78940c07eb19c923e66404671ca308144e66c7f5 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 30 Jun 2023 03:09:27 -0700 Subject: [PATCH 06/13] add test --- .../strategies/test_deepspeed_strategy.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 569f9fcafe81e..6ba1a60db4b91 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -15,6 +15,7 @@ import json import logging import os +from re import escape from typing import Any, Dict from unittest import mock @@ -30,13 +31,14 @@ from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from lightning.pytorch.loggers import CSVLogger from lightning.pytorch.plugins import DeepSpeedPrecisionPlugin -from lightning.pytorch.strategies import DeepSpeedStrategy -from lightning.pytorch.strategies.deepspeed import _DEEPSPEED_AVAILABLE +from lightning.pytorch.strategies.deepspeed import DeepSpeedStrategy, _DEEPSPEED_AVAILABLE from lightning.pytorch.utilities.exceptions import MisconfigurationException from lightning.pytorch.utilities.imports import _TORCHMETRICS_GREATER_EQUAL_0_11 as _TM_GE_0_11 from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.runif import RunIf +from pytorch.accelerators import CUDAAccelerator + if _DEEPSPEED_AVAILABLE: import deepspeed from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer @@ -1307,3 +1309,19 @@ def transfer_batch_to_device(self, batch, *args, **kwargs): batch = trainer.strategy.batch_to_device(batch) assert batch.is_cuda assert batch.dtype is torch.float16 + + +@RunIf(deepspeed=True) +@pytest.mark.parametrize("device_indices", [[1], [1, 0], [0, 2], [3, 2, 1]]) +def test_validate_parallel_devices_indices(device_indices): + """Test that the strategy validates that it doesn't support selecting specific devices by index. DeepSpeed + doesn't support it and needs the index to match to the local rank of the process.""" + strategy = DeepSpeedStrategy( + accelerator=CUDAAccelerator(), + parallel_devices=[torch.device("cuda", i) for i in device_indices] + ) + with pytest.raises( + RuntimeError, + match=escape(f"device indices {device_indices!r} don't match the local rank values of processes") + ): + strategy.setup_environment() From 73b72d5c4aa93162f72063d24dcc75357ef04431 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 30 Jun 2023 10:17:14 +0000 Subject: [PATCH 07/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_fabric/strategies/test_deepspeed.py | 12 ++++++------ .../strategies/test_deepspeed_strategy.py | 17 ++++++++--------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/tests/tests_fabric/strategies/test_deepspeed.py b/tests/tests_fabric/strategies/test_deepspeed.py index 7a64ff080a7ea..c0adc578073c9 100644 --- a/tests/tests_fabric/strategies/test_deepspeed.py +++ b/tests/tests_fabric/strategies/test_deepspeed.py @@ -354,14 +354,14 @@ def test_deepspeed_save_filter(tmp_path): @RunIf(deepspeed=True) @pytest.mark.parametrize("device_indices", [[1], [1, 0], [0, 2], [3, 2, 1]]) def test_validate_parallel_devices_indices(device_indices): - """Test that the strategy validates that it doesn't support selecting specific devices by index. DeepSpeed - doesn't support it and needs the index to match to the local rank of the process.""" + """Test that the strategy validates that it doesn't support selecting specific devices by index. + + DeepSpeed doesn't support it and needs the index to match to the local rank of the process. + """ strategy = DeepSpeedStrategy( - accelerator=CUDAAccelerator(), - parallel_devices=[torch.device("cuda", i) for i in device_indices] + accelerator=CUDAAccelerator(), parallel_devices=[torch.device("cuda", i) for i in device_indices] ) with pytest.raises( - RuntimeError, - match=escape(f"device indices {device_indices!r} don't match the local rank values of processes") + RuntimeError, match=escape(f"device indices {device_indices!r} don't match the local rank values of processes") ): strategy.setup_environment() diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 6ba1a60db4b91..cb319e8a9b868 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -22,6 +22,7 @@ import pytest import torch import torch.nn.functional as F +from pytorch.accelerators import CUDAAccelerator from torch import nn, Tensor from torch.utils.data import DataLoader from torchmetrics import Accuracy @@ -31,14 +32,12 @@ from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from lightning.pytorch.loggers import CSVLogger from lightning.pytorch.plugins import DeepSpeedPrecisionPlugin -from lightning.pytorch.strategies.deepspeed import DeepSpeedStrategy, _DEEPSPEED_AVAILABLE +from lightning.pytorch.strategies.deepspeed import _DEEPSPEED_AVAILABLE, DeepSpeedStrategy from lightning.pytorch.utilities.exceptions import MisconfigurationException from lightning.pytorch.utilities.imports import _TORCHMETRICS_GREATER_EQUAL_0_11 as _TM_GE_0_11 from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.runif import RunIf -from pytorch.accelerators import CUDAAccelerator - if _DEEPSPEED_AVAILABLE: import deepspeed from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer @@ -1314,14 +1313,14 @@ def transfer_batch_to_device(self, batch, *args, **kwargs): @RunIf(deepspeed=True) @pytest.mark.parametrize("device_indices", [[1], [1, 0], [0, 2], [3, 2, 1]]) def test_validate_parallel_devices_indices(device_indices): - """Test that the strategy validates that it doesn't support selecting specific devices by index. DeepSpeed - doesn't support it and needs the index to match to the local rank of the process.""" + """Test that the strategy validates that it doesn't support selecting specific devices by index. + + DeepSpeed doesn't support it and needs the index to match to the local rank of the process. + """ strategy = DeepSpeedStrategy( - accelerator=CUDAAccelerator(), - parallel_devices=[torch.device("cuda", i) for i in device_indices] + accelerator=CUDAAccelerator(), parallel_devices=[torch.device("cuda", i) for i in device_indices] ) with pytest.raises( - RuntimeError, - match=escape(f"device indices {device_indices!r} don't match the local rank values of processes") + RuntimeError, match=escape(f"device indices {device_indices!r} don't match the local rank values of processes") ): strategy.setup_environment() From fa3736ba6dd3249014742a4f2d963330e22f5c51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 30 Jun 2023 21:34:52 -0400 Subject: [PATCH 08/13] remove test --- .../strategies/test_deepspeed_integration.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/tests/tests_fabric/strategies/test_deepspeed_integration.py b/tests/tests_fabric/strategies/test_deepspeed_integration.py index 0279b1c72089c..311b14231a4f7 100644 --- a/tests/tests_fabric/strategies/test_deepspeed_integration.py +++ b/tests/tests_fabric/strategies/test_deepspeed_integration.py @@ -264,22 +264,6 @@ def test_deepspeed_env_variables_on_platforms(_, deepspeed_dist_mock, platform): assert os.environ["LOCAL_RANK"] == str(strategy.local_rank) -@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) -def test_deepspeed_specific_gpu_device_index(): - """Test that the DeepSpeed strategy can run on specific device indices.""" - - class RunFabric(BoringFabric): - def step(self, model, batch): - assert self.device.type == "cuda" - assert self.device.index == 1 - assert batch.device.index == 1 - assert model.device.index == 1 - return super().step(model, batch) - - fabric = RunFabric(accelerator="cuda", devices=[1], strategy="deepspeed") - fabric.run() - - @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True, bf16_cuda=True) def test_deepspeed_with_bfloat16_precision(): """Test that the DeepSpeed strategy works with bfloat16 precision.""" From 1c6f9bf5cadf528344f3523c5831b63866814f35 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 30 Jun 2023 18:37:45 -0700 Subject: [PATCH 09/13] comment --- src/lightning/fabric/connector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lightning/fabric/connector.py b/src/lightning/fabric/connector.py index f99418185712f..d007ff26231cf 100644 --- a/src/lightning/fabric/connector.py +++ b/src/lightning/fabric/connector.py @@ -377,7 +377,6 @@ def _choose_strategy(self) -> Union[Strategy, str]: if self._num_nodes_flag > 1: return "ddp" if len(self._parallel_devices) <= 1: - # TODO: Change this once gpu accelerator was renamed to cuda accelerator if isinstance(self._accelerator_flag, (CUDAAccelerator, MPSAccelerator)) or ( isinstance(self._accelerator_flag, str) and self._accelerator_flag in ("cuda", "gpu", "mps") ): From 0a33d971a35d854e10976ae3938a89c5935bb381 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 30 Jun 2023 18:38:47 -0700 Subject: [PATCH 10/13] import --- tests/tests_pytorch/strategies/test_deepspeed_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index cb319e8a9b868..a159697531215 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -22,12 +22,12 @@ import pytest import torch import torch.nn.functional as F -from pytorch.accelerators import CUDAAccelerator from torch import nn, Tensor from torch.utils.data import DataLoader from torchmetrics import Accuracy from lightning.pytorch import LightningDataModule, LightningModule, Trainer +from lightning.pytorch.accelerators import CUDAAccelerator from lightning.pytorch.callbacks import Callback, LearningRateMonitor, ModelCheckpoint from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from lightning.pytorch.loggers import CSVLogger From d39023bba11fa890ca784b067699dd33aa574964 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 30 Jun 2023 18:41:34 -0700 Subject: [PATCH 11/13] changelog --- src/lightning/fabric/CHANGELOG.md | 3 +++ src/lightning/pytorch/CHANGELOG.md | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 6e75f250f5d1f..2c032198ea03d 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -80,6 +80,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Automatically call `xla_model.mark_step()` after `optimizer.step()` with XLA ([#17883](https://github.com/Lightning-AI/lightning/pull/17883)) +- Added validation against misconfigured device selection when using the DeepSpeed strategy ([#17952](https://github.com/Lightning-AI/lightning/pull/17952)) + + ### Changed - Allow using iterable-style datasets with TPUs ([#17331](https://github.com/Lightning-AI/lightning/pull/17331)) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 26121858746eb..4748550900d8d 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -59,6 +59,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Automatically call `xla_model.mark_step()` before saving checkpoints with XLA ([#17882](https://github.com/Lightning-AI/lightning/pull/17882)) + +- Added validation against misconfigured device selection when using the DeepSpeed strategy ([#17952](https://github.com/Lightning-AI/lightning/pull/17952)) + + ### Changed - Removed the limitation to call `self.trainer.model.parameters()` in `LightningModule.configure_optimizers()` ([#17309](https://github.com/Lightning-AI/lightning/pull/17309)) From c0bf4d4ef6a69a9eeba190b3a7887bf1949efb6c Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 30 Jun 2023 18:50:27 -0700 Subject: [PATCH 12/13] typing --- src/lightning/fabric/strategies/deepspeed.py | 1 + src/lightning/pytorch/strategies/deepspeed.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/lightning/fabric/strategies/deepspeed.py b/src/lightning/fabric/strategies/deepspeed.py index b299ca377f209..75d8ed1546f4f 100644 --- a/src/lightning/fabric/strategies/deepspeed.py +++ b/src/lightning/fabric/strategies/deepspeed.py @@ -598,6 +598,7 @@ def _setup_distributed(self) -> None: f"The DeepSpeed strategy is only supported on CUDA GPUs but `{self.accelerator.__class__.__name__}`" " is used." ) + assert self.parallel_devices is not None _validate_device_index_selection(self.parallel_devices) reset_seed() self._set_world_ranks() diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py index 488e85dc5bbf8..516e02ff79eeb 100644 --- a/src/lightning/pytorch/strategies/deepspeed.py +++ b/src/lightning/pytorch/strategies/deepspeed.py @@ -325,6 +325,7 @@ def _load_config(self, config: Optional[Union[_PATH, Dict[str, Any]]]) -> Option return config def setup_distributed(self) -> None: + assert self.parallel_devices is not None _validate_device_index_selection(self.parallel_devices) reset_seed() self.set_world_ranks() From 949c6b101c854f09b235054d0fe97844688e4f38 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 30 Jun 2023 18:51:50 -0700 Subject: [PATCH 13/13] delete test --- .../strategies/test_deepspeed_strategy.py | 42 ------------------- 1 file changed, 42 deletions(-) diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index a159697531215..6aaf93034957c 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -1156,48 +1156,6 @@ def test_deepspeed_gradient_clip_by_value(tmpdir): trainer.fit(model) -@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) -def test_specific_gpu_device_id(tmpdir): - class TestCallback(Callback): - def on_train_start(self, *_) -> None: - assert model.device.index == 1 - - def on_train_batch_start( - self, - trainer: Trainer, - pl_module: LightningModule, - batch: Any, - *_, - ) -> None: - assert batch.device.index == 1 - - def on_test_start(self, *_) -> None: - assert model.device.index == 1 - - def on_test_batch_start( - self, - trainer: Trainer, - pl_module: LightningModule, - batch: Any, - *_, - ) -> None: - assert batch.device.index == 1 - - model = BoringModel() - trainer = Trainer( - default_root_dir=tmpdir, - fast_dev_run=True, - accelerator="gpu", - devices=[1], - strategy="deepspeed", - callbacks=TestCallback(), - enable_progress_bar=False, - enable_model_summary=False, - ) - trainer.fit(model) - trainer.test(model) - - @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_multi_save_same_filepath(tmpdir): """Test that verifies that deepspeed saves only latest checkpoint in the specified path and deletes the old