diff --git a/CHANGELOG.md b/CHANGELOG.md index d12fc25e39ade..745dee8ad8031 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -489,6 +489,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed missing arguments when saving hyperparameters from the parent class but not from the child class ([#9800](https://github.com/PyTorchLightning/pytorch-lightning/pull/9800)) +- Fixed DeepSpeed GPU device IDs ([#9847](https://github.com/PyTorchLightning/pytorch-lightning/pull/9847)) + + - Reset `val_dataloader` in `tuner/batch_size_scaling` ([#9857](https://github.com/PyTorchLightning/pytorch-lightning/pull/9857)) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index f706e5f33346d..a1d9e346f1217 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import argparse import contextlib import json import logging @@ -429,6 +430,7 @@ def _initialize_deepspeed_train(self, model): model_parameters = filter(lambda p: p.requires_grad, self.model.parameters()) model, deepspeed_optimizer, _, deepspeed_scheduler = deepspeed.initialize( + args=argparse.Namespace(device_rank=self.root_device.index), config=self.config, model=model, model_parameters=model_parameters, @@ -505,6 +507,7 @@ def _initialize_deepspeed_inference(self, model): # Remove all module hooks before initializing new model remove_module_hooks(model) model, _, _, _ = deepspeed.initialize( + args=argparse.Namespace(device_rank=self.root_device.index), config=inference_config, model=model, optimizer=optimizer, diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 830e7bc72b07d..f3b4733fdc803 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -970,3 +970,40 @@ def test_different_accumulate_grad_batches_fails(tmpdir): MisconfigurationException, match="DeepSpeed currently does not support different `accumulate_grad_batches`" ): trainer.fit(model) + + +@RunIf(min_gpus=2, deepspeed=True, special=True) +def test_specific_gpu_device_id(tmpdir): + class TestCallback(Callback): + def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: + assert model.device.index == 1 + + def on_train_batch_start( + self, + trainer: Trainer, + pl_module: LightningModule, + batch: Any, + batch_idx: int, + dataloader_idx: int, + ) -> None: + assert batch.device.index == 1 + + def on_test_start(self, trainer: Trainer, pl_module: LightningModule) -> None: + assert model.device.index == 1 + + def on_test_batch_start( + self, + trainer: Trainer, + pl_module: LightningModule, + batch: Any, + batch_idx: int, + dataloader_idx: int, + ) -> None: + assert batch.device.index == 1 + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, fast_dev_run=True, gpus=[1], plugins="deepspeed", callbacks=TestCallback() + ) + trainer.fit(model) + trainer.test(model)