From de1401d5872b7d1ce2078abd5eb669937e797695 Mon Sep 17 00:00:00 2001 From: Kyle Date: Thu, 13 Jan 2022 20:15:13 -0500 Subject: [PATCH 01/32] update tests for v2 --- .../data/horovod/train_default_model.py | 2 +- tests/models/test_amp.py | 31 +++++-- tests/models/test_cpu.py | 4 +- tests/models/test_gpu.py | 80 ++++++++++--------- tests/models/test_hooks.py | 18 +++-- tests/models/test_horovod.py | 20 +---- tests/models/test_onnx.py | 5 +- tests/models/test_restore.py | 8 +- tests/models/test_tpu.py | 77 +++++++++++------- 9 files changed, 140 insertions(+), 105 deletions(-) diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index 4527f337af365..2c2515851bbf3 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -100,7 +100,7 @@ def training_epoch_end(self, outputs) -> None: trainer._checkpoint_connector.restore(checkpoint_path) if on_gpu: - trainer = Trainer(gpus=1, strategy="horovod", max_epochs=1) + trainer = Trainer(accelerator="gpu", devices=1, strategy="horovod", max_epochs=1) # Test the root_gpu property assert trainer.root_gpu == hvd.local_rank() diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index 917bb4d224194..3fb42fb0ce29e 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -77,13 +77,18 @@ def _assert_autocast_enabled(self): ], ) @pytest.mark.parametrize("precision", [16, "bf16"]) -@pytest.mark.parametrize("num_processes", [1, 2]) -def test_amp_cpus(tmpdir, strategy, precision, num_processes): +@pytest.mark.parametrize("devices", [1, 2]) +def test_amp_cpus(tmpdir, strategy, precision, devices): """Make sure combinations of AMP and training types work if supported.""" tutils.reset_seed() trainer = Trainer( - default_root_dir=tmpdir, num_processes=num_processes, max_epochs=1, strategy=strategy, precision=precision + default_root_dir=tmpdir, + accelerator="cpu", + devices=devices, + max_epochs=1, + strategy=strategy, + precision=precision, ) model = AMPTestModel() @@ -97,12 +102,19 @@ def test_amp_cpus(tmpdir, strategy, precision, num_processes): @RunIf(min_gpus=2, min_torch="1.10") @pytest.mark.parametrize("strategy", [None, "dp", "ddp_spawn"]) @pytest.mark.parametrize("precision", [16, "bf16"]) -@pytest.mark.parametrize("gpus", [1, 2]) -def test_amp_gpus(tmpdir, strategy, precision, gpus): +@pytest.mark.parametrize("devices", [1, 2]) +def test_amp_gpus(tmpdir, strategy, precision, devices): """Make sure combinations of AMP and training types work if supported.""" tutils.reset_seed() - trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, gpus=gpus, strategy=strategy, precision=precision) + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + accelerator="gpu", + devices=devices, + strategy=strategy, + precision=precision, + ) model = AMPTestModel() trainer.fit(model) @@ -141,7 +153,8 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir): trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, - gpus=[0], + accelerator="gpu", + devices=[0], strategy="ddp_spawn", precision=16, callbacks=[checkpoint], @@ -195,7 +208,9 @@ def configure_optimizers(self): model = CustomModel() model.training_epoch_end = None - trainer = Trainer(default_root_dir=tmpdir, max_steps=5, precision=16, amp_backend="apex", gpus=1) + trainer = Trainer( + default_root_dir=tmpdir, max_steps=5, precision=16, amp_backend="apex", accelerator="gpu", devices=1 + ) assert str(trainer.amp_backend) == "AMPType.APEX" trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 22e31c442d7dd..d8d155dd269b8 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -136,8 +136,8 @@ def test_multi_cpu_model_ddp(tmpdir): max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, - gpus=None, - num_processes=2, + accelerator="cpu", + devices=2, strategy="ddp_spawn", ) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index d17322e191ff1..da6934081ef72 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -47,7 +47,8 @@ def test_multi_gpu_none_backend(tmpdir): max_epochs=1, limit_train_batches=0.2, limit_val_batches=0.2, - gpus=2, + accelerator="gpu", + devices=2, ) dm = ClassifDataModule() @@ -56,8 +57,8 @@ def test_multi_gpu_none_backend(tmpdir): @RunIf(min_gpus=2) -@pytest.mark.parametrize("gpus", [1, [0], [1]]) -def test_single_gpu_model(tmpdir, gpus): +@pytest.mark.parametrize("devices", [1, [0], [1]]) +def test_single_gpu_model(tmpdir, devices): """Make sure single GPU works (DP mode).""" trainer_options = dict( default_root_dir=tmpdir, @@ -65,7 +66,8 @@ def test_single_gpu_model(tmpdir, gpus): max_epochs=1, limit_train_batches=0.1, limit_val_batches=0.1, - gpus=gpus, + accelerator="gpu", + devices=devices, ) model = BoringModel() @@ -93,7 +95,7 @@ def device_count(): @pytest.mark.parametrize( - ["gpus", "expected_num_gpus", "strategy"], + ["devices", "expected_num_gpus", "strategy"], [ pytest.param(None, 0, None, id="None - expect 0 gpu to use."), pytest.param(0, 0, None, id="Oth gpu, expect 1 gpu to use."), @@ -103,23 +105,23 @@ def device_count(): pytest.param(3, 3, "ddp", id="3rd gpu - 1 gpu to use (backend:ddp)"), ], ) -def test_trainer_gpu_parse(mocked_device_count, gpus, expected_num_gpus, strategy): - assert Trainer(gpus=gpus, strategy=strategy).num_gpus == expected_num_gpus +def test_trainer_gpu_parse(mocked_device_count, devices, expected_num_gpus, strategy): + assert Trainer(accelerator="gpu", devices=devices, strategy=strategy).num_gpus == expected_num_gpus @pytest.mark.parametrize( - ["gpus", "expected_num_gpus", "strategy"], + ["devices", "expected_num_gpus", "strategy"], [ pytest.param(None, 0, None, id="None - expect 0 gpu to use."), pytest.param(None, 0, "ddp", id="None - expect 0 gpu to use."), ], ) -def test_trainer_num_gpu_0(mocked_device_count_0, gpus, expected_num_gpus, strategy): - assert Trainer(gpus=gpus, strategy=strategy).num_gpus == expected_num_gpus +def test_trainer_num_gpu_0(mocked_device_count_0, devices, expected_num_gpus, strategy): + assert Trainer(accelerator="gpu", devices=devices, strategy=strategy).num_gpus == expected_num_gpus @pytest.mark.parametrize( - ["gpus", "expected_root_gpu", "strategy"], + ["devices", "expected_root_gpu", "strategy"], [ pytest.param(None, None, "ddp", id="None is None"), pytest.param(0, None, "ddp", id="O gpus, expect gpu root device to be None."), @@ -129,25 +131,25 @@ def test_trainer_num_gpu_0(mocked_device_count_0, gpus, expected_num_gpus, strat pytest.param(3, 0, "ddp", id="3 gpus, expect gpu root device to be 0.(backend:ddp)"), ], ) -def test_root_gpu_property(mocked_device_count, gpus, expected_root_gpu, strategy): - assert Trainer(gpus=gpus, strategy=strategy).root_gpu == expected_root_gpu +def test_root_gpu_property(mocked_device_count, devices, expected_root_gpu, strategy): + assert Trainer(accelerator="gpu", devices=devices, strategy=strategy).root_gpu == expected_root_gpu @pytest.mark.parametrize( - ["gpus", "expected_root_gpu", "strategy"], + ["devices", "expected_root_gpu", "strategy"], [ pytest.param(None, None, None, id="None is None"), pytest.param(None, None, "ddp", id="None is None"), pytest.param(0, None, "ddp", id="None is None"), ], ) -def test_root_gpu_property_0_passing(mocked_device_count_0, gpus, expected_root_gpu, strategy): - assert Trainer(gpus=gpus, strategy=strategy).root_gpu == expected_root_gpu +def test_root_gpu_property_0_passing(mocked_device_count_0, devices, expected_root_gpu, strategy): + assert Trainer(accelerator="gpu", devices=devices, strategy=strategy).root_gpu == expected_root_gpu # Asking for a gpu when non are available will result in a MisconfigurationException @pytest.mark.parametrize( - ["gpus", "expected_root_gpu", "strategy"], + ["devices", "expected_root_gpu", "strategy"], [ (1, None, "ddp"), (3, None, "ddp"), @@ -158,13 +160,13 @@ def test_root_gpu_property_0_passing(mocked_device_count_0, gpus, expected_root_ ("-1", None, "ddp"), ], ) -def test_root_gpu_property_0_raising(mocked_device_count_0, gpus, expected_root_gpu, strategy): +def test_root_gpu_property_0_raising(mocked_device_count_0, devices, expected_root_gpu, strategy): with pytest.raises(MisconfigurationException): - Trainer(gpus=gpus, strategy=strategy) + Trainer(accelerator="gpu", devices=devices, strategy=strategy) @pytest.mark.parametrize( - ["gpus", "expected_root_gpu"], + ["devices", "expected_root_gpu"], [ pytest.param(None, None, id="No gpus, expect gpu root device to be None"), pytest.param([0], 0, id="Oth gpu, expect gpu root device to be 0."), @@ -173,12 +175,12 @@ def test_root_gpu_property_0_raising(mocked_device_count_0, gpus, expected_root_ pytest.param([1, 2], 1, id="[1, 2] gpus, expect gpu root device to be 1."), ], ) -def test_determine_root_gpu_device(gpus, expected_root_gpu): - assert device_parser.determine_root_gpu_device(gpus) == expected_root_gpu +def test_determine_root_gpu_device(devices, expected_root_gpu): + assert device_parser.determine_root_gpu_device(devices) == expected_root_gpu @pytest.mark.parametrize( - ["gpus", "expected_gpu_ids"], + ["devices", "expected_gpu_ids"], [ (None, None), (0, None), @@ -196,20 +198,20 @@ def test_determine_root_gpu_device(gpus, expected_root_gpu): pytest.param("-1", list(range(PRETEND_N_OF_GPUS)), id="'-1' - use all gpus"), ], ) -def test_parse_gpu_ids(mocked_device_count, gpus, expected_gpu_ids): - assert device_parser.parse_gpu_ids(gpus) == expected_gpu_ids +def test_parse_gpu_ids(mocked_device_count, devices, expected_gpu_ids): + assert device_parser.parse_gpu_ids(devices) == expected_gpu_ids -@pytest.mark.parametrize("gpus", [0.1, -2, False, [-1], [None], ["0"], [0, 0]]) -def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, gpus): +@pytest.mark.parametrize("devices", [0.1, -2, False, [-1], [None], ["0"], [0, 0]]) +def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, devices): with pytest.raises(MisconfigurationException): - device_parser.parse_gpu_ids(gpus) + device_parser.parse_gpu_ids(devices) -@pytest.mark.parametrize("gpus", [[1, 2, 19], -1, "-1"]) -def test_parse_gpu_fail_on_non_existent_id(mocked_device_count_0, gpus): +@pytest.mark.parametrize("devices", [[1, 2, 19], -1, "-1"]) +def test_parse_gpu_fail_on_non_existent_id(mocked_device_count_0, devices): with pytest.raises(MisconfigurationException): - device_parser.parse_gpu_ids(gpus) + device_parser.parse_gpu_ids(devices) def test_parse_gpu_fail_on_non_existent_id_2(mocked_device_count): @@ -217,10 +219,10 @@ def test_parse_gpu_fail_on_non_existent_id_2(mocked_device_count): device_parser.parse_gpu_ids([1, 2, 19]) -@pytest.mark.parametrize("gpus", [-1, "-1"]) -def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_count_0, gpus): +@pytest.mark.parametrize("devices", [-1, "-1"]) +def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_count_0, devices): with pytest.raises(MisconfigurationException): - device_parser.parse_gpu_ids(gpus) + device_parser.parse_gpu_ids(devices) @mock.patch.dict( @@ -236,19 +238,19 @@ def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_coun ) @mock.patch("torch.cuda.device_count", return_value=1) @mock.patch("torch.cuda.is_available", return_value=True) -@pytest.mark.parametrize("gpus", [[0, 1, 2], 2, "0"]) -def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus): +@pytest.mark.parametrize("devices", [[0, 1, 2], 2, "0"]) +def test_torchelastic_gpu_parsing(mocked_device_count, devices): """Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device That we omit sanitizing the gpus as only one of the GPUs is visible.""" - trainer = Trainer(gpus=gpus) + trainer = Trainer(accelerator="gpu", devices=devices) assert isinstance(trainer._accelerator_connector.cluster_environment, TorchElasticEnvironment) assert trainer.data_parallel_device_ids == device_parser.parse_gpu_ids(gpus) - assert trainer.gpus == gpus + assert trainer.devices == devices @RunIf(min_gpus=1) def test_single_gpu_batch_parse(): - trainer = Trainer(gpus=1) + trainer = Trainer(accelerator="gpu", devices=1) # non-transferrable types primitive_objects = [None, {}, [], 1.0, "x", [None, 2], {"x": (1, 2), "y": None}] diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index cdf94d18171f7..1388313ecccc9 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -156,7 +156,7 @@ def transfer_batch_to_device(self, batch, device, dataloader_idx): model = CurrentTestModel() batch = CustomBatch((torch.zeros(5, 32), torch.ones(5, 1, dtype=torch.long))) - trainer = Trainer(gpus=1) + trainer = Trainer(accelerator="gpu", devices=1) # running .fit() would require us to implement custom data loaders, we mock the model reference instead model_getter_mock.return_value = model @@ -203,7 +203,8 @@ def train_dataloader(self): max_epochs=1, enable_model_summary=False, strategy="ddp", - gpus=2, + accelerator="gpu", + devices=2, ) trainer.fit(model) @@ -437,10 +438,17 @@ def _predict_batch(trainer, model, batches): [ {}, # these precision plugins modify the optimization flow, so testing them explicitly - pytest.param(dict(gpus=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)), - pytest.param(dict(gpus=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1)), pytest.param( - dict(gpus=1, precision=16, strategy="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1, standalone=True) + dict(accelerator="gpu", devices=1, precision=16, amp_backend="native"), + marks=RunIf(min_gpus=1) + ), + pytest.param( + dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), + marks=RunIf(amp_apex=True, min_gpus=1) + ), + pytest.param( + dict(accelerator="gpu", devices=1, precision=16, strategy="deepspeed"), + marks=RunIf(deepspeed=True, min_gpus=1, standalone=True), ), ], ) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 5d553394fed9c..a3d213ef9e920 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -55,7 +55,9 @@ def test_nccl_is_available_on_gpu_environment(): def _run_horovod(trainer_options): """Execute the training script across multiple workers in parallel.""" - devices = trainer_options.get("devices", 1) + num_processes = trainer_options.get("devices", 2) + # for Horovod, we interpret `gpus` to be set per worker + trainer_options.update(accelerator="gpu" if on_gpu else "cpu") tutils.reset_seed() # TODO: Find out why coverage breaks CI. # append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else '' @@ -151,22 +153,6 @@ def test_horovod_multi_gpu(tmpdir): _run_horovod(trainer_options) -@RunIf(min_gpus=2, skip_windows=True, horovod_nccl=True) -def test_horovod_multi_gpu_accumulate_grad_batches(tmpdir): - trainer_options = dict( - default_root_dir=tmpdir, - enable_progress_bar=False, - max_epochs=1, - limit_train_batches=4, - limit_val_batches=0, - accumulate_grad_batches=2, - accelerator="gpu", - devices=2, - strategy="horovod", - ) - _run_horovod(trainer_options) - - @RunIf(horovod=True, skip_windows=True) def test_horovod_raises_unsupported_accumulate_grad_batches(tmpdir): """Ensure MisConfigurationException for different `accumulate_grad_batches` at different epochs for Horovod diff --git a/tests/models/test_onnx.py b/tests/models/test_onnx.py index d111b266fb115..ee750d113cc59 100644 --- a/tests/models/test_onnx.py +++ b/tests/models/test_onnx.py @@ -43,7 +43,7 @@ def test_model_saves_with_input_sample(tmpdir): def test_model_saves_on_gpu(tmpdir): """Test that model saves on gpu.""" model = BoringModel() - trainer = Trainer(gpus=1, fast_dev_run=True) + trainer = Trainer(accelerator="gpu", devices=1, fast_dev_run=True) trainer.fit(model) file_path = os.path.join(tmpdir, "model.onnx") @@ -96,7 +96,8 @@ def test_model_saves_on_multi_gpu(tmpdir): max_epochs=1, limit_train_batches=10, limit_val_batches=10, - gpus=[0, 1], + accelerator="gpu", + devices=[0, 1], strategy="ddp_spawn", enable_progress_bar=False, ) diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index e5259c4047ad2..20212af558f3e 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -399,7 +399,8 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir): limit_val_batches=5, callbacks=[checkpoint], logger=logger, - gpus=[0, 1], + accelerator="gpu", + devices=[0, 1], strategy="dp", default_root_dir=tmpdir, ) @@ -445,7 +446,8 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir): limit_val_batches=2, callbacks=[checkpoint], logger=logger, - gpus=[0, 1], + accelerator="gpu", + devices=[0, 1], strategy="ddp_spawn", default_root_dir=tmpdir, ) @@ -564,7 +566,7 @@ def test_dp_resume(tmpdir): model = CustomClassificationModelDP(lr=0.1) dm = ClassifDataModule() - trainer_options = dict(max_epochs=1, gpus=2, strategy="dp", default_root_dir=tmpdir) + trainer_options = dict(max_epochs=1, accelerator="gpu", devices=2, strategy="dp", default_root_dir=tmpdir) # get logger logger = tutils.get_default_logger(tmpdir) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 90b255f73f5aa..2d65db791ebdd 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -57,7 +57,8 @@ def test_model_tpu_cores_1(tmpdir): default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=2, - tpu_cores=1, + accelerator="tpu", + devices=1, limit_train_batches=4, limit_val_batches=4, ) @@ -76,7 +77,8 @@ def test_model_tpu_index(tmpdir, tpu_core): default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=2, - tpu_cores=[tpu_core], + accelerator="tpu", + devices=[tpu_core], limit_train_batches=4, limit_val_batches=4, ) @@ -95,7 +97,8 @@ def test_model_tpu_cores_8(tmpdir): default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, - tpu_cores=8, + accelerator="tpu", + devices=8, limit_train_batches=4, limit_val_batches=4, ) @@ -115,7 +118,8 @@ def test_model_16bit_tpu_cores_1(tmpdir): precision=16, enable_progress_bar=False, max_epochs=2, - tpu_cores=1, + accelerator="tpu", + devices=1, limit_train_batches=8, limit_val_batches=2, ) @@ -135,7 +139,8 @@ def test_model_16bit_tpu_index(tmpdir, tpu_core): precision=16, enable_progress_bar=False, max_epochs=2, - tpu_cores=[tpu_core], + accelerator="tpu", + devices=[tpu_core], limit_train_batches=4, limit_val_batches=2, ) @@ -155,7 +160,8 @@ def test_model_16bit_tpu_cores_8(tmpdir): precision=16, enable_progress_bar=False, max_epochs=1, - tpu_cores=8, + accelerator="tpu", + devices=8, limit_train_batches=4, limit_val_batches=4, ) @@ -185,7 +191,8 @@ def validation_step(self, *args, **kwargs): max_epochs=2, limit_train_batches=2, limit_val_batches=2, - tpu_cores=8, + accelerator="tpu", + devices=8, ) trainer.fit(model) trainer.test(dataloaders=DataLoader(RandomDataset(32, 2000), batch_size=32)) @@ -200,7 +207,8 @@ def test_tpu_grad_norm(tmpdir): default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=4, - tpu_cores=1, + accelerator="tpu", + devices=1, limit_train_batches=0.4, limit_val_batches=0.4, gradient_clip_val=0.5, @@ -219,7 +227,8 @@ def test_tpu_clip_grad_by_value(tmpdir): default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=4, - tpu_cores=1, + accelerator="tpu", + devices=1, limit_train_batches=10, limit_val_batches=10, gradient_clip_val=0.5, @@ -237,40 +246,41 @@ def test_dataloaders_passed_to_fit(tmpdir): tutils.reset_seed() model = BoringModel() - trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, tpu_cores=8) + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="tpu", devices=8) trainer.fit(model, train_dataloaders=model.train_dataloader(), val_dataloaders=model.val_dataloader()) assert trainer.state.finished, f"Training failed with {trainer.state}" @pytest.mark.parametrize( - ["tpu_cores", "expected_tpu_id"], + ["devices", "expected_tpu_id"], [(1, None), (8, None), ([1], 1), ([8], 8)], ) @RunIf(tpu=True) -def test_tpu_id_to_be_as_expected(tpu_cores, expected_tpu_id): +def test_tpu_id_to_be_as_expected(devices, expected_tpu_id): """Test if trainer.tpu_id is set as expected.""" - assert Trainer(tpu_cores=tpu_cores)._accelerator_connector.tpu_id == expected_tpu_id + assert Trainer(accelerator="tpu", devices=devices)._accelerator_connector.tpu_id == expected_tpu_id +@RunIf(tpu=True) def test_tpu_misconfiguration(): """Test if trainer.tpu_id is set as expected.""" with pytest.raises(MisconfigurationException, match="`tpu_cores` can only be"): - Trainer(tpu_cores=[1, 8]) + Trainer(accelerator="tpu", devices=[1, 8]) @pytest.mark.skipif(_TPU_AVAILABLE, reason="test requires missing TPU") def test_exception_when_no_tpu_found(tmpdir): """Test if exception is thrown when xla devices are not available.""" - with pytest.raises(MisconfigurationException, match="No TPU devices were found."): - Trainer(tpu_cores=8) + with pytest.raises(MisconfigurationException, match="but TPUs are not available."): + Trainer(accelerator="tpu", devices=8) -@pytest.mark.parametrize("tpu_cores", [1, 8, [1]]) +@pytest.mark.parametrize("devices", [1, 8, [1]]) @RunIf(tpu=True) -def test_accelerator_set_when_using_tpu(tmpdir, tpu_cores): +def test_accelerator_set_when_using_tpu(tmpdir, devices): """Test if the accelerator is set to `tpu` when tpu_cores is not None.""" - assert isinstance(Trainer(tpu_cores=tpu_cores).accelerator, TPUAccelerator) + assert isinstance(Trainer(accelerator="tpu", devices=devices).accelerator, TPUAccelerator) @RunIf(tpu=True) @@ -279,7 +289,7 @@ def test_broadcast_on_tpu(): """Checks if an object from the main process is broadcasted to other processes correctly.""" def test_broadcast(rank): - trainer = Trainer(tpu_cores=8) + trainer = Trainer(accelerator="tpu", devices=8) assert isinstance(trainer.accelerator, TPUAccelerator) assert isinstance(trainer.strategy, TPUSpawnStrategy) obj = ("ver_0.5", "logger_name", rank) @@ -310,9 +320,9 @@ def test_broadcast(rank): def test_tpu_choice(tmpdir, tpu_cores, expected_tpu_id, error_expected): if error_expected: with pytest.raises(MisconfigurationException, match=r".*tpu_cores` can only be 1, 8 or [<1-8>]*"): - Trainer(default_root_dir=tmpdir, tpu_cores=tpu_cores) + Trainer(default_root_dir=tmpdir, accelerator="tpu", devices=tpu_cores) else: - trainer = Trainer(default_root_dir=tmpdir, tpu_cores=tpu_cores) + trainer = Trainer(default_root_dir=tmpdir, accelerator="tpu", devices=tpu_cores) assert trainer._accelerator_connector.tpu_id == expected_tpu_id @@ -341,7 +351,7 @@ def test_tpu_reduce(): """Test tpu spawn reduce operation.""" def test_reduce(rank): - trainer = Trainer(tpu_cores=8) + trainer = Trainer(accelerator="tpu", devices=8) # faster this way reduce_ops = ["mean", "AVG", "undefined", "sum", ReduceOp.SUM, ReduceOp.MAX] for reduce_op in reduce_ops: @@ -372,7 +382,8 @@ def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir): default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=1, - tpu_cores=1, + accelerator="tpu", + devices=1, precision=16, limit_train_batches=4, limit_val_batches=4, @@ -394,7 +405,14 @@ def test_if_test_works_with_checkpoint_false(tmpdir): # Train a model on TPU model = BoringModel() - trainer = Trainer(max_epochs=1, tpu_cores=8, default_root_dir=tmpdir, fast_dev_run=True, enable_checkpointing=False) + trainer = Trainer( + max_epochs=1, + accelerator="tpu", + devices=8, + default_root_dir=tmpdir, + fast_dev_run=True, + enable_checkpointing=False, + ) trainer.fit(model) assert trainer.state.finished, f"Training failed with {trainer.state}" @@ -430,7 +448,8 @@ def teardown(self, stage): default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=4, - tpu_cores=8, + accelerator="tpu", + devices=8, limit_train_batches=0.4, limit_val_batches=0.4, strategy=TPUSpawnStrategy(debug=True), @@ -457,7 +476,8 @@ def teardown(self, stage): default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=4, - tpu_cores=8, + accelerator="tpu", + devices=8, limit_train_batches=0.4, limit_val_batches=0.4, ) @@ -469,6 +489,7 @@ def teardown(self, stage): @RunIf(tpu=True) @pl_multi_process_test def test_device_type_when_training_plugin_tpu_passed(tmpdir): - trainer = Trainer(strategy=TPUSpawnStrategy(), tpu_cores=8) + + trainer = Trainer(strategy=TPUSpawnStrategy(), accelerator="tpu", devices=8) assert isinstance(trainer.strategy, TPUSpawnStrategy) assert isinstance(trainer.accelerator, TPUAccelerator) From b5814090cef33afc415e693c745efea3cc9a634b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Jan 2022 01:37:33 +0000 Subject: [PATCH 02/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/models/test_hooks.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 1388313ecccc9..67c762af321ad 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -438,13 +438,9 @@ def _predict_batch(trainer, model, batches): [ {}, # these precision plugins modify the optimization flow, so testing them explicitly + pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)), pytest.param( - dict(accelerator="gpu", devices=1, precision=16, amp_backend="native"), - marks=RunIf(min_gpus=1) - ), - pytest.param( - dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), - marks=RunIf(amp_apex=True, min_gpus=1) + dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1) ), pytest.param( dict(accelerator="gpu", devices=1, precision=16, strategy="deepspeed"), From 5257653c53b071a99f4b9c1eb79803592c6f83ab Mon Sep 17 00:00:00 2001 From: Kyle Date: Tue, 8 Feb 2022 21:09:02 -0500 Subject: [PATCH 03/32] get rid of devices = 0 or devices = None --- tests/models/test_gpu.py | 40 ++++++++-------------------------------- 1 file changed, 8 insertions(+), 32 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index da6934081ef72..cfd46e87b58e1 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -97,53 +97,29 @@ def device_count(): @pytest.mark.parametrize( ["devices", "expected_num_gpus", "strategy"], [ - pytest.param(None, 0, None, id="None - expect 0 gpu to use."), - pytest.param(0, 0, None, id="Oth gpu, expect 1 gpu to use."), pytest.param(1, 1, None, id="1st gpu, expect 1 gpu to use."), pytest.param(-1, PRETEND_N_OF_GPUS, "ddp", id="-1 - use all gpus"), pytest.param("-1", PRETEND_N_OF_GPUS, "ddp", id="'-1' - use all gpus"), pytest.param(3, 3, "ddp", id="3rd gpu - 1 gpu to use (backend:ddp)"), ], ) -def test_trainer_gpu_parse(mocked_device_count, devices, expected_num_gpus, strategy): - assert Trainer(accelerator="gpu", devices=devices, strategy=strategy).num_gpus == expected_num_gpus - - -@pytest.mark.parametrize( - ["devices", "expected_num_gpus", "strategy"], - [ - pytest.param(None, 0, None, id="None - expect 0 gpu to use."), - pytest.param(None, 0, "ddp", id="None - expect 0 gpu to use."), - ], -) -def test_trainer_num_gpu_0(mocked_device_count_0, devices, expected_num_gpus, strategy): +@mock.patch("torch.cuda.is_available", return_value=True) +def test_trainer_gpu_parse(_, mocked_device_count, devices, expected_num_gpus, strategy): assert Trainer(accelerator="gpu", devices=devices, strategy=strategy).num_gpus == expected_num_gpus @pytest.mark.parametrize( ["devices", "expected_root_gpu", "strategy"], [ - pytest.param(None, None, "ddp", id="None is None"), - pytest.param(0, None, "ddp", id="O gpus, expect gpu root device to be None."), pytest.param(1, 0, "ddp", id="1 gpu, expect gpu root device to be 0."), pytest.param(-1, 0, "ddp", id="-1 - use all gpus, expect gpu root device to be 0."), pytest.param("-1", 0, "ddp", id="'-1' - use all gpus, expect gpu root device to be 0."), pytest.param(3, 0, "ddp", id="3 gpus, expect gpu root device to be 0.(backend:ddp)"), ], ) -def test_root_gpu_property(mocked_device_count, devices, expected_root_gpu, strategy): - assert Trainer(accelerator="gpu", devices=devices, strategy=strategy).root_gpu == expected_root_gpu - - -@pytest.mark.parametrize( - ["devices", "expected_root_gpu", "strategy"], - [ - pytest.param(None, None, None, id="None is None"), - pytest.param(None, None, "ddp", id="None is None"), - pytest.param(0, None, "ddp", id="None is None"), - ], -) -def test_root_gpu_property_0_passing(mocked_device_count_0, devices, expected_root_gpu, strategy): +@mock.patch("torch.cuda.is_available", return_value=True) +@mock.patch("torch.cuda.device_count", return_value=3) +def test_root_gpu_property(_, mocked_device_count, devices, expected_root_gpu, strategy): assert Trainer(accelerator="gpu", devices=devices, strategy=strategy).root_gpu == expected_root_gpu @@ -236,10 +212,10 @@ def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_coun "LOCAL_WORLD_SIZE": "2", }, ) -@mock.patch("torch.cuda.device_count", return_value=1) @mock.patch("torch.cuda.is_available", return_value=True) -@pytest.mark.parametrize("devices", [[0, 1, 2], 2, "0"]) -def test_torchelastic_gpu_parsing(mocked_device_count, devices): +@mock.patch("torch.cuda.device_count", return_value=1) +@pytest.mark.parametrize("devices", [[0, 1, 2], 2]) +def test_torchelastic_gpu_parsing(_, mocked_device_count, devices): """Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device That we omit sanitizing the gpus as only one of the GPUs is visible.""" trainer = Trainer(accelerator="gpu", devices=devices) From 9e75a0f74d95609fbfcdb2d0da08cf3c021c3c47 Mon Sep 17 00:00:00 2001 From: Kyle Date: Tue, 8 Feb 2022 22:00:51 -0500 Subject: [PATCH 04/32] use gpu when accelerator="gpu" --- tests/models/test_hooks.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 67c762af321ad..15e31e3f017cb 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -497,9 +497,11 @@ def training_step(self, batch, batch_idx): "state_dict": ANY, "loops": ANY, } - if kwargs.get("amp_backend") == "native" or kwargs.get("amp_backend") == "apex": - saved_ckpt[trainer.precision_plugin.__class__.__qualname__] = ANY - device = torch.device("cuda:0" if "gpus" in kwargs else "cpu") + if kwargs.get("amp_backend") == "native": + saved_ckpt["native_amp_scaling_state"] = ANY + elif kwargs.get("amp_backend") == "apex": + saved_ckpt["amp_scaling_state"] = ANY + device = torch.device("cuda:0" if kwargs["accelerator"] == "gpu" else "cpu") expected = [ dict(name="Callback.on_init_start", args=(trainer,)), dict(name="Callback.on_init_end", args=(trainer,)), From 84caafc0e83e0c34aa644cd7a5deed5521d69281 Mon Sep 17 00:00:00 2001 From: Jv Kyle Eclarin Date: Tue, 8 Feb 2022 22:54:46 -0500 Subject: [PATCH 05/32] use the right way of getting kwargs --- tests/models/test_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 15e31e3f017cb..aee49273a11a6 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -501,7 +501,7 @@ def training_step(self, batch, batch_idx): saved_ckpt["native_amp_scaling_state"] = ANY elif kwargs.get("amp_backend") == "apex": saved_ckpt["amp_scaling_state"] = ANY - device = torch.device("cuda:0" if kwargs["accelerator"] == "gpu" else "cpu") + device = torch.device("cuda:0" if kwargs.get("accelerator") == "gpu" else "cpu") expected = [ dict(name="Callback.on_init_start", args=(trainer,)), dict(name="Callback.on_init_end", args=(trainer,)), From 0b199dd685e05f12b4368e2e6783473b777a0c38 Mon Sep 17 00:00:00 2001 From: Kyle Date: Wed, 9 Feb 2022 23:08:03 -0500 Subject: [PATCH 06/32] use devices instead of tpu_cores --- tests/models/test_tpu.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 2d65db791ebdd..f4db46f98c5c7 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -50,7 +50,7 @@ def val_dataloader(self): @RunIf(tpu=True) @pl_multi_process_test -def test_model_tpu_cores_1(tmpdir): +def test_model_devices_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( @@ -90,7 +90,7 @@ def test_model_tpu_index(tmpdir, tpu_core): @RunIf(tpu=True) @pl_multi_process_test -def test_model_tpu_cores_8(tmpdir): +def test_model_devices_8(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( @@ -110,7 +110,7 @@ def test_model_tpu_cores_8(tmpdir): @RunIf(tpu=True) @pl_multi_process_test -def test_model_16bit_tpu_cores_1(tmpdir): +def test_model_16bit_devices_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( @@ -152,7 +152,7 @@ def test_model_16bit_tpu_index(tmpdir, tpu_core): @RunIf(tpu=True) @pl_multi_process_test -def test_model_16bit_tpu_cores_8(tmpdir): +def test_model_16bit_devices_8(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( @@ -300,7 +300,7 @@ def test_broadcast(rank): @pytest.mark.parametrize( - ["tpu_cores", "expected_tpu_id", "error_expected"], + ["devices", "expected_tpu_id", "error_expected"], [ (1, None, False), (8, None, False), @@ -317,23 +317,23 @@ def test_broadcast(rank): ) @RunIf(tpu=True) @pl_multi_process_test -def test_tpu_choice(tmpdir, tpu_cores, expected_tpu_id, error_expected): +def test_tpu_choice(tmpdir, devices, expected_tpu_id, error_expected): if error_expected: with pytest.raises(MisconfigurationException, match=r".*tpu_cores` can only be 1, 8 or [<1-8>]*"): - Trainer(default_root_dir=tmpdir, accelerator="tpu", devices=tpu_cores) + Trainer(default_root_dir=tmpdir, accelerator="tpu", devices=devices) else: - trainer = Trainer(default_root_dir=tmpdir, accelerator="tpu", devices=tpu_cores) + trainer = Trainer(default_root_dir=tmpdir, accelerator="tpu", devices=devices) assert trainer._accelerator_connector.tpu_id == expected_tpu_id @pytest.mark.parametrize( ["cli_args", "expected"], - [("--tpu_cores=8", {"tpu_cores": 8}), ("--tpu_cores=1,", {"tpu_cores": "1,"})], + [("--devices=8", {"devices": 8}), ("--devices=1,", {"devices": "1,"})], ) @RunIf(tpu=True) @pl_multi_process_test -def test_tpu_cores_with_argparse(cli_args, expected): - """Test passing tpu_cores in command line.""" +def test_devices_with_argparse(cli_args, expected): + """Test passing devices in command line.""" cli_args = cli_args.split(" ") if cli_args else [] with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): parser = ArgumentParser(add_help=False) From 2318b433ff29f78e13836bee367f903dc0e810f3 Mon Sep 17 00:00:00 2001 From: Kyle Date: Wed, 9 Feb 2022 23:08:52 -0500 Subject: [PATCH 07/32] switch mocked to match function --- tests/models/test_gpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index cfd46e87b58e1..aa02719236182 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -117,8 +117,8 @@ def test_trainer_gpu_parse(_, mocked_device_count, devices, expected_num_gpus, s pytest.param(3, 0, "ddp", id="3 gpus, expect gpu root device to be 0.(backend:ddp)"), ], ) -@mock.patch("torch.cuda.is_available", return_value=True) @mock.patch("torch.cuda.device_count", return_value=3) +@mock.patch("torch.cuda.is_available", return_value=True) def test_root_gpu_property(_, mocked_device_count, devices, expected_root_gpu, strategy): assert Trainer(accelerator="gpu", devices=devices, strategy=strategy).root_gpu == expected_root_gpu @@ -212,8 +212,8 @@ def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_coun "LOCAL_WORLD_SIZE": "2", }, ) -@mock.patch("torch.cuda.is_available", return_value=True) @mock.patch("torch.cuda.device_count", return_value=1) +@mock.patch("torch.cuda.is_available", return_value=True) @pytest.mark.parametrize("devices", [[0, 1, 2], 2]) def test_torchelastic_gpu_parsing(_, mocked_device_count, devices): """Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device That we omit From 63b1dccd2b6b58f78777000f6e927cb7b857f8f0 Mon Sep 17 00:00:00 2001 From: Kyle Date: Wed, 9 Feb 2022 23:58:14 -0500 Subject: [PATCH 08/32] add accelerator --- tests/models/test_tpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index f4db46f98c5c7..204432b88fa61 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -328,7 +328,7 @@ def test_tpu_choice(tmpdir, devices, expected_tpu_id, error_expected): @pytest.mark.parametrize( ["cli_args", "expected"], - [("--devices=8", {"devices": 8}), ("--devices=1,", {"devices": "1,"})], + [("--accelerator=gpu --devices=8", {"devices": 8}), ("--accelerator=gpu --devices=1,", {"devices": "1,"})], ) @RunIf(tpu=True) @pl_multi_process_test From 81aa3300a129f63a87033ae365dd5cb509a01b81 Mon Sep 17 00:00:00 2001 From: Kyle Date: Sun, 13 Mar 2022 00:36:17 -0500 Subject: [PATCH 09/32] revert tests --- tests/models/test_gpu.py | 8 ++++---- tests/models/test_tpu.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index aa02719236182..9d780edec2e65 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -214,14 +214,14 @@ def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_coun ) @mock.patch("torch.cuda.device_count", return_value=1) @mock.patch("torch.cuda.is_available", return_value=True) -@pytest.mark.parametrize("devices", [[0, 1, 2], 2]) -def test_torchelastic_gpu_parsing(_, mocked_device_count, devices): +@pytest.mark.parametrize("gpus", [[0, 1, 2], 2, "0"]) +def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus): """Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device That we omit sanitizing the gpus as only one of the GPUs is visible.""" - trainer = Trainer(accelerator="gpu", devices=devices) + trainer = Trainer(gpus=gpus) assert isinstance(trainer._accelerator_connector.cluster_environment, TorchElasticEnvironment) assert trainer.data_parallel_device_ids == device_parser.parse_gpu_ids(gpus) - assert trainer.devices == devices + assert trainer.gpus == gpus @RunIf(min_gpus=1) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 204432b88fa61..5920b77298e1e 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -272,8 +272,8 @@ def test_tpu_misconfiguration(): def test_exception_when_no_tpu_found(tmpdir): """Test if exception is thrown when xla devices are not available.""" - with pytest.raises(MisconfigurationException, match="but TPUs are not available."): - Trainer(accelerator="tpu", devices=8) + with pytest.raises(MisconfigurationException, match="No TPU devices were found."): + Trainer(tpu_cores=8) @pytest.mark.parametrize("devices", [1, 8, [1]]) From dcd3b03c023cd9b9d824505229303e739fcbe042 Mon Sep 17 00:00:00 2001 From: Kyle Date: Sun, 13 Mar 2022 01:17:12 -0500 Subject: [PATCH 10/32] put back a few more tests --- tests/models/test_gpu.py | 12 ++++++++++++ tests/models/test_horovod.py | 16 ++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 9d780edec2e65..865a236fe87c5 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -123,6 +123,18 @@ def test_root_gpu_property(_, mocked_device_count, devices, expected_root_gpu, s assert Trainer(accelerator="gpu", devices=devices, strategy=strategy).root_gpu == expected_root_gpu +@pytest.mark.parametrize( + ["gpus", "expected_root_gpu", "strategy"], + [ + pytest.param(None, None, None, id="None is None"), + pytest.param(None, None, "ddp", id="None is None"), + pytest.param(0, None, "ddp", id="None is None"), + ], +) +def test_root_gpu_property_0_passing(mocked_device_count_0, gpus, expected_root_gpu, strategy): + assert Trainer(gpus=gpus, strategy=strategy).root_gpu == expected_root_gpu + + # Asking for a gpu when non are available will result in a MisconfigurationException @pytest.mark.parametrize( ["devices", "expected_root_gpu", "strategy"], diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index a3d213ef9e920..910190fcbfa4e 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -153,6 +153,22 @@ def test_horovod_multi_gpu(tmpdir): _run_horovod(trainer_options) +@RunIf(min_gpus=2, skip_windows=True, horovod_nccl=True) +def test_horovod_multi_gpu_accumulate_grad_batches(tmpdir): + trainer_options = dict( + default_root_dir=tmpdir, + enable_progress_bar=False, + max_epochs=1, + limit_train_batches=4, + limit_val_batches=0, + accumulate_grad_batches=2, + accelerator="gpu", + devices=2, + strategy="horovod", + ) + _run_horovod(trainer_options) + + @RunIf(horovod=True, skip_windows=True) def test_horovod_raises_unsupported_accumulate_grad_batches(tmpdir): """Ensure MisConfigurationException for different `accumulate_grad_batches` at different epochs for Horovod From d9f352b48829cdc7165e4828291cc202d85b3b66 Mon Sep 17 00:00:00 2001 From: Kyle Date: Sun, 13 Mar 2022 01:22:38 -0500 Subject: [PATCH 11/32] revert test_horovod --- tests/models/test_horovod.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 910190fcbfa4e..5d553394fed9c 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -55,9 +55,7 @@ def test_nccl_is_available_on_gpu_environment(): def _run_horovod(trainer_options): """Execute the training script across multiple workers in parallel.""" - num_processes = trainer_options.get("devices", 2) - # for Horovod, we interpret `gpus` to be set per worker - trainer_options.update(accelerator="gpu" if on_gpu else "cpu") + devices = trainer_options.get("devices", 1) tutils.reset_seed() # TODO: Find out why coverage breaks CI. # append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else '' From 5e19aa385a57802687cfee02eeda5f46a9db6599 Mon Sep 17 00:00:00 2001 From: Kyle Date: Sun, 13 Mar 2022 01:27:42 -0500 Subject: [PATCH 12/32] revert tpu test --- tests/models/test_tpu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 5920b77298e1e..0cd108a07a9be 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -276,11 +276,11 @@ def test_exception_when_no_tpu_found(tmpdir): Trainer(tpu_cores=8) -@pytest.mark.parametrize("devices", [1, 8, [1]]) +@pytest.mark.parametrize("tpu_cores", [1, 8, [1]]) @RunIf(tpu=True) -def test_accelerator_set_when_using_tpu(tmpdir, devices): +def test_accelerator_set_when_using_tpu(tmpdir, tpu_cores): """Test if the accelerator is set to `tpu` when tpu_cores is not None.""" - assert isinstance(Trainer(accelerator="tpu", devices=devices).accelerator, TPUAccelerator) + assert isinstance(Trainer(tpu_cores=tpu_cores).accelerator, TPUAccelerator) @RunIf(tpu=True) From c1f3df1eec03b5bda043633d096be637410d326f Mon Sep 17 00:00:00 2001 From: Kyle Date: Mon, 14 Mar 2022 14:57:44 -0400 Subject: [PATCH 13/32] use tpu instead of gpu --- tests/models/test_tpu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 0cd108a07a9be..96569da016876 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -328,12 +328,12 @@ def test_tpu_choice(tmpdir, devices, expected_tpu_id, error_expected): @pytest.mark.parametrize( ["cli_args", "expected"], - [("--accelerator=gpu --devices=8", {"devices": 8}), ("--accelerator=gpu --devices=1,", {"devices": "1,"})], + [("--tpu_cores=8", {"tpu_cores": 8}), ("--tpu_cores=1,", {"tpu_cores": "1,"})], ) @RunIf(tpu=True) @pl_multi_process_test -def test_devices_with_argparse(cli_args, expected): - """Test passing devices in command line.""" +def test_tpu_cores_with_argparse(cli_args, expected): + """Test passing tpu_cores in command line.""" cli_args = cli_args.split(" ") if cli_args else [] with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): parser = ArgumentParser(add_help=False) From 26e5e66d63a3b8058cdb9223f5c881165e21eb7f Mon Sep 17 00:00:00 2001 From: Kyle Date: Mon, 14 Mar 2022 15:03:20 -0400 Subject: [PATCH 14/32] use devices --- tests/models/test_tpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 96569da016876..2a857b174baac 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -328,7 +328,7 @@ def test_tpu_choice(tmpdir, devices, expected_tpu_id, error_expected): @pytest.mark.parametrize( ["cli_args", "expected"], - [("--tpu_cores=8", {"tpu_cores": 8}), ("--tpu_cores=1,", {"tpu_cores": "1,"})], + [("--accelerator=tpu --devices=8", {"devices": 8}), ("--accelerator=tpu --devices=1,", {"devices": "1,"})], ) @RunIf(tpu=True) @pl_multi_process_test From 88e9752d9c549a87e6e6ff17443c787d0b6de98b Mon Sep 17 00:00:00 2001 From: Kyle Date: Mon, 14 Mar 2022 15:36:02 -0400 Subject: [PATCH 15/32] devices always returns an int or List[int] --- tests/models/test_tpu.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 2a857b174baac..bd0dba0be416e 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -306,9 +306,9 @@ def test_broadcast(rank): (8, None, False), ([1], 1, False), ([8], 8, False), - ("1,", 1, False), - ("1", None, False), - ("9, ", 9, True), + (1, 1, False), + (1, None, False), + (9, 9, True), ([9], 9, True), ([0], 0, True), (2, None, True), @@ -328,7 +328,7 @@ def test_tpu_choice(tmpdir, devices, expected_tpu_id, error_expected): @pytest.mark.parametrize( ["cli_args", "expected"], - [("--accelerator=tpu --devices=8", {"devices": 8}), ("--accelerator=tpu --devices=1,", {"devices": "1,"})], + [("--accelerator=tpu --devices=8", {"devices": 8}), ("--accelerator=tpu --devices=1,", {"devices": 1})], ) @RunIf(tpu=True) @pl_multi_process_test From 1723b833a7f44702cc5a9b5626fad65f786968cb Mon Sep 17 00:00:00 2001 From: Kyle Date: Mon, 14 Mar 2022 16:11:20 -0400 Subject: [PATCH 16/32] revert tests --- tests/models/test_tpu.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index bd0dba0be416e..2efaae65a395f 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -300,15 +300,15 @@ def test_broadcast(rank): @pytest.mark.parametrize( - ["devices", "expected_tpu_id", "error_expected"], + ["tpu_cores", "expected_tpu_id", "error_expected"], [ (1, None, False), (8, None, False), ([1], 1, False), ([8], 8, False), - (1, 1, False), - (1, None, False), - (9, 9, True), + ("1,", 1, False), + ("1", None, False), + ("9, ", 9, True), ([9], 9, True), ([0], 0, True), (2, None, True), @@ -317,18 +317,18 @@ def test_broadcast(rank): ) @RunIf(tpu=True) @pl_multi_process_test -def test_tpu_choice(tmpdir, devices, expected_tpu_id, error_expected): +def test_tpu_choice(tmpdir, tpu_cores, expected_tpu_id, error_expected): if error_expected: with pytest.raises(MisconfigurationException, match=r".*tpu_cores` can only be 1, 8 or [<1-8>]*"): - Trainer(default_root_dir=tmpdir, accelerator="tpu", devices=devices) + Trainer(default_root_dir=tmpdir, tpu_cores=tpu_cores) else: - trainer = Trainer(default_root_dir=tmpdir, accelerator="tpu", devices=devices) + trainer = Trainer(default_root_dir=tmpdir, tpu_cores=tpu_cores) assert trainer._accelerator_connector.tpu_id == expected_tpu_id @pytest.mark.parametrize( ["cli_args", "expected"], - [("--accelerator=tpu --devices=8", {"devices": 8}), ("--accelerator=tpu --devices=1,", {"devices": 1})], + [("--tpu_cores=8", {"tpu_cores": 8}), ("--tpu_cores=1,", {"tpu_cores": "1,"})], ) @RunIf(tpu=True) @pl_multi_process_test From d35aba7651dc8c12b80875c1f7112e395d76a0cc Mon Sep 17 00:00:00 2001 From: Kyle Date: Mon, 14 Mar 2022 22:39:56 -0400 Subject: [PATCH 17/32] use devices instead of tpu_cores --- tests/models/test_tpu.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 2efaae65a395f..65daad59ae556 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -67,10 +67,10 @@ def test_model_devices_1(tmpdir): tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) -@pytest.mark.parametrize("tpu_core", [1, 5]) +@pytest.mark.parametrize("devices", [1, 5]) @RunIf(tpu=True) @pl_multi_process_test -def test_model_tpu_index(tmpdir, tpu_core): +def test_model_tpu_index(tmpdir, devices): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( @@ -78,7 +78,7 @@ def test_model_tpu_index(tmpdir, tpu_core): enable_progress_bar=False, max_epochs=2, accelerator="tpu", - devices=[tpu_core], + devices=[devices], limit_train_batches=4, limit_val_batches=4, ) @@ -128,10 +128,10 @@ def test_model_16bit_devices_1(tmpdir): tpipes.run_model_test(trainer_options, model, on_gpu=False) -@pytest.mark.parametrize("tpu_core", [1, 5]) +@pytest.mark.parametrize("devices", [1, 5]) @RunIf(tpu=True) @pl_multi_process_test -def test_model_16bit_tpu_index(tmpdir, tpu_core): +def test_model_16bit_tpu_index(tmpdir, devices): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( @@ -140,7 +140,7 @@ def test_model_16bit_tpu_index(tmpdir, tpu_core): enable_progress_bar=False, max_epochs=2, accelerator="tpu", - devices=[tpu_core], + devices=[devices], limit_train_batches=4, limit_val_batches=2, ) From 263813060dff9044e05c65849df226724e6a3333 Mon Sep 17 00:00:00 2001 From: Kyle Date: Tue, 15 Mar 2022 01:55:50 -0400 Subject: [PATCH 18/32] revert some tests so we can use depreacation warning instead --- tests/models/test_tpu.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 65daad59ae556..87fd29f1c0209 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -67,18 +67,17 @@ def test_model_devices_1(tmpdir): tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) -@pytest.mark.parametrize("devices", [1, 5]) +@pytest.mark.parametrize("tpu_core", [1, 5]) @RunIf(tpu=True) @pl_multi_process_test -def test_model_tpu_index(tmpdir, devices): +def test_model_tpu_index(tmpdir, tpu_core): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, enable_progress_bar=False, max_epochs=2, - accelerator="tpu", - devices=[devices], + tpu_cores=[tpu_core], limit_train_batches=4, limit_val_batches=4, ) @@ -128,10 +127,10 @@ def test_model_16bit_devices_1(tmpdir): tpipes.run_model_test(trainer_options, model, on_gpu=False) -@pytest.mark.parametrize("devices", [1, 5]) +@pytest.mark.parametrize("tpu_core", [1, 5]) @RunIf(tpu=True) @pl_multi_process_test -def test_model_16bit_tpu_index(tmpdir, devices): +def test_model_16bit_tpu_index(tmpdir, tpu_core): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( @@ -139,8 +138,7 @@ def test_model_16bit_tpu_index(tmpdir, devices): precision=16, enable_progress_bar=False, max_epochs=2, - accelerator="tpu", - devices=[devices], + tpu_cores=[tpu_core], limit_train_batches=4, limit_val_batches=2, ) From 7bb59ac36cfdf2f11841a9483f04070ed144ba53 Mon Sep 17 00:00:00 2001 From: Kyle Date: Tue, 22 Mar 2022 02:15:59 -0400 Subject: [PATCH 19/32] match current changes directly --- tests/models/test_hooks.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index aee49273a11a6..2c21aa66c5937 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -442,10 +442,6 @@ def _predict_batch(trainer, model, batches): pytest.param( dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1) ), - pytest.param( - dict(accelerator="gpu", devices=1, precision=16, strategy="deepspeed"), - marks=RunIf(deepspeed=True, min_gpus=1, standalone=True), - ), ], ) @pytest.mark.parametrize("automatic_optimization", (True, False)) @@ -497,11 +493,9 @@ def training_step(self, batch, batch_idx): "state_dict": ANY, "loops": ANY, } - if kwargs.get("amp_backend") == "native": - saved_ckpt["native_amp_scaling_state"] = ANY - elif kwargs.get("amp_backend") == "apex": - saved_ckpt["amp_scaling_state"] = ANY - device = torch.device("cuda:0" if kwargs.get("accelerator") == "gpu" else "cpu") + if kwargs.get("amp_backend") == "native" or kwargs.get("amp_backend") == "apex": + saved_ckpt[trainer.precision_plugin.__class__.__qualname__] = ANY + device = torch.device("cuda:0" if "gpus" in kwargs else "cpu") expected = [ dict(name="Callback.on_init_start", args=(trainer,)), dict(name="Callback.on_init_end", args=(trainer,)), From 1e18c21b3b22a9d9a84564e3dd018d6437da74ea Mon Sep 17 00:00:00 2001 From: Jv Kyle Eclarin Date: Thu, 24 Mar 2022 01:20:39 -0400 Subject: [PATCH 20/32] revert tpu test save the best for last --- tests/models/test_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 281339ba3df70..eba6dbd611bbb 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -440,7 +440,7 @@ def _predict_batch(trainer, model, batches): # these precision plugins modify the optimization flow, so testing them explicitly pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)), pytest.param( - dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1) + dict(gpus=1, precision=16, strategy="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1, standalone=True) ), ], ) From c61b4f7f524e39311b294d699772f186eaa1f99f Mon Sep 17 00:00:00 2001 From: Kyle Date: Thu, 24 Mar 2022 19:38:20 -0400 Subject: [PATCH 21/32] use accelerator="gpu" --- tests/models/test_hooks.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index eba6dbd611bbb..c605e15561ba2 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -439,8 +439,9 @@ def _predict_batch(trainer, model, batches): {}, # these precision plugins modify the optimization flow, so testing them explicitly pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)), + pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1)), pytest.param( - dict(gpus=1, precision=16, strategy="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1, standalone=True) + dict(accelerator="gpu", devices=1, precision=16, strategy="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1, standalone=True) ), ], ) @@ -495,7 +496,7 @@ def training_step(self, batch, batch_idx): } if kwargs.get("amp_backend") == "native" or kwargs.get("amp_backend") == "apex": saved_ckpt[trainer.precision_plugin.__class__.__qualname__] = ANY - device = torch.device("cuda:0" if "gpus" in kwargs else "cpu") + device = torch.device("cuda:0" if kwargs["accelerator"] == "gpu" else "cpu") expected = [ dict(name="Callback.on_init_start", args=(trainer,)), dict(name="Callback.on_init_end", args=(trainer,)), @@ -553,7 +554,6 @@ def training_step(self, batch, batch_idx): dict(name="training_epoch_end", args=([dict(loss=ANY)] * train_batches,)), dict(name="Callback.on_train_epoch_end", args=(trainer, model)), # `ModelCheckpoint.save_checkpoint` is called here from `Callback.on_train_epoch_end` - dict(name="Callback.state_dict"), dict(name="Callback.on_save_checkpoint", args=(trainer, model, saved_ckpt)), dict(name="on_save_checkpoint", args=(saved_ckpt,)), dict(name="on_train_epoch_end"), @@ -627,7 +627,6 @@ def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir): dict(name="setup", kwargs=dict(stage="fit")), dict(name="on_load_checkpoint", args=(loaded_ckpt,)), dict(name="Callback.on_load_checkpoint", args=(trainer, model, {"foo": True})), - dict(name="Callback.load_state_dict", args=({"foo": True},)), dict(name="configure_sharded_model"), dict(name="Callback.on_configure_sharded_model", args=(trainer, model)), dict(name="configure_optimizers"), @@ -649,7 +648,6 @@ def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir): *model._train_batch(trainer, model, steps_after_reload, current_batch=1, current_epoch=1), dict(name="training_epoch_end", args=([dict(loss=ANY)] * train_batches,)), dict(name="Callback.on_train_epoch_end", args=(trainer, model)), - dict(name="Callback.state_dict"), dict(name="Callback.on_save_checkpoint", args=(trainer, model, saved_ckpt)), dict(name="on_save_checkpoint", args=(saved_ckpt,)), dict(name="on_train_epoch_end"), From 548426ed5eac3509981f9673777ff303c9b5b08f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Mar 2022 23:39:39 +0000 Subject: [PATCH 22/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/models/test_hooks.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index c605e15561ba2..99397de7d5e90 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -439,9 +439,12 @@ def _predict_batch(trainer, model, batches): {}, # these precision plugins modify the optimization flow, so testing them explicitly pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)), - pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1)), pytest.param( - dict(accelerator="gpu", devices=1, precision=16, strategy="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1, standalone=True) + dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1) + ), + pytest.param( + dict(accelerator="gpu", devices=1, precision=16, strategy="deepspeed"), + marks=RunIf(deepspeed=True, min_gpus=1, standalone=True), ), ], ) From 8e71f4174f2c4effdccd9b69e4ae1a148e78dfb8 Mon Sep 17 00:00:00 2001 From: Kyle Date: Thu, 24 Mar 2022 19:44:39 -0400 Subject: [PATCH 23/32] use if statements for readability --- tests/models/test_hooks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index c605e15561ba2..6cd54238ddf2d 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -496,7 +496,9 @@ def training_step(self, batch, batch_idx): } if kwargs.get("amp_backend") == "native" or kwargs.get("amp_backend") == "apex": saved_ckpt[trainer.precision_plugin.__class__.__qualname__] = ANY - device = torch.device("cuda:0" if kwargs["accelerator"] == "gpu" else "cpu") + device = torch.device("cpu") + if "accelerator" in kwargs: + device = torch.device("cuda:0") expected = [ dict(name="Callback.on_init_start", args=(trainer,)), dict(name="Callback.on_init_end", args=(trainer,)), From fc7d4a7d4e2b374e65e80b03fd84272047c1d82f Mon Sep 17 00:00:00 2001 From: Kyle Date: Thu, 24 Mar 2022 20:01:42 -0400 Subject: [PATCH 24/32] revert change for now --- tests/models/test_hooks.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index ee6c6e20f3373..423ff7a1f28b6 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -438,13 +438,10 @@ def _predict_batch(trainer, model, batches): [ {}, # these precision plugins modify the optimization flow, so testing them explicitly - pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)), + pytest.param(dict(gpus=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)), + pytest.param(dict(gpus=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1)), pytest.param( - dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1) - ), - pytest.param( - dict(accelerator="gpu", devices=1, precision=16, strategy="deepspeed"), - marks=RunIf(deepspeed=True, min_gpus=1, standalone=True), + dict(gpus=1, precision=16, strategy="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1, standalone=True) ), ], ) @@ -499,9 +496,7 @@ def training_step(self, batch, batch_idx): } if kwargs.get("amp_backend") == "native" or kwargs.get("amp_backend") == "apex": saved_ckpt[trainer.precision_plugin.__class__.__qualname__] = ANY - device = torch.device("cpu") - if "accelerator" in kwargs: - device = torch.device("cuda:0") + device = torch.device("cuda:0" if "gpus" in kwargs else "cpu") expected = [ dict(name="Callback.on_init_start", args=(trainer,)), dict(name="Callback.on_init_end", args=(trainer,)), From 88d3c0c003e68ad97abfc06709c3e0a839ade160 Mon Sep 17 00:00:00 2001 From: Kyle Date: Thu, 24 Mar 2022 20:51:24 -0400 Subject: [PATCH 25/32] try accelerator=gpu again, but with the right indent --- tests/models/test_hooks.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 423ff7a1f28b6..9eda9d9d60043 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -438,10 +438,10 @@ def _predict_batch(trainer, model, batches): [ {}, # these precision plugins modify the optimization flow, so testing them explicitly - pytest.param(dict(gpus=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)), - pytest.param(dict(gpus=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1)), + pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)), + pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1)), pytest.param( - dict(gpus=1, precision=16, strategy="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1, standalone=True) + dict(accelerator="gpu", devices=1, precision=16, strategy="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1, standalone=True) ), ], ) @@ -496,7 +496,7 @@ def training_step(self, batch, batch_idx): } if kwargs.get("amp_backend") == "native" or kwargs.get("amp_backend") == "apex": saved_ckpt[trainer.precision_plugin.__class__.__qualname__] = ANY - device = torch.device("cuda:0" if "gpus" in kwargs else "cpu") + device = torch.device("cuda:0" if "accelerator" in kwargs else "cpu") expected = [ dict(name="Callback.on_init_start", args=(trainer,)), dict(name="Callback.on_init_end", args=(trainer,)), From 68cdb795097b526f425282b0af6de7f4aad12c77 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 25 Mar 2022 00:52:48 +0000 Subject: [PATCH 26/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/models/test_hooks.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 9eda9d9d60043..e4acdef811589 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -439,9 +439,12 @@ def _predict_batch(trainer, model, batches): {}, # these precision plugins modify the optimization flow, so testing them explicitly pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)), - pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1)), pytest.param( - dict(accelerator="gpu", devices=1, precision=16, strategy="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1, standalone=True) + dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1) + ), + pytest.param( + dict(accelerator="gpu", devices=1, precision=16, strategy="deepspeed"), + marks=RunIf(deepspeed=True, min_gpus=1, standalone=True), ), ], ) From 46088a105259ea85dfcfc994f7446e7816b4ffe8 Mon Sep 17 00:00:00 2001 From: Kyle Date: Thu, 24 Mar 2022 21:56:03 -0400 Subject: [PATCH 27/32] revert test again; it wasn't the indent --- tests/models/test_hooks.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 9eda9d9d60043..eb8802db42ed1 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -438,10 +438,10 @@ def _predict_batch(trainer, model, batches): [ {}, # these precision plugins modify the optimization flow, so testing them explicitly - pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)), - pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1)), + pytest.param(dict(gpus=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)), + pytest.param(dict(gpus=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1)), pytest.param( - dict(accelerator="gpu", devices=1, precision=16, strategy="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1, standalone=True) + dict(gpus=1, precision=16, strategy="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1, standalone=True) ), ], ) @@ -496,7 +496,7 @@ def training_step(self, batch, batch_idx): } if kwargs.get("amp_backend") == "native" or kwargs.get("amp_backend") == "apex": saved_ckpt[trainer.precision_plugin.__class__.__qualname__] = ANY - device = torch.device("cuda:0" if "accelerator" in kwargs else "cpu") + device = torch.device("cuda:0" if "gpus" in kwargs else "cpu") expected = [ dict(name="Callback.on_init_start", args=(trainer,)), dict(name="Callback.on_init_end", args=(trainer,)), From dfb7d36473b9a869081fe5b59bdb5710180bbe0c Mon Sep 17 00:00:00 2001 From: Kyle Date: Thu, 24 Mar 2022 22:00:38 -0400 Subject: [PATCH 28/32] revert test, it wasn't the indent --- tests/models/test_hooks.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 67c762af321ad..eb8802db42ed1 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -438,13 +438,10 @@ def _predict_batch(trainer, model, batches): [ {}, # these precision plugins modify the optimization flow, so testing them explicitly - pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)), + pytest.param(dict(gpus=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)), + pytest.param(dict(gpus=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1)), pytest.param( - dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1) - ), - pytest.param( - dict(accelerator="gpu", devices=1, precision=16, strategy="deepspeed"), - marks=RunIf(deepspeed=True, min_gpus=1, standalone=True), + dict(gpus=1, precision=16, strategy="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1, standalone=True) ), ], ) From 45e6dd7fc41cb7cba934aa923a0e80567214b464 Mon Sep 17 00:00:00 2001 From: Kyle Date: Fri, 25 Mar 2022 11:30:49 -0400 Subject: [PATCH 29/32] add Callback.state_dict back --- tests/models/test_hooks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index eb8802db42ed1..ab80a07740cc8 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -554,6 +554,7 @@ def training_step(self, batch, batch_idx): dict(name="training_epoch_end", args=([dict(loss=ANY)] * train_batches,)), dict(name="Callback.on_train_epoch_end", args=(trainer, model)), # `ModelCheckpoint.save_checkpoint` is called here from `Callback.on_train_epoch_end` + dict(name="Callback.state_dict"), dict(name="Callback.on_save_checkpoint", args=(trainer, model, saved_ckpt)), dict(name="on_save_checkpoint", args=(saved_ckpt,)), dict(name="on_train_epoch_end"), From a3582b99edc7b28e0d9d89f9f20547508dc39ca8 Mon Sep 17 00:00:00 2001 From: Kyle Date: Fri, 25 Mar 2022 11:48:30 -0400 Subject: [PATCH 30/32] add Callback.load_state_dict back to test_trainer_model_hook_system_fit_no_val_and_resume --- tests/models/test_hooks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 6de9716d3d26f..548221fa880d8 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -627,6 +627,7 @@ def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir): dict(name="Callback.setup", args=(trainer, model), kwargs=dict(stage="fit")), dict(name="setup", kwargs=dict(stage="fit")), dict(name="on_load_checkpoint", args=(loaded_ckpt,)), + dict(name="Callback.load_state_dict", args=({"foo": True},)), dict(name="Callback.on_load_checkpoint", args=(trainer, model, {"foo": True})), dict(name="configure_sharded_model"), dict(name="Callback.on_configure_sharded_model", args=(trainer, model)), From d05cc21ef80dbc5367b4ee64d7a6eb8300924ef9 Mon Sep 17 00:00:00 2001 From: Kyle Date: Fri, 25 Mar 2022 12:12:24 -0400 Subject: [PATCH 31/32] wow, missed a few things when reverting the test --- tests/models/test_hooks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 548221fa880d8..97b6c8a0f09a3 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -627,8 +627,8 @@ def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir): dict(name="Callback.setup", args=(trainer, model), kwargs=dict(stage="fit")), dict(name="setup", kwargs=dict(stage="fit")), dict(name="on_load_checkpoint", args=(loaded_ckpt,)), - dict(name="Callback.load_state_dict", args=({"foo": True},)), dict(name="Callback.on_load_checkpoint", args=(trainer, model, {"foo": True})), + dict(name="Callback.load_state_dict", args=({"foo": True},)), dict(name="configure_sharded_model"), dict(name="Callback.on_configure_sharded_model", args=(trainer, model)), dict(name="configure_optimizers"), @@ -650,6 +650,7 @@ def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir): *model._train_batch(trainer, model, steps_after_reload, current_batch=1, current_epoch=1), dict(name="training_epoch_end", args=([dict(loss=ANY)] * train_batches,)), dict(name="Callback.on_train_epoch_end", args=(trainer, model)), + dict(name="Callback.state_dict"), dict(name="Callback.on_save_checkpoint", args=(trainer, model, saved_ckpt)), dict(name="on_save_checkpoint", args=(saved_ckpt,)), dict(name="on_train_epoch_end"), From 2fbd71da5d24909b084ae70e8d67316aa36658db Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Sat, 26 Mar 2022 19:28:20 +0530 Subject: [PATCH 32/32] Apply suggestions from code review --- tests/models/test_tpu.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 078b7e2808339..3b5c53a4d3397 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -50,7 +50,7 @@ def val_dataloader(self): @RunIf(tpu=True) @pl_multi_process_test -def test_model_devices_1(tmpdir): +def test_model_tpu_devices_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( @@ -90,7 +90,7 @@ def test_model_tpu_index(tmpdir, tpu_core): @RunIf(tpu=True) @pl_multi_process_test -def test_model_devices_8(tmpdir): +def test_model_tpu_devices_8(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( @@ -110,7 +110,7 @@ def test_model_devices_8(tmpdir): @RunIf(tpu=True) @pl_multi_process_test -def test_model_16bit_devices_1(tmpdir): +def test_model_16bit_tpu_devices_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict( @@ -152,7 +152,7 @@ def test_model_16bit_tpu_index(tmpdir, tpu_core): @RunIf(tpu=True) @pl_multi_process_test -def test_model_16bit_devices_8(tmpdir): +def test_model_16bit_tpu_devices_8(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() trainer_options = dict(