Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update tests/models/*.py to use devices instead of gpus or ipus #11470

Merged
merged 46 commits into from
Mar 26, 2022
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
de1401d
update tests for v2
mathemusician Jan 14, 2022
b581409
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 14, 2022
5257653
get rid of devices = 0 or devices = None
mathemusician Feb 9, 2022
9e75a0f
use gpu when accelerator="gpu"
mathemusician Feb 9, 2022
84caafc
use the right way of getting kwargs
mathemusician Feb 9, 2022
0b199dd
use devices instead of tpu_cores
mathemusician Feb 10, 2022
2318b43
switch mocked to match function
mathemusician Feb 10, 2022
63b1dcc
add accelerator
mathemusician Feb 10, 2022
81aa330
revert tests
mathemusician Mar 13, 2022
dcd3b03
put back a few more tests
mathemusician Mar 13, 2022
d9f352b
revert test_horovod
mathemusician Mar 13, 2022
5e19aa3
revert tpu test
mathemusician Mar 13, 2022
e86d265
Merge branch 'PyTorchLightning:master' into update_models
mathemusician Mar 14, 2022
c1f3df1
use tpu instead of gpu
mathemusician Mar 14, 2022
12041e1
Merge branch 'update_models' of https://github.com/mathemusician/pyto…
mathemusician Mar 14, 2022
26e5e66
use devices
mathemusician Mar 14, 2022
88e9752
devices always returns an int or List[int]
mathemusician Mar 14, 2022
1723b83
revert tests
mathemusician Mar 14, 2022
d35aba7
use devices instead of tpu_cores
mathemusician Mar 15, 2022
2638130
revert some tests so we can use depreacation warning instead
mathemusician Mar 15, 2022
9691028
Merge branch 'PyTorchLightning:master' into update_models
mathemusician Mar 18, 2022
c952dff
Merge branch 'master' into update_models
mathemusician Mar 22, 2022
7bb59ac
match current changes directly
mathemusician Mar 22, 2022
4d37f7d
Merge branch 'update_models' of https://github.com/mathemusician/pyto…
mathemusician Mar 22, 2022
ef61e37
Merge branch 'master' into update_models
mathemusician Mar 22, 2022
1e18c21
revert tpu test
mathemusician Mar 24, 2022
bee050c
Merge branch 'PyTorchLightning:master' into update_models
mathemusician Mar 24, 2022
c61b4f7
use accelerator="gpu"
mathemusician Mar 24, 2022
b6226af
Merge branch 'PyTorchLightning:master' into update_models
mathemusician Mar 24, 2022
548426e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 24, 2022
8e71f41
use if statements for readability
mathemusician Mar 24, 2022
0cd25ad
Merge branch 'update_models' of https://github.com/mathemusician/pyto…
mathemusician Mar 24, 2022
fc7d4a7
revert change for now
mathemusician Mar 25, 2022
88d3c0c
try accelerator=gpu again, but with the right indent
mathemusician Mar 25, 2022
68cdb79
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 25, 2022
46088a1
revert test again; it wasn't the indent
mathemusician Mar 25, 2022
8f69374
Merge branch 'update_models' of https://github.com/mathemusician/pyto…
mathemusician Mar 25, 2022
dfb7d36
revert test, it wasn't the indent
mathemusician Mar 25, 2022
e5de2d3
Merge branch 'master' into update_models
mathemusician Mar 25, 2022
45e6dd7
add Callback.state_dict back
mathemusician Mar 25, 2022
33a8ecc
Merge branch 'update_models' of https://github.com/mathemusician/pyto…
mathemusician Mar 25, 2022
a3582b9
add Callback.load_state_dict back to test_trainer_model_hook_system_f…
mathemusician Mar 25, 2022
d05cc21
wow, missed a few things when reverting the test
mathemusician Mar 25, 2022
c8d3819
Merge branch 'PyTorchLightning:master' into update_models
mathemusician Mar 25, 2022
5c812f3
Merge branch 'PyTorchLightning:master' into update_models
mathemusician Mar 25, 2022
2fbd71d
Apply suggestions from code review
rohitgr7 Mar 26, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/models/data/horovod/train_default_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def training_epoch_end(self, outputs) -> None:
trainer._checkpoint_connector.restore(checkpoint_path)

if on_gpu:
trainer = Trainer(gpus=1, strategy="horovod", max_epochs=1)
trainer = Trainer(accelerator="gpu", devices=1, strategy="horovod", max_epochs=1)
# Test the root_gpu property
assert trainer.root_gpu == hvd.local_rank()

Expand Down
31 changes: 23 additions & 8 deletions tests/models/test_amp.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,18 @@ def _assert_autocast_enabled(self):
],
)
@pytest.mark.parametrize("precision", [16, "bf16"])
@pytest.mark.parametrize("num_processes", [1, 2])
def test_amp_cpus(tmpdir, strategy, precision, num_processes):
@pytest.mark.parametrize("devices", [1, 2])
def test_amp_cpus(tmpdir, strategy, precision, devices):
"""Make sure combinations of AMP and training types work if supported."""
tutils.reset_seed()

trainer = Trainer(
default_root_dir=tmpdir, num_processes=num_processes, max_epochs=1, strategy=strategy, precision=precision
default_root_dir=tmpdir,
accelerator="cpu",
devices=devices,
max_epochs=1,
strategy=strategy,
precision=precision,
)

model = AMPTestModel()
Expand All @@ -97,12 +102,19 @@ def test_amp_cpus(tmpdir, strategy, precision, num_processes):
@RunIf(min_gpus=2, min_torch="1.10")
@pytest.mark.parametrize("strategy", [None, "dp", "ddp_spawn"])
@pytest.mark.parametrize("precision", [16, "bf16"])
@pytest.mark.parametrize("gpus", [1, 2])
def test_amp_gpus(tmpdir, strategy, precision, gpus):
@pytest.mark.parametrize("devices", [1, 2])
def test_amp_gpus(tmpdir, strategy, precision, devices):
"""Make sure combinations of AMP and training types work if supported."""
tutils.reset_seed()

trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, gpus=gpus, strategy=strategy, precision=precision)
trainer = Trainer(
default_root_dir=tmpdir,
max_epochs=1,
accelerator="gpu",
devices=devices,
strategy=strategy,
precision=precision,
)

model = AMPTestModel()
trainer.fit(model)
Expand Down Expand Up @@ -141,7 +153,8 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir):
trainer = Trainer(
default_root_dir=tmpdir,
max_epochs=1,
gpus=[0],
accelerator="gpu",
devices=[0],
strategy="ddp_spawn",
precision=16,
callbacks=[checkpoint],
Expand Down Expand Up @@ -195,7 +208,9 @@ def configure_optimizers(self):
model = CustomModel()
model.training_epoch_end = None

trainer = Trainer(default_root_dir=tmpdir, max_steps=5, precision=16, amp_backend="apex", gpus=1)
trainer = Trainer(
default_root_dir=tmpdir, max_steps=5, precision=16, amp_backend="apex", accelerator="gpu", devices=1
)
assert str(trainer.amp_backend) == "AMPType.APEX"
trainer.fit(model)
assert trainer.state.finished, f"Training failed with {trainer.state}"
Expand Down
4 changes: 2 additions & 2 deletions tests/models/test_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ def test_multi_cpu_model_ddp(tmpdir):
max_epochs=1,
limit_train_batches=0.4,
limit_val_batches=0.2,
gpus=None,
num_processes=2,
accelerator="cpu",
devices=2,
strategy="ddp_spawn",
)

Expand Down
78 changes: 34 additions & 44 deletions tests/models/test_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def test_multi_gpu_none_backend(tmpdir):
max_epochs=1,
limit_train_batches=0.2,
limit_val_batches=0.2,
gpus=2,
accelerator="gpu",
devices=2,
)

dm = ClassifDataModule()
Expand All @@ -56,16 +57,17 @@ def test_multi_gpu_none_backend(tmpdir):


@RunIf(min_gpus=2)
@pytest.mark.parametrize("gpus", [1, [0], [1]])
def test_single_gpu_model(tmpdir, gpus):
@pytest.mark.parametrize("devices", [1, [0], [1]])
def test_single_gpu_model(tmpdir, devices):
"""Make sure single GPU works (DP mode)."""
trainer_options = dict(
default_root_dir=tmpdir,
enable_progress_bar=False,
max_epochs=1,
limit_train_batches=0.1,
limit_val_batches=0.1,
gpus=gpus,
accelerator="gpu",
devices=devices,
)

model = BoringModel()
Expand Down Expand Up @@ -93,44 +95,32 @@ def device_count():


@pytest.mark.parametrize(
["gpus", "expected_num_gpus", "strategy"],
["devices", "expected_num_gpus", "strategy"],
[
pytest.param(None, 0, None, id="None - expect 0 gpu to use."),
pytest.param(0, 0, None, id="Oth gpu, expect 1 gpu to use."),
pytest.param(1, 1, None, id="1st gpu, expect 1 gpu to use."),
pytest.param(-1, PRETEND_N_OF_GPUS, "ddp", id="-1 - use all gpus"),
pytest.param("-1", PRETEND_N_OF_GPUS, "ddp", id="'-1' - use all gpus"),
pytest.param(3, 3, "ddp", id="3rd gpu - 1 gpu to use (backend:ddp)"),
],
)
def test_trainer_gpu_parse(mocked_device_count, gpus, expected_num_gpus, strategy):
assert Trainer(gpus=gpus, strategy=strategy).num_gpus == expected_num_gpus


@pytest.mark.parametrize(
["gpus", "expected_num_gpus", "strategy"],
[
pytest.param(None, 0, None, id="None - expect 0 gpu to use."),
pytest.param(None, 0, "ddp", id="None - expect 0 gpu to use."),
],
)
def test_trainer_num_gpu_0(mocked_device_count_0, gpus, expected_num_gpus, strategy):
assert Trainer(gpus=gpus, strategy=strategy).num_gpus == expected_num_gpus
@mock.patch("torch.cuda.is_available", return_value=True)
def test_trainer_gpu_parse(_, mocked_device_count, devices, expected_num_gpus, strategy):
assert Trainer(accelerator="gpu", devices=devices, strategy=strategy).num_gpus == expected_num_gpus


@pytest.mark.parametrize(
["gpus", "expected_root_gpu", "strategy"],
["devices", "expected_root_gpu", "strategy"],
[
pytest.param(None, None, "ddp", id="None is None"),
pytest.param(0, None, "ddp", id="O gpus, expect gpu root device to be None."),
pytest.param(1, 0, "ddp", id="1 gpu, expect gpu root device to be 0."),
pytest.param(-1, 0, "ddp", id="-1 - use all gpus, expect gpu root device to be 0."),
pytest.param("-1", 0, "ddp", id="'-1' - use all gpus, expect gpu root device to be 0."),
pytest.param(3, 0, "ddp", id="3 gpus, expect gpu root device to be 0.(backend:ddp)"),
],
)
def test_root_gpu_property(mocked_device_count, gpus, expected_root_gpu, strategy):
assert Trainer(gpus=gpus, strategy=strategy).root_gpu == expected_root_gpu
@mock.patch("torch.cuda.device_count", return_value=3)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_root_gpu_property(_, mocked_device_count, devices, expected_root_gpu, strategy):
assert Trainer(accelerator="gpu", devices=devices, strategy=strategy).root_gpu == expected_root_gpu


@pytest.mark.parametrize(
Expand All @@ -147,7 +137,7 @@ def test_root_gpu_property_0_passing(mocked_device_count_0, gpus, expected_root_

# Asking for a gpu when non are available will result in a MisconfigurationException
@pytest.mark.parametrize(
["gpus", "expected_root_gpu", "strategy"],
["devices", "expected_root_gpu", "strategy"],
[
(1, None, "ddp"),
(3, None, "ddp"),
Expand All @@ -158,13 +148,13 @@ def test_root_gpu_property_0_passing(mocked_device_count_0, gpus, expected_root_
("-1", None, "ddp"),
],
)
def test_root_gpu_property_0_raising(mocked_device_count_0, gpus, expected_root_gpu, strategy):
def test_root_gpu_property_0_raising(mocked_device_count_0, devices, expected_root_gpu, strategy):
with pytest.raises(MisconfigurationException):
Trainer(gpus=gpus, strategy=strategy)
Trainer(accelerator="gpu", devices=devices, strategy=strategy)


@pytest.mark.parametrize(
["gpus", "expected_root_gpu"],
["devices", "expected_root_gpu"],
[
pytest.param(None, None, id="No gpus, expect gpu root device to be None"),
pytest.param([0], 0, id="Oth gpu, expect gpu root device to be 0."),
Expand All @@ -173,12 +163,12 @@ def test_root_gpu_property_0_raising(mocked_device_count_0, gpus, expected_root_
pytest.param([1, 2], 1, id="[1, 2] gpus, expect gpu root device to be 1."),
],
)
def test_determine_root_gpu_device(gpus, expected_root_gpu):
assert device_parser.determine_root_gpu_device(gpus) == expected_root_gpu
def test_determine_root_gpu_device(devices, expected_root_gpu):
assert device_parser.determine_root_gpu_device(devices) == expected_root_gpu


@pytest.mark.parametrize(
["gpus", "expected_gpu_ids"],
["devices", "expected_gpu_ids"],
[
(None, None),
(0, None),
Expand All @@ -196,31 +186,31 @@ def test_determine_root_gpu_device(gpus, expected_root_gpu):
pytest.param("-1", list(range(PRETEND_N_OF_GPUS)), id="'-1' - use all gpus"),
],
)
def test_parse_gpu_ids(mocked_device_count, gpus, expected_gpu_ids):
assert device_parser.parse_gpu_ids(gpus) == expected_gpu_ids
def test_parse_gpu_ids(mocked_device_count, devices, expected_gpu_ids):
assert device_parser.parse_gpu_ids(devices) == expected_gpu_ids


@pytest.mark.parametrize("gpus", [0.1, -2, False, [-1], [None], ["0"], [0, 0]])
def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, gpus):
@pytest.mark.parametrize("devices", [0.1, -2, False, [-1], [None], ["0"], [0, 0]])
def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, devices):
with pytest.raises(MisconfigurationException):
device_parser.parse_gpu_ids(gpus)
device_parser.parse_gpu_ids(devices)


@pytest.mark.parametrize("gpus", [[1, 2, 19], -1, "-1"])
def test_parse_gpu_fail_on_non_existent_id(mocked_device_count_0, gpus):
@pytest.mark.parametrize("devices", [[1, 2, 19], -1, "-1"])
def test_parse_gpu_fail_on_non_existent_id(mocked_device_count_0, devices):
with pytest.raises(MisconfigurationException):
device_parser.parse_gpu_ids(gpus)
device_parser.parse_gpu_ids(devices)


def test_parse_gpu_fail_on_non_existent_id_2(mocked_device_count):
with pytest.raises(MisconfigurationException):
device_parser.parse_gpu_ids([1, 2, 19])


@pytest.mark.parametrize("gpus", [-1, "-1"])
def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_count_0, gpus):
@pytest.mark.parametrize("devices", [-1, "-1"])
def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_count_0, devices):
with pytest.raises(MisconfigurationException):
device_parser.parse_gpu_ids(gpus)
device_parser.parse_gpu_ids(devices)


@mock.patch.dict(
Expand Down Expand Up @@ -248,7 +238,7 @@ def test_torchelastic_gpu_parsing(mocked_device_count, mocked_is_available, gpus

@RunIf(min_gpus=1)
def test_single_gpu_batch_parse():
trainer = Trainer(gpus=1)
trainer = Trainer(accelerator="gpu", devices=1)

# non-transferrable types
primitive_objects = [None, {}, [], 1.0, "x", [None, 2], {"x": (1, 2), "y": None}]
Expand Down
22 changes: 14 additions & 8 deletions tests/models/test_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def transfer_batch_to_device(self, batch, device, dataloader_idx):
model = CurrentTestModel()
batch = CustomBatch((torch.zeros(5, 32), torch.ones(5, 1, dtype=torch.long)))

trainer = Trainer(gpus=1)
trainer = Trainer(accelerator="gpu", devices=1)
# running .fit() would require us to implement custom data loaders, we mock the model reference instead

model_getter_mock.return_value = model
Expand Down Expand Up @@ -203,7 +203,8 @@ def train_dataloader(self):
max_epochs=1,
enable_model_summary=False,
strategy="ddp",
gpus=2,
accelerator="gpu",
devices=2,
)
trainer.fit(model)

Expand Down Expand Up @@ -437,10 +438,13 @@ def _predict_batch(trainer, model, batches):
[
{},
# these precision plugins modify the optimization flow, so testing them explicitly
pytest.param(dict(gpus=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)),
pytest.param(dict(gpus=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1)),
pytest.param(dict(accelerator="gpu", devices=1, precision=16, amp_backend="native"), marks=RunIf(min_gpus=1)),
pytest.param(
dict(gpus=1, precision=16, strategy="deepspeed"), marks=RunIf(deepspeed=True, min_gpus=1, standalone=True)
dict(accelerator="gpu", devices=1, precision=16, amp_backend="apex"), marks=RunIf(amp_apex=True, min_gpus=1)
),
pytest.param(
dict(accelerator="gpu", devices=1, precision=16, strategy="deepspeed"),
marks=RunIf(deepspeed=True, min_gpus=1, standalone=True),
),
],
)
Expand Down Expand Up @@ -493,9 +497,11 @@ def training_step(self, batch, batch_idx):
"state_dict": ANY,
"loops": ANY,
}
if kwargs.get("amp_backend") == "native" or kwargs.get("amp_backend") == "apex":
saved_ckpt[trainer.precision_plugin.__class__.__qualname__] = ANY
device = torch.device("cuda:0" if "gpus" in kwargs else "cpu")
if kwargs.get("amp_backend") == "native":
saved_ckpt["native_amp_scaling_state"] = ANY
elif kwargs.get("amp_backend") == "apex":
saved_ckpt["amp_scaling_state"] = ANY
device = torch.device("cuda:0" if kwargs.get("accelerator") == "gpu" else "cpu")
expected = [
dict(name="Callback.on_init_start", args=(trainer,)),
dict(name="Callback.on_init_end", args=(trainer,)),
Expand Down
5 changes: 3 additions & 2 deletions tests/models/test_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_model_saves_with_input_sample(tmpdir):
def test_model_saves_on_gpu(tmpdir):
"""Test that model saves on gpu."""
model = BoringModel()
trainer = Trainer(gpus=1, fast_dev_run=True)
trainer = Trainer(accelerator="gpu", devices=1, fast_dev_run=True)
trainer.fit(model)

file_path = os.path.join(tmpdir, "model.onnx")
Expand Down Expand Up @@ -96,7 +96,8 @@ def test_model_saves_on_multi_gpu(tmpdir):
max_epochs=1,
limit_train_batches=10,
limit_val_batches=10,
gpus=[0, 1],
accelerator="gpu",
devices=[0, 1],
strategy="ddp_spawn",
enable_progress_bar=False,
)
Expand Down
8 changes: 5 additions & 3 deletions tests/models/test_restore.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,8 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
limit_val_batches=5,
callbacks=[checkpoint],
logger=logger,
gpus=[0, 1],
accelerator="gpu",
devices=[0, 1],
strategy="dp",
default_root_dir=tmpdir,
)
Expand Down Expand Up @@ -445,7 +446,8 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
limit_val_batches=2,
callbacks=[checkpoint],
logger=logger,
gpus=[0, 1],
accelerator="gpu",
devices=[0, 1],
strategy="ddp_spawn",
default_root_dir=tmpdir,
)
Expand Down Expand Up @@ -564,7 +566,7 @@ def test_dp_resume(tmpdir):
model = CustomClassificationModelDP(lr=0.1)
dm = ClassifDataModule()

trainer_options = dict(max_epochs=1, gpus=2, strategy="dp", default_root_dir=tmpdir)
trainer_options = dict(max_epochs=1, accelerator="gpu", devices=2, strategy="dp", default_root_dir=tmpdir)

# get logger
logger = tutils.get_default_logger(tmpdir)
Expand Down
Loading