Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Horovod] Fix Reduce for Horovod #6585

Closed
wants to merge 12 commits into from
Prev Previous commit
Next Next commit
Merge branch 'master' into horovod-reduce
  • Loading branch information
carmocca committed Apr 19, 2021
commit 3b293a758fd58ae84aa7630f9641a0090eeaee91
6 changes: 3 additions & 3 deletions tests/models/test_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _run_horovod(trainer_options, on_gpu=False):
# for Horovod, we interpret `gpus` to be set per worker
trainer_options.update(gpus=1 if on_gpu else None)
tutils.reset_seed()
# TODO: find why coverage breaks CI
# TODO: Find out why coverage breaks CI.
# append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else ''
# str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append,
cmdline = [
Expand Down Expand Up @@ -275,6 +275,7 @@ def get_optimizer_params(optimizer):
assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1])


# todo: need to be fixed :]
@pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied")
@RunIf(skip_windows=True, horovod=True)
def test_result_reduce_horovod(tmpdir):
Expand Down Expand Up @@ -325,9 +326,8 @@ def training_epoch_end(self, outputs) -> None:
horovod.run(hvd_test_fn, np=2)


@pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied")
# todo: need to be fixed :]
@RunIf(skip_windows=True, horovod=True, num_gpus=2)
def test_accuracy_metric_horovod():
num_batches = 10
batch_size = 16
threshold = 0.5
Expand Down
85 changes: 59 additions & 26 deletions tests/special_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,29 +16,62 @@ set -e

# this environment variable allows special tests to run
export PL_RUNNING_SPECIAL_TESTS=1
DEFAULTS="-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no"
python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp
python ${DEFAULTS} tests/models/test_sync_batchnorm.py::test_sync_batchnorm_ddp
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_invalid_deepspeed_defaults_no_precision
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_warn_deepspeed_override_backward
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_run_configure_optimizers
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_config
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_custom_precision_params
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_assert_config_zero_offload_disabled
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_multigpu
python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp
python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual
python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual_amp
python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_automatic
python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_with_wrong_balance
python ${DEFAULTS} tests/utilities/test_all_gather_grad.py::test_all_gather_collection
python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_ddp
python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_dp
python ${DEFAULTS} tests/trainer/logging_/test_train_loop_logging_1_0.py::test_logging_sync_dist_true_ddp
python ${DEFAULTS} tests/callbacks/test_pruning.py::test_pruning_callback_ddp
python ${DEFAULTS} tests/test_profiler.py::test_pytorch_profiler_trainer_ddp
python ${DEFAULTS} tests/models/test_hooks.py::test_transfer_batch_hook_ddp
python ${DEFAULTS} tests/trainer/test_data_loading.py::test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler
python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model
python ${DEFAULTS} tests/checkpointing/test_checkpoint_callback_frequency.py::test_top_k_distributed
nvprof --profile-from-start off -o trace_name.prof -- python ${DEFAULTS} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx
# python arguments
defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no'

# find tests marked as `@RunIf(special=True)`
grep_output=$(grep --recursive --line-number --word-regexp 'tests' 'benchmarks' --regexp 'special=True')
# file paths
files=$(echo "$grep_output" | cut -f1 -d:)
files_arr=($files)
# line numbers
linenos=$(echo "$grep_output" | cut -f2 -d:)
linenos_arr=($linenos)

# tests to skip - space separated
blocklist='test_pytorch_profiler_nested_emit_nvtx'
report=''

for i in "${!files_arr[@]}"; do
file=${files_arr[$i]}
lineno=${linenos_arr[$i]}

# get code from `@RunIf(special=True)` line to EOF
test_code=$(tail -n +"$lineno" "$file")

# read line by line
while read -r line; do
# if it's a test
if [[ $line == def\ test_* ]]; then
# get the name
test_name=$(echo $line | cut -c 5- | cut -f1 -d\()

# check blocklist
if echo $blocklist | grep --word-regexp "$test_name" > /dev/null; then
report+="Skipped\t$file:$lineno::$test_name\n"
break
fi

# SPECIAL_PATTERN allows filtering the tests to run when debugging.
# use as `SPECIAL_PATTERN="foo_bar" ./special_tests.sh` to run only those
# test with `foo_bar` in their name
if [[ $line != *$SPECIAL_PATTERN* ]]; then
report+="Skipped\t$file:$lineno::$test_name\n"
break
fi

# run the test
report+="Ran\t$file:$lineno::$test_name\n"
python ${defaults} "${file}::${test_name}"
break
fi
done < <(echo "$test_code")
done

nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx

# echo test report
printf '=%.s' {1..80}
printf "\n$report"
printf '=%.s' {1..80}
printf '\n'
You are viewing a condensed version of this merge commit. You can view the full changes here.