From 7ac3f5fc99998ef1dfbaf324289878032a8ac691 Mon Sep 17 00:00:00 2001 From: Alexander Pacha Date: Fri, 13 Jan 2023 12:04:48 +0100 Subject: [PATCH 01/10] Adding missing pre-commit requirement to tests.txt --- requirements/tests.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/tests.txt b/requirements/tests.txt index debf7eb171..c54422187e 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -1,4 +1,5 @@ coverage lmdb parameterized +pre-commit pytest From d29d73c17acc8dde4a0a44561f7873e981042341 Mon Sep 17 00:00:00 2001 From: Alexander Pacha Date: Fri, 13 Jan 2023 14:07:05 +0100 Subject: [PATCH 02/10] Added support for setting a timeout for distributed learning --- mmengine/dist/utils.py | 15 +++++++++++++++ mmengine/runner/runner.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/mmengine/dist/utils.py b/mmengine/dist/utils.py index f5ec3cf51d..8c39e3c8c8 100644 --- a/mmengine/dist/utils.py +++ b/mmengine/dist/utils.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import datetime import functools import os import subprocess @@ -50,6 +51,20 @@ def init_dist(launcher, backend='nccl', **kwargs) -> None: 'gloo' and 'mpi'. Defaults to 'nccl'. **kwargs: keyword arguments are passed to ``init_process_group``. """ + timeout = kwargs.get('timeout', None) + if timeout: + # If a timeout (in seconds) is specified, it must be converted + # to a timedelta object before forwarding the call to + # the respective backend, because they expect a timedelta object. + if type(timeout) == int: + kwargs['timeout'] = datetime.timedelta( + seconds=int(kwargs['timeout'])) + else: + raise TypeError( + f'Timeout for distributed training must be provided as ' + f"integer (timeout in seconds), but we've received the type " + f'{type(timeout)}. Please specify the timeout like this: ' + f"dist_cfg=dict(backend='nccl', timeout=1800)") if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') if launcher == 'pytorch': diff --git a/mmengine/runner/runner.py b/mmengine/runner/runner.py index b706bfc710..0b891aacff 100644 --- a/mmengine/runner/runner.py +++ b/mmengine/runner/runner.py @@ -625,7 +625,7 @@ def setup_env(self, env_cfg: Dict) -> None: mp_start_method='fork', opencv_num_threads=0 ), - dist_cfg=dict(backend='nccl'), + dist_cfg=dict(backend='nccl', timeout=1800), resource_limit=4096 ) From c8f01e37dc573021cc2751627fe8f090a23a27a0 Mon Sep 17 00:00:00 2001 From: Alexander Pacha Date: Fri, 13 Jan 2023 16:44:15 +0100 Subject: [PATCH 03/10] Adding documentation about how to change the runtime timeout into the distributed manual. --- docs/en/advanced_tutorials/distributed.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/en/advanced_tutorials/distributed.md b/docs/en/advanced_tutorials/distributed.md index 8edae584fa..c76de3bc4a 100644 --- a/docs/en/advanced_tutorials/distributed.md +++ b/docs/en/advanced_tutorials/distributed.md @@ -23,6 +23,16 @@ We will detail on these APIs in the following chapters. - [init_dist](mmengine.dist.init_dist): Launch function of distributed training. Currently it supports 3 launchers including pytorch, slurm and MPI. It also setup the given communication backends, defaults to NCCL. +If you need to change the runtime timeout (default=30 minutes) for distributed operations that take very long, you can specify a different timeout in your runtime configuration like this: + +```python +env_cfg = dict( + cudnn_benchmark=True, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl', timeout="10800"), # Sets the timeout to 3h (10800 seconds) +) +``` + ## Query and control The query and control functions are all argument free. From 1859ecddef15e4deeb4471f59eca4ae0f40a2eb9 Mon Sep 17 00:00:00 2001 From: Alexander Pacha Date: Fri, 13 Jan 2023 17:02:20 +0100 Subject: [PATCH 04/10] Fixed type in documentation to correctly specify an integer --- docs/en/advanced_tutorials/distributed.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/advanced_tutorials/distributed.md b/docs/en/advanced_tutorials/distributed.md index c76de3bc4a..76bc108231 100644 --- a/docs/en/advanced_tutorials/distributed.md +++ b/docs/en/advanced_tutorials/distributed.md @@ -29,7 +29,7 @@ If you need to change the runtime timeout (default=30 minutes) for distributed o env_cfg = dict( cudnn_benchmark=True, mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), - dist_cfg=dict(backend='nccl', timeout="10800"), # Sets the timeout to 3h (10800 seconds) + dist_cfg=dict(backend='nccl', timeout=10800), # Sets the timeout to 3h (10800 seconds) ) ``` From 76044bcdfab792ba209fe1aed5b5669d5934a3da Mon Sep 17 00:00:00 2001 From: Alexander Pacha Date: Fri, 13 Jan 2023 17:03:22 +0100 Subject: [PATCH 05/10] Removing type-cast after checking the correct type already before --- mmengine/dist/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mmengine/dist/utils.py b/mmengine/dist/utils.py index 8c39e3c8c8..9e600c2d05 100644 --- a/mmengine/dist/utils.py +++ b/mmengine/dist/utils.py @@ -57,8 +57,7 @@ def init_dist(launcher, backend='nccl', **kwargs) -> None: # to a timedelta object before forwarding the call to # the respective backend, because they expect a timedelta object. if type(timeout) == int: - kwargs['timeout'] = datetime.timedelta( - seconds=int(kwargs['timeout'])) + kwargs['timeout'] = datetime.timedelta(seconds=kwargs['timeout']) else: raise TypeError( f'Timeout for distributed training must be provided as ' From 235c3d36602b0590ec0a278b936723bbef984341 Mon Sep 17 00:00:00 2001 From: Alexander Pacha Date: Mon, 16 Jan 2023 11:46:03 +0100 Subject: [PATCH 06/10] Update mmengine/dist/utils.py Adding an explicit `is not None` to the check Co-authored-by: Mashiro <57566630+HAOCHENYE@users.noreply.github.com> --- mmengine/dist/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmengine/dist/utils.py b/mmengine/dist/utils.py index 9e600c2d05..80ef72eafb 100644 --- a/mmengine/dist/utils.py +++ b/mmengine/dist/utils.py @@ -52,7 +52,7 @@ def init_dist(launcher, backend='nccl', **kwargs) -> None: **kwargs: keyword arguments are passed to ``init_process_group``. """ timeout = kwargs.get('timeout', None) - if timeout: + if timeout is not None: # If a timeout (in seconds) is specified, it must be converted # to a timedelta object before forwarding the call to # the respective backend, because they expect a timedelta object. From d2981673e8e26c7e2fe0cc183bb558d3911fb1f9 Mon Sep 17 00:00:00 2001 From: Alexander Pacha Date: Mon, 16 Jan 2023 11:53:36 +0100 Subject: [PATCH 07/10] Removing explicit type check and replacing it with more pythonic way of assuming it is the right type and handling the exception if the type doesn't match. --- mmengine/dist/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mmengine/dist/utils.py b/mmengine/dist/utils.py index 80ef72eafb..b02e3b0dd5 100644 --- a/mmengine/dist/utils.py +++ b/mmengine/dist/utils.py @@ -56,14 +56,14 @@ def init_dist(launcher, backend='nccl', **kwargs) -> None: # If a timeout (in seconds) is specified, it must be converted # to a timedelta object before forwarding the call to # the respective backend, because they expect a timedelta object. - if type(timeout) == int: + try: kwargs['timeout'] = datetime.timedelta(seconds=kwargs['timeout']) - else: + except TypeError as exception: raise TypeError( f'Timeout for distributed training must be provided as ' - f"integer (timeout in seconds), but we've received the type " + f"timeout in seconds, but we've received the type " f'{type(timeout)}. Please specify the timeout like this: ' - f"dist_cfg=dict(backend='nccl', timeout=1800)") + f"dist_cfg=dict(backend='nccl', timeout=1800)") from exception if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') if launcher == 'pytorch': From 5a9e42197df4b618e54ab7d52935430c37e94c81 Mon Sep 17 00:00:00 2001 From: Alexander Pacha Date: Mon, 16 Jan 2023 11:54:00 +0100 Subject: [PATCH 08/10] Removing pre-commit from test requirements again --- requirements/tests.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/tests.txt b/requirements/tests.txt index c54422187e..debf7eb171 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -1,5 +1,4 @@ coverage lmdb parameterized -pre-commit pytest From 1b85228928288c75f1556a65ef5c3c034b56473e Mon Sep 17 00:00:00 2001 From: Alexander Pacha Date: Mon, 16 Jan 2023 11:55:41 +0100 Subject: [PATCH 09/10] Simplified the code according to suggestions from PR --- mmengine/dist/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmengine/dist/utils.py b/mmengine/dist/utils.py index b02e3b0dd5..864e57b26f 100644 --- a/mmengine/dist/utils.py +++ b/mmengine/dist/utils.py @@ -57,7 +57,7 @@ def init_dist(launcher, backend='nccl', **kwargs) -> None: # to a timedelta object before forwarding the call to # the respective backend, because they expect a timedelta object. try: - kwargs['timeout'] = datetime.timedelta(seconds=kwargs['timeout']) + kwargs['timeout'] = datetime.timedelta(seconds=timeout) except TypeError as exception: raise TypeError( f'Timeout for distributed training must be provided as ' From cdd1a36f792d99b7bb2335100921c1612b45bd82 Mon Sep 17 00:00:00 2001 From: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com> Date: Fri, 3 Feb 2023 15:35:47 +0800 Subject: [PATCH 10/10] Update distributed.md --- docs/en/advanced_tutorials/distributed.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/en/advanced_tutorials/distributed.md b/docs/en/advanced_tutorials/distributed.md index 76bc108231..56b9252e92 100644 --- a/docs/en/advanced_tutorials/distributed.md +++ b/docs/en/advanced_tutorials/distributed.md @@ -23,15 +23,16 @@ We will detail on these APIs in the following chapters. - [init_dist](mmengine.dist.init_dist): Launch function of distributed training. Currently it supports 3 launchers including pytorch, slurm and MPI. It also setup the given communication backends, defaults to NCCL. -If you need to change the runtime timeout (default=30 minutes) for distributed operations that take very long, you can specify a different timeout in your runtime configuration like this: + If you need to change the runtime timeout (default=30 minutes) for distributed operations that take very long, you can specify a different timeout in your `env_cfg` configuration passing in [Runner](mmengine.runner.Runner) like this: -```python -env_cfg = dict( - cudnn_benchmark=True, - mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), - dist_cfg=dict(backend='nccl', timeout=10800), # Sets the timeout to 3h (10800 seconds) -) -``` + ```python + env_cfg = dict( + cudnn_benchmark=True, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl', timeout=10800), # Sets the timeout to 3h (10800 seconds) + ) + runner = Runner(xxx, env_cfg=env_cfg) + ``` ## Query and control