From a23460778955ff57e6fd61f4b1d9eacb49722ddb Mon Sep 17 00:00:00 2001 From: Junhwa Song Date: Thu, 1 Dec 2022 02:33:16 +0900 Subject: [PATCH 01/28] Bump ray from 1.9.1 to 2.1.0 Signed-off-by: Junhwa Song --- requirements/runtime.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/runtime.txt b/requirements/runtime.txt index ab4bf3ac..7cf3a24b 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -1,5 +1,5 @@ mmcv-full>=1.4.7 pandas protobuf<=3.20 -ray[default]==1.9.1 +ray[default]==2.1.0 tabulate From f6e85e4d4f0fcb0120e29388d5260173505c6cbf Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Thu, 1 Dec 2022 14:19:14 +0900 Subject: [PATCH 02/28] Init --- siatune/apis/tune.py | 58 +------------------- siatune/mm/tasks/mmtrainbase.py | 24 +++----- siatune/ray/__init__.py | 5 ++ siatune/ray/tuner.py | 97 +++++++++++++++++++++++++++++++++ 4 files changed, 114 insertions(+), 70 deletions(-) create mode 100644 siatune/ray/tuner.py diff --git a/siatune/apis/tune.py b/siatune/apis/tune.py index 40bbc467..60cf7b73 100644 --- a/siatune/apis/tune.py +++ b/siatune/apis/tune.py @@ -1,16 +1,10 @@ # Copyright (c) SI-Analytics. All rights reserved. -from os import path as osp -import mmcv import ray from mmcv.utils import Config from siatune.mm.tasks import BaseTask -from siatune.ray.callbacks import build_callback -from siatune.ray.schedulers import build_scheduler -from siatune.ray.searchers import build_searcher -from siatune.ray.spaces import build_space -from siatune.ray.stoppers import build_stopper +from siatune.ray import Tuner def tune(task_processor: BaseTask, tune_config: Config, @@ -29,51 +23,5 @@ def tune(task_processor: BaseTask, tune_config: Config, trainable_cfg = tune_config.get('trainable', dict()) trainable = task_processor.create_trainable(**trainable_cfg) - assert hasattr(tune_config, 'metric') - assert hasattr(tune_config, 'mode') and tune_config.mode in ['min', 'max'] - - tune_artifact_dir = osp.join(tune_config.work_dir, 'artifact') - mmcv.mkdir_or_exist(tune_artifact_dir) - - stopper = tune_config.get('stop', None) - if stopper is not None: - stopper = build_stopper(stopper) - - space = tune_config.get('space', None) - if space is not None: - space = build_space(space) - - resources_per_trial = None - if not hasattr(trainable, 'default_resource_request'): - resources_per_trial = dict( - gpu=task_processor.num_workers * - task_processor.num_gpus_per_worker, - cpu=task_processor.num_workers * - task_processor.num_cpus_per_worker) - - searcher = tune_config.get('searcher', None) - if searcher is not None: - searcher = build_searcher(searcher) - - scheduler = tune_config.get('scheduler', None) - if scheduler is not None: - scheduler = build_scheduler(scheduler) - - callbacks = tune_config.get('callbacks', None) - if callbacks is not None: - callbacks = [build_callback(callback) for callback in callbacks] - - return ray.tune.run( - trainable, - name=exp_name, - metric=tune_config.metric, - mode=tune_config.mode, - stop=stopper, - config=space, - resources_per_trial=resources_per_trial, - num_samples=tune_config.get('num_samples', -1), - local_dir=tune_artifact_dir, - search_alg=searcher, - scheduler=scheduler, - raise_on_failed_trial=tune_config.get('raise_on_failed_trial', False), - callbacks=callbacks) + tuner = Tuner.from_cfg(tune_config, trainable) + return tuner.fit() diff --git a/siatune/mm/tasks/mmtrainbase.py b/siatune/mm/tasks/mmtrainbase.py index fac8c4f8..227ea0b1 100644 --- a/siatune/mm/tasks/mmtrainbase.py +++ b/siatune/mm/tasks/mmtrainbase.py @@ -4,9 +4,9 @@ from functools import partial import mmcv -import ray import torch -from ray.tune.integration.torch import DistributedTrainableCreator +from ray.air.config import ScalingConfig +from ray.train.torch import TorchTrainer from .base import BaseTask from .builder import TASKS @@ -78,8 +78,7 @@ def context_aware_run(self, def create_trainable( self, backend: str = 'nccl', - timeout_s: int = 1800, - ) -> ray.tune.trainable: + ) -> TorchTrainer: """Get ray trainable task. Args: @@ -94,14 +93,9 @@ def create_trainable( assert backend in ['gloo', 'nccl'] - return DistributedTrainableCreator( - partial( - self.context_aware_run, - backend=backend, - ), - backend=backend, - timeout_s=timeout_s, - num_workers=self.num_workers, - num_gpus_per_worker=self.num_gpus_per_worker, - num_cpus_per_worker=self.num_cpus_per_worker, - ) + return TorchTrainer( + partial(self.context_aware_run, backend=backend), + scaling_config=ScalingConfig( + resources_per_worker=dict( + CPU=self.num_cpus_per_worker, + GPU=self.num_gpus_per_worker))) diff --git a/siatune/ray/__init__.py b/siatune/ray/__init__.py index 061afde0..cb03c07b 100644 --- a/siatune/ray/__init__.py +++ b/siatune/ray/__init__.py @@ -1,4 +1,9 @@ # Copyright (c) SI-Analytics. All rights reserved. +from .callbacks import * # noqa F403 from .schedulers import * # noqa F403 +from .searchers import * # noqa F403 from .spaces import * # noqa F403 from .stoppers import * # noqa F403 +from .tuner import Tuner + +__all__ = ['Tuner'] diff --git a/siatune/ray/tuner.py b/siatune/ray/tuner.py new file mode 100644 index 00000000..c470c5c8 --- /dev/null +++ b/siatune/ray/tuner.py @@ -0,0 +1,97 @@ +# Copyright (c) SI-Analytics. All rights reserved. +import copy +import os.path as osp + +from ray.air.config import RunConfig +from ray.tune.tune_config import TuneConfig +from ray.tune.tuner import Tuner as RayTuner + +from siatune.ray import (build_callback, build_scheduler, build_searcher, + build_space, build_stopper) + + +class Tuner: + """Wrapper class of :class:`ray.tune.tuner.Tuner`. + + Args: + trainable (Callable): + work_dir (str): + param_space (dict, optional): + tune_cfg (dict, optional): + Refer to https://github.com/ray-project/ray/blob/ray-2.1.0/python/ray/tune/tune_config.py for details. # noqa + searcher (dict, optional): + trial_scheduler (dict, optional): + stopper (dict, optional): + callbacks (list, optional): + """ + + def __init__( + self, + trainable, + work_dir, + param_space=None, + tune_cfg=None, + searcher=None, + trial_scheduler=None, + stopper=None, + callbacks=None, + ): + work_dir = osp.abspath(work_dir) + + if param_space is not None: + param_space = build_space(param_space) + + tune_cfg = copy.deepcopy(tune_cfg or dict()) + + if searcher is not None: + searcher = build_searcher(searcher) + + if trial_scheduler is not None: + trial_scheduler = build_scheduler(trial_scheduler) + + if stopper is not None: + stopper = build_stopper(stopper) + + if callbacks is not None: + if isinstance(callbacks, dict): + callbacks = [callbacks] + callbacks = [build_callback(callback) for callback in callbacks] + + self.tuner = RayTuner( + trainable, + param_space=param_space, + tune_config=TuneConfig( + searcher=searcher, trial_scheduler=trial_scheduler, + **tune_cfg), + run_config=RunConfig( + local_dir=work_dir, + stop=stopper, + callbacks=callbacks, + failure_config=None, # todo + sync_config=None, # todo + checkpoint_config=None, # todo + ), + ) + + @classmethod + def from_cfg(cls, cfg, trainable): + cfg = copy.deepcopy(cfg) + tuner = cls( + trainable, + work_dir=cfg['work_dir'], + param_space=cfg.get('space', None), + tune_cfg=cfg.get('tune_cfg', None), + searcher=cfg.get('searcher', None), + trial_scheduler=cfg.get('trial_scheduler', None), + stopper=cfg.get('stopper', None), + callbacks=cfg.get('callbacks', None), + ) + + return tuner + + @classmethod + def resume(cls, path, **kwargs): + return RayTuner.restore(path, **kwargs) + + def fit(self): + return self.tuner.fit() From cac34ea2b46df005618c0de472b4aab13164a808 Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Thu, 1 Dec 2022 14:34:26 +0900 Subject: [PATCH 03/28] Update mmseg config --- configs/_base_/scheduler/asynchb.py | 2 +- configs/mmseg/mmseg_asynchb_nevergrad_pso.py | 16 +++++----------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/configs/_base_/scheduler/asynchb.py b/configs/_base_/scheduler/asynchb.py index 99426e01..a73b6e0d 100644 --- a/configs/_base_/scheduler/asynchb.py +++ b/configs/_base_/scheduler/asynchb.py @@ -1,4 +1,4 @@ -scheduler = dict( +trial_scheduler = dict( type='AsyncHyperBandScheduler', time_attr='training_iteration', max_t=20, diff --git a/configs/mmseg/mmseg_asynchb_nevergrad_pso.py b/configs/mmseg/mmseg_asynchb_nevergrad_pso.py index 923122f0..3d81ecd6 100644 --- a/configs/mmseg/mmseg_asynchb_nevergrad_pso.py +++ b/configs/mmseg/mmseg_asynchb_nevergrad_pso.py @@ -4,16 +4,10 @@ '../_base_/space/optimizer.py', '../_base_/space/batch_size.py' ] -space = { - 'model': {{_base_.model}}, - 'optimizer': {{_base_.optimizer}}, - 'data.samples_per_gpu': {{_base_.batch_size}}, - 'model.decode_head.num_classes': 21, - 'model.auxiliary_head.num_classes': 21, -} +space = dict( + data=dict(samples_per_gpu={{_base_.batch_size}}), + model={{_base_.model}}, + optimizer={{_base_.optimizer}}) task = dict(type='MMSegmentation') -metric = 'val/mIoU' -mode = 'max' -raise_on_failed_trial = False -num_samples = 256 +tune_cfg = dict(num_samples=8, metric='val/mIoU', mode='max') From 756147b2ab91bb16f784ee5d30b4a1141e6d984b Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Thu, 1 Dec 2022 15:22:01 +0900 Subject: [PATCH 04/28] Fix deprecated warning --- siatune/mm/tasks/base.py | 4 ++-- siatune/mm/tasks/mmtrainbase.py | 10 +++++----- siatune/ray/callbacks/mlflow.py | 4 ++-- siatune/ray/schedulers/pbt.py | 4 ++-- siatune/ray/searchers/builder.py | 4 ++-- siatune/ray/searchers/flaml.py | 4 ++-- siatune/ray/searchers/hyperopt.py | 2 +- siatune/ray/searchers/nevergrad.py | 2 +- siatune/ray/spaces/base.py | 3 ++- siatune/ray/spaces/choice.py | 3 ++- siatune/ray/spaces/sample_from.py | 3 ++- 11 files changed, 23 insertions(+), 20 deletions(-) diff --git a/siatune/mm/tasks/base.py b/siatune/mm/tasks/base.py index 23bb98e1..1e419b36 100644 --- a/siatune/mm/tasks/base.py +++ b/siatune/mm/tasks/base.py @@ -4,7 +4,7 @@ from copy import deepcopy from typing import Any, Dict, List, Optional, Sequence -import ray +from ray.tune import Trainable from siatune.mm.context import ContextManager from siatune.utils import ImmutableContainer @@ -140,7 +140,7 @@ def run(self, *, args: argparse.Namespace, **kwargs) -> None: pass @abstractmethod - def create_trainable(self, *args, **kwargs) -> ray.tune.Trainable: + def create_trainable(self, *args, **kwargs) -> Trainable: """Get ray trainable task. Args: diff --git a/siatune/mm/tasks/mmtrainbase.py b/siatune/mm/tasks/mmtrainbase.py index 227ea0b1..16e6dc7d 100644 --- a/siatune/mm/tasks/mmtrainbase.py +++ b/siatune/mm/tasks/mmtrainbase.py @@ -82,13 +82,11 @@ def create_trainable( """Get ray trainable task. Args: - backend (str): - The backend for dist training. Defaults to 'nccl'. - timeout_s (int): - Seconds before the torch process group times out. + backend (str): The backend for distributed training. + Defaults to 'nccl'. Returns: - ray.tune.trainable: The trainable task. + TorchTrainer: The trainable task. """ assert backend in ['gloo', 'nccl'] @@ -96,6 +94,8 @@ def create_trainable( return TorchTrainer( partial(self.context_aware_run, backend=backend), scaling_config=ScalingConfig( + num_workers=2, + use_gpu=True, resources_per_worker=dict( CPU=self.num_cpus_per_worker, GPU=self.num_gpus_per_worker))) diff --git a/siatune/ray/callbacks/mlflow.py b/siatune/ray/callbacks/mlflow.py index bfadb2c6..c57f1af8 100644 --- a/siatune/ray/callbacks/mlflow.py +++ b/siatune/ray/callbacks/mlflow.py @@ -1,10 +1,10 @@ # Copyright (c) SI-Analytics. All rights reserved. from typing import List +from ray.tune.experiment import Trial from ray.tune.integration.mlflow import \ MLflowLoggerCallback as _MLflowLoggerCallback from ray.tune.integration.mlflow import logger -from ray.tune.trial import Trial from ray.tune.utils.util import is_nan_or_inf from .builder import CALLBACKS @@ -73,7 +73,7 @@ def log_trial_start(self, trial: 'Trial'): set the parent run ID. Args: - trial (Trial): `ray.tune.trial.Trial` + trial (Trial): :class:`ray.tune.experiment.trial.Trial` """ # Create run if not already exists. if trial not in self._trial_runs: diff --git a/siatune/ray/schedulers/pbt.py b/siatune/ray/schedulers/pbt.py index ddfa982b..9280e9bc 100644 --- a/siatune/ray/schedulers/pbt.py +++ b/siatune/ray/schedulers/pbt.py @@ -3,10 +3,10 @@ import random from typing import Callable, Dict, Optional -from ray.tune.sample import Domain +from ray.tune.experiment import Trial from ray.tune.schedulers.pbt import \ PopulationBasedTraining as _PopulationBasedTraining -from ray.tune.trial import Trial +from ray.tune.search.sample import Domain from siatune.ray.schedulers import SCHEDULERS from siatune.ray.spaces import build_space diff --git a/siatune/ray/searchers/builder.py b/siatune/ray/searchers/builder.py index 6112c8c1..2d81a137 100644 --- a/siatune/ray/searchers/builder.py +++ b/siatune/ray/searchers/builder.py @@ -1,11 +1,11 @@ # Copyright (c) SI-Analytics. All rights reserved. from mmcv.utils import Config, Registry -from ray import tune +from ray.tune.search import Searcher SEARCHERS = Registry('searchers') -def build_searcher(cfg: Config) -> tune.suggest.Searcher: +def build_searcher(cfg: Config) -> Searcher: """Build the searcher from configs. Args: diff --git a/siatune/ray/searchers/flaml.py b/siatune/ray/searchers/flaml.py index 800385ff..b984c1e8 100644 --- a/siatune/ray/searchers/flaml.py +++ b/siatune/ray/searchers/flaml.py @@ -1,6 +1,6 @@ # Copyright (c) SI-Analytics. All rights reserved. -from ray.tune.suggest.flaml import CFO as _CFO -from ray.tune.suggest.flaml import BlendSearch as _BlendSearch +from ray.tune.search.flaml import CFO as _CFO +from ray.tune.search.flaml import BlendSearch as _BlendSearch from .builder import SEARCHERS diff --git a/siatune/ray/searchers/hyperopt.py b/siatune/ray/searchers/hyperopt.py index d62780d4..37921cfc 100644 --- a/siatune/ray/searchers/hyperopt.py +++ b/siatune/ray/searchers/hyperopt.py @@ -1,5 +1,5 @@ # Copyright (c) SI-Analytics. All rights reserved. -from ray.tune.suggest.hyperopt import HyperOptSearch as _HyperOptSearch +from ray.tune.search.hyperopt import HyperOptSearch as _HyperOptSearch from .builder import SEARCHERS diff --git a/siatune/ray/searchers/nevergrad.py b/siatune/ray/searchers/nevergrad.py index 10a6e8fe..b4e401c7 100644 --- a/siatune/ray/searchers/nevergrad.py +++ b/siatune/ray/searchers/nevergrad.py @@ -2,7 +2,7 @@ from typing import Dict, List, Optional, Union from ray.tune.result import DEFAULT_METRIC -from ray.tune.suggest.nevergrad import NevergradSearch as _NevergradSearch +from ray.tune.search.nevergrad import NevergradSearch as _NevergradSearch from .builder import SEARCHERS diff --git a/siatune/ray/spaces/base.py b/siatune/ray/spaces/base.py index f6e32336..5fd1f3cd 100644 --- a/siatune/ray/spaces/base.py +++ b/siatune/ray/spaces/base.py @@ -3,6 +3,7 @@ from typing import Callable import ray.tune as tune +from ray.tune.search.sample import Domain from .builder import SPACES @@ -15,7 +16,7 @@ def __init__(self, **kwargs) -> None: self.kwargs = kwargs @property - def space(self) -> tune.sample.Domain: + def space(self) -> Domain: """Return the space.""" return self.sample.__func__(**self.kwargs) diff --git a/siatune/ray/spaces/choice.py b/siatune/ray/spaces/choice.py index e9cc7c0f..b68ea54d 100644 --- a/siatune/ray/spaces/choice.py +++ b/siatune/ray/spaces/choice.py @@ -2,6 +2,7 @@ from typing import Callable, Optional, Sequence import ray.tune as tune +from ray.tune.search.sample import Domain from siatune.utils import ImmutableContainer from .base import BaseSpace @@ -31,5 +32,5 @@ def __init__(self, self.categories = categories @property - def space(self) -> tune.sample.Domain: + def space(self) -> Domain: return self.sample.__func__(self.categories) diff --git a/siatune/ray/spaces/sample_from.py b/siatune/ray/spaces/sample_from.py index 1772e83e..314b5b18 100644 --- a/siatune/ray/spaces/sample_from.py +++ b/siatune/ray/spaces/sample_from.py @@ -2,6 +2,7 @@ from typing import Callable, Union import ray.tune as tune +from ray.tune.search.sample import Domain from .base import BaseSpace from .builder import SPACES @@ -25,5 +26,5 @@ def __init__(self, func: Union[str, Callable]) -> None: self.func = func @property - def space(self) -> tune.sample.Domain: + def space(self) -> Domain: return self.sample.__func__(self.func) From 73aa245da315cf16c3f04b04a908d9d91b934d27 Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Thu, 1 Dec 2022 16:07:57 +0900 Subject: [PATCH 05/28] Fix trainable function signature --- siatune/mm/tasks/base.py | 9 +++------ siatune/mm/tasks/mmtrainbase.py | 31 ++---------------------------- siatune/ray/searchers/nevergrad.py | 1 - siatune/ray/tuner.py | 5 ++--- 4 files changed, 7 insertions(+), 39 deletions(-) diff --git a/siatune/mm/tasks/base.py b/siatune/mm/tasks/base.py index 1e419b36..0558dfbe 100644 --- a/siatune/mm/tasks/base.py +++ b/siatune/mm/tasks/base.py @@ -106,16 +106,14 @@ def parse_args(self, args: Sequence[str]) -> argparse.Namespace: """ pass - def context_aware_run(self, - searched_cfg: Dict, - checkpoint_dir: Optional[str] = None, - **kwargs) -> Any: + def context_aware_run(self, searched_cfg: Dict) -> Any: """Gather and refine the information received by users and Ray.tune to execute the objective task. Args: searched_cfg (Dict): The searched configuration. kwargs (**kwargs): The kwargs. + Returns: Any: The result of the objective task. """ @@ -124,9 +122,8 @@ def context_aware_run(self, context = dict( args=deepcopy(self.args), searched_cfg=deepcopy(ImmutableContainer.decouple(searched_cfg)), - checkpoint_dir=checkpoint_dir, + # checkpoint_dir=checkpoint_dir, ) - context.update(kwargs) return context_manager(self.run)(**context) @abstractmethod diff --git a/siatune/mm/tasks/mmtrainbase.py b/siatune/mm/tasks/mmtrainbase.py index 16e6dc7d..619f91d4 100644 --- a/siatune/mm/tasks/mmtrainbase.py +++ b/siatune/mm/tasks/mmtrainbase.py @@ -1,7 +1,5 @@ # Copyright (c) SI-Analytics. All rights reserved. -import os from abc import ABCMeta, abstractmethod -from functools import partial import mmcv import torch @@ -56,29 +54,7 @@ def train_model( """ pass - def context_aware_run(self, - searched_cfg, - backend='nccl', - **kwargs) -> None: - """Gather and refine the information received by users and Ray.tune to - execute the objective task. - - Args: - searched_cfg (Config): The searched configs. - backend (str): - The backend for dist training. Defaults to 'nccl'. - kwargs (**kwargs): The kwargs. - """ - # set non blocking mode on the nccl backend - # https://github.com/pytorch/pytorch/issues/50820 - if backend == 'nccl' and os.getenv('NCCL_BLOCKING_WAIT') is None: - os.environ['NCCL_BLOCKING_WAIT'] = '0' - return super().context_aware_run(searched_cfg, **kwargs) - - def create_trainable( - self, - backend: str = 'nccl', - ) -> TorchTrainer: + def create_trainable(self) -> TorchTrainer: """Get ray trainable task. Args: @@ -88,11 +64,8 @@ def create_trainable( Returns: TorchTrainer: The trainable task. """ - - assert backend in ['gloo', 'nccl'] - return TorchTrainer( - partial(self.context_aware_run, backend=backend), + self.context_aware_run, scaling_config=ScalingConfig( num_workers=2, use_gpu=True, diff --git a/siatune/ray/searchers/nevergrad.py b/siatune/ray/searchers/nevergrad.py index b4e401c7..d4620f8b 100644 --- a/siatune/ray/searchers/nevergrad.py +++ b/siatune/ray/searchers/nevergrad.py @@ -68,7 +68,6 @@ def __init__(self, metric=metric, mode=mode, points_to_evaluate=points_to_evaluate, - max_concurrent=None, **kwargs) def _setup_nevergrad(self) -> None: diff --git a/siatune/ray/tuner.py b/siatune/ray/tuner.py index c470c5c8..73025de7 100644 --- a/siatune/ray/tuner.py +++ b/siatune/ray/tuner.py @@ -59,10 +59,9 @@ def __init__( self.tuner = RayTuner( trainable, - param_space=param_space, + param_space=dict(train_loop_config=param_space), tune_config=TuneConfig( - searcher=searcher, trial_scheduler=trial_scheduler, - **tune_cfg), + search_alg=searcher, scheduler=trial_scheduler, **tune_cfg), run_config=RunConfig( local_dir=work_dir, stop=stopper, From f8fa7b262897f55962581e367f730fde297e971a Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Thu, 1 Dec 2022 18:21:40 +0900 Subject: [PATCH 06/28] Fix rewriter --- siatune/mm/context/rewriters/dump.py | 4 ++-- siatune/mm/context/rewriters/path.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/siatune/mm/context/rewriters/dump.py b/siatune/mm/context/rewriters/dump.py index 6267c48b..8bfa48db 100644 --- a/siatune/mm/context/rewriters/dump.py +++ b/siatune/mm/context/rewriters/dump.py @@ -3,7 +3,7 @@ from os import path as osp from typing import Dict -import ray +from ray.air import session from siatune.utils import dump_cfg from .base import BaseRewriter @@ -46,7 +46,7 @@ def __call__(self, context: Dict) -> Dict: Dict: The context after rewriting. """ cfg = context.pop(self.key) - trial_id = ray.tune.get_trial_id() + trial_id = session.get_trial_id() tmp_path = self.get_temporary_path(f'{trial_id}.py') setattr(context.get('args'), self.arg_name, tmp_path) dump_cfg(cfg, tmp_path) diff --git a/siatune/mm/context/rewriters/path.py b/siatune/mm/context/rewriters/path.py index 041bb36c..49b45d86 100644 --- a/siatune/mm/context/rewriters/path.py +++ b/siatune/mm/context/rewriters/path.py @@ -1,7 +1,7 @@ # Copyright (c) SI-Analytics. All rights reserved. from os import path as osp -import ray +from ray.air import session from .base import BaseRewriter from .builder import REWRITERS @@ -31,5 +31,5 @@ def __call__(self, context: dict) -> dict: """ value = getattr(context['args'], self.arg_name) setattr(context['args'], self.arg_name, - osp.join(value, ray.tune.get_trial_id())) + osp.join(value, session.get_trial_id())) return context From 5d3ac5ba427a3eeeb7c03a8c0ad197ed3942d0bd Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Thu, 1 Dec 2022 20:03:57 +0900 Subject: [PATCH 07/28] Fix minor --- siatune/mm/tasks/base.py | 1 - siatune/mm/tasks/mmseg.py | 2 -- siatune/mm/tasks/mmtrainbase.py | 9 +++++---- siatune/ray/tuner.py | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/siatune/mm/tasks/base.py b/siatune/mm/tasks/base.py index 0558dfbe..a2352b1d 100644 --- a/siatune/mm/tasks/base.py +++ b/siatune/mm/tasks/base.py @@ -122,7 +122,6 @@ def context_aware_run(self, searched_cfg: Dict) -> Any: context = dict( args=deepcopy(self.args), searched_cfg=deepcopy(ImmutableContainer.decouple(searched_cfg)), - # checkpoint_dir=checkpoint_dir, ) return context_manager(self.run)(**context) diff --git a/siatune/mm/tasks/mmseg.py b/siatune/mm/tasks/mmseg.py index 98b82564..205a396c 100644 --- a/siatune/mm/tasks/mmseg.py +++ b/siatune/mm/tasks/mmseg.py @@ -80,7 +80,6 @@ def build_model(self, The train opt. Defaults to None. test_cfg (Optional[Config]): The Test opt. Defaults to None. - Returns: torch.nn.Module: The model. """ @@ -98,7 +97,6 @@ def build_dataset( cfg (Config): The configs. default_args (Optional[Config]): The default args. Defaults to None. - Returns: torch.utils.data.Dataset: The dataset. """ diff --git a/siatune/mm/tasks/mmtrainbase.py b/siatune/mm/tasks/mmtrainbase.py index 619f91d4..0c57a852 100644 --- a/siatune/mm/tasks/mmtrainbase.py +++ b/siatune/mm/tasks/mmtrainbase.py @@ -4,7 +4,7 @@ import mmcv import torch from ray.air.config import ScalingConfig -from ray.train.torch import TorchTrainer +from ray.train.torch import TorchConfig, TorchTrainer from .base import BaseTask from .builder import TASKS @@ -67,8 +67,9 @@ def create_trainable(self) -> TorchTrainer: return TorchTrainer( self.context_aware_run, scaling_config=ScalingConfig( - num_workers=2, - use_gpu=True, + num_workers=self.num_workers, + use_gpu=torch.cuda.is_available(), resources_per_worker=dict( CPU=self.num_cpus_per_worker, - GPU=self.num_gpus_per_worker))) + GPU=self.num_gpus_per_worker)), + torch_config=TorchConfig(backend='gloo')) diff --git a/siatune/ray/tuner.py b/siatune/ray/tuner.py index 73025de7..e1aaf079 100644 --- a/siatune/ray/tuner.py +++ b/siatune/ray/tuner.py @@ -90,7 +90,7 @@ def from_cfg(cls, cfg, trainable): @classmethod def resume(cls, path, **kwargs): - return RayTuner.restore(path, **kwargs) + return cls.restore(path, **kwargs) def fit(self): return self.tuner.fit() From 04a5250934b36b1fda7e63bd9662deae9eaf1d14 Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Fri, 2 Dec 2022 15:10:53 +0900 Subject: [PATCH 08/28] Fix reporter --- siatune/mm/hooks/reporter.py | 4 ++-- siatune/mm/tasks/mmtrainbase.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/siatune/mm/hooks/reporter.py b/siatune/mm/hooks/reporter.py index 47ec9880..e769843f 100644 --- a/siatune/mm/hooks/reporter.py +++ b/siatune/mm/hooks/reporter.py @@ -1,8 +1,8 @@ # Copyright (c) SI-Analytics. All rights reserved. -import ray from mmcv.runner import HOOKS, BaseRunner from mmcv.runner.dist_utils import get_dist_info from mmcv.runner.hooks.logger import LoggerHook +from ray.air import session from torch import distributed as dist @@ -90,4 +90,4 @@ def log(self, runner: BaseRunner) -> None: filter(lambda elem: self.filtering_key in elem, tags.keys())): return tags['global_step'] = self.get_iter(runner) - ray.tune.report(**tags) + session.report(tags) diff --git a/siatune/mm/tasks/mmtrainbase.py b/siatune/mm/tasks/mmtrainbase.py index 0c57a852..c4103a1d 100644 --- a/siatune/mm/tasks/mmtrainbase.py +++ b/siatune/mm/tasks/mmtrainbase.py @@ -67,9 +67,9 @@ def create_trainable(self) -> TorchTrainer: return TorchTrainer( self.context_aware_run, scaling_config=ScalingConfig( - num_workers=self.num_workers, - use_gpu=torch.cuda.is_available(), - resources_per_worker=dict( + trainer_resources=dict( CPU=self.num_cpus_per_worker, - GPU=self.num_gpus_per_worker)), + GPU=self.num_gpus_per_worker), + num_workers=self.num_workers, + use_gpu=torch.cuda.is_available()), torch_config=TorchConfig(backend='gloo')) From 59a86da980e52286685bd752fc52397f15618325 Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Fri, 2 Dec 2022 15:49:15 +0900 Subject: [PATCH 09/28] Fix apis --- siatune/apis/analysis.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/siatune/apis/analysis.py b/siatune/apis/analysis.py index 48b5c9c0..31e25d56 100644 --- a/siatune/apis/analysis.py +++ b/siatune/apis/analysis.py @@ -5,19 +5,19 @@ from typing import Optional from mmcv.utils import Config, get_logger -from ray import tune +from ray.tune import ResultGrid from siatune.utils import ImmutableContainer, dump_cfg -def log_analysis(analysis: tune.ExperimentAnalysis, +def log_analysis(results: ResultGrid, tune_config: Config, task_config: Optional[Config] = None, log_dir: Optional[str] = None) -> None: """Log the analysis of the experiment. Args: - analysis (tune.ExperimentAnalysis): The analysis of the experiment. + results (ResultGrid): Result of `Tuner.fit()`. tune_config (Config): The tune config. task_config (Optional[Config]): The task config. Defaults to None. log_dir (Optional[str]): The log dir. Defaults to None. @@ -33,10 +33,9 @@ def log_analysis(analysis: tune.ExperimentAnalysis, logger = get_logger( 'siatune', log_file=osp.join(log_dir, f'{timestamp}.log')) - logger.info( - f'Best Hyperparam: \n' - f'{pformat(ImmutableContainer.decouple(analysis.best_config))}') - logger.info( - f'Best Results: \n' - f'{pformat(ImmutableContainer.decouple(analysis.best_result))}') - logger.info(f'Best Logdir: {analysis.best_logdir}') + result = results.get_best_result() + logger.info(f'Best Result: \n' + f'{pformat(ImmutableContainer.decouple(result))}') + logger.info(f'Best Hyperparam: \n' + f'{pformat(ImmutableContainer.decouple(result.config))}') + logger.info(f'Best Logdir: {result.log_dir}') From 4fb42dd8c14ffb7344044c670f925532ca1387bc Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Fri, 2 Dec 2022 20:28:13 +0900 Subject: [PATCH 10/28] Fix RayCheckpointHook --- siatune/mm/hooks/checkpoint.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/siatune/mm/hooks/checkpoint.py b/siatune/mm/hooks/checkpoint.py index faada1dc..f0a1e135 100644 --- a/siatune/mm/hooks/checkpoint.py +++ b/siatune/mm/hooks/checkpoint.py @@ -10,7 +10,7 @@ from mmcv.runner.checkpoint import get_state_dict, weights_to_cpu from mmcv.runner.dist_utils import master_only from mmcv.runner.hooks import CheckpointHook as _CheckpointHook -from ray.tune.integration.torch import distributed_checkpoint_dir +from ray.air import session from torch.optim import Optimizer @@ -100,9 +100,7 @@ def _save_checkpoint(self, runner: BaseRunner) -> None: for name, optim in optimizer.items(): checkpoint['optimizer'][name] = optim.state_dict() - with distributed_checkpoint_dir( - step=(runner.epoch + 1) // - self.interval if self.by_epoch else (runner.iter + 1) // - self.interval) as checkpoint_dir: + ckpt = session.get_checkpoint() + with ckpt.as_directory() as checkpoint_dir: path = os.path.join(checkpoint_dir, 'ray_ckpt.pth') torch.save(checkpoint, path) From 2c1215c8f2574248e24b7166dac2576dc2109863 Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Fri, 2 Dec 2022 20:58:37 +0900 Subject: [PATCH 11/28] Fix requirements --- requirements/optional.txt | 2 +- requirements/runtime.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/optional.txt b/requirements/optional.txt index a5b95cc0..8ef39eb3 100644 --- a/requirements/optional.txt +++ b/requirements/optional.txt @@ -4,5 +4,5 @@ hyperopt==0.2.5 mlflow==1.21.0 nevergrad==0.4.3.post7 optuna==2.10.0 +scikit-learn scikit-optimize==0.9.0 -sklearn diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 7cf3a24b..d9ef98e1 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -1,5 +1,6 @@ mmcv-full>=1.4.7 pandas protobuf<=3.20 +pyarrow ray[default]==2.1.0 tabulate From 709bb9cc4f6b3d551374b302efd854862d903ed5 Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Fri, 2 Dec 2022 21:36:43 +0900 Subject: [PATCH 12/28] Fix test code for rewriters --- tests/test_mm/test_rewriters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_mm/test_rewriters.py b/tests/test_mm/test_rewriters.py index 282c29d7..5cdd9329 100644 --- a/tests/test_mm/test_rewriters.py +++ b/tests/test_mm/test_rewriters.py @@ -33,7 +33,7 @@ def __call__(self, context: Dict) -> Dict: build_rewriter(dict(type='DummyRewriter')), DummyRewriter) -@patch('ray.tune.get_trial_id') +@patch('ray.air.session.get_trial_id') def test_dump(mock_get_trial_id): mock_get_trial_id.return_value = 'test' dump = Dump(key='cfg', arg_name='config') @@ -99,7 +99,7 @@ def test_patch(): })._cfg_dict -@patch('ray.tune.get_trial_id') +@patch('ray.air.session.get_trial_id') def test_append_trial_id_to_path(mock_get_trial_id): mock_get_trial_id.return_value = 'test' args = MagicMock() From 940320bf76b4108b25078592a318e0b27ca4c204 Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Fri, 2 Dec 2022 21:54:21 +0900 Subject: [PATCH 13/28] Fix test code for hooks --- siatune/mm/hooks/checkpoint.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/siatune/mm/hooks/checkpoint.py b/siatune/mm/hooks/checkpoint.py index f0a1e135..4c2109e4 100644 --- a/siatune/mm/hooks/checkpoint.py +++ b/siatune/mm/hooks/checkpoint.py @@ -4,13 +4,13 @@ from typing import Optional import mmcv +import ray.tune as tune import torch from mmcv.parallel import is_module_wrapper from mmcv.runner import HOOKS, BaseRunner from mmcv.runner.checkpoint import get_state_dict, weights_to_cpu from mmcv.runner.dist_utils import master_only from mmcv.runner.hooks import CheckpointHook as _CheckpointHook -from ray.air import session from torch.optim import Optimizer @@ -100,7 +100,10 @@ def _save_checkpoint(self, runner: BaseRunner) -> None: for name, optim in optimizer.items(): checkpoint['optimizer'][name] = optim.state_dict() - ckpt = session.get_checkpoint() - with ckpt.as_directory() as checkpoint_dir: + step = (runner.epoch + 1) // self.interval + if not self.by_epoch: + step //= runner.iter + 1 + + with tune.checkpoint_dir(step=step) as checkpoint_dir: path = os.path.join(checkpoint_dir, 'ray_ckpt.pth') torch.save(checkpoint, path) From cddfc3c7b3a29f00fadaea5c919fa42f9ea08511 Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Fri, 2 Dec 2022 23:38:38 +0900 Subject: [PATCH 14/28] Fix test code for tasks --- tests/test_mm/test_tasks.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_mm/test_tasks.py b/tests/test_mm/test_tasks.py index f0d6d0c2..72a7df38 100644 --- a/tests/test_mm/test_tasks.py +++ b/tests/test_mm/test_tasks.py @@ -6,6 +6,7 @@ import torch from mmcv.utils import Config from ray import tune +from ray.air import session from siatune.mm.tasks import (TASKS, BaseTask, BlackBoxTask, ContinuousTestFunction, DiscreteTestFunction, @@ -194,7 +195,7 @@ def test_mmcls(*not_used): task.run(args=task.args) -@patch('ray.tune.report', side_effect=report_to_session) +@patch('ray.air.session.report', side_effect=report_to_session) def test_mm_train_based_task(mock_report): with pytest.raises(TypeError): MMTrainBasedTask() @@ -250,7 +251,7 @@ def train_model(self, model, dataset, cfg): loss.backward() optimizer.step() total_loss += loss.item() - tune.report(loss=total_loss / (batch_idx + 1)) + session.report(loss=total_loss / (batch_idx + 1)) def run(self, *, searched_cfg, **kwargs): cfg = searched_cfg.get('cfg') @@ -275,4 +276,6 @@ def run(self, *, searched_cfg, **kwargs): task.set_resource(1, 0, 1) task.context_aware_run(searched_cfg=dict(cfg=cfg)) assert 'loss' in get_session() - tune.run(task.create_trainable(backend='gloo'), config=dict(cfg=cfg)) + + trainable = task.create_trainable() + tune.Tuner(trainable).fit() From b47f3c00f7266d20c446e391c948ea6d6addf542 Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Sat, 3 Dec 2022 04:48:48 +0900 Subject: [PATCH 15/28] Fix test code for apis --- tests/test_apis/test_apis.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/test_apis/test_apis.py b/tests/test_apis/test_apis.py index 16260ade..40a4b64c 100644 --- a/tests/test_apis/test_apis.py +++ b/tests/test_apis/test_apis.py @@ -1,9 +1,10 @@ import os +import random import tempfile from unittest.mock import MagicMock import mmcv -from ray.tune.trainable import Trainable +import ray from siatune.apis import log_analysis, tune @@ -37,14 +38,11 @@ def test_log_analysis(): def test_tune(): - class TestTrainable(Trainable): - - def step(self): - result = {'name': self.trial_name, 'trial_id': self.trial_id} - return result + def trainable(config): + ray.tune.report({'metric': random.random()}) mock_task_processor = MagicMock() - mock_task_processor.create_trainable.return_value = TestTrainable + mock_task_processor.create_trainable.return_value = trainable with tempfile.TemporaryDirectory() as tmpdir: tune_config = mmcv.Config( dict( From ca42bfc6749f72530a956d90c779db12a5f40211 Mon Sep 17 00:00:00 2001 From: Younghwan Na <100389977+yhna940@users.noreply.github.com> Date: Thu, 15 Dec 2022 10:30:03 +0900 Subject: [PATCH 16/28] :memo: Del checkpoint for base task proc --- siatune/mm/tasks/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/siatune/mm/tasks/base.py b/siatune/mm/tasks/base.py index a2352b1d..364c4b2f 100644 --- a/siatune/mm/tasks/base.py +++ b/siatune/mm/tasks/base.py @@ -29,9 +29,7 @@ class BaseTask(metaclass=ABCMeta): 1. args (argparse.Namespace): The low level CLI arguments. 2. searched_cfg (Dict): The configuration searched by the algorithm. - 3. checkpoint_dir (Optional[str]): - The directory of checkpoints that contains the states. - Inputs: searched_cfg (Dict), checkpoint_dir (Optional[str]) + Inputs: searched_cfg (Dict) Outputs: None """ From 411f3077a9704cb5458204da9857987e987f5e3b Mon Sep 17 00:00:00 2001 From: Junhwa Song Date: Thu, 15 Dec 2022 10:55:54 +0900 Subject: [PATCH 17/28] Update siatune/apis/analysis.py Co-authored-by: Hakjin Lee Signed-off-by: Junhwa Song --- siatune/apis/analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/siatune/apis/analysis.py b/siatune/apis/analysis.py index 31e25d56..19b1f936 100644 --- a/siatune/apis/analysis.py +++ b/siatune/apis/analysis.py @@ -17,7 +17,7 @@ def log_analysis(results: ResultGrid, """Log the analysis of the experiment. Args: - results (ResultGrid): Result of `Tuner.fit()`. + results (ResultGrid): Experiment results of `Tuner.fit()`. tune_config (Config): The tune config. task_config (Optional[Config]): The task config. Defaults to None. log_dir (Optional[str]): The log dir. Defaults to None. From 791111d7b3c8d2d096e826d91c0dba9f87b4aa26 Mon Sep 17 00:00:00 2001 From: Junhwa Song Date: Thu, 15 Dec 2022 11:40:48 +0900 Subject: [PATCH 18/28] Update siatune/mm/tasks/mmtrainbase.py Co-authored-by: Hakjin Lee Signed-off-by: Junhwa Song --- siatune/mm/tasks/mmtrainbase.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/siatune/mm/tasks/mmtrainbase.py b/siatune/mm/tasks/mmtrainbase.py index c4103a1d..7aed4ae4 100644 --- a/siatune/mm/tasks/mmtrainbase.py +++ b/siatune/mm/tasks/mmtrainbase.py @@ -58,8 +58,6 @@ def create_trainable(self) -> TorchTrainer: """Get ray trainable task. Args: - backend (str): The backend for distributed training. - Defaults to 'nccl'. Returns: TorchTrainer: The trainable task. From 703d5a18f87d0990b313ab0151f67958b604ed0c Mon Sep 17 00:00:00 2001 From: Junhwa Song Date: Thu, 15 Dec 2022 12:19:24 +0900 Subject: [PATCH 19/28] Update siatune/mm/tasks/mmtrainbase.py Co-authored-by: Hakjin Lee Signed-off-by: Junhwa Song --- siatune/mm/tasks/mmtrainbase.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/siatune/mm/tasks/mmtrainbase.py b/siatune/mm/tasks/mmtrainbase.py index 7aed4ae4..d9213f96 100644 --- a/siatune/mm/tasks/mmtrainbase.py +++ b/siatune/mm/tasks/mmtrainbase.py @@ -57,8 +57,6 @@ def train_model( def create_trainable(self) -> TorchTrainer: """Get ray trainable task. - Args: - Returns: TorchTrainer: The trainable task. """ From 9eda02d45ef6d250c63834b7f1caf82920a76701 Mon Sep 17 00:00:00 2001 From: Junhwa Song Date: Thu, 15 Dec 2022 13:09:09 +0900 Subject: [PATCH 20/28] Support custom trainer and backend (#91) * Support custom trainer and backend * Add comment * Add assertion * Fix typo * Update siatune/ray/config.py * Apply lint * Fix test code Co-authored-by: Hakjin Lee --- siatune/mm/tasks/mmtrainbase.py | 14 +++++--- siatune/ray/config.py | 57 +++++++++++++++++++++++++++++++++ tests/test_mm/test_tasks.py | 2 +- 3 files changed, 67 insertions(+), 6 deletions(-) create mode 100644 siatune/ray/config.py diff --git a/siatune/mm/tasks/mmtrainbase.py b/siatune/mm/tasks/mmtrainbase.py index d9213f96..b3c94937 100644 --- a/siatune/mm/tasks/mmtrainbase.py +++ b/siatune/mm/tasks/mmtrainbase.py @@ -4,8 +4,9 @@ import mmcv import torch from ray.air.config import ScalingConfig -from ray.train.torch import TorchConfig, TorchTrainer +from ray.train.data_parallel_trainer import DataParallelTrainer +from siatune.ray.config import CustomBackendConfig from .base import BaseTask from .builder import TASKS @@ -54,18 +55,21 @@ def train_model( """ pass - def create_trainable(self) -> TorchTrainer: + def create_trainable(self) -> DataParallelTrainer: """Get ray trainable task. Returns: TorchTrainer: The trainable task. """ - return TorchTrainer( + assert self.num_workers == self.num_gpus_per_worker, ( + '`num_workers` must be equal to `num_gpus_per_worker`.') + + return DataParallelTrainer( self.context_aware_run, + backend_config=CustomBackendConfig(), scaling_config=ScalingConfig( trainer_resources=dict( CPU=self.num_cpus_per_worker, GPU=self.num_gpus_per_worker), num_workers=self.num_workers, - use_gpu=torch.cuda.is_available()), - torch_config=TorchConfig(backend='gloo')) + use_gpu=torch.cuda.is_available())) diff --git a/siatune/ray/config.py b/siatune/ray/config.py new file mode 100644 index 00000000..75794798 --- /dev/null +++ b/siatune/ray/config.py @@ -0,0 +1,57 @@ +# Copyright (c) SI-Analytics. All rights reserved. +# Modified from https://github.com/ray-project/ray/blob/ray-2.1.0/python/ray/train/torch/config.py # noqa + +import logging +import os +from dataclasses import dataclass + +import ray +import torch.distributed as dist +from ray.train._internal.utils import get_address_and_port +from ray.train._internal.worker_group import WorkerGroup +from ray.train.backend import BackendConfig +from ray.train.torch.config import _set_nccl_network_interface, _TorchBackend + +logger = logging.getLogger(__name__) + + +@dataclass +class CustomBackendConfig(BackendConfig): + """Configuration for torch process group setup.""" + + @property + def backend_cls(self): + return _CustomTorchBackend + + +class _CustomTorchBackend(_TorchBackend): + + def on_start(self, worker_group: WorkerGroup, + backend_config: BackendConfig): + if not dist.is_available(): + raise RuntimeError('Distributed torch is not available.') + + worker_group.execute(_set_nccl_network_interface) + + master_addr, master_port = worker_group.execute_single( + 0, get_address_and_port) + + def set_env_vars(addr, port, rank, world_size): + os.environ['MASTER_ADDR'] = addr + os.environ['MASTER_PORT'] = str(port) + os.environ['RANK'] = str(rank) + os.environ['LOCAL_RANK'] = str(rank) + os.environ['WORLD_SIZE'] = str(world_size) + + setup_futures = [] + for i in range(len(worker_group)): + setup_futures.append( + worker_group.execute_single_async( + i, + set_env_vars, + addr=master_addr, + port=master_port, + rank=i, + world_size=len(worker_group), + )) + ray.get(setup_futures) diff --git a/tests/test_mm/test_tasks.py b/tests/test_mm/test_tasks.py index 72a7df38..a779dc86 100644 --- a/tests/test_mm/test_tasks.py +++ b/tests/test_mm/test_tasks.py @@ -273,7 +273,7 @@ def run(self, *, searched_cfg, **kwargs): ))) task = TestTask() - task.set_resource(1, 0, 1) + task.set_resource() task.context_aware_run(searched_cfg=dict(cfg=cfg)) assert 'loss' in get_session() From 082ea7b4c47e2a02352565a4c1d9789800928c90 Mon Sep 17 00:00:00 2001 From: Junhwa Song Date: Thu, 15 Dec 2022 18:24:28 +0900 Subject: [PATCH 21/28] Update siatune/mm/tasks/mmtrainbase.py Co-authored-by: Hakjin Lee Signed-off-by: Junhwa Song --- siatune/mm/tasks/mmtrainbase.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/siatune/mm/tasks/mmtrainbase.py b/siatune/mm/tasks/mmtrainbase.py index b3c94937..e897a71a 100644 --- a/siatune/mm/tasks/mmtrainbase.py +++ b/siatune/mm/tasks/mmtrainbase.py @@ -59,7 +59,7 @@ def create_trainable(self) -> DataParallelTrainer: """Get ray trainable task. Returns: - TorchTrainer: The trainable task. + DataParallelTrainer: The trainable task. """ assert self.num_workers == self.num_gpus_per_worker, ( '`num_workers` must be equal to `num_gpus_per_worker`.') From 789ca62c2daccb5eca8aa924258fa612a5ef3cfb Mon Sep 17 00:00:00 2001 From: Junhwa Song Date: Fri, 16 Dec 2022 10:34:56 +0900 Subject: [PATCH 22/28] Upgrade MMTask (#97) * Update class signature * Update mmseg * Update mmdet * Update mmcls * Update configs * Fix test code --- .../mmcls_cifar_100_asynchb_nevergrad_pso.py | 7 +- configs/mmdet/mmdet_asynchb_nevergrad_pso.py | 7 +- configs/mmseg/mmseg_asynchb_nevergrad_pso.py | 11 +- setup.cfg | 4 + siatune/mm/tasks/mmcls.py | 242 ++++++++--------- siatune/mm/tasks/mmdet.py | 254 +++++++++--------- siatune/mm/tasks/mmseg.py | 225 ++++++++-------- siatune/mm/tasks/mmtrainbase.py | 48 +--- tests/test_mm/test_tasks.py | 45 ++-- 9 files changed, 399 insertions(+), 444 deletions(-) diff --git a/configs/mmcls/mmcls_cifar_100_asynchb_nevergrad_pso.py b/configs/mmcls/mmcls_cifar_100_asynchb_nevergrad_pso.py index eda56e16..39cf8842 100644 --- a/configs/mmcls/mmcls_cifar_100_asynchb_nevergrad_pso.py +++ b/configs/mmcls/mmcls_cifar_100_asynchb_nevergrad_pso.py @@ -5,14 +5,11 @@ ] space = { + 'data.samples_per_gpu': {{_base_.batch_size}}, 'model': {{_base_.model}}, 'model.head.num_classes': 100, 'optimizer': {{_base_.optimizer}}, - 'data.samples_per_gpu': {{_base_.batch_size}}, } task = dict(type='MMClassification') -metric = 'val/accuracy_top-1' -mode = 'max' -raise_on_failed_trial = False -num_samples = 256 +tune_cfg = dict(num_samples=8, metric='val/accuracy_top-1', mode='max') diff --git a/configs/mmdet/mmdet_asynchb_nevergrad_pso.py b/configs/mmdet/mmdet_asynchb_nevergrad_pso.py index d06450ab..e780f291 100644 --- a/configs/mmdet/mmdet_asynchb_nevergrad_pso.py +++ b/configs/mmdet/mmdet_asynchb_nevergrad_pso.py @@ -5,13 +5,10 @@ ] space = { + 'data.samples_per_gpu': {{_base_.batch_size}}, 'model': {{_base_.model}}, 'optimizer': {{_base_.optimizer}}, - 'data.samples_per_gpu': {{_base_.batch_size}}, } task = dict(type='MMDetection') -metric = 'val/AP' -mode = 'max' -raise_on_failed_trial = False -num_samples = 256 +tune_cfg = dict(num_samples=8, metric='val/AP', mode='max') diff --git a/configs/mmseg/mmseg_asynchb_nevergrad_pso.py b/configs/mmseg/mmseg_asynchb_nevergrad_pso.py index 3d81ecd6..a2e13b36 100644 --- a/configs/mmseg/mmseg_asynchb_nevergrad_pso.py +++ b/configs/mmseg/mmseg_asynchb_nevergrad_pso.py @@ -4,10 +4,13 @@ '../_base_/space/optimizer.py', '../_base_/space/batch_size.py' ] -space = dict( - data=dict(samples_per_gpu={{_base_.batch_size}}), - model={{_base_.model}}, - optimizer={{_base_.optimizer}}) +space = { + 'data.samples_per_gpu': {{_base_.batch_size}}, + 'model': {{_base_.model}}, + 'model.decode_head.num_classes': 21, + 'model.auxiliary_head.num_classes': 21, + 'optimizer': {{_base_.optimizer}}, +} task = dict(type='MMSegmentation') tune_cfg = dict(num_samples=8, metric='val/mIoU', mode='max') diff --git a/setup.cfg b/setup.cfg index bdbd251f..7852e3b4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,7 @@ +[flake8] +per-file-ignores = + siatune/mm/tasks/mm*.py: E251,E501 + [isort] line_length = 79 multi_line_output = 0 diff --git a/siatune/mm/tasks/mmcls.py b/siatune/mm/tasks/mmcls.py index 0e5803b7..fdd8dc7a 100644 --- a/siatune/mm/tasks/mmcls.py +++ b/siatune/mm/tasks/mmcls.py @@ -3,14 +3,9 @@ import copy import os import time +import warnings from os import path as osp -from typing import Optional, Sequence - -import mmcv -import torch -import torch.distributed as dist -from mmcv.runner import get_dist_info -from mmcv.utils import Config, DictAction, get_git_hash +from typing import Sequence from .builder import TASKS from .mmtrainbase import MMTrainBasedTask @@ -18,16 +13,18 @@ @TASKS.register_module() class MMClassification(MMTrainBasedTask): - """MMClassification Wrapping class for ray tune.""" + """MMClassification wrapper class for `ray.tune`. - def parse_args(self, args: Sequence[str]) -> argparse.Namespace: - """Define and parse the necessary arguments for the task. + It is modified from https://github.com/open-mmlab/mmclassification/blob/v0.23.2/tools/train.py - Args: - args (Sequence[str]): The args. - Returns: - argparse.Namespace: The parsed args. - """ + Attributes: + args (Sequence[str]): + """ + + VERSION = 'v0.23.2' + + def parse_args(self, task_args: Sequence[str]): + from mmcv import DictAction parser = argparse.ArgumentParser(description='Train a model') parser.add_argument('config', help='train config file path') @@ -39,6 +36,31 @@ def parse_args(self, args: Sequence[str]) -> argparse.Namespace: '--no-validate', action='store_true', help='whether not to evaluate the checkpoint during training') + group_gpus = parser.add_mutually_exclusive_group() + group_gpus.add_argument( + '--device', help='device used for training. (Deprecated)') + group_gpus.add_argument( + '--gpus', + type=int, + help='(Deprecated, please use --gpu-id) number of gpus to use ' + '(only applicable to non-distributed training)') + group_gpus.add_argument( + '--gpu-ids', + type=int, + nargs='+', + help='(Deprecated, please use --gpu-id) ids of gpus to use ' + '(only applicable to non-distributed training)') + group_gpus.add_argument( + '--gpu-id', + type=int, + default=0, + help='id of gpu to use ' + '(only applicable to non-distributed training)') + parser.add_argument( + '--ipu-replicas', + type=int, + default=None, + help='num of ipu replicas to use') parser.add_argument( '--seed', type=int, default=None, help='random seed') parser.add_argument( @@ -53,108 +75,55 @@ def parse_args(self, args: Sequence[str]) -> argparse.Namespace: '--cfg-options', nargs='+', action=DictAction, - help='override some settings in the used config, the key-value ' - 'pair in xxx=yyy format will be merged into config file. If the ' - 'value to be overwritten is a list, it should be like key="[a,b]" ' - 'or key=a,b It also allows nested list/tuple values, e.g. ' - 'key="[(a,b),(c,d)]" Note that the quotation marks are necessary ' - 'and that no white space is allowed.') - args = parser.parse_args(args) - return args - - def build_model(self, - cfg: Config, - train_cfg: Optional[Config] = None, - test_cfg: Optional[Config] = None) -> torch.nn.Module: - """Build the model from configs. - - Args: - cfg (Config): The configs. - train_cfg (Optional[Config]): - The train opt. Defaults to None. - test_cfg (Optional[Config]): - The Test opt. Defaults to None. - - Returns: - torch.nn.Module: The model. - """ - - from mmcls.models import build_classifier - return build_classifier(cfg) - - def build_dataset( - self, - cfg: Config, - default_args: Optional[Config] = None) -> torch.utils.data.Dataset: - """Build the dataset from configs. - - Args: - cfg (Config): The configs. - default_args (Optional[Config]): - The default args. Defaults to None. - - Returns: - torch.utils.data.Dataset: The dataset. - """ - - from mmcls.datasets.builder import build_dataset - return build_dataset(cfg, default_args) - - def train_model(self, - model: torch.nn.Module, - dataset: torch.utils.data.Dataset, - cfg: Config, - distributed: bool = True, - validate: bool = False, - timestamp: Optional[str] = None, - meta: Optional[dict] = None) -> None: - from mmcls.apis.train import train_model - """Train the model. - - Args: - model (torch.nn.Module): The model. - dataset (torch.utils.data.Dataset): The dataset. - cfg (Config): The configs. - distributed (bool): - Whether or not distributed. Defaults to True. - validate (bool): - Whether or not validate. Defaults to False. - timestamp (Optional[str]): - The timestamp. Defaults to None. - meta (Optional[dict]): - The meta. Defaults to None. - """ + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args(task_args) + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) - train_model( - model, dataset, cfg, distributed, validate, timestamp, meta=meta) - return + return args - def run(self, *, args: argparse.Namespace, **kwargs) -> None: + def run(self, args: argparse.Namespace): """Run the task. Args: args (argparse.Namespace): The args that received from context manager. """ - + import mmcv + import torch + import torch.distributed as dist from mmcls import __version__ - from mmcls.apis import init_random_seed, set_random_seed - from mmcls.utils import (collect_env, get_root_logger, - setup_multi_processes) - - if 'LOCAL_RANK' not in os.environ: - os.environ['LOCAL_RANK'] = str(dist.get_rank()) + from mmcls.apis import init_random_seed, set_random_seed, train_model + from mmcls.datasets import build_dataset + from mmcls.models import build_classifier + from mmcls.utils import (auto_select_device, collect_env, + get_root_logger, setup_multi_processes) + from mmcv import Config + from mmcv.runner import get_dist_info, init_dist cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) + # set multi-process settings + setup_multi_processes(cfg) + # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True - # work_dir is determined in this priority: - # CLI > segment in file > filename + # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir @@ -164,12 +133,32 @@ def run(self, *, args: argparse.Namespace, **kwargs) -> None: osp.splitext(osp.basename(args.config))[0]) if args.resume_from is not None: cfg.resume_from = args.resume_from + if args.gpus is not None: + cfg.gpu_ids = range(1) + warnings.warn('`--gpus` is deprecated because we only support ' + 'single GPU mode in non-distributed training. ' + 'Use `gpus=1` now.') + if args.gpu_ids is not None: + cfg.gpu_ids = args.gpu_ids[0:1] + warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' + 'Because we only support single GPU mode in ' + 'non-distributed training. Use the first GPU ' + 'in `gpu_ids` now.') + if args.gpus is None and args.gpu_ids is None: + cfg.gpu_ids = [args.gpu_id] + + if args.ipu_replicas is not None: + cfg.ipu_replicas = args.ipu_replicas + args.device = 'ipu' # init distributed env first, since logger depends on the dist info. - distributed = True - # gpu_ids is used to calculate iter when resuming checkpoint - _, world_size = get_dist_info() - cfg.gpu_ids = range(world_size) + if args.launcher == 'none': + distributed = False + else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + _, world_size = get_dist_info() + cfg.gpu_ids = range(world_size) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) @@ -180,18 +169,15 @@ def run(self, *, args: argparse.Namespace, **kwargs) -> None: log_file = osp.join(cfg.work_dir, f'{timestamp}.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) - # set multi-process settings - setup_multi_processes(cfg) - # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() - env_info = '\n'.join([f'{k}: {v}' for k, v in env_info_dict.items()]) + env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' - logger.info('Environment info:\n' + dash_line + env_info + # noqa W504 - '\n' + dash_line) + logger.info('Environment info:\n' + dash_line + env_info + '\n' + + dash_line) meta['env_info'] = env_info # log some basic info @@ -199,45 +185,39 @@ def run(self, *, args: argparse.Namespace, **kwargs) -> None: logger.info(f'Config:\n{cfg.pretty_text}') # set random seeds - seed = init_random_seed(args.seed) + cfg.device = args.device or auto_select_device() + seed = init_random_seed(args.seed, device=cfg.device) seed = seed + dist.get_rank() if args.diff_seed else seed logger.info(f'Set random seed to {seed}, ' f'deterministic: {args.deterministic}') set_random_seed(seed, deterministic=args.deterministic) cfg.seed = seed meta['seed'] = seed - meta['exp_name'] = osp.basename(args.config) - model = self.build_model( - cfg.model, - train_cfg=cfg.get('train_cfg'), - test_cfg=cfg.get('test_cfg')) + model = build_classifier(cfg.model) model.init_weights() - # SyncBN is not support for DP - logger.info(model) - - datasets = [self.build_dataset(cfg.data.train)] + datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) val_dataset.pipeline = cfg.data.train.pipeline - datasets.append(self.build_dataset(val_dataset)) - if cfg.checkpoint_config is not None: - # save mmcls version, config file content and class names in - # checkpoints as meta data - cfg.checkpoint_config.meta = dict( - mmcls_version=f'{__version__}+{get_git_hash()[:7]}', + datasets.append(build_dataset(val_dataset)) + + # save mmcls version, config file content and class names in + # runner as meta data + meta.update( + dict( + mmcls_version=__version__, config=cfg.pretty_text, - CLASSES=datasets[0].CLASSES) + CLASSES=datasets[0].CLASSES)) + # add an attribute for visualization convenience - model.CLASSES = datasets[0].CLASSES - # passing checkpoint meta for saving best checkpoint - meta.update(cfg.checkpoint_config.meta) - self.train_model( + train_model( model, datasets, cfg, - distributed=True, + distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, + device=cfg.device, meta=meta) diff --git a/siatune/mm/tasks/mmdet.py b/siatune/mm/tasks/mmdet.py index d84d182e..6504d339 100644 --- a/siatune/mm/tasks/mmdet.py +++ b/siatune/mm/tasks/mmdet.py @@ -3,14 +3,9 @@ import copy import os import time +import warnings from os import path as osp -from typing import Optional, Sequence - -import mmcv -import torch -import torch.distributed as dist -from mmcv.runner import get_dist_info -from mmcv.utils import Config, DictAction, get_git_hash +from typing import Sequence from .builder import TASKS from .mmtrainbase import MMTrainBasedTask @@ -18,16 +13,18 @@ @TASKS.register_module() class MMDetection(MMTrainBasedTask): - """MMDetection Wrapping class for ray tune.""" + """MMDetection wrapper class for `ray.tune`. - def parse_args(self, args: Sequence[str]) -> argparse.Namespace: - """Define and parse the necessary arguments for the task. + It is modified from https://github.com/open-mmlab/mmdetection/blob/v2.25.2/tools/train.py - Args: - args (Sequence[str]): The args. - Returns: - argparse.Namespace: The parsed args. - """ + Attributes: + args (Sequence[str]): + """ + + VERSION = 'v2.25.2' + + def parse_args(self, task_args: Sequence[str]): + from mmcv import DictAction parser = argparse.ArgumentParser(description='Train a detector') parser.add_argument('config', help='train config file path') @@ -43,6 +40,24 @@ def parse_args(self, args: Sequence[str]) -> argparse.Namespace: '--no-validate', action='store_true', help='whether not to evaluate the checkpoint during training') + group_gpus = parser.add_mutually_exclusive_group() + group_gpus.add_argument( + '--gpus', + type=int, + help='(Deprecated, please use --gpu-id) number of gpus to use ' + '(only applicable to non-distributed training)') + group_gpus.add_argument( + '--gpu-ids', + type=int, + nargs='+', + help='(Deprecated, please use --gpu-id) ids of gpus to use ' + '(only applicable to non-distributed training)') + group_gpus.add_argument( + '--gpu-id', + type=int, + default=0, + help='id of gpu to use ' + '(only applicable to non-distributed training)') parser.add_argument( '--seed', type=int, default=None, help='random seed') parser.add_argument( @@ -57,93 +72,44 @@ def parse_args(self, args: Sequence[str]) -> argparse.Namespace: '--options', nargs='+', action=DictAction, - help='override some settings in the used config, the ' - 'key-value pair in xxx=yyy format will be merged into config file' - ' (deprecate), change to --cfg-options instead.') + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file (deprecate), ' + 'change to --cfg-options instead.') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, - help='override some settings in the used config, the ' - 'key-value pair in xxx=yyy format will be merged into config ' - 'file. If the value to be overwritten is a list, it should be ' - 'like key="[a,b]" or key=a,b It also allows nested list/tuple ' - 'values, e.g. key="[(a,b),(c,d)]" Note that the quotation marks ' - 'are necessary and that no white space is allowed.') + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) parser.add_argument( '--auto-scale-lr', action='store_true', help='enable automatically scaling LR.') - args = parser.parse_args(args) - return args - - def build_model(self, - cfg: Config, - train_cfg: Optional[Config] = None, - test_cfg: Optional[Config] = None) -> torch.nn.Module: - """Build the model from configs. - - Args: - cfg (Config): The configs. - train_cfg (Optional[Config]): - The train opt. Defaults to None. - test_cfg (Optional[Config]): - The Test opt. Defaults to None. - - Returns: - torch.nn.Module: The model. - """ - - from mmdet.models.builder import build_detector - return build_detector(cfg, train_cfg, test_cfg) - - def build_dataset( - self, - cfg: Config, - default_args: Optional[Config] = None) -> torch.utils.data.Dataset: - """Build the dataset from configs. - - Args: - cfg (Config): The configs. - default_args (Optional[Config]): - The default args. Defaults to None. - - Returns: - torch.utils.data.Dataset: The dataset. - """ - - from mmdet.datasets.builder import build_dataset - return build_dataset(cfg, default_args) - - def train_model(self, - model: torch.nn.Module, - dataset: torch.utils.data.Dataset, - cfg: Config, - distributed: bool = True, - validate: bool = False, - timestamp: Optional[str] = None, - meta: Optional[dict] = None) -> None: - """Train the model. + args = parser.parse_args(task_args) + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) - Args: - model (torch.nn.Module): The model. - dataset (torch.utils.data.Dataset): The dataset. - cfg (Config): The configs. - distributed (bool): - Whether or not distributed. Defaults to True. - validate (bool): - Whether or not validate. Defaults to False. - timestamp (Optional[str]): - The timestamp. Defaults to None. - meta (Optional[dict]): - The meta. Defaults to None. - """ + if args.options and args.cfg_options: + raise ValueError( + '--options and --cfg-options cannot be both ' + 'specified, --options is deprecated in favor of --cfg-options') + if args.options: + warnings.warn('--options is deprecated in favor of --cfg-options') + args.cfg_options = args.options - from mmdet.apis.train import train_detector - train_detector(model, dataset, cfg, distributed, validate, timestamp, - meta) + return args - def run(self, *, args: argparse.Namespace, **kwargs) -> None: + def run(self, args: argparse.Namespace): """Run the task. Args: @@ -151,24 +117,52 @@ def run(self, *, args: argparse.Namespace, **kwargs) -> None: The args that received from context manager. """ + import mmcv + import torch + import torch.distributed as dist + from mmcv import Config + from mmcv.runner import get_dist_info, init_dist + from mmcv.utils import get_git_hash from mmdet import __version__ - from mmdet.apis import init_random_seed, set_random_seed + from mmdet.apis import (init_random_seed, set_random_seed, + train_detector) + from mmdet.datasets import build_dataset + from mmdet.models import build_detector from mmdet.utils import (collect_env, get_device, get_root_logger, - setup_multi_processes) - - if 'LOCAL_RANK' not in os.environ: - os.environ['LOCAL_RANK'] = str(dist.get_rank()) + replace_cfg_vals, setup_multi_processes, + update_data_root) cfg = Config.fromfile(args.config) + + # replace the ${key} with the value of cfg.key + cfg = replace_cfg_vals(cfg) + + # update data root according to MMDET_DATASETS + update_data_root(cfg) + if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) + if args.auto_scale_lr: + if 'auto_scale_lr' in cfg and \ + 'enable' in cfg.auto_scale_lr and \ + 'base_batch_size' in cfg.auto_scale_lr: + cfg.auto_scale_lr.enable = True + else: + warnings.warn('Can not find "auto_scale_lr" or ' + '"auto_scale_lr.enable" or ' + '"auto_scale_lr.base_batch_size" in your' + ' configuration file. Please update all the ' + 'configuration files to mmdet >= 2.24.1.') + + # set multi-process settings + setup_multi_processes(cfg) + # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True - # work_dir is determined in this priority: - # CLI > segment in file > filename + # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir @@ -176,16 +170,33 @@ def run(self, *, args: argparse.Namespace, **kwargs) -> None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) + if args.resume_from is not None: cfg.resume_from = args.resume_from - cfg.auto_resume = args.auto_resume + if args.gpus is not None: + cfg.gpu_ids = range(1) + warnings.warn('`--gpus` is deprecated because we only support ' + 'single GPU mode in non-distributed training. ' + 'Use `gpus=1` now.') + if args.gpu_ids is not None: + cfg.gpu_ids = args.gpu_ids[0:1] + warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' + 'Because we only support single GPU mode in ' + 'non-distributed training. Use the first GPU ' + 'in `gpu_ids` now.') + if args.gpus is None and args.gpu_ids is None: + cfg.gpu_ids = [args.gpu_id] # init distributed env first, since logger depends on the dist info. - distributed = True - # gpu_ids is used to calculate iter when resuming checkpoint - _, world_size = get_dist_info() - cfg.gpu_ids = range(world_size) + if args.launcher == 'none': + distributed = False + else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + # re-set gpu_ids with distributed training mode + _, world_size = get_dist_info() + cfg.gpu_ids = range(world_size) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) @@ -194,23 +205,19 @@ def run(self, *, args: argparse.Namespace, **kwargs) -> None: # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') - print(cfg) logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) - # set multi-process settings - setup_multi_processes(cfg) - # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() - env_info = '\n'.join([f'{k}: {v}' for k, v in env_info_dict.items()]) + env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' - logger.info('Environment info:\n' + dash_line + env_info + # noqa W504 - '\n' + dash_line) + logger.info('Environment info:\n' + dash_line + env_info + '\n' + + dash_line) meta['env_info'] = env_info - + meta['config'] = cfg.pretty_text # log some basic info logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg.pretty_text}') @@ -226,37 +233,32 @@ def run(self, *, args: argparse.Namespace, **kwargs) -> None: meta['seed'] = seed meta['exp_name'] = osp.basename(args.config) - model = self.build_model( + model = build_detector( cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg')) model.init_weights() - # SyncBN is not support for DP - logger.info(model) - - datasets = [self.build_dataset(cfg.data.train)] + datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: + assert 'val' in [mode for (mode, _) in cfg.workflow] val_dataset = copy.deepcopy(cfg.data.val) - val_dataset.pipeline = cfg.data.train.pipeline - datasets.append(self.build_dataset(val_dataset)) + val_dataset.pipeline = cfg.data.train.get( + 'pipeline', cfg.data.train.dataset.get('pipeline')) + datasets.append(build_dataset(val_dataset)) if cfg.checkpoint_config is not None: # save mmdet version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict( - mmdet_version=f'{__version__}+{get_git_hash()[:7]}', - config=cfg.pretty_text, - CLASSES=datasets[0].CLASSES, - PALETTE=datasets[0].PALETTE) + mmdet_version=__version__ + get_git_hash()[:7], + CLASSES=datasets[0].CLASSES) # add an attribute for visualization convenience model.CLASSES = datasets[0].CLASSES - # passing checkpoint meta for saving best checkpoint - meta.update(cfg.checkpoint_config.meta) - self.train_model( + train_detector( model, datasets, cfg, - distributed=True, + distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta) diff --git a/siatune/mm/tasks/mmseg.py b/siatune/mm/tasks/mmseg.py index 205a396c..14d2b929 100644 --- a/siatune/mm/tasks/mmseg.py +++ b/siatune/mm/tasks/mmseg.py @@ -3,14 +3,9 @@ import copy import os import time +import warnings from os import path as osp -from typing import Optional, Sequence - -import mmcv -import torch -import torch.distributed as dist -from mmcv.runner import get_dist_info -from mmcv.utils import Config, DictAction, get_git_hash +from typing import Sequence from .builder import TASKS from .mmtrainbase import MMTrainBasedTask @@ -18,16 +13,18 @@ @TASKS.register_module() class MMSegmentation(MMTrainBasedTask): - """MMSegmentation Wrapping class for ray tune.""" + """MMSegmentation wrapper class for `ray.tune`. - def parse_args(self, args: Sequence[str]) -> argparse.Namespace: - """Define and parse the necessary arguments for the task. + It is modified from https://github.com/open-mmlab/mmsegmentation/blob/v0.25.0/tools/train.py - Args: - args (Sequence[str]): The args. - Returns: - argparse.Namespace: The parsed args. - """ + Attributes: + args (Sequence[str]): + """ + + VERSION = 'v0.25.0' + + def parse_args(self, task_args: Sequence[str]): + from mmcv.utils import DictAction parser = argparse.ArgumentParser(description='Train a segmentor') parser.add_argument('config', help='train config file path') @@ -41,6 +38,24 @@ def parse_args(self, args: Sequence[str]) -> argparse.Namespace: '--no-validate', action='store_true', help='whether not to evaluate the checkpoint during training') + group_gpus = parser.add_mutually_exclusive_group() + group_gpus.add_argument( + '--gpus', + type=int, + help='(Deprecated, please use --gpu-id) number of gpus to use ' + '(only applicable to non-distributed training)') + group_gpus.add_argument( + '--gpu-ids', + type=int, + nargs='+', + help='(Deprecated, please use --gpu-id) ids of gpus to use ' + '(only applicable to non-distributed training)') + group_gpus.add_argument( + '--gpu-id', + type=int, + default=0, + help='id of gpu to use ' + '(only applicable to non-distributed training)') parser.add_argument( '--seed', type=int, default=None, help='random seed') parser.add_argument( @@ -51,102 +66,76 @@ def parse_args(self, args: Sequence[str]) -> argparse.Namespace: '--deterministic', action='store_true', help='whether to set deterministic options for CUDNN backend.') + parser.add_argument( + '--options', + nargs='+', + action=DictAction, + help= + "--options is deprecated in favor of --cfg_options' and it will " + 'not be supported in version v0.22.0. Override some settings in the ' + 'used config, the key-value pair in xxx=yyy format will be merged ' + 'into config file. If the value to be overwritten is a list, it ' + 'should be like key="[a,b]" or key=a,b It also allows nested ' + 'list/tuple values, e.g. key="[(a,b),(c,d)]" Note that the quotation ' + 'marks are necessary and that no white space is allowed.') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, - help='override some settings in the used config, the key-value ' - 'pair in xxx=yyy format will be merged into config file. If the ' - 'value to be overwritten is a list, it should be like key="[a,b]" ' - 'or key=a,b It also allows nested list/tuple values, e.g. ' - 'key="[(a,b),(c,d)]" Note that the quotation marks are necessary ' - 'and that no white space is allowed.') + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) parser.add_argument( '--auto-resume', action='store_true', help='resume from the latest checkpoint automatically.') - args = parser.parse_args(args) - return args - - def build_model(self, - cfg: Config, - train_cfg: Optional[Config] = None, - test_cfg: Optional[Config] = None) -> torch.nn.Module: - """Build the model from configs. - - Args: - cfg (Config): The configs. - train_cfg (Optional[Config]): - The train opt. Defaults to None. - test_cfg (Optional[Config]): - The Test opt. Defaults to None. - Returns: - torch.nn.Module: The model. - """ - - from mmseg.models.builder import build_segmentor - return build_segmentor(cfg, train_cfg, test_cfg) - - def build_dataset( - self, - cfg: Config, - default_args: Optional[Config] = None) -> torch.utils.data.Dataset: - """Build the dataset from configs. - - Args: - cfg (Config): The configs. - default_args (Optional[Config]): - The default args. Defaults to None. - Returns: - torch.utils.data.Dataset: The dataset. - """ - - from mmseg.datasets.builder import build_dataset - return build_dataset(cfg, default_args) - - def train_model(self, - model: torch.nn.Module, - dataset: torch.utils.data.Dataset, - cfg: Config, - distributed: bool = True, - validate: bool = False, - timestamp: Optional[str] = None, - meta: Optional[dict] = None) -> None: - """Train the model. - - Args: - model (torch.nn.Module): The model. - dataset (torch.utils.data.Dataset): The dataset. - cfg (Config): The configs. - distributed (bool): - Whether or not distributed. Defaults to True. - validate (bool): - Whether or not validate. Defaults to False. - timestamp (Optional[str]): - The timestamp. Defaults to None. - meta (Optional[dict]): The meta. Defaults to None. - """ + args = parser.parse_args(task_args) + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + if args.options and args.cfg_options: + raise ValueError( + '--options and --cfg-options cannot be both ' + 'specified, --options is deprecated in favor of --cfg-options. ' + '--options will not be supported in version v0.22.0.') + if args.options: + warnings.warn( + '--options is deprecated in favor of --cfg-options. ' + '--options will not be supported in version v0.22.0.') + args.cfg_options = args.options - from mmseg.apis.train import train_segmentor - return train_segmentor(model, dataset, cfg, distributed, validate, - timestamp, meta) + return args - def run(self, *, args, **kwargs) -> None: + def run(self, args: argparse.Namespace): """Run the task. Args: args (argparse.Namespace): The args that received from context manager. """ - + import mmcv + import torch + import torch.distributed as dist + from mmcv.cnn.utils import revert_sync_batchnorm + from mmcv.runner import get_dist_info, init_dist + from mmcv.utils import Config, get_git_hash from mmseg import __version__ - from mmseg.apis import init_random_seed, set_random_seed - from mmseg.utils import (collect_env, get_root_logger, + from mmseg.apis import (init_random_seed, set_random_seed, + train_segmentor) + from mmseg.datasets import build_dataset + from mmseg.models import build_segmentor + from mmseg.utils import (collect_env, get_device, get_root_logger, setup_multi_processes) - if 'LOCAL_RANK' not in os.environ: - os.environ['LOCAL_RANK'] = str(dist.get_rank()) - cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) @@ -155,8 +144,7 @@ def run(self, *, args, **kwargs) -> None: if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True - # work_dir is determined in this priority: CLI > - # segment in file > filename + # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir @@ -168,14 +156,31 @@ def run(self, *, args, **kwargs) -> None: cfg.load_from = args.load_from if args.resume_from is not None: cfg.resume_from = args.resume_from + if args.gpus is not None: + cfg.gpu_ids = range(1) + warnings.warn('`--gpus` is deprecated because we only support ' + 'single GPU mode in non-distributed training. ' + 'Use `gpus=1` now.') + if args.gpu_ids is not None: + cfg.gpu_ids = args.gpu_ids[0:1] + warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' + 'Because we only support single GPU mode in ' + 'non-distributed training. Use the first GPU ' + 'in `gpu_ids` now.') + if args.gpus is None and args.gpu_ids is None: + cfg.gpu_ids = [args.gpu_id] cfg.auto_resume = args.auto_resume # init distributed env first, since logger depends on the dist info. - distributed = True - # gpu_ids is used to calculate iter when resuming checkpoint - _, world_size = get_dist_info() - cfg.gpu_ids = range(world_size) + if args.launcher == 'none': + distributed = False + else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + # gpu_ids is used to calculate iter when resuming checkpoint + _, world_size = get_dist_info() + cfg.gpu_ids = range(world_size) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) @@ -196,8 +201,8 @@ def run(self, *, args, **kwargs) -> None: env_info_dict = collect_env() env_info = '\n'.join([f'{k}: {v}' for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' - logger.info('Environment info:\n' + dash_line + env_info + # noqa W504 - '\n' + dash_line) + logger.info('Environment info:\n' + dash_line + env_info + '\n' + + dash_line) meta['env_info'] = env_info # log some basic info @@ -205,7 +210,8 @@ def run(self, *, args, **kwargs) -> None: logger.info(f'Config:\n{cfg.pretty_text}') # set random seeds - seed = init_random_seed(args.seed) + cfg.device = get_device() + seed = init_random_seed(args.seed, device=cfg.device) seed = seed + dist.get_rank() if args.diff_seed else seed logger.info(f'Set random seed to {seed}, ' f'deterministic: {args.deterministic}') @@ -214,20 +220,27 @@ def run(self, *, args, **kwargs) -> None: meta['seed'] = seed meta['exp_name'] = osp.basename(args.config) - model = self.build_model( + model = build_segmentor( cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg')) model.init_weights() # SyncBN is not support for DP + if not distributed: + warnings.warn( + 'SyncBN is only supported with DDP. To be compatible with DP, ' + 'we convert SyncBN to BN. Please use dist_train.sh which can ' + 'avoid this error.') + model = revert_sync_batchnorm(model) + logger.info(model) - datasets = [self.build_dataset(cfg.data.train)] + datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) val_dataset.pipeline = cfg.data.train.pipeline - datasets.append(self.build_dataset(val_dataset)) + datasets.append(build_dataset(val_dataset)) if cfg.checkpoint_config is not None: # save mmseg version, config file content and class names in # checkpoints as meta data @@ -240,11 +253,11 @@ def run(self, *, args, **kwargs) -> None: model.CLASSES = datasets[0].CLASSES # passing checkpoint meta for saving best checkpoint meta.update(cfg.checkpoint_config.meta) - self.train_model( + train_segmentor( model, datasets, cfg, - distributed=True, + distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta) diff --git a/siatune/mm/tasks/mmtrainbase.py b/siatune/mm/tasks/mmtrainbase.py index e897a71a..e1f71928 100644 --- a/siatune/mm/tasks/mmtrainbase.py +++ b/siatune/mm/tasks/mmtrainbase.py @@ -1,7 +1,6 @@ # Copyright (c) SI-Analytics. All rights reserved. -from abc import ABCMeta, abstractmethod +from abc import ABCMeta -import mmcv import torch from ray.air.config import ScalingConfig from ray.train.data_parallel_trainer import DataParallelTrainer @@ -15,52 +14,13 @@ class MMTrainBasedTask(BaseTask, metaclass=ABCMeta): """Wrap the apis of open mm train-based projects.""" - @abstractmethod - def build_model(self, cfg: mmcv.Config, **kwargs) -> torch.nn.Module: - """Build the model from configs. - - Args: - cfg (Config): The configs. - Returns: - torch.nn.Module: The model. - """ - pass - - @abstractmethod - def build_dataset(self, cfg: mmcv.Config, - **kwargs) -> torch.utils.data.Dataset: - """Build the dataset from configs. - - Args: - cfg (Config): The configs. - Returns: - torch.utils.data.Dataset: The dataset. - """ - pass - - @abstractmethod - def train_model( - self, - model: torch.nn.Module, - dataset: torch.utils.data.Dataset, - cfg: mmcv.Config, - **kwargs, - ) -> None: - """Train the model. - - Args: - model (torch.nn.Module): The model. - dataset (torch.utils.data.Dataset): The dataset. - cfg (Config): The configs. - """ - pass - def create_trainable(self) -> DataParallelTrainer: - """Get ray trainable task. + """Get a :class:`DataParallelTrainer` instance. Returns: - DataParallelTrainer: The trainable task. + DataParallelTrainer: Trainer to optimize hyperparameter. """ + assert self.num_workers == self.num_gpus_per_worker, ( '`num_workers` must be equal to `num_gpus_per_worker`.') diff --git a/tests/test_mm/test_tasks.py b/tests/test_mm/test_tasks.py index a779dc86..a602f9ad 100644 --- a/tests/test_mm/test_tasks.py +++ b/tests/test_mm/test_tasks.py @@ -1,7 +1,9 @@ import argparse -import os from unittest.mock import patch +import mmcls # noqa: F401 +import mmdet # noqa: F401 +import mmseg # noqa: F401 import pytest import torch from mmcv.utils import Config @@ -162,36 +164,33 @@ def test_discrete_test_function(mock_report): assert isinstance(get_session().get('result'), float) -@patch.object(MMSegmentation, 'train_model') -@patch.object(MMSegmentation, 'build_model') -@patch.object(MMSegmentation, 'build_dataset') -def test_mmseg(*not_used): - os.environ['LOCAL_RANK'] = '0' - - task = MMSegmentation() - task.set_args(['tests/data/config.py']) +@patch('mmcls.apis.train_model') +@patch('mmcls.datasets.build_dataset') +@patch('mmcls.models.build_classifier') +def test_mmcls(*not_used): + task = MMClassification() + task_args = ['tests/data/config.py'] + task.set_args(task_args) task.run(args=task.args) -@patch.object(MMDetection, 'train_model') -@patch.object(MMDetection, 'build_model') -@patch.object(MMDetection, 'build_dataset') +@patch('mmdet.apis.train_detector') +@patch('mmdet.datasets.build_dataset') +@patch('mmdet.models.build_detector') def test_mmdet(*not_used): - os.environ['LOCAL_RANK'] = '0' - task = MMDetection() - task.set_args(['tests/data/config.py']) + task_args = ['tests/data/config.py'] + task.set_args(task_args) task.run(args=task.args) -@patch.object(MMClassification, 'train_model') -@patch.object(MMClassification, 'build_model') -@patch.object(MMClassification, 'build_dataset') -def test_mmcls(*not_used): - os.environ['LOCAL_RANK'] = '0' - - task = MMClassification() - task.set_args(['tests/data/config.py']) +@patch('mmseg.apis.train_segmentor') +@patch('mmseg.datasets.build_dataset') +@patch('mmseg.models.build_segmentor') +def test_mmseg(*not_used): + task = MMSegmentation() + task_args = ['tests/data/config.py'] + task.set_args(task_args) task.run(args=task.args) From d2ff007ccd80b1eabafa7c0c64bf4812cf5440f4 Mon Sep 17 00:00:00 2001 From: Junhwa Song Date: Fri, 16 Dec 2022 19:21:09 +0900 Subject: [PATCH 23/28] Fix minor (#100) * Fix blocking issue at test_tasks.py * Support single GPU tuning * Bump FLAML to v1.0.14 to avoid deprecated warning --- requirements/optional.txt | 2 +- siatune/mm/tasks/mmtrainbase.py | 4 +- tests/test_mm/test_tasks.py | 280 ---------------------- tests/test_mm/test_tasks/test_base.py | 66 +++++ tests/test_mm/test_tasks/test_blackbox.py | 80 +++++++ tests/test_mm/test_tasks/test_mmtask.py | 37 +++ 6 files changed, 185 insertions(+), 284 deletions(-) delete mode 100644 tests/test_mm/test_tasks.py create mode 100644 tests/test_mm/test_tasks/test_base.py create mode 100644 tests/test_mm/test_tasks/test_blackbox.py create mode 100644 tests/test_mm/test_tasks/test_mmtask.py diff --git a/requirements/optional.txt b/requirements/optional.txt index 5bddb703..4ce60537 100644 --- a/requirements/optional.txt +++ b/requirements/optional.txt @@ -1,5 +1,5 @@ bayesian-optimization==1.2.0 -flaml==0.9.7 +flaml==1.0.14 hyperopt==0.2.5 mlflow==1.23.1 nevergrad==0.4.3.post7 diff --git a/siatune/mm/tasks/mmtrainbase.py b/siatune/mm/tasks/mmtrainbase.py index e1f71928..3f2607f3 100644 --- a/siatune/mm/tasks/mmtrainbase.py +++ b/siatune/mm/tasks/mmtrainbase.py @@ -28,8 +28,6 @@ def create_trainable(self) -> DataParallelTrainer: self.context_aware_run, backend_config=CustomBackendConfig(), scaling_config=ScalingConfig( - trainer_resources=dict( - CPU=self.num_cpus_per_worker, - GPU=self.num_gpus_per_worker), + trainer_resources=dict(CPU=self.num_cpus_per_worker), num_workers=self.num_workers, use_gpu=torch.cuda.is_available())) diff --git a/tests/test_mm/test_tasks.py b/tests/test_mm/test_tasks.py deleted file mode 100644 index a602f9ad..00000000 --- a/tests/test_mm/test_tasks.py +++ /dev/null @@ -1,280 +0,0 @@ -import argparse -from unittest.mock import patch - -import mmcls # noqa: F401 -import mmdet # noqa: F401 -import mmseg # noqa: F401 -import pytest -import torch -from mmcv.utils import Config -from ray import tune -from ray.air import session - -from siatune.mm.tasks import (TASKS, BaseTask, BlackBoxTask, - ContinuousTestFunction, DiscreteTestFunction, - MMClassification, MMDetection, MMSegmentation, - MMTrainBasedTask, build_task_processor) -from siatune.utils.config import dump_cfg - -_session = dict() - - -def report_to_session(*args, **kwargs): - _session = get_session() - _session.update(kwargs) - for arg in args: - if isinstance(arg, dict): - _session.update(arg) - - -def get_session(): - global _session - return _session - - -@patch('ray.tune.report', side_effect=report_to_session) -def test_base_task(mock_report): - with pytest.raises(TypeError): - BaseTask() - - class TestRewriter: - - def __call__(self, context): - context.get('args').test = -1 - return context - - class TestTask(BaseTask): - - def parse_args(self, *args, **kwargs): - return argparse.Namespace(test=1) - - def run(self, *, args, **kwargs): - tune.report(test=args.test) - return args.test - - def create_trainable(self): - return self.context_aware_run - - task = TestTask([TestRewriter()]) - task.set_args('') - assert task.args == argparse.Namespace(test=1) - assert isinstance(task.rewriters, list) - task.context_aware_run({}) - assert get_session().get('test') == -1 - - tune.run(task.create_trainable(), config={}) - - -def test_black_box_task(): - with pytest.raises(TypeError): - BlackBoxTask() - - class TestTask(BlackBoxTask): - - def run(self, *args, **kwargs): - tune.report(test=1) - - task = TestTask() - task.set_args('') - assert task.args == argparse.Namespace() - tune.run(task.create_trainable(), config={}) - - -def test_build_task_processor(): - - class TestTaks(BaseTask): - - def parse_args(self, *args, **kwargs): - pass - - def run(self, *args, **kwargs): - pass - - def create_trainable(self, *args, **kwargs): - pass - - TASKS.register_module(TestTaks) - assert isinstance(build_task_processor(dict(type='TestTaks')), TestTaks) - - -@patch('ray.tune.report', side_effect=report_to_session) -def test_continuous_test_function(mock_report): - func = ContinuousTestFunction() - predefined_cont_funcs = [ - 'delayedsphere', - 'sphere', - 'sphere1', - 'sphere2', - 'sphere4', - 'maxdeceptive', - 'sumdeceptive', - 'altcigar', - 'discus', - 'cigar', - 'bentcigar', - 'multipeak', - 'altellipsoid', - 'stepellipsoid', - 'ellipsoid', - 'rastrigin', - 'bucherastrigin', - 'doublelinearslope', - 'stepdoublelinearslope', - 'hm', - 'rosenbrock', - 'ackley', - 'schwefel_1_2', - 'griewank', - 'deceptiveillcond', - 'deceptivepath', - 'deceptivemultimodal', - 'lunacek', - 'genzcornerpeak', - 'minusgenzcornerpeak', - 'genzgaussianpeakintegral', - 'minusgenzgaussianpeakintegral', - 'slope', - 'linear', - 'st0', - 'st1', - 'st10', - 'st100', - ] - - for func_name in predefined_cont_funcs: - dump_cfg( - Config(dict(func=func_name, _variable0=0.0, _variable1=0.0)), - 'test.py') - args = argparse.Namespace(config='test.py') - func.run(args=args) - assert isinstance(get_session().get('result'), float) - - -@patch('ray.tune.report', side_effect=report_to_session) -def test_discrete_test_function(mock_report): - func = DiscreteTestFunction() - - predefined_discrete_funcs = ['onemax', 'leadingones', 'jump'] - for func_name in predefined_discrete_funcs: - dump_cfg( - Config(dict(func=func_name, _variable0=0.0, _variable1=0.0)), - 'test.py') - args = argparse.Namespace(config='test.py') - func.run(args=args) - assert isinstance(get_session().get('result'), float) - - -@patch('mmcls.apis.train_model') -@patch('mmcls.datasets.build_dataset') -@patch('mmcls.models.build_classifier') -def test_mmcls(*not_used): - task = MMClassification() - task_args = ['tests/data/config.py'] - task.set_args(task_args) - task.run(args=task.args) - - -@patch('mmdet.apis.train_detector') -@patch('mmdet.datasets.build_dataset') -@patch('mmdet.models.build_detector') -def test_mmdet(*not_used): - task = MMDetection() - task_args = ['tests/data/config.py'] - task.set_args(task_args) - task.run(args=task.args) - - -@patch('mmseg.apis.train_segmentor') -@patch('mmseg.datasets.build_dataset') -@patch('mmseg.models.build_segmentor') -def test_mmseg(*not_used): - task = MMSegmentation() - task_args = ['tests/data/config.py'] - task.set_args(task_args) - task.run(args=task.args) - - -@patch('ray.air.session.report', side_effect=report_to_session) -def test_mm_train_based_task(mock_report): - with pytest.raises(TypeError): - MMTrainBasedTask() - - class TestTask(MMTrainBasedTask): - - def parse_args(self, args): - parser = argparse.ArgumentParser() - return parser.parse_args(args) - - def build_model(self, cfg): - - class Regression(torch.nn.Module): - - def __init__(self, input_dim, output_dim): - super().__init__() - self.linear = torch.nn.Linear(input_dim, output_dim) - - def forward(self, x): - return self.linear(x) - - return Regression(cfg.input_dim, cfg.output_dim) - - def build_dataset(self, cfg): - - class Dataset(torch.utils.data.Dataset): - - def __init__(self, num_points): - torch.manual_seed(0) - self._x = torch.randn(num_points, 1) - self._y = 2 * self._x + 1 - self.num_points = num_points - - def __getitem__(self, index): - return self._x[index], self._y[index] - - def __len__(self): - return self.num_points - - return Dataset(cfg.num_points) - - def train_model(self, model, dataset, cfg): - criterion = torch.nn.MSELoss() - optimizer = torch.optim.SGD(model.parameters(), lr=cfg.lr) - data_loader = torch.utils.data.DataLoader( - dataset, batch_size=cfg.batch_size) - for _ in range(cfg.num_epochs): - total_loss = 0. - for batch_idx, (data, target) in enumerate(data_loader): - optimizer.zero_grad() - output = model(data) - loss = criterion(output, target) - loss.backward() - optimizer.step() - total_loss += loss.item() - session.report(loss=total_loss / (batch_idx + 1)) - - def run(self, *, searched_cfg, **kwargs): - cfg = searched_cfg.get('cfg') - model = self.build_model(cfg.model) - dataset = self.build_dataset(cfg.data) - self.train_model(model, dataset, cfg.train) - - cfg = Config( - dict( - model=dict( - input_dim=1, - output_dim=1, - ), - data=dict(num_points=128, ), - train=dict( - lr=0.1, - batch_size=32, - num_epochs=4, - ))) - - task = TestTask() - task.set_resource() - task.context_aware_run(searched_cfg=dict(cfg=cfg)) - assert 'loss' in get_session() - - trainable = task.create_trainable() - tune.Tuner(trainable).fit() diff --git a/tests/test_mm/test_tasks/test_base.py b/tests/test_mm/test_tasks/test_base.py new file mode 100644 index 00000000..ee3f19ef --- /dev/null +++ b/tests/test_mm/test_tasks/test_base.py @@ -0,0 +1,66 @@ +import argparse + +import pytest +import ray +from ray import tune +from ray.tune.result_grid import ResultGrid + +from siatune.mm.tasks import TASKS, BaseTask, build_task_processor + + +@pytest.fixture +def init_ray(): + if ray.is_initialized(): + ray.shutdown() + return ray.init(num_cpus=1) + + +def test_base_task(init_ray): + with pytest.raises(TypeError): + BaseTask() + + class TestTask(BaseTask): + + def parse_args(self, args): + parser = argparse.ArgumentParser() + parser.add_argument('test') + return parser.parse_args(args) + + def run(self, args): + tune.report(test=args.test) + + def create_trainable(self): + return self.context_aware_run + + class TestRewriter: + + def __call__(self, context): + args = context.pop('args') + args.test = 'success' + return dict(args=args) + + task = TestTask(rewriters=[TestRewriter()]) + task.set_args(['default']) + assert task.args == argparse.Namespace(test='default') + + trainable = task.create_trainable() + results = ResultGrid(tune.run(trainable, config={})) + assert results[0].metrics['test'] == 'success' + + +def test_build_task_processor(): + + @TASKS.register_module() + class TestTask(BaseTask): + + def parse_args(self, args): + pass + + def run(self, args): + pass + + def create_trainable(self): + pass + + task = build_task_processor(dict(type='TestTask', rewriters=[])) + assert isinstance(task, (BaseTask, TestTask)) diff --git a/tests/test_mm/test_tasks/test_blackbox.py b/tests/test_mm/test_tasks/test_blackbox.py new file mode 100644 index 00000000..41bdfdad --- /dev/null +++ b/tests/test_mm/test_tasks/test_blackbox.py @@ -0,0 +1,80 @@ +import argparse +from unittest.mock import patch + +from mmcv.utils import Config + +from siatune.mm.tasks import ContinuousTestFunction, DiscreteTestFunction +from siatune.utils.config import dump_cfg + +session = dict() + + +def report_to_session(**kwargs): + session.update(kwargs) + + +@patch('ray.tune.report', side_effect=report_to_session) +def test_continuous_test_function(init_ray): + func = ContinuousTestFunction() + predefined_cont_funcs = [ + 'delayedsphere', + 'sphere', + 'sphere1', + 'sphere2', + 'sphere4', + 'maxdeceptive', + 'sumdeceptive', + 'altcigar', + 'discus', + 'cigar', + 'bentcigar', + 'multipeak', + 'altellipsoid', + 'stepellipsoid', + 'ellipsoid', + 'rastrigin', + 'bucherastrigin', + 'doublelinearslope', + 'stepdoublelinearslope', + 'hm', + 'rosenbrock', + 'ackley', + 'schwefel_1_2', + 'griewank', + 'deceptiveillcond', + 'deceptivepath', + 'deceptivemultimodal', + 'lunacek', + 'genzcornerpeak', + 'minusgenzcornerpeak', + 'genzgaussianpeakintegral', + 'minusgenzgaussianpeakintegral', + 'slope', + 'linear', + 'st0', + 'st1', + 'st10', + 'st100', + ] + + for func_name in predefined_cont_funcs: + dump_cfg( + Config(dict(func=func_name, _variable0=0.0, _variable1=0.0)), + 'test.py') + args = argparse.Namespace(config='test.py') + func.run(args=args) + assert isinstance(session['result'], float) + + +@patch('ray.tune.report', side_effect=report_to_session) +def test_discrete_test_function(init_ray): + func = DiscreteTestFunction() + + predefined_discrete_funcs = ['onemax', 'leadingones', 'jump'] + for func_name in predefined_discrete_funcs: + dump_cfg( + Config(dict(func=func_name, _variable0=0.0, _variable1=0.0)), + 'test.py') + args = argparse.Namespace(config='test.py') + func.run(args=args) + assert isinstance(session['result'], float) diff --git a/tests/test_mm/test_tasks/test_mmtask.py b/tests/test_mm/test_tasks/test_mmtask.py new file mode 100644 index 00000000..01dda0fa --- /dev/null +++ b/tests/test_mm/test_tasks/test_mmtask.py @@ -0,0 +1,37 @@ +from unittest.mock import patch + +import mmcls # noqa: F401 +import mmdet # noqa: F401 +import mmseg # noqa: F401 + +from siatune.mm.tasks import MMClassification, MMDetection, MMSegmentation + + +@patch('mmcls.apis.train_model') +@patch('mmcls.datasets.build_dataset') +@patch('mmcls.models.build_classifier') +def test_mmcls(*not_used): + task = MMClassification() + task_args = ['tests/data/config.py'] + task.set_args(task_args) + task.run(args=task.args) + + +@patch('mmdet.apis.train_detector') +@patch('mmdet.datasets.build_dataset') +@patch('mmdet.models.build_detector') +def test_mmdet(*not_used): + task = MMDetection() + task_args = ['tests/data/config.py'] + task.set_args(task_args) + task.run(args=task.args) + + +@patch('mmseg.apis.train_segmentor') +@patch('mmseg.datasets.build_dataset') +@patch('mmseg.models.build_segmentor') +def test_mmseg(*not_used): + task = MMSegmentation() + task_args = ['tests/data/config.py'] + task.set_args(task_args) + task.run(args=task.args) From e63911dde4d3f871f09531d2f920fe651f93cebb Mon Sep 17 00:00:00 2001 From: Junhwa Song Date: Sat, 17 Dec 2022 00:01:54 +0900 Subject: [PATCH 24/28] Update siatune/mm/tasks/mmtrainbase.py Signed-off-by: Junhwa Song --- siatune/mm/tasks/mmtrainbase.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/siatune/mm/tasks/mmtrainbase.py b/siatune/mm/tasks/mmtrainbase.py index 3f2607f3..e9c4bb1e 100644 --- a/siatune/mm/tasks/mmtrainbase.py +++ b/siatune/mm/tasks/mmtrainbase.py @@ -21,9 +21,6 @@ def create_trainable(self) -> DataParallelTrainer: DataParallelTrainer: Trainer to optimize hyperparameter. """ - assert self.num_workers == self.num_gpus_per_worker, ( - '`num_workers` must be equal to `num_gpus_per_worker`.') - return DataParallelTrainer( self.context_aware_run, backend_config=CustomBackendConfig(), From 9d4f5e68af307453c8f457705711a2870192b3b3 Mon Sep 17 00:00:00 2001 From: KKIEEK Date: Sat, 17 Dec 2022 01:58:25 +0900 Subject: [PATCH 25/28] Fix typo --- tests/test_mm/test_tasks/test_blackbox.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_mm/test_tasks/test_blackbox.py b/tests/test_mm/test_tasks/test_blackbox.py index 41bdfdad..84947d7a 100644 --- a/tests/test_mm/test_tasks/test_blackbox.py +++ b/tests/test_mm/test_tasks/test_blackbox.py @@ -14,7 +14,7 @@ def report_to_session(**kwargs): @patch('ray.tune.report', side_effect=report_to_session) -def test_continuous_test_function(init_ray): +def test_continuous_test_function(*not_used): func = ContinuousTestFunction() predefined_cont_funcs = [ 'delayedsphere', @@ -67,7 +67,7 @@ def test_continuous_test_function(init_ray): @patch('ray.tune.report', side_effect=report_to_session) -def test_discrete_test_function(init_ray): +def test_discrete_test_function(*not_used): func = DiscreteTestFunction() predefined_discrete_funcs = ['onemax', 'leadingones', 'jump'] From cf5a79b8b45fc546b7648c33589b8342581337a9 Mon Sep 17 00:00:00 2001 From: Junhwa Song Date: Sat, 17 Dec 2022 15:39:45 +0900 Subject: [PATCH 26/28] Supplement documentations (#102) --- siatune/mm/tasks/mmcls.py | 9 +++++++- siatune/mm/tasks/mmdet.py | 9 +++++++- siatune/mm/tasks/mmseg.py | 9 +++++++- siatune/ray/tuner.py | 47 +++++++++++++++++++++++---------------- 4 files changed, 52 insertions(+), 22 deletions(-) diff --git a/siatune/mm/tasks/mmcls.py b/siatune/mm/tasks/mmcls.py index fdd8dc7a..c77485a1 100644 --- a/siatune/mm/tasks/mmcls.py +++ b/siatune/mm/tasks/mmcls.py @@ -18,7 +18,14 @@ class MMClassification(MMTrainBasedTask): It is modified from https://github.com/open-mmlab/mmclassification/blob/v0.23.2/tools/train.py Attributes: - args (Sequence[str]): + args (argparse.Namespace): The arguments for `tools/train.py` + script file. It is parsed by :method:`parse_args`. + num_workers (int): The number of workers to launch. + num_cpus_per_worker (int): The number of CPUs per worker. + Default to 1. + num_gpus_per_worker (int): The number of GPUs per worker. + Since it must be equal `num_workers` attribute, it is + not used in MMClassification. """ VERSION = 'v0.23.2' diff --git a/siatune/mm/tasks/mmdet.py b/siatune/mm/tasks/mmdet.py index 6504d339..e19448f5 100644 --- a/siatune/mm/tasks/mmdet.py +++ b/siatune/mm/tasks/mmdet.py @@ -18,7 +18,14 @@ class MMDetection(MMTrainBasedTask): It is modified from https://github.com/open-mmlab/mmdetection/blob/v2.25.2/tools/train.py Attributes: - args (Sequence[str]): + args (argparse.Namespace): The arguments for `tools/train.py` + script file. It is parsed by :method:`parse_args`. + num_workers (int): The number of workers to launch. + num_cpus_per_worker (int): The number of CPUs per worker. + Default to 1. + num_gpus_per_worker (int): The number of GPUs per worker. + Since it must be equal `num_workers` attribute, it is + not used in MMDetection. """ VERSION = 'v2.25.2' diff --git a/siatune/mm/tasks/mmseg.py b/siatune/mm/tasks/mmseg.py index 14d2b929..db17d687 100644 --- a/siatune/mm/tasks/mmseg.py +++ b/siatune/mm/tasks/mmseg.py @@ -18,7 +18,14 @@ class MMSegmentation(MMTrainBasedTask): It is modified from https://github.com/open-mmlab/mmsegmentation/blob/v0.25.0/tools/train.py Attributes: - args (Sequence[str]): + args (argparse.Namespace): The arguments for `tools/train.py` + script file. It is parsed by :method:`parse_args`. + num_workers (int): The number of workers to launch. + num_cpus_per_worker (int): The number of CPUs per worker. + Default to 1. + num_gpus_per_worker (int): The number of GPUs per worker. + Since it must be equal `num_workers` attribute, it is + not used in MMSegmentation. """ VERSION = 'v0.25.0' diff --git a/siatune/ray/tuner.py b/siatune/ray/tuner.py index e1aaf079..045bbf5b 100644 --- a/siatune/ray/tuner.py +++ b/siatune/ray/tuner.py @@ -1,6 +1,7 @@ # Copyright (c) SI-Analytics. All rights reserved. import copy import os.path as osp +from typing import Any, Callable, Optional, Union from ray.air.config import RunConfig from ray.tune.tune_config import TuneConfig @@ -14,27 +15,35 @@ class Tuner: """Wrapper class of :class:`ray.tune.tuner.Tuner`. Args: - trainable (Callable): - work_dir (str): - param_space (dict, optional): - tune_cfg (dict, optional): - Refer to https://github.com/ray-project/ray/blob/ray-2.1.0/python/ray/tune/tune_config.py for details. # noqa - searcher (dict, optional): - trial_scheduler (dict, optional): - stopper (dict, optional): - callbacks (list, optional): + trainable (Callable): The trainable to be tuned. + work_dir (str): The working directory to save checkpoints. The logs + will be saved in the subdirectory of `work_dir`. + param_space (dict, optional): Search space of the tuning task. + tune_cfg (dict, optional): Tuning algorithm specific configs + except for `search_alg` and `scheduler`. + Refer to :class:`ray.tune.tune_config.TuneConfig` for more info. + searcher (dict, optional): Search algorithm for optimization. + Default to random search. + Refer to :module:`ray.tune.search` for more options. + trial_scheduler (dict, optional): Scheduler for executing the trial. + Default to FIFO scheduler. + Refer to :module:`ray.tune.schedulers` for more options. + stopper (dict, optional): Stop conditions to consider. + Refer to :class:`ray.tune.stopper.Stopper` for more info. + callbacks (dict | list, optional): Callbacks to invoke. + Refer to :class:``ray.tune.callback.Callback` for more info. """ def __init__( self, - trainable, - work_dir, - param_space=None, - tune_cfg=None, - searcher=None, - trial_scheduler=None, - stopper=None, - callbacks=None, + trainable: Callable[[dict], Any], + work_dir: str, + param_space: Optional[dict] = None, + tune_cfg: Optional[dict] = None, + searcher: Optional[dict] = None, + trial_scheduler: Optional[dict] = None, + stopper: Optional[dict] = None, + callbacks: Optional[Union[dict, list]] = None, ): work_dir = osp.abspath(work_dir) @@ -73,7 +82,7 @@ def __init__( ) @classmethod - def from_cfg(cls, cfg, trainable): + def from_cfg(cls, cfg: dict, trainable: Callable[[dict], Any]): cfg = copy.deepcopy(cfg) tuner = cls( trainable, @@ -89,7 +98,7 @@ def from_cfg(cls, cfg, trainable): return tuner @classmethod - def resume(cls, path, **kwargs): + def resume(cls, path: str, **kwargs): return cls.restore(path, **kwargs) def fit(self): From ab040696b9d2dbb6bd7a955d7245ec15463915fb Mon Sep 17 00:00:00 2001 From: Junhwa Song Date: Sat, 17 Dec 2022 15:43:07 +0900 Subject: [PATCH 27/28] Update siatune/ray/tuner.py Signed-off-by: Junhwa Song --- siatune/ray/tuner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/siatune/ray/tuner.py b/siatune/ray/tuner.py index 045bbf5b..08fe9a58 100644 --- a/siatune/ray/tuner.py +++ b/siatune/ray/tuner.py @@ -31,7 +31,7 @@ class Tuner: stopper (dict, optional): Stop conditions to consider. Refer to :class:`ray.tune.stopper.Stopper` for more info. callbacks (dict | list, optional): Callbacks to invoke. - Refer to :class:``ray.tune.callback.Callback` for more info. + Refer to :class:`ray.tune.callback.Callback` for more info. """ def __init__( From 9ac7b32d9f52aef10b89f9c2cf99ac78a1cb77c8 Mon Sep 17 00:00:00 2001 From: Junhwa Song Date: Mon, 19 Dec 2022 11:21:51 +0900 Subject: [PATCH 28/28] Support resume (#104) --- siatune/ray/tuner.py | 13 +++++++++---- siatune/run.py | 5 +++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/siatune/ray/tuner.py b/siatune/ray/tuner.py index 08fe9a58..695078b2 100644 --- a/siatune/ray/tuner.py +++ b/siatune/ray/tuner.py @@ -32,6 +32,8 @@ class Tuner: Refer to :class:`ray.tune.stopper.Stopper` for more info. callbacks (dict | list, optional): Callbacks to invoke. Refer to :class:`ray.tune.callback.Callback` for more info. + resume (str, optional): The experiment path to resume. + Default to None. """ def __init__( @@ -44,6 +46,7 @@ def __init__( trial_scheduler: Optional[dict] = None, stopper: Optional[dict] = None, callbacks: Optional[Union[dict, list]] = None, + resume: Optional[str] = None, ): work_dir = osp.abspath(work_dir) @@ -66,6 +69,8 @@ def __init__( callbacks = [callbacks] callbacks = [build_callback(callback) for callback in callbacks] + self.resume = resume + self.tuner = RayTuner( trainable, param_space=dict(train_loop_config=param_space), @@ -93,13 +98,13 @@ def from_cfg(cls, cfg: dict, trainable: Callable[[dict], Any]): trial_scheduler=cfg.get('trial_scheduler', None), stopper=cfg.get('stopper', None), callbacks=cfg.get('callbacks', None), + resume=cfg.get('resume', None), ) return tuner - @classmethod - def resume(cls, path: str, **kwargs): - return cls.restore(path, **kwargs) - def fit(self): + if self.resume is not None: + self.tuner = RayTuner.restore(self.resume) + return self.tuner.fit() diff --git a/siatune/run.py b/siatune/run.py index 50ed7708..21647776 100644 --- a/siatune/run.py +++ b/siatune/run.py @@ -21,6 +21,8 @@ def parse_args() -> Namespace: parser.add_argument('tune_config', help='tune config file path') parser.add_argument( '--work-dir', default=None, help='the dir to save logs and models') + parser.add_argument( + '--resume', default=None, help='the experiment path to resume') parser.add_argument( '--address', default=None, @@ -89,6 +91,9 @@ def main() -> None: if hasattr(task_processor.args, 'work_dir'): task_processor.args.work_dir = tune_config.work_dir + if args.resume is not None: + tune_config.resume = args.resume + ray.init( address=args.address, num_cpus=args.num_cpus, num_gpus=args.num_gpus) assert ray.is_initialized()