From a6f099531a3ebef429ba59e997b84f4388def880 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 1 Mar 2021 19:10:27 +0100 Subject: [PATCH 1/9] cleaning SWA (#6259) * rename * if * test * chlog --- CHANGELOG.md | 3 +++ pytorch_lightning/callbacks/__init__.py | 2 +- .../callbacks/{swa.py => stochastic_weight_avg.py} | 10 ++++------ .../trainer/connectors/callback_connector.py | 2 +- .../{test_swa.py => test_stochastic_weight_avg.py} | 0 5 files changed, 9 insertions(+), 8 deletions(-) rename pytorch_lightning/callbacks/{swa.py => stochastic_weight_avg.py} (97%) rename tests/callbacks/{test_swa.py => test_stochastic_weight_avg.py} (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f31000b0c302..9cf9b731c27fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Changed the order of `backward`, `step`, `zero_grad` to `zero_grad`, `backward`, `step` ([#6147](https://github.com/PyTorchLightning/pytorch-lightning/pull/6147)) +- Renamed `pytorch_lightning.callbacks.swa` to `pytorch_lightning.callbacks.stochastic_weight_avg` ([#6259](https://github.com/PyTorchLightning/pytorch-lightning/pull/6259)) + + ### Deprecated diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py index f3787c1cb2f7f..fb61ad81aee28 100644 --- a/pytorch_lightning/callbacks/__init__.py +++ b/pytorch_lightning/callbacks/__init__.py @@ -22,7 +22,7 @@ from pytorch_lightning.callbacks.progress import ProgressBar, ProgressBarBase from pytorch_lightning.callbacks.pruning import ModelPruning from pytorch_lightning.callbacks.quantization import QuantizationAwareTraining -from pytorch_lightning.callbacks.swa import StochasticWeightAveraging +from pytorch_lightning.callbacks.stochastic_weight_avg import StochasticWeightAveraging __all__ = [ 'BackboneFinetuning', diff --git a/pytorch_lightning/callbacks/swa.py b/pytorch_lightning/callbacks/stochastic_weight_avg.py similarity index 97% rename from pytorch_lightning/callbacks/swa.py rename to pytorch_lightning/callbacks/stochastic_weight_avg.py index c8cf367cb4d5e..bece2ffe9f1b2 100644 --- a/pytorch_lightning/callbacks/swa.py +++ b/pytorch_lightning/callbacks/stochastic_weight_avg.py @@ -102,12 +102,10 @@ def __init__( if isinstance(swa_epoch_start, float) and not (0 <= swa_epoch_start <= 1): raise MisconfigurationException(err_msg) - if ( - swa_lrs is not None and ( - not isinstance(swa_lrs, (float, list)) or isinstance(swa_lrs, float) and swa_lrs <= 0 - or isinstance(swa_lrs, list) and not all(lr > 0 and isinstance(lr, float) for lr in swa_lrs) - ) - ): + wrong_type = not isinstance(swa_lrs, (float, list)) + wrong_float = isinstance(swa_lrs, float) and swa_lrs <= 0 + wrong_list = isinstance(swa_lrs, list) and not all(lr > 0 and isinstance(lr, float) for lr in swa_lrs) + if (swa_lrs is not None and (wrong_type or wrong_float or wrong_list)): raise MisconfigurationException("The `swa_lrs` should be a positive float or a list of positive float.") if avg_fn is not None and not isinstance(avg_fn, Callable): diff --git a/pytorch_lightning/trainer/connectors/callback_connector.py b/pytorch_lightning/trainer/connectors/callback_connector.py index 40ac8f3e69870..8a5289e608c94 100644 --- a/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/pytorch_lightning/trainer/connectors/callback_connector.py @@ -76,7 +76,7 @@ def _configure_swa_callbacks(self): if not self.trainer._stochastic_weight_avg: return - from pytorch_lightning.callbacks.swa import StochasticWeightAveraging + from pytorch_lightning.callbacks.stochastic_weight_avg import StochasticWeightAveraging existing_swa = [cb for cb in self.trainer.callbacks if isinstance(cb, StochasticWeightAveraging)] if not existing_swa: self.trainer.callbacks = [StochasticWeightAveraging()] + self.trainer.callbacks diff --git a/tests/callbacks/test_swa.py b/tests/callbacks/test_stochastic_weight_avg.py similarity index 100% rename from tests/callbacks/test_swa.py rename to tests/callbacks/test_stochastic_weight_avg.py From 69903a07ebc05739059b7fd1459fc5ddeac62b52 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Tue, 2 Mar 2021 03:15:43 +0900 Subject: [PATCH 2/9] Remove opt from manual_backward in docs (#6267) --- docs/source/common/lightning_module.rst | 8 ++++---- pytorch_lightning/core/lightning.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst index e9c138a2c12cc..c02f23ac60d09 100644 --- a/docs/source/common/lightning_module.rst +++ b/docs/source/common/lightning_module.rst @@ -946,7 +946,7 @@ When set to ``False``, Lightning does not automate the optimization process. Thi opt = self.optimizers(use_pl_optimizer=True) loss = ... - self.manual_backward(loss, opt) + self.manual_backward(loss) opt.step() opt.zero_grad() @@ -961,16 +961,16 @@ In the multi-optimizer case, ignore the ``optimizer_idx`` argument and use the o def training_step(self, batch, batch_idx, optimizer_idx): # access your optimizers with use_pl_optimizer=False. Default is True - (opt_a, opt_b) = self.optimizers(use_pl_optimizer=True) + opt_a, opt_b = self.optimizers(use_pl_optimizer=True) gen_loss = ... opt_a.zero_grad() - self.manual_backward(gen_loss, opt_a) + self.manual_backward(gen_loss) opt_a.step() disc_loss = ... opt_b.zero_grad() - self.manual_backward(disc_loss, opt_b) + self.manual_backward(disc_loss) opt_b.step() -------------- diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 5dce0caa59720..52bcc213692ac 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1211,10 +1211,10 @@ def manual_backward(self, loss: Tensor, optimizer: Optional[Optimizer] = None, * Example:: def training_step(...): - (opt_a, opt_b) = self.optimizers() + opt_a, opt_b = self.optimizers() loss = ... # automatically applies scaling, etc... - self.manual_backward(loss, opt_a) + self.manual_backward(loss) opt_a.step() """ if optimizer is not None: From 3f9ed7687105dc3da3c7a6f7e6858a64b44a253f Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 1 Mar 2021 22:14:55 +0100 Subject: [PATCH 3/9] switch agents pool (#6270) --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4d84253473bbc..7b48121311e71 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -23,7 +23,7 @@ jobs: # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: 2 - pool: dsvm-spot-pool + pool: gridai-spot-pool #strategy: # matrix: @@ -58,7 +58,7 @@ jobs: export GIT_TERMINAL_PROMPT=1 #sudo apt-get install -y cmake # python -m pip install "pip==20.1" - pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + pip install --requirement requirements.txt python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)" python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed From b5d33763b33b50d9c93e82f1e5ca6c1d4ee76a4e Mon Sep 17 00:00:00 2001 From: Max Frei Date: Mon, 1 Mar 2021 20:59:28 +0100 Subject: [PATCH 4/9] Allow user to disable the automatic formatting of checkpoint file names. --- .../callbacks/model_checkpoint.py | 31 +++++++++++++++++-- tests/checkpointing/test_model_checkpoint.py | 8 +++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 54ad16f7b686f..e525269355c70 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -131,6 +131,15 @@ class ModelCheckpoint(Callback): ... filename='sample-mnist-{epoch:02d}-{val_loss:.2f}' ... ) + # save epoch and val_loss in name, but specify the formatting yourself (e.g. to avoid problems with Tensorboard + # or Neptune, due to the presence of characters like = or /) + # saves a file like: my/path/sample-mnist-epoch02-val_loss0.32.ckpt + >>> checkpoint_callback = ModelCheckpoint( + ... monitor='val/loss', + ... dirpath='my/path/', + ... filename='sample-mnist-epoch{epoch:02d}-val_loss{val/loss:.2f}' + ... ) + # retrieve the best checkpoint after training checkpoint_callback = ModelCheckpoint(dirpath='my/path/') trainer = Trainer(callbacks=[checkpoint_callback]) @@ -156,6 +165,7 @@ def __init__( save_weights_only: bool = False, mode: str = "min", period: int = 1, + auto_insert_metric_name: bool = True ): super().__init__() self.monitor = monitor @@ -164,6 +174,7 @@ def __init__( self.save_top_k = save_top_k self.save_weights_only = save_weights_only self.period = period + self.auto_insert_metric_name = auto_insert_metric_name self._last_global_step_saved = -1 self.current_score = None self.best_k_models = {} @@ -336,6 +347,7 @@ def _format_checkpoint_name( step: int, metrics: Dict[str, Any], prefix: str = "", + auto_insert_metric_name: bool = True ) -> str: if not filename: # filename is not set, use default name @@ -347,7 +359,10 @@ def _format_checkpoint_name( metrics.update({"epoch": epoch, 'step': step}) for group in groups: name = group[1:] - filename = filename.replace(group, name + "={" + name) + + if auto_insert_metric_name: + filename = filename.replace(group, name + "={" + name) + if name not in metrics: metrics[name] = 0 filename = filename.format(**metrics) @@ -372,6 +387,12 @@ def format_checkpoint_name(self, epoch: int, step: int, metrics: Dict[str, Any], >>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{epoch}-{val_loss:.2f}') >>> os.path.basename(ckpt.format_checkpoint_name(2, 3, metrics=dict(val_loss=0.123456))) 'epoch=2-val_loss=0.12.ckpt' + >>> ckpt = ModelCheckpoint( + >>> dirpath=tmpdir, + >>> filename='epoch={epoch}-validation_loss={val_loss:.2f}', + >>> auto_insert_metric_name=False) + >>> os.path.basename(ckpt.format_checkpoint_name(2, 3, metrics=dict(val_loss=0.123456))) + 'epoch=2-validation_loss=0.12.ckpt' >>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{missing:d}') >>> os.path.basename(ckpt.format_checkpoint_name(0, 4, metrics={})) 'missing=0.ckpt' @@ -380,7 +401,13 @@ def format_checkpoint_name(self, epoch: int, step: int, metrics: Dict[str, Any], 'step=0.ckpt' """ - filename = self._format_checkpoint_name(self.filename, epoch, step, metrics) + filename = self._format_checkpoint_name( + self.filename, + epoch, + step, + metrics, + auto_insert_metric_name=self.auto_insert_metric_name) + if ver is not None: filename = self.CHECKPOINT_JOIN_CHAR.join((filename, f"v{ver}")) diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index a6ec2bc56656f..d8b6809651606 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -426,6 +426,14 @@ def test_model_checkpoint_format_checkpoint_name(tmpdir): ckpt_name = ckpt.format_checkpoint_name(4, 3, {'val/loss': 0.03}) assert ckpt_name == 'epoch=4_val/loss=0.03000.ckpt' + # auto_insert_metric_name=False + ckpt_name = ModelCheckpoint._format_checkpoint_name( + 'epoch={epoch:03d}-val_acc={val/acc}', + 3, + 2, + {'val/acc': 0.03}, + auto_insert_metric_name=False) + assert ckpt_name == 'epoch=003-val_acc=0.03' class ModelCheckpointExtensionTest(ModelCheckpoint): FILE_EXTENSION = '.tpkc' From e400c35f98c740f22170c237cdf2d4f0762b1f19 Mon Sep 17 00:00:00 2001 From: Max Frei Date: Mon, 1 Mar 2021 21:23:03 +0100 Subject: [PATCH 5/9] Added changelog entry. --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9cf9b731c27fd..4cf0bb22fe642 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `checkpoint` parameter to callback's `on_save_checkpoint` hook ([#6072](https://github.com/PyTorchLightning/pytorch-lightning/pull/6072)) +- Added `auto_insert_metric_name` parameter to `ModelCheckpoint` ([#6277](https://github.com/PyTorchLightning/pytorch-lightning/pull/6277)) + + ### Changed - Changed the order of `backward`, `step`, `zero_grad` to `zero_grad`, `backward`, `step` ([#6147](https://github.com/PyTorchLightning/pytorch-lightning/pull/6147)) From 48ff9a3422f74f4406ef2d654bdff2e9a42b2328 Mon Sep 17 00:00:00 2001 From: Max Frei Date: Mon, 1 Mar 2021 21:28:08 +0100 Subject: [PATCH 6/9] Made flake8 happy. --- pytorch_lightning/callbacks/model_checkpoint.py | 7 +++---- tests/checkpointing/test_model_checkpoint.py | 1 + 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index e525269355c70..88b3dd81f1ab8 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -387,10 +387,9 @@ def format_checkpoint_name(self, epoch: int, step: int, metrics: Dict[str, Any], >>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{epoch}-{val_loss:.2f}') >>> os.path.basename(ckpt.format_checkpoint_name(2, 3, metrics=dict(val_loss=0.123456))) 'epoch=2-val_loss=0.12.ckpt' - >>> ckpt = ModelCheckpoint( - >>> dirpath=tmpdir, - >>> filename='epoch={epoch}-validation_loss={val_loss:.2f}', - >>> auto_insert_metric_name=False) + >>> ckpt = ModelCheckpoint(dirpath=tmpdir, + ... filename='epoch={epoch}-validation_loss={val_loss:.2f}', + ... auto_insert_metric_name=False) >>> os.path.basename(ckpt.format_checkpoint_name(2, 3, metrics=dict(val_loss=0.123456))) 'epoch=2-validation_loss=0.12.ckpt' >>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{missing:d}') diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index d8b6809651606..af8dacb60160a 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -435,6 +435,7 @@ def test_model_checkpoint_format_checkpoint_name(tmpdir): auto_insert_metric_name=False) assert ckpt_name == 'epoch=003-val_acc=0.03' + class ModelCheckpointExtensionTest(ModelCheckpoint): FILE_EXTENSION = '.tpkc' From db849c356e16060d7f98d95169e8c775689ccac6 Mon Sep 17 00:00:00 2001 From: Max Frei <36265931+maxfrei750@users.noreply.github.com> Date: Mon, 8 Mar 2021 20:30:46 +0100 Subject: [PATCH 7/9] Applied review suggestion: quotes for special characters in docstring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos MocholĂ­ --- pytorch_lightning/callbacks/model_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index a3ae09cb52b19..a498661237f21 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -132,7 +132,7 @@ class ModelCheckpoint(Callback): ... ) # save epoch and val_loss in name, but specify the formatting yourself (e.g. to avoid problems with Tensorboard - # or Neptune, due to the presence of characters like = or /) + # or Neptune, due to the presence of characters like '=' or '/') # saves a file like: my/path/sample-mnist-epoch02-val_loss0.32.ckpt >>> checkpoint_callback = ModelCheckpoint( ... monitor='val/loss', From 278cef7f8191bf614441ff2faab84cc4a34d6170 Mon Sep 17 00:00:00 2001 From: Max Frei Date: Mon, 8 Mar 2021 20:33:04 +0100 Subject: [PATCH 8/9] Fixed example in docstring. --- pytorch_lightning/callbacks/model_checkpoint.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index a498661237f21..99b60d9fe06c1 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -138,6 +138,7 @@ class ModelCheckpoint(Callback): ... monitor='val/loss', ... dirpath='my/path/', ... filename='sample-mnist-epoch{epoch:02d}-val_loss{val/loss:.2f}' + ... auto_insert_metric_name: False ... ) # retrieve the best checkpoint after training From 34f5b04bdf5a126ea80e6d01d5f8e83af785137b Mon Sep 17 00:00:00 2001 From: Max Frei Date: Mon, 8 Mar 2021 20:35:01 +0100 Subject: [PATCH 9/9] Fixed syntax error in docstring. --- pytorch_lightning/callbacks/model_checkpoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 99b60d9fe06c1..f05a10a41996b 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -137,8 +137,8 @@ class ModelCheckpoint(Callback): >>> checkpoint_callback = ModelCheckpoint( ... monitor='val/loss', ... dirpath='my/path/', - ... filename='sample-mnist-epoch{epoch:02d}-val_loss{val/loss:.2f}' - ... auto_insert_metric_name: False + ... filename='sample-mnist-epoch{epoch:02d}-val_loss{val/loss:.2f}', + ... auto_insert_metric_name=False ... ) # retrieve the best checkpoint after training