Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixing TensorBoard #687

Merged
merged 9 commits into from
Jan 16, 2020
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pytorch_lightning/core/hooks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
# Hooks
Hooks
=====

There are cases when you might want to do something different at different parts of the training/validation loop.
To enable a hook, simply override the method in your LightningModule and the trainer will call it at the correct time.
Expand Down
15 changes: 8 additions & 7 deletions pytorch_lightning/core/lightning.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def training_step(self, batch, batch_idx):

logger_logs = {'training_loss': loss} # optional (MUST ALL BE TENSORS)

# if using TestTubeLogger or TensorboardLogger you can nest scalars
# if using TestTubeLogger or TensorBoardLogger you can nest scalars
logger_logs = {'losses': logger_logs} # optional (MUST ALL BE TENSORS)

output = {
Expand Down Expand Up @@ -665,9 +665,10 @@ def configure_optimizers(self):

.. note:: If you use multiple optimizers, training_step will have an additional `optimizer_idx` parameter.

.. note:: If you use LBFGS lightning handles the closure function automatically for you.

.. note:: If you use multiple optimizers, gradients will be calculated only for the parameters of current optimizer at each training step.
.. note:: If you use LBFGS lightning handles the closure function automatically for you

.. note:: If you use multiple optimizers, gradients will be calculated only
for the parameters of current optimizer at each training step.

Example
-------
Expand Down Expand Up @@ -939,9 +940,9 @@ def load_from_metrics(cls, weights_path, tags_csv, map_location=None):
for mapping storage {'cuda:1':'cuda:0'}
:return: The pretrained LightningModule

If you're using test tube, there is an alternate method which uses the meta_tags.csv
file from test-tube to rebuild the model. The meta_tags.csv file can be found in the
test-tube experiment save_dir.
If you're using `test-tube`, there is an alternate method which uses the meta_tags.csv
file from test-tube to rebuild the model. The `meta_tags.csv` file can be found in the
`test-tube` experiment save_dir.

.. code-block:: python

Expand Down
2 changes: 1 addition & 1 deletion pytorch_lightning/logging/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,8 @@ def __init__(self, hparams):
"""

from os import environ
from .base import LightningLoggerBase, rank_zero_only

from .base import LightningLoggerBase, rank_zero_only
from .tensorboard import TensorBoardLogger

try:
Expand Down
39 changes: 28 additions & 11 deletions pytorch_lightning/logging/tensorboard.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import os
from warnings import warn
from argparse import Namespace
from pkg_resources import parse_version

import torch
from pkg_resources import parse_version
import pandas as pd
from torch.utils.tensorboard import SummaryWriter

from .base import LightningLoggerBase, rank_zero_only
Expand All @@ -28,8 +30,8 @@ class TensorBoardLogger(LightningLoggerBase):
directory for existing versions, then automatically assigns the next available version.
:param \**kwargs: Other arguments are passed directly to the :class:`SummaryWriter` constructor.


"""
NAME_CSV_TAGS = 'meta_tags.csv'

def __init__(self, save_dir, name="default", version=None, **kwargs):
super().__init__()
Expand All @@ -38,6 +40,7 @@ def __init__(self, save_dir, name="default", version=None, **kwargs):
self._version = version

self._experiment = None
self.tags = {}
self.kwargs = kwargs

@property
Expand All @@ -57,22 +60,25 @@ def experiment(self):

@rank_zero_only
def log_hyperparams(self, params):
if params is None:
return

# in case converting from namespace
if isinstance(params, Namespace):
params = vars(params)
params = dict(params)

if parse_version(torch.__version__) < parse_version("1.3.0"):
warn(
f"Hyperparameter logging is not available for Torch version {torch.__version__}."
" Skipping log_hyperparams. Upgrade to Torch 1.3.0 or above to enable"
" hyperparameter logging."
)
# TODO: some alternative should be added
return
try:
# in case converting from namespace, todo: rather test if it is namespace
params = vars(params)
except TypeError:
pass
if params is not None:
else:
# `add_hparams` requires both - hparams and metric
self.experiment.add_hparams(hparam_dict=dict(params), metric_dict={})
self.experiment.add_hparams(hparam_dict=params, metric_dict={})
# some alternative should be added
self.tags.update(params)

@rank_zero_only
def log_metrics(self, metrics, step=None):
Expand All @@ -89,6 +95,17 @@ def save(self):
# you are using PT version (<v1.2) which does not have implemented flush
self.experiment._get_file_writer().flush()

# create a preudo standard path ala test-tube
dir_path = os.path.join(self.save_dir, self.name, 'version_%s' % self.version)
if not os.path.isdir(dir_path):
dir_path = self.save_dir
# prepare the file path
meta_tags_path = os.path.join(dir_path, self.NAME_CSV_TAGS)
# save the metatags file
df = pd.DataFrame({'key': list(self.tags.keys()),
'value': list(self.tags.values())})
df.to_csv(meta_tags_path, index=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't all this tag saving stuff go into the log_hyperparams function, not log_metrics?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good question, what do you propose? @williamFalcon

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can just move all this new code up into where the warning is in log_hyperparams.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed with @neggert

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so you want to do a save always when a new parameter is added?
it makes more sense for me to do it while saving...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

log_metrics gets called all the time. we shouldn’t save the csv over and over when that happens.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so saving it in save() is fine? or am I missing something...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i haven’t looked at this in a while. i would just put it where test tube does it

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test-tube does it while saving too, that's how I "copy-paste" it...
https://github.com/williamFalcon/test-tube/blob/master/test_tube/log.py#L366

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

log_hyperparams is intended to be called once at the beginning of training. I think that's what we want.


@rank_zero_only
def finalize(self, status):
self.save()
Expand Down
3 changes: 2 additions & 1 deletion pytorch_lightning/trainer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
# Trainer
Trainer
=======

The lightning trainer abstracts best practices for running a training, val, test routine.
It calls parts of your model when it wants to hand over full control and otherwise makes
Expand Down
4 changes: 2 additions & 2 deletions pytorch_lightning/trainer/callback_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from abc import ABC

from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.logging import TensorboardLogger
from pytorch_lightning.logging import TensorBoardLogger


class TrainerCallbackConfigMixin(ABC):
Expand Down Expand Up @@ -69,7 +69,7 @@ def configure_early_stopping(self, early_stop_callback, logger):
# configure logger
if logger is True:
# default logger
self.logger = TensorboardLogger(
self.logger = TensorBoardLogger(
save_dir=self.default_save_path,
version=self.slurm_job_id,
name='lightning_logs'
Expand Down
3 changes: 2 additions & 1 deletion pytorch_lightning/trainer/evaluation_loop.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
# Validation loop
Validation loop
===============

The lightning validation loop handles everything except the actual computations of your model.
To decide what will happen in your validation loop, define the `validation_step` function.
Expand Down
10 changes: 5 additions & 5 deletions pytorch_lightning/trainer/training_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@
from subprocess import call
import logging
from abc import ABC
from argparse import Namespace

import pandas as pd
import torch
import torch.distributed as dist

Expand Down Expand Up @@ -268,7 +270,6 @@ def save_checkpoint(self, filepath):
torch.save(checkpoint, filepath)

def restore(self, checkpoint_path, on_gpu):

# if on_gpu:
# checkpoint = torch.load(checkpoint_path)
# else:
Expand Down Expand Up @@ -461,14 +462,13 @@ def max_ckpt_in_folder(self, path, name_key='ckpt_'):


def load_hparams_from_tags_csv(tags_csv):
from argparse import Namespace
import pandas as pd
if not os.path.isfile(tags_csv):
logging.warning(f'Missing Tags: {tags_csv}.')
return Namespace()

tags_df = pd.read_csv(tags_csv)
dic = tags_df.to_dict(orient='records')

ns_dict = {row['key']: convert(row['value']) for row in dic}

ns = Namespace(**ns_dict)
return ns

Expand Down
2 changes: 1 addition & 1 deletion pytorch_lightning/trainer/training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ def run_training_batch(self, batch, batch_idx):

# call training_step once per optimizer
for opt_idx, optimizer in enumerate(self.optimizers):
# make sure only the gradients of the current optimizer's paramaters are calculated
# make sure only the gradients of the current optimizer's paramaters are calculated
# in the training step to prevent dangling gradients in multiple-optimizer setup.
for param in self.get_model().parameters():
param.requires_grad = False
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ numpy>=1.16.4
torch>=1.1
torchvision>=0.4.0
pandas>=0.24 # lower version do not support py3.7
test-tube>=0.7.5
tensorboard>=1.14
future>=0.17.1 # required for builtins in setup.py
2 changes: 1 addition & 1 deletion tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ pytest>=3.0.5
pytest-cov
flake8
check-manifest
# test_tube # already installed in main req.
test-tube>=0.7.5
mlflow
comet_ml
wandb
Expand Down
6 changes: 3 additions & 3 deletions tests/test_cpu_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_early_stopping_cpu_model(tmpdir):
show_progress_bar=True,
logger=tutils.get_test_tube_logger(tmpdir),
train_percent_check=0.1,
val_percent_check=0.1
val_percent_check=0.1,
)

model, hparams = tutils.get_model()
Expand All @@ -51,7 +51,7 @@ def test_lbfgs_cpu_model(tmpdir):
show_progress_bar=False,
weights_summary='top',
train_percent_check=1.0,
val_percent_check=0.2
val_percent_check=0.2,
)

model, hparams = tutils.get_model(use_test_model=True, lbfgs=True)
Expand All @@ -70,7 +70,7 @@ def test_default_logger_callbacks_cpu_model(tmpdir):
print_nan_grads=True,
show_progress_bar=False,
train_percent_check=0.01,
val_percent_check=0.01
val_percent_check=0.01,
)

model, hparams = tutils.get_model()
Expand Down
5 changes: 5 additions & 0 deletions tests/test_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ def test_comet_pickle(tmpdir, monkeypatch):
trainer2 = pickle.loads(pkl_bytes)
trainer2.logger.log_metrics({"acc": 1.0})


def test_wandb_logger(tmpdir):
"""Verify that basic functionality of wandb logger works."""
tutils.reset_seed()
Expand All @@ -201,6 +202,7 @@ def test_wandb_logger(tmpdir):
wandb_dir = os.path.join(tmpdir, "wandb")
logger = WandbLogger(save_dir=wandb_dir, anonymous=True)


def test_neptune_logger(tmpdir):
"""Verify that basic functionality of neptune logger works."""
tutils.reset_seed()
Expand All @@ -223,13 +225,16 @@ def test_neptune_logger(tmpdir):
print('result finished')
assert result == 1, "Training failed"


def test_wandb_pickle(tmpdir):
"""Verify that pickling trainer with wandb logger works."""
tutils.reset_seed()

from pytorch_lightning.logging import WandbLogger
wandb_dir = str(tmpdir)
logger = WandbLogger(save_dir=wandb_dir, anonymous=True)
assert logger is not None


def test_neptune_pickle(tmpdir):
"""Verify that pickling trainer with neptune logger works."""
Expand Down
11 changes: 5 additions & 6 deletions tests/test_restore_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,11 @@ def test_running_test_pretrained_model_ddp(tmpdir):
trainer = Trainer(**trainer_options)
result = trainer.fit(model)

exp = logger.experiment
logging.info(os.listdir(exp.get_data_path(exp.name, exp.version)))
logging.info(os.listdir(tutils.get_data_path(logger, path_dir=tmpdir)))

# correct result and ok accuracy
assert result == 1, 'training failed to complete'
pretrained_model = tutils.load_model(logger.experiment,
pretrained_model = tutils.load_model(logger,
trainer.checkpoint_callback.filepath,
module_class=LightningTestModel)

Expand Down Expand Up @@ -87,7 +86,7 @@ def test_running_test_pretrained_model(tmpdir):
# correct result and ok accuracy
assert result == 1, 'training failed to complete'
pretrained_model = tutils.load_model(
logger.experiment, trainer.checkpoint_callback.filepath, module_class=LightningTestModel
logger, trainer.checkpoint_callback.filepath, module_class=LightningTestModel
)

new_trainer = Trainer(**trainer_options)
Expand Down Expand Up @@ -171,7 +170,7 @@ def test_running_test_pretrained_model_dp(tmpdir):

# correct result and ok accuracy
assert result == 1, 'training failed to complete'
pretrained_model = tutils.load_model(logger.experiment,
pretrained_model = tutils.load_model(logger,
trainer.checkpoint_callback.filepath,
module_class=LightningTestModel)

Expand Down Expand Up @@ -361,7 +360,7 @@ def test_model_saving_loading(tmpdir):
trainer.save_checkpoint(new_weights_path)

# load new model
tags_path = logger.experiment.get_data_path(logger.experiment.name, logger.experiment.version)
tags_path = tutils.get_data_path(logger, path_dir=tmpdir)
tags_path = os.path.join(tags_path, 'meta_tags.csv')
model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path,
tags_csv=tags_path)
Expand Down
9 changes: 4 additions & 5 deletions tests/test_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class CurrentTestModel(LightningTestModelBase):
trainer.save_checkpoint(new_weights_path)

# load new model
tags_path = logger.experiment.get_data_path(logger.experiment.name, logger.experiment.version)
tags_path = tutils.get_data_path(logger, path_dir=tmpdir)
tags_path = os.path.join(tags_path, 'meta_tags.csv')
model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path,
tags_csv=tags_path)
Expand Down Expand Up @@ -89,7 +89,7 @@ class CurrentTestModel(LightningValidationStepMixin, LightningTestModelBase):
trainer.save_checkpoint(new_weights_path)

# load new model
tags_path = logger.experiment.get_data_path(logger.experiment.name, logger.experiment.version)
tags_path = tutils.get_data_path(logger, path_dir=tmpdir)
tags_path = os.path.join(tags_path, 'meta_tags.csv')
model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path,
tags_csv=tags_path)
Expand Down Expand Up @@ -184,9 +184,8 @@ def test_loading_meta_tags(tmpdir):
logger.save()

# load tags
tags_path = logger.experiment.get_data_path(
logger.experiment.name, logger.experiment.version
) + '/meta_tags.csv'
path_expt_dir = tutils.get_data_path(logger, path_dir=tmpdir)
tags_path = os.path.join(path_expt_dir, 'meta_tags.csv')
tags = training_io.load_hparams_from_tags_csv(tags_path)

assert tags.batch_size == 32 and tags.hidden_dim == 1000
Expand Down
Loading