Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for incorrect usage of detach(), cpu(), to() #6216

Merged
merged 12 commits into from
Mar 1, 2021
Original file line number Diff line number Diff line change
Expand Up @@ -281,11 +281,11 @@ def cache_result(self) -> None:
# attach capture batch_size
Result.attach_batch_size(self._batch_size, hook_result)

hook_result.detach()
hook_result = hook_result.detach()
if self.trainer.move_metrics_to_cpu:
hook_result.cpu()
hook_result = hook_result.cpu()
elif self.trainer._distrib_type == DistributedType.DP:
hook_result.to(torch.device("cuda", self.trainer.root_gpu))
hook_result = hook_result.to(torch.device("cuda", self.trainer.root_gpu))

self._internals[fx_name].append(hook_result, info)

Expand Down
4 changes: 2 additions & 2 deletions pytorch_lightning/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,9 +736,9 @@ def run_evaluation(self, max_batches=None, on_epoch=False):
def track_output_for_epoch_end(self, outputs, output):
if output is not None:
if isinstance(output, Result):
output.detach()
output = output.detach()
if self.move_metrics_to_cpu:
output.cpu()
output = output.cpu()
elif isinstance(output, dict):
output = recursive_detach(output, to_cpu=self.move_metrics_to_cpu)
elif isinstance(output, torch.Tensor) and output.is_cuda and self.move_metrics_to_cpu:
Expand Down
6 changes: 3 additions & 3 deletions pytorch_lightning/trainer/training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def on_after_backward(self, training_step_output, batch_idx, untouched_loss):
is_result_obj = isinstance(training_step_output, Result)

if is_result_obj:
training_step_output.detach()
training_step_output = training_step_output.detach()
else:
training_step_output.batch_loss = training_step_output.batch_loss.detach()

Expand Down Expand Up @@ -395,9 +395,9 @@ def _process_training_step_output_1_0(self, training_step_output, split_batch):

# track metrics without grads for epoch reduction
training_step_output_for_epoch_end = copy(result)
training_step_output_for_epoch_end.detach()
training_step_output_for_epoch_end = training_step_output_for_epoch_end.detach()
if self.trainer.move_metrics_to_cpu:
training_step_output_for_epoch_end.cpu()
training_step_output_for_epoch_end = training_step_output_for_epoch_end.cpu()

# what flows back into the system
training_step_output = result
Expand Down