Resolve hiddens

Lightning-AI · carmocca · Mar 28, 2021 · Feb 16, 2021 · Mar 26, 2021 · Mar 26, 2021
commit 82df1254fe37e31fa0a3194a712e12eb57001867
@@ -1478,15 +1478,9 @@ with the hidden
         def training_step(self, batch, batch_idx, hiddens):
             # hiddens are the hiddens from the previous truncated backprop step
             out, hiddens = self.lstm(data, hiddens)
-
-            # remember to detach() hiddens.
-            # If you don't, you will get a RuntimeError: Trying to backward through
-            # the graph a second time...
-            # Using hiddens.detach() allows each split to be disconnected.
-
             return {
                 "loss": ...,
-                "hiddens": hiddens  # remember to detach() this
+                "hiddens": hiddens
             }
 
 To modify how the batch is split,

@@ -14,6 +14,7 @@
 
 import inspect
 from abc import ABC
+from collections import Mapping
 
 import torch
 
@@ -75,9 +76,7 @@ def process_dict_result(self, output, train=False):
         # --------------------------
         # single scalar returned from a xx_step
         if isinstance(output, torch.Tensor):
-            progress_bar_metrics = {}
-            log_metrics = {}
-            return output, progress_bar_metrics, log_metrics
+            return output, {}, {}, None
 
         # ---------------
         # EXTRACT PROGRESS BAR KEYS
@@ -134,12 +133,19 @@ def process_dict_result(self, output, train=False):
             if self._distrib_type in (DistributedType.DP, DistributedType.DDP2):
                 loss = self.reduce_distributed_output(loss, self.num_gpus)
 
+        # ---------------
+        # EXTRACT HIDDEN
+        # ---------------
+        hiddens = output.get('hiddens', None) if isinstance(output, Mapping) else None
+        if hiddens is not None:
+            hiddens = hiddens.detach()
+
         # detach all metrics for callbacks to prevent memory leaks
         # no .item() because it will slow things down
         progress_bar_metrics = recursive_detach(progress_bar_metrics)
         log_metrics = recursive_detach(log_metrics)
 
-        return loss, progress_bar_metrics, log_metrics
+        return loss, progress_bar_metrics, log_metrics, hiddens
 
     def reduce_distributed_output(self, output, num_gpus):
         if num_gpus <= 1:

@@ -354,19 +354,22 @@ def _process_training_step_output_1_0(self, training_step_output, split_batch):
         result = self.trainer.lightning_module._results
 
         loss = None
+        hiddens = None
+        result["extra"] = {}
 
         # handle dict return
         if isinstance(training_step_output, dict):
             loss = training_step_output.pop("loss", None)
+            hiddens = training_step_output.pop("hiddens", None)
             result["extra"] = training_step_output
 
         # handle scalar return
         elif isinstance(training_step_output, torch.Tensor):
             loss = training_step_output
-            result["extra"] = {}
 
         # map to results under the hood
         result.minimize = loss
+        self.trainer.hiddens = hiddens
 
         # track batch for manual reduction with result
         result.track_batch_size(len(split_batch))

@@ -111,7 +111,7 @@ def training_step_end(self, *_):
     assert generated == excepted
 
 
-def test__logger_connector__epoch_result_store__train__ttbt(tmpdir):
+def test__logger_connector__epoch_result_store__train__tbptt(tmpdir):
     """
     Tests that LoggerConnector will properly capture logged information with ttbt
     and reduce them
@@ -142,6 +142,7 @@ def __init__(self):
 
         @decorator_with_arguments(fx_name="training_step")
         def training_step(self, batch, batch_idx, hiddens):
+            assert hiddens == self.test_hidden, "Hidden state not persistent between tbptt steps"
             self.test_hidden = torch.rand(1)
 
             x_tensor, y_list = batch

@@ -318,12 +318,7 @@ def __init__(self):
             self.layer = torch.nn.Linear(2, 2)
 
         def training_step(self, batch, batch_idx, hiddens):
-            try:
-                assert hiddens == self.test_hidden, "Hidden state not persistent between tbptt steps"
-            # todo: specify the possible exception
-            except Exception as ex:
-                print(ex)
-
+            assert hiddens == self.test_hidden, "Hidden state not persistent between tbptt steps"
             self.test_hidden = torch.rand(1)
 
             x_tensor, y_list = batch