[1.x] Backport of LSTM and GRU fix (#17898) and RNN op (#17632) (#18317)

* [v1.x] [Large Tensor] Backport of Fixed RNN op (#17632) * Changed relevant function args to index_t * Added nightly test for RNN * Added fix for LSTM, GRU, RNN-ReLU, RNN-tanh * Using const instead of literals * Added nightly test for RNN ReLU & tanh, LSTM, GRU * Type assertion to force evaluation of output NDArray * Incorporated latest round of comments * [v1.x] Backport of Fix LSTM and GRU layers gradient calculations (#18203) * Fix input gradient calculation for bidirectional LSTM For bidiractional LSTM with number of layers > 2 input gradient calculation was incorrect. Reason of wrong calculations was overwriting y derivative (dy) tensor by calculated x derivative (dx) tensor before right2left layer could use dy for own gradient calculations. Propsed fix uses additional space to avoid overwriting. * Fix gradient calculation for GRU For GRU with number of layers > 2 i2h_weight gradient for layers in the middle (all except last and first) was incorrect. Wrong caluculations were caused by assigning output pointer to input instead of calculating new input pointer. * Enable tests for GRU and LSTM gradients * Fix comments * Change loop iteration deduction * Add more test cases for fused rnn layers Co-authored-by: Connor Goggins <[email protected]>
apache · Jun 3, 2020 · 8986e3f · 8986e3f
1 parent 36bd144
commit 8986e3f
Show file tree

Hide file tree

Showing 4 changed files with 291 additions and 280 deletions.
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
@@ -64,7 +64,7 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   bool bidirectional, state_outputs;
   int mode;
   float p;
-  int seq_length_, batch_size_, input_size_;
+  index_t seq_length_, batch_size_, input_size_;
 
   bool use_sequence_length;
   dmlc::optional<int> projection_size;
@@ -123,8 +123,8 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   }
 };
 
-inline int GetRnnParamSize(int num_layer,
-                           int input_size,
+inline index_t GetRnnParamSize(int num_layer,
+                           index_t input_size,
                            int state_size,
                            int direction,
                            int mode,
@@ -141,14 +141,14 @@ inline int GetRnnParamSize(int num_layer,
       size *= 3;
       break;
   }
-  int size1 = (input_size + state_size + 2) * size;  // first layer size
-  int size2 = (state_size * direction + state_size + 2) * size;  // other layers size
+  index_t size1 = (input_size + state_size + 2) * size;  // first layer size
+  index_t size2 = (state_size * direction + state_size + 2) * size;  // other layers size
   if (projection_size.has_value()) {
-    int proj_size = projection_size.value();
+    index_t proj_size = projection_size.value();
     size1 = (input_size + proj_size + 2) * size;
     size2 = (proj_size * direction + proj_size + 2) * size;
   }
-  int param_size = size1 + (num_layer - 1) * size2;
+  index_t param_size = size1 + (num_layer - 1) * size2;
   if (projection_size.has_value()) {
     param_size += projection_size.value() * state_size * num_layer * direction;
   }
@@ -183,8 +183,8 @@ inline int GetRnnBiasSize(int num_layer,
  *  - output -> h[t](, c[t] additionally with Lstm) time by time(sz: NxH(x2))
  *  - intermediate y[1...T] as next layer's inputs(sz: TxNxHxD)
  */
-inline size_t GetRNNWorkspaceSize(int seq_length,
-                                  int batch_size,
+inline size_t GetRNNWorkspaceSize(index_t seq_length,
+                                  index_t batch_size,
                                   int hidden_size,
                                   int projection_size,
                                   int direction,
@@ -194,7 +194,9 @@ inline size_t GetRNNWorkspaceSize(int seq_length,
     case rnn_enum::kLstm:
       size = seq_length * batch_size * hidden_size * (4 + direction) +  // wx*x + inter-y
           batch_size * hidden_size * 6 +                                // wh*h + h + c
-          seq_length * hidden_size * 8;                    // Used in Backward, Δbx, Δbh
+          seq_length * hidden_size * 8 +                   // Used in Backward, Δbx, Δbh
+          // temporary dy in backward computation for bidirectional layers
+          seq_length * batch_size * hidden_size * (direction - 1 ? direction : 0);
       break;
     case rnn_enum::kGru:
       // Differs with Lstm, the outputs of three gates are also held in memory
@@ -215,8 +217,8 @@ inline size_t GetRNNWorkspaceSize(int seq_length,
 
 inline size_t GetRNNReserveSpaceSize(int num_layer,
                                      int direction,
-                                     int seq_length,
-                                     int batch_size,
+                                     index_t seq_length,
+                                     index_t batch_size,
                                      int hidden_size,
                                      int mode) {
   size_t size = 0;
@@ -280,9 +282,9 @@ void RNNForwardTraining(DType* ws,
                         bool state_outputs,
                         const int num_layers,
                         const int direction,
-                        const int seq_length,
-                        const int batch_size,
-                        const int input_size,
+                        const index_t seq_length,
+                        const index_t batch_size,
+                        const index_t input_size,
                         const int state_size,
                         DType* x_ptr,
                         DType* hx_ptr,
@@ -323,9 +325,9 @@ void RNNForwardInference(DType* ws,
                          bool state_outputs,
                          const int num_layers,
                          const int direction,
-                         const int seq_length,
-                         const int batch_size,
-                         const int input_size,
+                         const index_t seq_length,
+                         const index_t batch_size,
+                         const index_t input_size,
                          const int state_size,
                          const int projection_size,
                          DType* x_ptr,
@@ -365,9 +367,9 @@ void RNNBackward(DType* ws,
                  DType* rs,
                  const int num_layers,
                  const int direction,
-                 const int seq_length,
-                 const int batch_size,
-                 const int input_size,
+                 const index_t seq_length,
+                 const index_t batch_size,
+                 const index_t input_size,
                  const int state_size,
                  DType* x_ptr,
                  DType* hx_ptr,