Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
[1.x] Backport of LSTM and GRU fix (#17898) and RNN op (#17632) (#18317)
Browse files Browse the repository at this point in the history
* [v1.x] [Large Tensor] Backport of Fixed RNN op (#17632)

* Changed relevant function args to index_t

* Added nightly test for RNN

* Added fix for LSTM, GRU, RNN-ReLU, RNN-tanh

* Using const instead of literals

* Added nightly test for RNN ReLU & tanh, LSTM, GRU

* Type assertion to force evaluation of output NDArray

* Incorporated latest round of comments

* [v1.x] Backport of Fix LSTM and GRU layers gradient calculations (#18203)

* Fix input gradient calculation for bidirectional LSTM

For bidiractional LSTM with number of layers > 2 input gradient calculation was incorrect.
Reason of wrong calculations was overwriting y derivative (dy) tensor by
calculated x derivative (dx) tensor before right2left layer could use dy for own
gradient calculations.
Propsed fix uses additional space to avoid overwriting.

* Fix gradient calculation for GRU

For GRU with number of layers > 2 i2h_weight gradient for
layers in the middle (all except last and first) was incorrect.
Wrong caluculations were caused by assigning output pointer to
input instead of calculating new input pointer.

* Enable tests for GRU and LSTM gradients

* Fix comments

* Change loop iteration deduction

* Add more test cases for fused rnn layers

Co-authored-by: Connor Goggins <[email protected]>
  • Loading branch information
bgawrych and connorgoggins authored Jun 3, 2020
1 parent 36bd144 commit 8986e3f
Show file tree
Hide file tree
Showing 4 changed files with 291 additions and 280 deletions.
44 changes: 23 additions & 21 deletions src/operator/rnn-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
bool bidirectional, state_outputs;
int mode;
float p;
int seq_length_, batch_size_, input_size_;
index_t seq_length_, batch_size_, input_size_;

bool use_sequence_length;
dmlc::optional<int> projection_size;
Expand Down Expand Up @@ -123,8 +123,8 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
}
};

inline int GetRnnParamSize(int num_layer,
int input_size,
inline index_t GetRnnParamSize(int num_layer,
index_t input_size,
int state_size,
int direction,
int mode,
Expand All @@ -141,14 +141,14 @@ inline int GetRnnParamSize(int num_layer,
size *= 3;
break;
}
int size1 = (input_size + state_size + 2) * size; // first layer size
int size2 = (state_size * direction + state_size + 2) * size; // other layers size
index_t size1 = (input_size + state_size + 2) * size; // first layer size
index_t size2 = (state_size * direction + state_size + 2) * size; // other layers size
if (projection_size.has_value()) {
int proj_size = projection_size.value();
index_t proj_size = projection_size.value();
size1 = (input_size + proj_size + 2) * size;
size2 = (proj_size * direction + proj_size + 2) * size;
}
int param_size = size1 + (num_layer - 1) * size2;
index_t param_size = size1 + (num_layer - 1) * size2;
if (projection_size.has_value()) {
param_size += projection_size.value() * state_size * num_layer * direction;
}
Expand Down Expand Up @@ -183,8 +183,8 @@ inline int GetRnnBiasSize(int num_layer,
* - output -> h[t](, c[t] additionally with Lstm) time by time(sz: NxH(x2))
* - intermediate y[1...T] as next layer's inputs(sz: TxNxHxD)
*/
inline size_t GetRNNWorkspaceSize(int seq_length,
int batch_size,
inline size_t GetRNNWorkspaceSize(index_t seq_length,
index_t batch_size,
int hidden_size,
int projection_size,
int direction,
Expand All @@ -194,7 +194,9 @@ inline size_t GetRNNWorkspaceSize(int seq_length,
case rnn_enum::kLstm:
size = seq_length * batch_size * hidden_size * (4 + direction) + // wx*x + inter-y
batch_size * hidden_size * 6 + // wh*h + h + c
seq_length * hidden_size * 8; // Used in Backward, Δbx, Δbh
seq_length * hidden_size * 8 + // Used in Backward, Δbx, Δbh
// temporary dy in backward computation for bidirectional layers
seq_length * batch_size * hidden_size * (direction - 1 ? direction : 0);
break;
case rnn_enum::kGru:
// Differs with Lstm, the outputs of three gates are also held in memory
Expand All @@ -215,8 +217,8 @@ inline size_t GetRNNWorkspaceSize(int seq_length,

inline size_t GetRNNReserveSpaceSize(int num_layer,
int direction,
int seq_length,
int batch_size,
index_t seq_length,
index_t batch_size,
int hidden_size,
int mode) {
size_t size = 0;
Expand Down Expand Up @@ -280,9 +282,9 @@ void RNNForwardTraining(DType* ws,
bool state_outputs,
const int num_layers,
const int direction,
const int seq_length,
const int batch_size,
const int input_size,
const index_t seq_length,
const index_t batch_size,
const index_t input_size,
const int state_size,
DType* x_ptr,
DType* hx_ptr,
Expand Down Expand Up @@ -323,9 +325,9 @@ void RNNForwardInference(DType* ws,
bool state_outputs,
const int num_layers,
const int direction,
const int seq_length,
const int batch_size,
const int input_size,
const index_t seq_length,
const index_t batch_size,
const index_t input_size,
const int state_size,
const int projection_size,
DType* x_ptr,
Expand Down Expand Up @@ -365,9 +367,9 @@ void RNNBackward(DType* ws,
DType* rs,
const int num_layers,
const int direction,
const int seq_length,
const int batch_size,
const int input_size,
const index_t seq_length,
const index_t batch_size,
const index_t input_size,
const int state_size,
DType* x_ptr,
DType* hx_ptr,
Expand Down
Loading

0 comments on commit 8986e3f

Please sign in to comment.