diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 5eca5c6c2ecd..1d6f2eb5c36a 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -64,7 +64,7 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   bool bidirectional, state_outputs;
   int mode;
   float p;
-  int seq_length_, batch_size_, input_size_;
+  index_t seq_length_, batch_size_, input_size_;
 
   bool use_sequence_length;
   dmlc::optional<int> projection_size;
@@ -123,8 +123,8 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   }
 };
 
-inline int GetRnnParamSize(int num_layer,
-                           int input_size,
+inline index_t GetRnnParamSize(int num_layer,
+                           index_t input_size,
                            int state_size,
                            int direction,
                            int mode,
@@ -141,14 +141,14 @@ inline int GetRnnParamSize(int num_layer,
       size *= 3;
       break;
   }
-  int size1 = (input_size + state_size + 2) * size;  // first layer size
-  int size2 = (state_size * direction + state_size + 2) * size;  // other layers size
+  index_t size1 = (input_size + state_size + 2) * size;  // first layer size
+  index_t size2 = (state_size * direction + state_size + 2) * size;  // other layers size
   if (projection_size.has_value()) {
-    int proj_size = projection_size.value();
+    index_t proj_size = projection_size.value();
     size1 = (input_size + proj_size + 2) * size;
     size2 = (proj_size * direction + proj_size + 2) * size;
   }
-  int param_size = size1 + (num_layer - 1) * size2;
+  index_t param_size = size1 + (num_layer - 1) * size2;
   if (projection_size.has_value()) {
     param_size += projection_size.value() * state_size * num_layer * direction;
   }
@@ -183,8 +183,8 @@ inline int GetRnnBiasSize(int num_layer,
  *  - output -> h[t](, c[t] additionally with Lstm) time by time(sz: NxH(x2))
  *  - intermediate y[1...T] as next layer's inputs(sz: TxNxHxD)
  */
-inline size_t GetRNNWorkspaceSize(int seq_length,
-                                  int batch_size,
+inline size_t GetRNNWorkspaceSize(index_t seq_length,
+                                  index_t batch_size,
                                   int hidden_size,
                                   int projection_size,
                                   int direction,
@@ -194,7 +194,9 @@ inline size_t GetRNNWorkspaceSize(int seq_length,
     case rnn_enum::kLstm:
       size = seq_length * batch_size * hidden_size * (4 + direction) +  // wx*x + inter-y
           batch_size * hidden_size * 6 +                                // wh*h + h + c
-          seq_length * hidden_size * 8;                    // Used in Backward, Δbx, Δbh
+          seq_length * hidden_size * 8 +                   // Used in Backward, Δbx, Δbh
+          // temporary dy in backward computation for bidirectional layers
+          seq_length * batch_size * hidden_size * (direction - 1 ? direction : 0);
       break;
     case rnn_enum::kGru:
       // Differs with Lstm, the outputs of three gates are also held in memory
@@ -215,8 +217,8 @@ inline size_t GetRNNWorkspaceSize(int seq_length,
 
 inline size_t GetRNNReserveSpaceSize(int num_layer,
                                      int direction,
-                                     int seq_length,
-                                     int batch_size,
+                                     index_t seq_length,
+                                     index_t batch_size,
                                      int hidden_size,
                                      int mode) {
   size_t size = 0;
@@ -280,9 +282,9 @@ void RNNForwardTraining(DType* ws,
                         bool state_outputs,
                         const int num_layers,
                         const int direction,
-                        const int seq_length,
-                        const int batch_size,
-                        const int input_size,
+                        const index_t seq_length,
+                        const index_t batch_size,
+                        const index_t input_size,
                         const int state_size,
                         DType* x_ptr,
                         DType* hx_ptr,
@@ -323,9 +325,9 @@ void RNNForwardInference(DType* ws,
                          bool state_outputs,
                          const int num_layers,
                          const int direction,
-                         const int seq_length,
-                         const int batch_size,
-                         const int input_size,
+                         const index_t seq_length,
+                         const index_t batch_size,
+                         const index_t input_size,
                          const int state_size,
                          const int projection_size,
                          DType* x_ptr,
@@ -365,9 +367,9 @@ void RNNBackward(DType* ws,
                  DType* rs,
                  const int num_layers,
                  const int direction,
-                 const int seq_length,
-                 const int batch_size,
-                 const int input_size,
+                 const index_t seq_length,
+                 const index_t batch_size,
+                 const index_t input_size,
                  const int state_size,
                  DType* x_ptr,
                  DType* hx_ptr,
diff --git a/src/operator/rnn_impl.h b/src/operator/rnn_impl.h
index 08d069801079..06ac1232cdb5 100644
--- a/src/operator/rnn_impl.h
+++ b/src/operator/rnn_impl.h
@@ -60,9 +60,9 @@ void LstmForwardTrainingSingleLayer(DType* ws,
                                     DType* rs,
                                     bool state_outputs,
                                     bool bid,
-                                    const int T,
-                                    const int N,
-                                    const int I,
+                                    const index_t T,
+                                    const index_t N,
+                                    const index_t I,
                                     const int H,
                                     const Tensor<cpu, 2, DType> &x,
                                     const Tensor<cpu, 2, DType> &hx,
@@ -89,17 +89,17 @@ void LstmForwardTrainingSingleLayer(DType* ws,
   const int offset = bid ? H : 0;
   const DType alpha = 1.0;
   const DType beta = 0.0;
-  const int cell_size = N * H;
+  const index_t cell_size = N * H;
   linalg_gemm(x, wx, yx_flat, alpha, beta, false, true);
 
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-  for (int i = 0; i < T; ++i) {
-    int t = bid ? T - 1 - i : i;
+  for (index_t i = 0; i < T; ++i) {
+    index_t t = bid ? T - 1 - i : i;
     linalg_gemm(i ? h : hx, wh, yh_flat, alpha, beta, false, true);
     #pragma omp parallel for num_threads(omp_threads)
-    for (int jk = 0; jk < cell_size; ++jk) {
-      int j = jk / H;
-      int k = jk % H;
+    for (index_t jk = 0; jk < cell_size; ++jk) {
+      index_t j = jk / H;
+      index_t k = jk % H;
       DType it = sigmoid<DType>(yx[t][j][0][k] + yh[j][0][k] + bx[0][k] + bh[0][k]);
       DType ft = sigmoid<DType>(yx[t][j][1][k] + yh[j][1][k] + bx[1][k] + bh[1][k]);
       DType gt =           tanh(yx[t][j][2][k] + yh[j][2][k] + bx[2][k] + bh[2][k]);
@@ -128,9 +128,9 @@ void LstmForwardTraining(DType* ws,
                          bool state_outputs,
                          const int L,
                          const int D,
-                         const int T,
-                         const int N,
-                         const int I,
+                         const index_t T,
+                         const index_t N,
+                         const index_t I,
                          const int H,
                          DType* x_ptr,
                          DType* hx_ptr,
@@ -154,8 +154,8 @@ void LstmForwardTraining(DType* ws,
   int idx = 0;  // state & cell state's idx;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   for (int i = 0; i < L; ++i) {
-    const int input_size = i ? H * D : I;
-    const int w_size = (input_size + H) * H * 4;
+    const index_t input_size = i ? H * D : I;
+    const index_t w_size = (input_size + H) * H * 4;
     Tensor<cpu, 2, DType> x(x_ptr, Shape2(T * N, input_size));
     Tensor<cpu, 3, DType> y(rs2 + y_offset, Shape3(T, N, H * D));
     LstmForwardTrainingSingleLayer<DType>(ws, rs2, state_outputs, false, T, N, input_size, H, x,
@@ -196,7 +196,7 @@ void LstmForwardTraining(DType* ws,
     }
   }
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < T * N * H * D; ++i) {
+  for (index_t i = 0; i < T * N * H * D; ++i) {
     y_ptr[i] = (rs2 + y_offset)[i];
   }
 }
@@ -205,9 +205,9 @@ template<typename DType>
 void LstmForwardInferenceSingleLayer(DType* ws,
                                      bool state_outputs,
                                      bool bid,
-                                     const int T,
-                                     const int N,
-                                     const int I,
+                                     const index_t T,
+                                     const index_t N,
+                                     const index_t I,
                                      const int H,
                                      const int P,
                                      const Tensor<cpu, 2, DType> &x,
@@ -237,19 +237,19 @@ void LstmForwardInferenceSingleLayer(DType* ws,
   const int proj_offset = bid ? P : 0;
   const DType alpha = 1.0;
   const DType beta = 0.0;
-  const int cell_size = N * H;
+  const index_t cell_size = N * H;
   linalg_gemm(x, wx, yx_flat, alpha, beta, false, true);
 
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-  for (int i = 0; i < T; ++i) {
-    int t = bid ? T - 1 - i : i;
+  for (index_t i = 0; i < T; ++i) {
+    index_t t = bid ? T - 1 - i : i;
     if (P > 0) {
       linalg_gemm(i ? r : hx, wh, yh_flat, alpha, beta, false, true);
     } else {
       linalg_gemm(i ? h : hx, wh, yh_flat, alpha, beta, false, true);
     }
     #pragma omp parallel for num_threads(omp_threads)
-    for (int jk = 0; jk < cell_size; ++jk) {
+    for (index_t jk = 0; jk < cell_size; ++jk) {
       int j = jk / H;
       int k = jk % H;
       DType it = sigmoid<DType>(yx[t][j][0][k] + yh[j][0][k] + bx[0][k] + bh[0][k]);
@@ -282,9 +282,9 @@ void LstmForwardInference(DType* ws,
                           bool state_outputs,
                           const int L,
                           const int D,
-                          const int T,
-                          const int N,
-                          const int I,
+                          const index_t T,
+                          const index_t N,
+                          const index_t I,
                           const int H,
                           const int P,
                           DType* x_ptr,
@@ -298,16 +298,16 @@ void LstmForwardInference(DType* ws,
   const int total_layers = D * L;
   Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(total_layers, N, P ? P : H));
   Tensor<cpu, 3, DType> cx(cx_ptr, Shape3(total_layers, N, H));
-  const int b_size = 2 * H * 4;
-  const int cell_size = N * H;
-  const int projection_size = (P ? P : H) * N;
+  const index_t b_size = 2 * H * 4;
+  const index_t cell_size = N * H;
+  const index_t projection_size = (P ? P : H) * N;
   DType* y_tmp_ptr = ws + (T + 1) * cell_size * 4 + cell_size * 2;
   DType* y_cur_ptr = y_ptr;
   int idx = 0;  // state & cell state's idx;
   bool flag = L % 2 ? false : true;
   for (int i = 0; i < L; ++i) {
-    const int input_size = i ? (P ? P : H) * D : I;
-    int w_size = (input_size + (P ? P : H)) * H * 4;
+    const index_t input_size = i ? (P ? P : H) * D : I;
+    index_t w_size = (input_size + (P ? P : H)) * H * 4;
     if (P > 0) {
       w_size += P * H;
     }
@@ -351,9 +351,9 @@ void LstmBackwardSingleLayer(DType* ws,
                              DType* rs,
                              DType* tmp_buf,
                              bool bid,
-                             const int T,
-                             const int N,
-                             const int I,
+                             const index_t T,
+                             const index_t N,
+                             const index_t I,
                              const int H,
                              const Tensor<cpu, 2, DType> &x,
                              const Tensor<cpu, 2, DType> &hx,
@@ -403,41 +403,41 @@ void LstmBackwardSingleLayer(DType* ws,
   const DType beta0 = 0.0;
   const DType beta1 = 1.0;
   const DType beta2 = 2.0;
-  const int cell_size = N * H;
+  const index_t cell_size = N * H;
   if (dhy_ptr != nullptr) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < cell_size; ++i) {
+    for (index_t i = 0; i < cell_size; ++i) {
       dh.dptr_[i] = dhy_ptr[i];
     }
   } else {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < cell_size; ++i) {
+    for (index_t i = 0; i < cell_size; ++i) {
       dh.dptr_[i] = 0;
     }
   }
   if (dcy_ptr != nullptr) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < cell_size; ++i) {
+    for (index_t i = 0; i < cell_size; ++i) {
       dc.dptr_[i] = dcy_ptr[i];
     }
   } else {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < cell_size; ++i) {
+    for (index_t i = 0; i < cell_size; ++i) {
       dc.dptr_[i] = 0;
     }
   }
 
-  for (int i = T - 1; i >= 0; --i) {
-    int t = bid ? T - 1 - i : i;
-    int tnext = bid ? t + 1 : t - 1;
+  for (index_t i = T - 1; i >= 0; --i) {
+    index_t t = bid ? T - 1 - i : i;
+    index_t tnext = bid ? t + 1 : t - 1;
     const Tensor<cpu, 2, DType>& dhnext = i ? dh : dhx;
     const Tensor<cpu, 2, DType>& dcnext = i ? dc : dcx;
     const Tensor<cpu, 2, DType>& hnext = i ? htmp : hx;
     const Tensor<cpu, 2, DType>& cnext = i ? c[i - 1] : cx;
     #pragma omp parallel for num_threads(omp_threads)
-    for (int jk = 0; jk < cell_size; ++jk) {
-      int j = jk / H;
-      int k = jk % H;
+    for (index_t jk = 0; jk < cell_size; ++jk) {
+      index_t j = jk / H;
+      index_t k = jk % H;
       DType tc = tanh(c[i][j][k]);
       DType it = ifgo[i][j][k][0];
       DType ft = ifgo[i][j][k][1];
@@ -480,13 +480,13 @@ void LstmBackwardSingleLayer(DType* ws,
   if (req_params != kNullOp && req_params != kAddTo) {
     linalg_gemm(dyx, x, dwx, alpha, beta0, true, false);
   }
-  const int row = T * N;
-  const int col = H * 4;
+  const index_t row = T * N;
+  const index_t col = H * 4;
   if (req_params != kNullOp) {
     if (req_params != kAddTo) {
-      for (int i = 0; i < row; ++i) {
+      for (index_t i = 0; i < row; ++i) {
         #pragma omp parallel for num_threads(omp_threads)
-        for (int j = 0; j < col; ++j) {
+        for (index_t j = 0; j < col; ++j) {
           dbx[j] += dyx[i][j];
           dbh[j] = dbx[j];
         }
@@ -495,20 +495,20 @@ void LstmBackwardSingleLayer(DType* ws,
       const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf, Shape2(col, T));
       const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + col * T, Shape2(col, T));
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < col * T; ++i) {
+      for (index_t i = 0; i < col * T; ++i) {
         tmp_dbx.dptr_[i] = 0;
         tmp_dbh.dptr_[i] = 0;
       }
-      for (int t = T - 1; t >= 0; --t) {
+      for (index_t t = T - 1; t >= 0; --t) {
         #pragma omp parallel for num_threads(omp_threads)
-        for (int j = 0; j < col; ++j) {
-          for (int i = 0; i < N; ++i) {
+        for (index_t j = 0; j < col; ++j) {
+          for (index_t i = 0; i < N; ++i) {
             tmp_dbx[j][t] += dyx[t * N + i][j];
             tmp_dbh[j][t] = tmp_dbx[j][t];
           }
         }
         #pragma omp parallel for num_threads(omp_threads)
-        for (int j = 0; j < col; ++j) {
+        for (index_t j = 0; j < col; ++j) {
           dbx[j] += tmp_dbx[j][t] + dbx[j];
           dbh[j] += tmp_dbh[j][t] + dbh[j];
         }
@@ -522,9 +522,9 @@ void LstmBackward(DType* ws,
                   DType* rs,
                   const int L,
                   const int D,
-                  const int T,
-                  const int N,
-                  const int I,
+                  const index_t T,
+                  const index_t N,
+                  const index_t I,
                   const int H,
                   DType* x_ptr,
                   DType* hx_ptr,
@@ -553,16 +553,17 @@ void LstmBackward(DType* ws,
   Tensor<cpu, 3, DType> cx(cx_ptr, Shape3(total_layers, N, H));
   Tensor<cpu, 3, DType> dhx(dhx_ptr, Shape3(total_layers, N, H));
   Tensor<cpu, 3, DType> dcx(dcx_ptr, Shape3(total_layers, N, H));
-  const int b_size = 2 * H * 4;
-  const int r_size = D * T * N * H * 6;
-  const int y_offset = T * N * H * 5;
-  const int w_size1 = (I + H) * H * 4;      // first layer
-  const int w_size2 = (D * H + H) * H * 4;  // other layers
-  const int cell_size = N * H;
+  const index_t b_size = 2 * H * 4;
+  const index_t r_size = D * T * N * H * 6;
+  const index_t y_offset = T * N * H * 5;
+  const index_t w_size1 = (I + H) * H * 4;      // first layer
+  const index_t w_size2 = (D * H + H) * H * 4;  // other layers
+  const index_t cell_size = N * H;
+  const index_t y_size = T * N * H * D;
   DType* dy_tmp_ptr = ws2 + T * cell_size * 4 + cell_size * 3;
   for (int i = L - 1; i >= 0; --i) {
-    const int input_size = i ? H * D : I;
-    const int w_size = i ? w_size2 : w_size1;
+    const index_t input_size = i ? H * D : I;
+    const index_t w_size = i ? w_size2 : w_size1;
     int idx = i * D;
     DType* w_cur_ptr = i ? w_ptr + (w_size1 + (i - 1) * w_size2) * D : w_ptr;
     DType* dw_cur_ptr = i ? dw_ptr + (w_size1 + (i - 1) * w_size2) * D : dw_ptr;
@@ -589,12 +590,16 @@ void LstmBackward(DType* ws,
                                      x, hx[idx], cx[idx], y, dy, dx, dhx[idx], dcx[idx],
                                      dhy_cur_ptr, dcy_cur_ptr, w_cur_ptr, dw_cur_ptr, db_cur_ptr,
                                      req_data, req_params, req_state, req_statecell);
+
+      // Prevent overwritting dy while calculating dx in left2right layer
+      const int loop_iteration = (L - 1) - i;
+      dy_tmp_ptr = loop_iteration % 2 ? dy_tmp_ptr - y_size : dy_tmp_ptr + y_size;
     }
     if (dropout > 0.0f && i > 0 && req_data != kNullOp) {
       dropout_random = dropout_random - T * N * D * H;
       const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
       #pragma omp parallel for num_threads(omp_threads)
-      for (int j = 0; j < T * N * D * H; j++) {
+      for (index_t j = 0; j < T * N * D * H; j++) {
         if (dropout_random[j] == 0) {
           dx.dptr_[j] = 0;
         } else {
@@ -611,9 +616,9 @@ void GruForwardInferenceSingleLayer(DType* ws,
                                     DType* tmp_buf,
                                     bool state_outputs,
                                     const int D,
-                                    const int T,
-                                    const int N,
-                                    const int I,
+                                    const index_t T,
+                                    const index_t N,
+                                    const index_t I,
                                     const int H,
                                     const Tensor<cpu, 2, DType> &x,
                                     const Tensor<cpu, 2, DType> &hx,
@@ -650,13 +655,13 @@ void GruForwardInferenceSingleLayer(DType* ws,
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   if (D == 1) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * H + j] = hx[i][j];
       }
   } else {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * D * H + j] = hx[i][j];
         back_ht_1[i * D * H + j] = hx[N + i][j];
@@ -674,7 +679,7 @@ void GruForwardInferenceSingleLayer(DType* ws,
     linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true);
   }
 
-  for (int t = 0; t < T; t++) {
+  for (index_t t = 0; t < T; t++) {
     //  perform the first direction, X * wx and H * wh for each step
     //  ht-1 * wh, ht-1:[N, H] wh:[3 * H, H]
     Tensor<cpu, 2, DType> dht_1(ht_1, Shape2(N, D * H));
@@ -688,11 +693,11 @@ void GruForwardInferenceSingleLayer(DType* ws,
     }
     gemmC1_t = gemmC1 + t * N * 3 * H;
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
-        int rtb = i * 3 * H;
-        int ztb = i * 3 * H + H;
-        int ntb = i * 3 * H + 2 * H;
+        index_t rtb = i * 3 * H;
+        index_t ztb = i * 3 * H + H;
+        index_t ntb = i * 3 * H + 2 * H;
         rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] + gemmC2[rtb + j]
             + bx[0][j] + bh[0][j]);
         zt[i * H + j] = sigmoid(gemmC1_t[ztb + j] + gemmC2[ztb + j]
@@ -715,11 +720,11 @@ void GruForwardInferenceSingleLayer(DType* ws,
       linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true);
 
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
-          int rtb = i * 3 * H;
-          int ztb = i * 3 * H + H;
-          int ntb = i * 3 * H + 2 * H;
+          index_t rtb = i * 3 * H;
+          index_t ztb = i * 3 * H + H;
+          index_t ntb = i * 3 * H + 2 * H;
           rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] +
               gemmC2[rtb + j] + back_bx[0][j] + back_bh[0][j]);
           zt[i * H + j] = sigmoid(gemmC1_t[ztb + j] +
@@ -739,7 +744,7 @@ void GruForwardInferenceSingleLayer(DType* ws,
     if (D == 1) {
       DType* y_start = y_ptr + (T - 1) * N * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * H + j];
         }
@@ -747,7 +752,7 @@ void GruForwardInferenceSingleLayer(DType* ws,
       DType* y_start = y_ptr + (T - 1) * N * H * D;
       DType* y_back_start = y_ptr + H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * D * H + j];
           hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j];
@@ -761,9 +766,9 @@ void GruForwardInference(DType* ws,
                          bool state_outputs,
                          const int L,
                          const int D,
-                         const int T,
-                         const int N,
-                         int I,
+                         const index_t T,
+                         const index_t N,
+                         index_t I,
                          const int H,
                          DType* x_ptr,
                          DType* hx_ptr,
@@ -814,9 +819,9 @@ void GruForwardTrainingSingleLayer(DType* ws,
                                    DType* tmp_buf,
                                    bool state_outputs,
                                    const int D,
-                                   const int T,
-                                   const int N,
-                                   const int I,
+                                   const index_t T,
+                                   const index_t N,
+                                   const index_t I,
                                    const int H,
                                    const Tensor<cpu, 2, DType> &x,
                                    const Tensor<cpu, 2, DType> &hx,
@@ -862,13 +867,13 @@ void GruForwardTrainingSingleLayer(DType* ws,
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   if (D == 1) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * H + j] = hx[i][j];
       }
   } else {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * D * H + j] = hx[i][j];
         back_ht_1[i * D * H + j] = hx[N + i][j];
@@ -887,7 +892,7 @@ void GruForwardTrainingSingleLayer(DType* ws,
     linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true);
   }
 
-  for (int t = 0; t < T; t++) {
+  for (index_t t = 0; t < T; t++) {
     //  perform the first direction, X * wx and H * wh for each step
     //  ht-1 * wh, ht-1:[N, H] wh:[3 * H, H]
     Tensor<cpu, 2, DType> dht_1(ht_1, Shape2(N, D * H));
@@ -905,11 +910,11 @@ void GruForwardTrainingSingleLayer(DType* ws,
     gemmC1_t = gemmC1 + t * N * 3 * H;
     DType* Mnht = Mnh + t * N * H;
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
-        int rtb = i * 3 * H;
-        int ztb = i * 3 * H + H;
-        int ntb = i * 3 * H + 2 * H;
+        index_t rtb = i * 3 * H;
+        index_t ztb = i * 3 * H + H;
+        index_t ntb = i * 3 * H + 2 * H;
         Mnht[i * H + j] = gemmC2[ntb + j] + bh[2][j];
         rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] + gemmC2[rtb + j]
             + bx[0][j] + bh[0][j]);
@@ -937,11 +942,11 @@ void GruForwardTrainingSingleLayer(DType* ws,
 
       DType* back_Mnht = back_Mnh + (T - 1 - t) * N * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
-          int rtb = i * 3 * H;
-          int ztb = i * 3 * H + H;
-          int ntb = i * 3 * H + 2 * H;
+          index_t rtb = i * 3 * H;
+          index_t ztb = i * 3 * H + H;
+          index_t ntb = i * 3 * H + 2 * H;
           back_Mnht[i * H + j] = gemmC2[ntb + j] + back_bh[2][j];
           rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] +
               gemmC2[rtb + j] + back_bx[0][j] + back_bh[0][j]);
@@ -963,7 +968,7 @@ void GruForwardTrainingSingleLayer(DType* ws,
     if (D == 1) {
       DType* y_start = y_ptr + (T - 1) * N * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * H + j];
         }
@@ -971,7 +976,7 @@ void GruForwardTrainingSingleLayer(DType* ws,
       DType* y_start = y_ptr + (T - 1) * N * H * D;
       DType* y_back_start = y_ptr + H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * D * H + j];
           hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j];
@@ -986,9 +991,9 @@ void GruForwardTraining(DType* ws,
                         bool state_outputs,
                         const int L,
                         const int D,
-                        const int T,
-                        const int N,
-                        int I,
+                        const index_t T,
+                        const index_t N,
+                        index_t I,
                         const int H,
                         DType* x_ptr,
                         DType* hx_ptr,
@@ -1055,7 +1060,7 @@ void GruForwardTraining(DType* ws,
   }
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < T * N * H * D; ++i) {
+  for (index_t i = 0; i < T * N * H * D; ++i) {
     y_ptr[i] = y_l[i];
   }
 }
@@ -1064,9 +1069,9 @@ template <typename DType>
 void GruBackwardSingleLayer(DType* ws,
                             DType* tmp_buf,
                             const int D,
-                            const int T,
-                            const int N,
-                            const int I,
+                            const index_t T,
+                            const index_t N,
+                            const index_t I,
                             const int H,
                             const Tensor<cpu, 2, DType> &x,
                             const Tensor<cpu, 2, DType> &hx,
@@ -1132,7 +1137,7 @@ void GruBackwardSingleLayer(DType* ws,
     }
   }
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < N * H; ++i) {
+  for (index_t i = 0; i < N * H; ++i) {
     if (dhy_ptr) {
       dht1[i] = dhy_ptr[i];
     } else {
@@ -1141,7 +1146,7 @@ void GruBackwardSingleLayer(DType* ws,
   }
 
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < N; ++i) {
+  for (index_t i = 0; i < N; ++i) {
     for (int j = 0; j < H; ++j) {
       hx_[i * D * H + j] = hx[i][j];
     }
@@ -1149,7 +1154,7 @@ void GruBackwardSingleLayer(DType* ws,
 
   if (D == 2) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N * H; ++i) {
+    for (index_t i = 0; i < N * H; ++i) {
       if (dhy_ptr) {
         back_dht1[i] = dhy_ptr[N * H + i];
       } else {
@@ -1157,13 +1162,13 @@ void GruBackwardSingleLayer(DType* ws,
       }
     }
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
         hx_[i * D * H + H + j] = hx[N + i][j];
       }
     }
   }
-  for (int t = T - 1; t >= 0; --t) {
+  for (index_t t = T - 1; t >= 0; --t) {
     if (t) {
       ht1 = y_ptr + (t - 1) * N * D * H;
     } else {
@@ -1173,7 +1178,7 @@ void GruBackwardSingleLayer(DType* ws,
     dyt = dy_ptr + t * N * D * H;
 
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
         dht1[i * H + j] += dyt[i * D * H + j];
       }
@@ -1186,7 +1191,7 @@ void GruBackwardSingleLayer(DType* ws,
     dat = da + t * N * 3 * H;
     dart = dar + t * N * 3 * H;
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
         int nid = i * 3 * H + 2 * H + j;
         int zid = i * 3 * H + H + j;
@@ -1232,7 +1237,7 @@ void GruBackwardSingleLayer(DType* ws,
     if (req_params != kAddTo) {
       #pragma omp parallel for num_threads(omp_threads)
       for (int i = 0; i < 3 * H; ++i) {
-        for (int j = 0; j < N * T; ++j) {
+        for (index_t j = 0; j < N * T; ++j) {
           dbx[i] += da[j * 3 * H + i];
           dbh[i] += dar[j * 3 * H + i];
         }
@@ -1241,15 +1246,15 @@ void GruBackwardSingleLayer(DType* ws,
       const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf + T * N * D * H, Shape2(H * 3, T));
       const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + T * N * D * H + 3 * H * T, Shape2(H * 3, T));
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < H * T * 3; ++i) {
+      for (index_t i = 0; i < H * T * 3; ++i) {
         tmp_dbx.dptr_[i] = 0;
         tmp_dbh.dptr_[i] = 0;
       }
 
-      for (int t = T - 1; t >= 0; --t) {
+      for (index_t t = T - 1; t >= 0; --t) {
         #pragma omp parallel for num_threads(omp_threads)
         for (int i = 0; i < 3 * H; ++i) {
-          for (int j = 0; j < N; ++j) {
+          for (index_t j = 0; j < N; ++j) {
             tmp_dbx[i][t] += da[t * N * 3 * H + j * 3 * H + i];
             tmp_dbh[i][t] += dar[t * N * 3 * H + j * 3 * H + i];
           }
@@ -1279,7 +1284,7 @@ void GruBackwardSingleLayer(DType* ws,
   }
 
   if (D == 2) {
-    for (int t = 0; t < T; ++t) {
+    for (index_t t = 0; t < T; ++t) {
       if (t == T-1) {
         back_ht1 = hx_;
       } else {
@@ -1289,7 +1294,7 @@ void GruBackwardSingleLayer(DType* ws,
       //  add dy[T, N, D, H] to dhy[D, N, H]
       dyt = dy_ptr + t * N * D * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
           back_dht1[i * H + j] += dyt[i * D * H + H + j];
         }
@@ -1303,12 +1308,12 @@ void GruBackwardSingleLayer(DType* ws,
       dart = dar + t * N * 3 * H;
 
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
-          int nid = i * 3 * H + 2 * H + j;
-          int zid = i * 3 * H + H + j;
-          int rid = i * 3 * H + j;
-          int id = i * H + j;
+          index_t nid = i * 3 * H + 2 * H + j;
+          index_t zid = i * 3 * H + H + j;
+          index_t rid = i * 3 * H + j;
+          index_t id = i * H + j;
           dat[nid] = back_dht1[id] * (1 - zt[id]) * (1 - nt[id] * nt[id]);
           dart[zid] = dat[zid] = back_dht1[id] * (back_ht1[i * D * H + H + j] -
               nt[id]) * zt[id] * (1 - zt[id]);
@@ -1350,7 +1355,7 @@ void GruBackwardSingleLayer(DType* ws,
       if (req_params != kAddTo) {
         #pragma omp parallel for num_threads(omp_threads)
         for (int i = 0; i < 3 * H; ++i) {
-          for (int j = 0; j < N * T; ++j) {
+          for (index_t j = 0; j < N * T; ++j) {
             back_dbx[i] += da[j * 3 * H + i];
             back_dbh[i] += dar[j * 3 * H + i];
           }
@@ -1359,14 +1364,14 @@ void GruBackwardSingleLayer(DType* ws,
         const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf + T * N * D * H, Shape2(H * 3, T));
         const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + T * N * D * H + 3 * H * T, Shape2(H * 3, T));
         #pragma omp parallel for num_threads(omp_threads)
-        for (int i = 0; i < H * T * 3; ++i) {
+        for (index_t i = 0; i < H * T * 3; ++i) {
           tmp_dbx.dptr_[i] = 0;
           tmp_dbh.dptr_[i] = 0;
         }
-        for (int t = T - 1; t >= 0; --t) {
+        for (index_t t = T - 1; t >= 0; --t) {
           #pragma omp parallel for num_threads(omp_threads)
           for (int i = 0; i < 3 * H; ++i) {
-            for (int j = 0; j < N; ++j) {
+            for (index_t j = 0; j < N; ++j) {
               tmp_dbx[i][t] += da[t * N * 3 * H + j * 3 * H + i];
               tmp_dbh[i][t] += dar[t * N * 3 * H + j * 3 * H + i];
             }
@@ -1397,7 +1402,7 @@ void GruBackwardSingleLayer(DType* ws,
   }
   if (req_state != kNullOp) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N * H * D; ++i) {
+    for (index_t i = 0; i < N * H * D; ++i) {
       dhx[i] = dht1[i];
     }
   }
@@ -1408,9 +1413,9 @@ void GruBackward(DType* ws,
                  DType* rs,
                  const int L,
                  const int D,
-                 const int T,
-                 const int N,
-                 int I,
+                 const index_t T,
+                 const index_t N,
+                 index_t I,
                  const int H,
                  DType* x_ptr,
                  DType* hx_ptr,
@@ -1462,7 +1467,7 @@ void GruBackward(DType* ws,
   DType* dhx_l = dhx_ptr + (L - 1) * D * N * H;
   DType* dy_l = dy_ptr;
   Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(L, D * N, H));
-  int inputsize = I;
+  index_t inputsize = I;
   DType* y_tmp = y_l - T * N * H * D;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   for (int l = L - 1; l >= 0; --l) {
@@ -1481,7 +1486,7 @@ void GruBackward(DType* ws,
     if (dropout > 0.0f && l > 0 && req_data != kNullOp) {
       dropout_random = dropout_random - T * N * D * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < T * N * I; i++) {
+      for (index_t i = 0; i < T * N * I; i++) {
         if (dropout_random[i] == 0) {
           dx_l[i] = 0;
         } else {
@@ -1491,7 +1496,7 @@ void GruBackward(DType* ws,
     }
     if (l > 0) {
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < T * N * H * D; ++i) {
+      for (index_t i = 0; i < T * N * H * D; ++i) {
         dy_l[i] = dx_l[i];
       }
       gateR_l = gateR_l - T * D * N * H;
@@ -1502,7 +1507,7 @@ void GruBackward(DType* ws,
       if (dhy_l)
         dhy_l = dhy_l - D * N * H;
       y_l = y_l - T * N * H * D;
-      y_tmp = y_l;
+      y_tmp = y_tmp - T * N * H * D;
       if (l == 1) {
         wx_l = wx_l - (inputsize + H) * H * 3 * D;
         wh_l = wx_l + inputsize * 3 * H;
@@ -1525,9 +1530,9 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws,
                                            DType* tmp_buf,
                                            bool state_outputs,
                                            const int D,
-                                           const int T,
-                                           const int N,
-                                           const int I,
+                                           const index_t T,
+                                           const index_t N,
+                                           const index_t I,
                                            const int H,
                                            const Tensor<cpu, 2, DType> &x,
                                            const Tensor<cpu, 2, DType> &hx,
@@ -1562,13 +1567,13 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws,
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   if (D == 1) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * H + j] = hx[i][j];
       }
   } else {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * D * H + j] = hx[i][j];
         back_ht_1[i * D * H + j] = hx[N + i][j];
@@ -1586,7 +1591,7 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws,
     linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true);
   }
 
-  for (int t = 0; t < T; t++) {
+  for (index_t t = 0; t < T; t++) {
     //  perform the first direction, X * wx and H * wh for each step
     //  ht-1 * wh, ht-1:[N, H] wh:[H, H]
     Tensor<cpu, 2, DType> dht_1(ht_1, Shape2(N, D * H));
@@ -1600,9 +1605,9 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws,
     }
     gemmC1_t = gemmC1 + t * N * H;
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
-        int tb = i * H;
+        index_t tb = i * H;
         if (mode == 1) {
           ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + bx[0][j] +
               gemmC2[tb + j] + bh[0][j]);
@@ -1624,9 +1629,9 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws,
       linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true);
 
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
-          int tb = i * H;
+          index_t tb = i * H;
           if (mode == 1) {
             back_ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + back_bx[0][j]
                 + gemmC2[tb + j] + back_bh[0][j]);
@@ -1645,7 +1650,7 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws,
     if (D == 1) {
       DType* y_start = y_ptr + (T - 1) * N * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * H + j];
         }
@@ -1653,7 +1658,7 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws,
       DType* y_start = y_ptr + (T - 1) * N * H * D;
       DType* y_back_start = y_ptr + H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * D * H + j];
           hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j];
@@ -1667,9 +1672,9 @@ void VanillaRNNForwardInference(DType* ws,
                                 bool state_outputs,
                                 const int L,
                                 const int D,
-                                const int T,
-                                const int N,
-                                int I,
+                                const index_t T,
+                                const index_t N,
+                                index_t I,
                                 const int H,
                                 DType* x_ptr,
                                 DType* hx_ptr,
@@ -1722,9 +1727,9 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws,
                                        DType* tmp_buf,
                                        bool state_outputs,
                                        const int D,
-                                       const int T,
-                                       const int N,
-                                       const int I,
+                                       const index_t T,
+                                       const index_t N,
+                                       const index_t I,
                                        const int H,
                                        const Tensor<cpu, 2, DType> &x,
                                        const Tensor<cpu, 2, DType> &hx,
@@ -1763,13 +1768,13 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws,
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   if (D == 1) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * H + j] = hx[i][j];
       }
   } else {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; i++)
+    for (index_t i = 0; i < N; i++)
       for (int j = 0; j < H; j++) {
         y_ptr[i * D * H + j] = hx[i][j];
         back_ht_1[i * D * H + j] = hx[N + i][j];
@@ -1788,7 +1793,7 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws,
     linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true);
   }
 
-  for (int t = 0; t < T; t++) {
+  for (index_t t = 0; t < T; t++) {
     //  perform the first direction, X * wx and H * wh for each step
     //  ht-1 * wh, ht-1:[N, H] wh:[H, H]
     Tensor<cpu, 2, DType> dht_1(ht_1, Shape2(N, D * H));
@@ -1803,9 +1808,9 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws,
     nt = gateN + t * N * H;
     gemmC1_t = gemmC1 + t * N * H;
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
-        int tb = i * H;
+        index_t tb = i * H;
         if (mode == 1) {
           nt[tb + j] = ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + bx[0][j] +
               gemmC2[tb + j] + bh[0][j]);
@@ -1827,9 +1832,9 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws,
       dback_ht_1_tmp = reshape(dback_ht_1.T(), Shape3(D, H, N));
       linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true);
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
-          int tb = i * H;
+          index_t tb = i * H;
           if (mode == 1) {
             nt[tb + j] = back_ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + back_bx[0][j]
                 + gemmC2[tb + j] + back_bh[0][j]);
@@ -1849,7 +1854,7 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws,
     if (D == 1) {
       DType* y_start = y_ptr + (T - 1) * N * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * H + j];
         }
@@ -1857,7 +1862,7 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws,
       DType* y_start = y_ptr + (T - 1) * N * H * D;
       DType* y_back_start = y_ptr + H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; i++)
+      for (index_t i = 0; i < N; i++)
         for (int j = 0; j < H; j++) {
           hy_ptr[i * H + j] = y_start[i * D * H + j];
           hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j];
@@ -1872,9 +1877,9 @@ void VanillaRNNForwardTraining(DType* ws,
                                bool state_outputs,
                                const int L,
                                const int D,
-                               const int T,
-                               const int N,
-                               int I,
+                               const index_t T,
+                               const index_t N,
+                               index_t I,
                                const int H,
                                DType* x_ptr,
                                DType* hx_ptr,
@@ -1936,7 +1941,7 @@ void VanillaRNNForwardTraining(DType* ws,
     wh_l = wx_l + I * H;
   }
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < T * N * H * D; ++i) {
+  for (index_t i = 0; i < T * N * H * D; ++i) {
     y_ptr[i] = y_l[i];
   }
 }
@@ -1945,9 +1950,9 @@ template <typename DType>
 void VanillaRNNBackwardSingleLayer(DType* ws,
                                    DType* tmp_buf,
                                    const int D,
-                                   const int T,
-                                   const int N,
-                                   const int I,
+                                   const index_t T,
+                                   const index_t N,
+                                   const index_t I,
                                    const int H,
                                    const Tensor<cpu, 2, DType> &x,
                                    const Tensor<cpu, 2, DType> &hx,
@@ -2005,7 +2010,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
   }
 
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < N * H; ++i) {
+  for (index_t i = 0; i < N * H; ++i) {
     if (dhy_ptr) {
       dht1[i] = dhy_ptr[i];
     } else {
@@ -2014,7 +2019,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
   }
 
   #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < N; ++i) {
+  for (index_t i = 0; i < N; ++i) {
     for (int j = 0; j < H; ++j) {
       hx_[i * D * H + j] = hx[i][j];
     }
@@ -2022,7 +2027,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
 
   if (D == 2) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N * H; ++i) {
+    for (index_t i = 0; i < N * H; ++i) {
       if (dhy_ptr) {
         back_dht1[i] = dhy_ptr[N * H + i];
       } else {
@@ -2030,13 +2035,13 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
       }
     }
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
         hx_[i * D * H + H + j] = hx[N + i][j];
       }
     }
   }
-  for (int t = T - 1; t >= 0; --t) {
+  for (index_t t = T - 1; t >= 0; --t) {
     if (t) {
       ht1 = y_ptr + (t - 1) * N * D * H;
     } else {
@@ -2046,7 +2051,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
     dyt = dy_ptr + t * N * D * H;
 
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
         dht1[i * H + j] += dyt[i * D * H + j];
       }
@@ -2055,9 +2060,9 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
     nt = gateN + t * N * H;
     dart = dar + t * N * H;
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N; ++i) {
+    for (index_t i = 0; i < N; ++i) {
       for (int j = 0; j < H; ++j) {
-        int id = i * H + j;
+        index_t id = i * H + j;
         if (mode == 1) {
           dart[id] = dht1[id] * (1 - nt[id] * nt[id]);
         } else {
@@ -2096,7 +2101,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
     if (req_params != kAddTo) {
       #pragma omp parallel for num_threads(omp_threads)
       for (int i = 0; i < H; ++i) {
-        for (int j = 0; j < N * T; ++j) {
+        for (index_t j = 0; j < N * T; ++j) {
           dbx[i] += dar[j * H + i];
           dbh[i] = dbx[i];
         }
@@ -2105,15 +2110,15 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
       const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf + T * N * D * H, Shape2(H, T));
       const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + T * N * D * H + H * T, Shape2(H, T));
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < H * T; ++i) {
+      for (index_t i = 0; i < H * T; ++i) {
         tmp_dbx.dptr_[i] = 0;
         tmp_dbh.dptr_[i] = 0;
       }
 
-      for (int t = T - 1; t >= 0; --t) {
+      for (index_t t = T - 1; t >= 0; --t) {
         #pragma omp parallel for num_threads(omp_threads)
         for (int i = 0; i < H; ++i) {
-          for (int j = 0; j < N; ++j) {
+          for (index_t j = 0; j < N; ++j) {
             tmp_dbx[i][t] += dar[t * N * H + j * H + i];
             tmp_dbh[i][t] = tmp_dbx[i][t];
           }
@@ -2143,7 +2148,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
   }
 
   if (D == 2) {
-    for (int t = 0; t < T; ++t) {
+    for (index_t t = 0; t < T; ++t) {
       if (t == T-1) {
         back_ht1 = hx_;
       } else {
@@ -2153,7 +2158,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
       //  add dy[T, N, D, H] to dhy[D, N, H]
       dyt = dy_ptr + t * N * D * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
           back_dht1[i * H + j] += dyt[i * D * H + H + j];
         }
@@ -2163,9 +2168,9 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
       dart = dar + t * N * H;
 
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < N; ++i) {
+      for (index_t i = 0; i < N; ++i) {
         for (int j = 0; j < H; ++j) {
-          int id = i * H + j;
+          index_t id = i * H + j;
           if (mode == 1) {
             dart[id] = back_dht1[id] * (1 - nt[id] * nt[id]);
           } else {
@@ -2205,7 +2210,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
       if (req_params != kAddTo) {
         #pragma omp parallel for num_threads(omp_threads)
         for (int i = 0; i < H; ++i) {
-          for (int j = 0; j < N * T; ++j) {
+          for (index_t j = 0; j < N * T; ++j) {
             back_dbx[i] += dar[j * H + i];
             back_dbh[i] = back_dbx[i];
           }
@@ -2214,15 +2219,15 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
         const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf + T * N * D * H, Shape2(H, T));
         const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + T * N * D * H + H * T, Shape2(H, T));
         #pragma omp parallel for num_threads(omp_threads)
-        for (int i = 0; i < H * T; ++i) {
+        for (index_t i = 0; i < H * T; ++i) {
           tmp_dbx.dptr_[i] = 0;
           tmp_dbh.dptr_[i] = 0;
         }
 
-        for (int t = T - 1; t >= 0; --t) {
+        for (index_t t = T - 1; t >= 0; --t) {
           #pragma omp parallel for num_threads(omp_threads)
           for (int i = 0; i < H; ++i) {
-            for (int j = 0; j < N; ++j) {
+            for (index_t j = 0; j < N; ++j) {
               tmp_dbx[i][t] += dar[t * N * H + j * H + i];
               tmp_dbh[i][t] = tmp_dbx[i][t];
             }
@@ -2253,7 +2258,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws,
   }
   if (req_state != kNullOp) {
     #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < N * H * D; ++i) {
+    for (index_t i = 0; i < N * H * D; ++i) {
       dhx[i] = dht1[i];
     }
   }
@@ -2264,9 +2269,9 @@ void VanillaRNNBackward(DType* ws,
                         DType* rs,
                         const int L,
                         const int D,
-                        const int T,
-                        const int N,
-                        int I,
+                        const index_t T,
+                        const index_t N,
+                        index_t I,
                         const int H,
                         DType* x_ptr,
                         DType* hx_ptr,
@@ -2316,7 +2321,7 @@ void VanillaRNNBackward(DType* ws,
   DType* dhx_l = dhx_ptr + (L - 1) * D * N * H;
   DType* dy_l = dy_ptr;
   Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(L, D * N, H));
-  int inputsize = I;
+  index_t inputsize = I;
   DType* y_tmp = y_l - T * N * H * D;
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   for (int l = L - 1; l >= 0; --l) {
@@ -2335,7 +2340,7 @@ void VanillaRNNBackward(DType* ws,
     if (dropout > 0.0f && l > 0 && req_data != kNullOp) {
       dropout_random = dropout_random - T * N * D * H;
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < T * N * I; i++) {
+      for (index_t i = 0; i < T * N * I; i++) {
         if (dropout_random[i] == 0) {
           dx_l[i] = 0;
         } else {
@@ -2345,7 +2350,7 @@ void VanillaRNNBackward(DType* ws,
     }
     if (l > 0) {
       #pragma omp parallel for num_threads(omp_threads)
-      for (int i = 0; i < T * N * H * D; ++i) {
+      for (index_t i = 0; i < T * N * H * D; ++i) {
         dy_l[i] = dx_l[i];
       }
       gateN_l = gateN_l -  T * D * N * H;
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index ee57f172c1c9..d50f3ff7e539 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -39,6 +39,7 @@
 SMALL_Y = 50
 LARGE_SIZE = LARGE_X * SMALL_Y
 LARGE_TENSOR_SHAPE = 2**32
+RNN_LARGE_TENSOR = 2**28
 
 
 def test_nn():
@@ -479,7 +480,6 @@ def check_embedding():
 
         assert out.shape[0] == LARGE_TENSOR_SHAPE
         assert out.shape[1] == 1
-        assert out.shape[2] == 1
         
     def check_spatial_transformer():
         data = nd.random_normal(shape=(2, 2**29, 1, 6))
@@ -504,6 +504,39 @@ def check_ravel():
 
         assert out.shape[0] == LARGE_TENSOR_SHAPE
 
+    def check_rnn():
+        data = nd.random_normal(shape=(RNN_LARGE_TENSOR, 4, 4))
+        parameters_relu_tanh = nd.random_normal(shape=(7,))
+        parameters_lstm = nd.random_normal(shape=(28,))
+        parameters_gru = nd.random_normal(shape=(21,))
+        state = nd.random_normal(shape=(1, 4, 1))
+        state_cell = nd.random_normal(shape=(1, 4, 1))
+        mode_relu = 'rnn_relu'
+        mode_tanh = 'rnn_tanh'
+        mode_lstm = 'lstm'
+        mode_gru = 'gru'
+        state_size = 1
+        num_layers = 1
+
+        out_relu = nd.RNN(data=data, parameters=parameters_relu_tanh, state=state, mode=mode_relu,
+                          state_size=state_size, num_layers=num_layers)
+        
+        out_tanh = nd.RNN(data=data, parameters=parameters_relu_tanh, state=state, mode=mode_tanh,
+                          state_size=state_size, num_layers=num_layers)
+        
+        out_lstm = nd.RNN(data=data, parameters=parameters_lstm, state=state, mode=mode_lstm,
+                          state_cell=state_cell, state_size=state_size, num_layers=num_layers)
+
+        out_gru = nd.RNN(data=data, parameters=parameters_gru, state=state, mode=mode_gru,
+                         state_size=state_size, num_layers=num_layers)
+
+        for out in [out_relu, out_tanh, out_lstm, out_gru]:
+            assert out.shape[0] == RNN_LARGE_TENSOR
+            assert out.shape[1] == 4
+            assert out.shape[2] == 1
+
+            assert type(out[0, 0, 0].asscalar()).__name__ == 'float32'
+
     check_gluon_embedding()
     check_fully_connected()
     check_dense()
@@ -527,6 +560,7 @@ def check_ravel():
     check_embedding()
     check_spatial_transformer()
     check_ravel()
+    check_rnn()
 
 
 def test_tensor():
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index f2a220bbe719..6f9308b12cea 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -685,15 +685,10 @@ def check_rnn_consistency(fused_layer, stack_layer, loss, input_size, hidden_siz
     stack_input_grad = sx.grad.asnumpy()
 
     assert_allclose(fused_out.asnumpy(), stack_out.asnumpy(), rtol=rtol, atol=atol)
-    if mx.context.current_context().device_type == 'cpu' and \
-            not mx.runtime.Features().is_enabled('MKLDNN') and \
-            'rnn' not in fused_layer.prefix:
-        print("LSTM and GRU on native CPU give wrong gradients. "
-              "Tracking issue: https://github.com/apache/incubator-mxnet/issues/17898.")
-    else:
-        assert_allclose(fused_input_grad, stack_input_grad, rtol=rtol, atol=atol)
-        for key, value in fused_grads.items():
-            assert_allclose(value.asnumpy(), stack_grads[key].asnumpy(), rtol=rtol, atol=atol)
+    assert_allclose(fused_input_grad, stack_input_grad, rtol=rtol, atol=atol)
+    for key, value in fused_grads.items():
+        assert_allclose(value.asnumpy(), stack_grads[key].asnumpy(), rtol=rtol, atol=atol)
+
     num_layers = fused_begin_state[0].shape[0] // (2 if bidirectional else 1)
     check_rnn_states(fused_states, stack_states, num_layers, bidirectional, len(fused_begin_state) == 2)
 
@@ -719,61 +714,32 @@ def create_op_by_mode(mode):
     return fused_op, stack_op, recurrent_block_prefix
 
 
-def check_rnn_unidir_layer_gradients(mode, input_size, hidden_size, loss):
+def check_rnn_unidir_layer_gradients(mode, input_size, hidden_size, num_layers, loss):
     fused_op, stack_op, recurrent_block_prefix = create_op_by_mode(mode)
-    # ==== Single layer ====
-    fused_layer = fused_op(hidden_size, num_layers=1, layout='NTC', bidirectional=False, prefix=recurrent_block_prefix)
-    fused_layer.initialize()
-
-    stack_layer = mx.gluon.rnn.HybridSequentialRNNCell(prefix=recurrent_block_prefix)
-    with stack_layer.name_scope():
-        stack_layer.add(stack_op(hidden_size, prefix='l0_'))
-    stack_layer.initialize()
 
-    check_rnn_consistency(fused_layer, stack_layer, loss, input_size, hidden_size)
-
-    # ==== Multiple layer ====
-    fused_layer = fused_op(hidden_size, num_layers=3, layout='NTC', bidirectional=False, prefix=recurrent_block_prefix)
+    fused_layer = fused_op(hidden_size, num_layers=num_layers, layout='NTC', bidirectional=False, prefix=recurrent_block_prefix)
     fused_layer.initialize()
 
     stack_layer = mx.gluon.rnn.HybridSequentialRNNCell(prefix=recurrent_block_prefix)
     with stack_layer.name_scope():
-        stack_layer.add(stack_op(hidden_size, prefix='l0_'))
-        stack_layer.add(stack_op(hidden_size, prefix='l1_'))
-        stack_layer.add(stack_op(hidden_size, prefix='l2_'))
+        for n in range(num_layers):
+            stack_layer.add(stack_op(hidden_size, prefix="l{}_".format(n)))
     stack_layer.initialize()
-
     check_rnn_consistency(fused_layer, stack_layer, loss, input_size, hidden_size)
 
 
-def check_rnn_bidir_layer_gradients(mode, input_size, hidden_size, loss):
+def check_rnn_bidir_layer_gradients(mode, input_size, hidden_size, num_layers, loss):
     fused_op, stack_op, recurrent_block_prefix = create_op_by_mode(mode)
-    # ==== Single layer ====
-    fused_layer = fused_op(hidden_size, num_layers=1, layout='NTC', bidirectional=True, prefix=recurrent_block_prefix)
-    fused_layer.initialize()
-
-    stack_layer = mx.gluon.rnn.HybridSequentialRNNCell(prefix=recurrent_block_prefix)
-    with stack_layer.name_scope():
-        stack_layer.add(gluon.rnn.BidirectionalCell(stack_op(hidden_size, prefix='l0_'),
-                                                    stack_op(hidden_size, prefix='r0_')))
-    stack_layer.initialize()
 
-    check_rnn_consistency(fused_layer, stack_layer, loss, input_size, hidden_size, bidirectional=True)
-
-    # ==== Multiple layer ====
-    fused_layer = fused_op(hidden_size, num_layers=3, layout='NTC', bidirectional=True, prefix=recurrent_block_prefix)
+    fused_layer = fused_op(hidden_size, num_layers=num_layers, layout='NTC', bidirectional=True, prefix=recurrent_block_prefix)
     fused_layer.initialize()
 
     stack_layer = mx.gluon.rnn.HybridSequentialRNNCell(prefix=recurrent_block_prefix)
     with stack_layer.name_scope():
-        stack_layer.add(gluon.rnn.BidirectionalCell(stack_op(hidden_size, prefix='l0_'),
-                                                    stack_op(hidden_size, prefix='r0_')))
-        stack_layer.add(gluon.rnn.BidirectionalCell(stack_op(hidden_size, prefix='l1_'),
-                                                    stack_op(hidden_size, prefix='r1_')))
-        stack_layer.add(gluon.rnn.BidirectionalCell(stack_op(hidden_size, prefix='l2_'),
-                                                    stack_op(hidden_size, prefix='r2_')))
-    stack_layer.initialize()
-
+        for n in range(num_layers):
+            stack_layer.add(gluon.rnn.BidirectionalCell(stack_op(hidden_size, prefix="l{}_".format(n)),
+                                                stack_op(hidden_size, prefix="r{}_".format(n))))
+        stack_layer.initialize()
     check_rnn_consistency(fused_layer, stack_layer, loss, input_size, hidden_size, bidirectional=True)
 
 
@@ -782,10 +748,11 @@ def check_rnn_bidir_layer_gradients(mode, input_size, hidden_size, loss):
 def test_fused_lstm_layer():
     input_sizes = [8]
     hidden_sizes = [8, 16]
-    for input_size, hidden_size in product(input_sizes, hidden_sizes):
+    num_layers = [1, 2, 3, 4]
+    for input_size, hidden_size, num_layers in product(input_sizes, hidden_sizes, num_layers):
         loss = mx.gluon.loss.L2Loss()
-        check_rnn_unidir_layer_gradients('lstm', input_size, hidden_size, loss)
-        check_rnn_bidir_layer_gradients('lstm', input_size, hidden_size, loss)
+        check_rnn_unidir_layer_gradients('lstm', input_size, hidden_size, num_layers, loss)
+        check_rnn_bidir_layer_gradients('lstm', input_size, hidden_size, num_layers, loss)
 
 
 @with_seed()
@@ -793,10 +760,11 @@ def test_fused_lstm_layer():
 def test_fused_gru_layer():
     input_sizes = [8]
     hidden_sizes = [8, 16]
-    for input_size, hidden_size in product(input_sizes, hidden_sizes):
+    num_layers = [1, 2, 3, 4]
+    for input_size, hidden_size, num_layers in product(input_sizes, hidden_sizes, num_layers):
         loss = mx.gluon.loss.L2Loss()
-        check_rnn_unidir_layer_gradients('gru', input_size, hidden_size, loss)
-        check_rnn_bidir_layer_gradients('gru', input_size, hidden_size, loss)
+        check_rnn_unidir_layer_gradients('gru', input_size, hidden_size, num_layers, loss)
+        check_rnn_bidir_layer_gradients('gru', input_size, hidden_size, num_layers, loss)
 
 
 @with_seed()
@@ -804,10 +772,11 @@ def test_fused_gru_layer():
 def test_fused_rnnrelu_layer():
     input_sizes = [8]
     hidden_sizes = [8, 16]
-    for input_size, hidden_size in product(input_sizes, hidden_sizes):
+    num_layers = [1, 2, 3, 4]
+    for input_size, hidden_size, num_layers in product(input_sizes, hidden_sizes, num_layers):
         loss = mx.gluon.loss.L2Loss()
-        check_rnn_unidir_layer_gradients('rnn_relu', input_size, hidden_size, loss)
-        check_rnn_bidir_layer_gradients('rnn_relu', input_size, hidden_size, loss)
+        check_rnn_unidir_layer_gradients('rnn_relu', input_size, hidden_size, num_layers, loss)
+        check_rnn_bidir_layer_gradients('rnn_relu', input_size, hidden_size, num_layers, loss)
 
 
 @with_seed()
@@ -815,10 +784,11 @@ def test_fused_rnnrelu_layer():
 def test_fused_rnntanh_layer():
     input_sizes = [8]
     hidden_sizes = [8, 16]
-    for input_size, hidden_size in product(input_sizes, hidden_sizes):
+    num_layers = [1, 2, 3, 4]
+    for input_size, hidden_size, num_layers in product(input_sizes, hidden_sizes, num_layers):
         loss = mx.gluon.loss.L2Loss()
-        check_rnn_unidir_layer_gradients('rnn_tanh', input_size, hidden_size, loss)
-        check_rnn_bidir_layer_gradients('rnn_tanh', input_size, hidden_size, loss)
+        check_rnn_unidir_layer_gradients('rnn_tanh', input_size, hidden_size, num_layers, loss)
+        check_rnn_bidir_layer_gradients('rnn_tanh', input_size, hidden_size, num_layers, loss)
 
 
 def test_rnn_unroll_variant_length():