diff --git a/.github/workflows/os_x_staticbuild.yml b/.github/workflows/os_x_staticbuild.yml
index eabe88f46053..6e313a025f51 100644
--- a/.github/workflows/os_x_staticbuild.yml
+++ b/.github/workflows/os_x_staticbuild.yml
@@ -10,7 +10,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install Dependencies
         run: |
-          brew install nasm automake ninja libtool
+          brew install nasm automake ninja libtool cmake pkgconfig protobuf
       - name: Build project
         run: |
           git --version
diff --git a/3rdparty/mshadow/CMakeLists.txt b/3rdparty/mshadow/CMakeLists.txt
index 3b898a4772b2..3a347fd51de6 100644
--- a/3rdparty/mshadow/CMakeLists.txt
+++ b/3rdparty/mshadow/CMakeLists.txt
@@ -13,6 +13,12 @@ add_library(mshadow INTERFACE)
 file(GLOB_RECURSE MSHADOWSOURCE "mshadow/*.h")
 target_include_directories(mshadow INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_sources(mshadow INTERFACE ${MSHADOWSOURCE})
+if(UNIX)
+  target_compile_options(mshadow INTERFACE
+    "$<$<COMPILE_LANGUAGE:CXX>:-Wno-unused-parameter>"
+    "$<$<COMPILE_LANGUAGE:CXX>:-Wno-unknown-pragmas>"
+    "$<$<COMPILE_LANGUAGE:CXX>:-Wno-unused-local-typedefs>")
+endif()
 
 if(USE_CUDA)
   enable_language(CUDA)
@@ -67,7 +73,8 @@ else()
 endif()
 
 set(mshadow_LINT_DIRS mshadow mshadow-ps)
+find_package(Python3)
 add_custom_target(mshadow_lint COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC}
-  -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE} -DLINT_DIRS=${mshadow_LINT_DIRS}
+  -DPYTHON_EXECUTABLE=${Python3_EXECUTABLE} -DLINT_DIRS=${mshadow_LINT_DIRS}
   -DPROJECT_SOURCE_DIR=${PROJECT_SOURCE_DIR} -DPROJECT_NAME=mshadow
   -P ${PROJECT_SOURCE_DIR}/../dmlc-core/cmake/lint.cmake)
diff --git a/3rdparty/mshadow/cmake/AutoDetectF16C.cmake b/3rdparty/mshadow/cmake/AutoDetectF16C.cmake
index 04331c3ac710..1ad85ba3bcde 100644
--- a/3rdparty/mshadow/cmake/AutoDetectF16C.cmake
+++ b/3rdparty/mshadow/cmake/AutoDetectF16C.cmake
@@ -25,8 +25,11 @@ if(AUTO_DETECT_F16_CMAKE_INCLUDED)
   return()
 endif()
 set(AUTO_DETECT_F16_CMAKE_INCLUDED True)
-
 set(SUPPORT_F16C False)
+if(ANDROID)
+    message("F16C instruction set is not yet supported for Andriod")
+    return()
+endif()
 if(MSVC)
     message("F16C instruction set is not yet supported for MSVC")
     return()
diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
index 28fbd868d8c8..a99838422348 100755
--- a/3rdparty/mshadow/mshadow/base.h
+++ b/3rdparty/mshadow/mshadow/base.h
@@ -18,12 +18,13 @@
 #define NOMINMAX
 #endif
 #endif
-#include <cmath>
-#include <cstdio>
+#include <algorithm>
 #include <cfloat>
 #include <climits>
-#include <algorithm>
+#include <cmath>
+#include <cstdio>
 #include <functional>
+#include <limits>
 #include <sstream>
 #include <string>
 
@@ -839,7 +840,7 @@ MSHADOW_XINLINE bool MaxValue<bool>(void) {
 /*! \brief maximum value of uint32_t */
 template<>
 MSHADOW_XINLINE uint32_t MaxValue<uint32_t>(void) {
-  return -1;
+  return std::numeric_limits<uint32_t>::max();
 }
 
 /*!
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4731663281d4..526a1da09895 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,6 +51,7 @@ else()
 endif()
 option(USE_GPERFTOOLS "Build with GPerfTools support" OFF)
 option(USE_JEMALLOC "Build with Jemalloc support" OFF)
+option(USE_LIBJPEG_TURBO "Use libjpeg-turbo" OFF)
 option(USE_DIST_KVSTORE "Build with DIST_KVSTORE support" OFF)
 option(USE_PLUGINS_WARPCTC "Use WARPCTC Plugins" OFF)
 option(USE_PLUGIN_CAFFE "Use Caffe Plugin" OFF)
@@ -381,6 +382,16 @@ if(USE_JEMALLOC)
   endif()
 endif()
 
+if(USE_LIBJPEG_TURBO)
+  find_package(PkgConfig REQUIRED)
+  pkg_search_module(TURBOJPEG REQUIRED libturbojpeg)
+  include_directories(SYSTEM ${TURBOJPEG_INCLUDE_DIRS})
+  list(APPEND mxnet_LINKER_LIBS ${TURBOJPEG_LINK_LIBRARIES})
+  add_definitions(-DMXNET_USE_LIBJPEG_TURBO=1)
+else()
+  add_definitions(-DMXNET_USE_LIBJPEG_TURBO=0)
+endif()
+
 # ---[ OpenCV
 if(USE_OPENCV)
   find_package(OpenCV COMPONENTS core highgui imgproc imgcodecs)
@@ -661,7 +672,7 @@ add_subdirectory("3rdparty/mshadow")
 
 set(MXNET_INSTALL_TARGETS mxnet)
 if(UNIX)
-  string(APPEND CMAKE_CUDA_FLAGS "${CUDA_ARCH_FLAGS_SPACES}")
+  string(APPEND CMAKE_CUDA_FLAGS " ${CUDA_ARCH_FLAGS_SPACES}")
   # Create dummy file since we want an empty shared library before linking
   set(DUMMY_SOURCE ${CMAKE_BINARY_DIR}/dummy.c)
   file(WRITE ${DUMMY_SOURCE} "")
@@ -673,6 +684,15 @@ if(UNIX)
   target_link_libraries(mxnet PRIVATE mxnet_static)
   target_link_libraries(mxnet_static PUBLIC ${CMAKE_DL_LIBS})
   set_target_properties(mxnet_static PROPERTIES OUTPUT_NAME mxnet)
+  if(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
+    target_compile_options(mxnet_static PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:-Werror>")
+    # Ignore erroneous compiler warnings:
+    # 1) variables used in '#pragma omp parallel' are considered unused
+    target_compile_options(mxnet_static PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:-Wno-error=unused-variable>")
+    if(USE_CUDA)
+      string(APPEND CMAKE_CUDA_FLAGS " -Werror cross-execution-space-call")
+    endif()
+  endif()
 elseif(MSVC)
   if(USE_CUDA)
     if(MSVC)
@@ -708,7 +728,7 @@ elseif(MSVC)
           COMMAND gen_warp $<TARGET_FILE:mxnet_${mxnet_first_arch}> WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/ DEPENDS $<TARGET_FILE:mxnet_${mxnet_first_arch}>)
       else(USE_SPLIT_ARCH_DLL)
         string(REPLACE ";" " " NVCC_FLAGS_ARCH "${NVCC_FLAGS_ARCH}")
-        set(CMAKE_CUDA_FLAGS "${CUDA_ARCH_FLAGS_SPACES}")
+        set(CMAKE_CUDA_FLAGS " ${CUDA_ARCH_FLAGS_SPACES}")
         add_library(mxnet SHARED ${SOURCE})
         target_link_libraries(mxnet PUBLIC mshadow)
         target_compile_options(
@@ -778,14 +798,7 @@ endfunction()
 if(USE_TVM_OP)
   list(APPEND mxnet_LINKER_LIBS ${CMAKE_CURRENT_BINARY_DIR}/3rdparty/tvm/libtvm_runtime.so)
   BuildTVMOP()
-  if(NOT Python3_EXECUTABLE)
-    find_package(PythonInterp 3 REQUIRED)
-    set(Python3_EXECUTABLE ${PYTHON_EXECUTABLE} CACHE FILEPATH "Path to the python3 executable")
-    if(NOT Python3_EXECUTABLE)
-      message(FATAL_ERROR "No python3 interpreter found to build TVM operators")
-    endif()
-  endif()
-
+  find_package(Python3 REQUIRED)
   set(TVM_OP_COMPILE_OPTIONS "-o${CMAKE_CURRENT_BINARY_DIR}" "--config" "${CMAKE_CURRENT_BINARY_DIR}/tvmop.conf" "-L" "${CMAKE_CURRENT_BINARY_DIR}/3rdparty/tvm")
   if(USE_CUDA)
     set(TVM_OP_COMPILE_OPTIONS "${TVM_OP_COMPILE_OPTIONS}" "--cuda-arch" "\"${CUDA_ARCH_FLAGS}\"")
@@ -904,13 +917,10 @@ endif()
 add_subdirectory(tests)
 
 # ---[ Linter target
-if(MSVC)
-  find_package(PythonInterp)
-  set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE} CACHE FILEPATH "Path to the python executable")
-endif()
+find_package(Python3)
 set(LINT_DIRS "include src plugin cpp-package tests")
 set(EXCLUDE_PATH "src/operator/contrib/ctc_include")
-add_custom_target(mxnet_lint COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE} -DLINT_DIRS=${LINT_DIRS} -DPROJECT_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR} -DPROJECT_NAME=mxnet -DEXCLUDE_PATH=${EXCLUDE_PATH} -P ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/dmlc-core/cmake/lint.cmake)
+add_custom_target(mxnet_lint COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DPYTHON_EXECUTABLE=${Python3_EXECUTABLE} -DLINT_DIRS=${LINT_DIRS} -DPROJECT_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR} -DPROJECT_NAME=mxnet -DEXCLUDE_PATH=${EXCLUDE_PATH} -P ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/dmlc-core/cmake/lint.cmake)
 
 if(BUILD_CYTHON_MODULES)
   include(cmake/BuildCythonModules.cmake)
diff --git a/Makefile b/Makefile
index 8c478d61d11a..90303ae154a2 100644
--- a/Makefile
+++ b/Makefile
@@ -223,6 +223,8 @@ ifeq (,$(wildcard /lib/liblapack.a))
 ifeq (,$(wildcard /lib/liblapack.so))
 ifeq (,$(wildcard /usr/lib/liblapack.a))
 ifeq (,$(wildcard /usr/lib/liblapack.so))
+ifeq (,$(wildcard /usr/lib/x86_64-linux-gnu/liblapack.a))
+ifeq (,$(wildcard /usr/lib/x86_64-linux-gnu/liblapack.so))
 ifeq (,$(wildcard /usr/lib/liblapack.dylib))
 ifeq (,$(wildcard /usr/lib64/liblapack.a))
 ifeq (,$(wildcard /usr/lib64/liblapack.so))
@@ -240,6 +242,8 @@ endif
 endif
 endif
 endif
+endif
+endif
 
 # lapack settings.
 ifeq ($(USE_LAPACK), 1)
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 06a9bb03b6ae..7dadb484650b 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: mxnet
 Type: Package
 Title: MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems
-Version: 1.6.0
+Version: 1.7.0
 Date: 2017-06-27
 Author: Tianqi Chen, Qiang Kou, Tong He, Anirudh Acharya <https://github.com/anirudhacharya>
 Maintainer: Qiang Kou <qkou@qkou.info>
diff --git a/R-package/Makefile b/R-package/Makefile
index 68249033f595..5a8bc42aca93 100644
--- a/R-package/Makefile
+++ b/R-package/Makefile
@@ -1,5 +1,5 @@
 rcpplint:
-	3rdparty/dmlc-core/scripts/lint.py mxnet-rcpp ${LINT_LANG} R-package/src
+	./3rdparty/dmlc-core/scripts/lint.py mxnet-rcpp all R-package/src
 
 rpkg:
 	mkdir -p R-package/inst/libs
diff --git a/R-package/R/optimizer.R b/R-package/R/optimizer.R
index 6f13f7b26ddb..be8d977b2a98 100644
--- a/R-package/R/optimizer.R
+++ b/R-package/R/optimizer.R
@@ -109,9 +109,9 @@ mx.opt.sgd <- function(learning.rate = 0.01,
 #'
 #' @param learning.rate float, default=0.002
 #'      The initial learning rate.
-#' @param gamma1 float, default=0.95
+#' @param rho float, default=0.95
 #'      decay factor of moving average for gradient, gradient^2.
-#' @param gamma2 float, default=0.9
+#' @param momentum float, default=0.9
 #'      "momentum" factor.
 #' @param epsilon float, default=1e-4
 #' @param wd float, default=0.0
@@ -125,8 +125,8 @@ mx.opt.sgd <- function(learning.rate = 0.01,
 #'
 mx.opt.rmsprop <- function(learning.rate = 0.002,
                            centered = TRUE,
-                           gamma1 = 0.95,
-                           gamma2 = 0.9,
+                           rho = 0.95,
+                           momentum = 0.9,
                            epsilon = 1e-4,
                            wd = 0,
                            rescale.grad = 1,
@@ -158,8 +158,8 @@ mx.opt.rmsprop <- function(learning.rate = 0.002,
                                           g,
                                           delta,
                                           lr = lr,
-                                          gamma1 = gamma1,
-                                          gamma2 = gamma2,
+                                          rho = rho,
+                                          momentum = momentum,
                                           epsilon = epsilon,
                                           wd = wd,
                                           rescale_grad = rescale.grad,
@@ -174,7 +174,7 @@ mx.opt.rmsprop <- function(learning.rate = 0.002,
                                       grad,
                                       n,
                                       lr = lr,
-                                      gamma1 = gamma1,
+                                      rho = rho,
                                       epsilon = epsilon,
                                       wd = wd,
                                       rescale_grad = rescale.grad,
diff --git a/R-package/tests/testthat/test_optimizer.R b/R-package/tests/testthat/test_optimizer.R
index 1eec83f2d46e..cbe9575c90ca 100644
--- a/R-package/tests/testthat/test_optimizer.R
+++ b/R-package/tests/testthat/test_optimizer.R
@@ -73,8 +73,8 @@ test_that("rmsprop", {
     fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
     "null"))
   
-  optimizer <- mx.opt.create("rmsprop", learning.rate = 1, centered = TRUE, gamma1 = 0.95, 
-    gamma2 = 0.9, epsilon = 1e-04, wd = 0, rescale.grad = 1, clip_gradient = -1)
+  optimizer <- mx.opt.create("rmsprop", learning.rate = 1, centered = TRUE, rho = 0.95,
+    momentum = 0.9, epsilon = 1e-04, wd = 0, rescale.grad = 1, clip_gradient = -1)
   
   updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
   
diff --git a/benchmark/opperf/nd_operations/array_rearrange.py b/benchmark/opperf/nd_operations/array_rearrange.py
index 12af8345543e..631d0bb997bc 100644
--- a/benchmark/opperf/nd_operations/array_rearrange.py
+++ b/benchmark/opperf/nd_operations/array_rearrange.py
@@ -29,8 +29,8 @@
 """
 
 
-def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the
+def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the
     rearrange operators in MXNet.
 
     Parameters
@@ -41,6 +41,8 @@ def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -55,5 +57,5 @@ def run_rearrange_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='
     mx_rearrange_ops = get_all_rearrange_operators()
 
     # Run benchmarks
-    mx_rearrange_op_results = run_op_benchmarks(mx_rearrange_ops, dtype, ctx, profiler, warmup, runs)
+    mx_rearrange_op_results = run_op_benchmarks(mx_rearrange_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_rearrange_op_results
diff --git a/benchmark/opperf/nd_operations/binary_operators.py b/benchmark/opperf/nd_operations/binary_operators.py
index 5d95360a73db..4444219e6054 100644
--- a/benchmark/opperf/nd_operations/binary_operators.py
+++ b/benchmark/opperf/nd_operations/binary_operators.py
@@ -38,8 +38,8 @@
     get_all_elemen_wise_binary_operators, get_all_misc_binary_operators
 
 
-def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the miscellaneous
+def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the miscellaneous
     binary operators in MXNet.
 
     Parameters
@@ -48,6 +48,10 @@ def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profi
         Context to run benchmarks
     dtype: str, default 'float32'
         Precision to use for benchmarks
+    profiler: str, default 'native'
+        Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -61,12 +65,12 @@ def run_mx_binary_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profi
     # Fetch all Miscellaneous Binary Operators
     mx_binary_misc_ops = get_all_misc_binary_operators()
     # Run benchmarks
-    mx_binary_op_results = run_op_benchmarks(mx_binary_misc_ops, dtype, ctx, profiler, warmup, runs)
+    mx_binary_op_results = run_op_benchmarks(mx_binary_misc_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_binary_op_results
 
 
-def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the binary
+def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the binary
     broadcast operators in MXNet.
 
     Parameters
@@ -77,6 +81,8 @@ def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32',
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -90,12 +96,12 @@ def run_mx_binary_broadcast_operators_benchmarks(ctx=mx.cpu(), dtype='float32',
     # Fetch all Binary Broadcast Operators
     mx_binary_broadcast_ops = get_all_broadcast_binary_operators()
     # Run benchmarks
-    mx_binary_op_results = run_op_benchmarks(mx_binary_broadcast_ops, dtype, ctx, profiler, warmup, runs)
+    mx_binary_op_results = run_op_benchmarks(mx_binary_broadcast_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_binary_op_results
 
 
-def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the binary
+def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the binary
     element_wise operators in MXNet.
 
     Parameters
@@ -106,6 +112,8 @@ def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 10
         Number of times to run for warmup
     runs: int, default 50
@@ -119,5 +127,5 @@ def run_mx_binary_element_wise_operators_benchmarks(ctx=mx.cpu(), dtype='float32
     # Fetch all Binary Element_wise Operators
     mx_binary_element_wise_ops = get_all_elemen_wise_binary_operators()
     # Run benchmarks
-    mx_binary_op_results = run_op_benchmarks(mx_binary_element_wise_ops, dtype, ctx, profiler, warmup, runs)
+    mx_binary_op_results = run_op_benchmarks(mx_binary_element_wise_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_binary_op_results
diff --git a/benchmark/opperf/nd_operations/gemm_operators.py b/benchmark/opperf/nd_operations/gemm_operators.py
index f1028123b421..55b3435a8f24 100644
--- a/benchmark/opperf/nd_operations/gemm_operators.py
+++ b/benchmark/opperf/nd_operations/gemm_operators.py
@@ -35,8 +35,8 @@
 """
 
 
-def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the GEMM
+def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the GEMM
     operators (dot, batch_dot, khatri_rao) in MXNet.
 
     Parameters
@@ -47,6 +47,8 @@ def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -57,43 +59,75 @@ def run_gemm_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ
     Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
 
     """
-    # Benchmark tests for dot operator
+    standard_inputs_dot = [{"lhs": (1024, 1024),
+                            "rhs": (1024, 1024)},
+                           {"lhs": (1000, 10),
+                            "rhs": (1000, 10),
+                            "transpose_b": True},
+                           {"lhs": (1000, 1),
+                            "rhs": (100, 1000),
+                            "transpose_a": True,
+                            "transpose_b": True}]
+    int64_tensor_inputs_dot = [{"lhs": (2**16, 2**16),
+                                "rhs": (2**16, 2**16)},
+                               {"lhs": (4, 2**30),
+                                "rhs": (4, 2**30),
+                                "transpose_b": True},
+                               {"lhs": (2**28, 16),
+                                "rhs": (16, 2**28),
+                                "transpose_a": True,
+                                "transpose_b": True}]
+    standard_inputs_batch_dot = [{"lhs": (32, 1024, 1024),
+                                  "rhs": (32, 1024, 1024)},
+                                 {"lhs": (32, 1000, 10),
+                                  "rhs": (32, 1000, 10),
+                                  "transpose_b": True},
+                                 {"lhs": (32, 1000, 1),
+                                  "rhs": (32, 100, 1000),
+                                  "transpose_a": True,
+                                  "transpose_b": True}]
+    int64_tensor_inputs_batch_dot = [{"lhs": (1, 2**16, 2**16),
+                                      "rhs": (1, 2**16, 2**16)},
+                                     {"lhs": (1, 4, 2**30),
+                                      "rhs": (1, 4, 2**30),
+                                      "transpose_b": True},
+                                     {"lhs": (1, 2**28, 16),
+                                      "rhs": (1, 16, 2**28),
+                                      "transpose_a": True,
+                                      "transpose_b": True}]
+    standard_inputs_khatri_rao = [{"args": [(32, 32), (32, 32)]},
+                                  {"args": [(64, 64), (64, 64)]}]
+    int64_tensor_inputs_khatri_rao = [{"args": [(2**32, 1), (2**32, 1)]}]
+
+    if int64_tensor == 'on':
+        inputs_dot = int64_tensor_inputs_dot
+        inputs_batch_dot = int64_tensor_inputs_batch_dot
+        inputs_khatri_rao = int64_tensor_inputs_khatri_rao
+    else:
+        inputs_dot = standard_inputs_dot
+        inputs_batch_dot = standard_inputs_batch_dot
+        inputs_khatri_rao = standard_inputs_khatri_rao
+
+    # Benchmark tests for dot and batch_dot operators
     dot_benchmark_res = run_performance_test(
         [getattr(MX_OP_MODULE, "dot")], run_backward=True,
         dtype=dtype, ctx=ctx,
-        inputs=[{"lhs": (1024, 1024),
-                 "rhs": (1024, 1024)},
-                {"lhs": (1000, 10),
-                 "rhs": (1000, 10),
-                 "transpose_b": True},
-                {"lhs": (1000, 1),
-                 "rhs": (100, 1000),
-                 "transpose_a": True,
-                 "transpose_b": True}],
+        inputs=inputs_dot,
         warmup=warmup, runs=runs, profiler=profiler)
-    # Benchmark tests for batch_dot operator
+
     batch_dot_benchmark_res = run_performance_test(
         [getattr(MX_OP_MODULE, "batch_dot")], run_backward=True,
         dtype=dtype, ctx=ctx,
-        inputs=[{"lhs": (32, 1024, 1024),
-                 "rhs": (32, 1024, 1024)},
-                {"lhs": (32, 1000, 10),
-                 "rhs": (32, 1000, 10),
-                 "transpose_b": True},
-                {"lhs": (32, 1000, 1),
-                 "rhs": (32, 100, 1000),
-                 "transpose_a": True,
-                 "transpose_b": True}],
+        inputs=inputs_batch_dot,
         warmup=warmup, runs=runs, profiler=profiler)
-    # Operator khatri_rao is not yet implemented for GPU
+        # Operator khatri_rao is not yet implemented for GPU
     khatri_rao_benchmark_res = []
     if ctx != mx.gpu():
         # Benchmark tests for khatri_rao operator
         khatri_rao_benchmark_res = run_performance_test(
             [getattr(MX_OP_MODULE, "khatri_rao")], run_backward=False,
             dtype=dtype, ctx=ctx,
-            inputs=[{"args": [(32, 32), (32, 32)]},
-                    {"args": [(64, 64), (64, 64)]}],
+            inputs=inputs_khatri_rao,
             warmup=warmup, runs=runs, profiler=profiler)
 
     # Prepare combined results for GEMM operators
diff --git a/benchmark/opperf/nd_operations/indexing_routines.py b/benchmark/opperf/nd_operations/indexing_routines.py
index a957785940a5..ee99de2b57bf 100644
--- a/benchmark/opperf/nd_operations/indexing_routines.py
+++ b/benchmark/opperf/nd_operations/indexing_routines.py
@@ -35,8 +35,8 @@
 """
 
 
-def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the indexing routines
+def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the indexing routines
     in MXNet.
 
     Parameters
@@ -47,6 +47,8 @@ def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -61,5 +63,5 @@ def run_indexing_routines_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
     mx_indexing_ops = get_all_indexing_routines()
 
     # Run benchmarks
-    mx_indexing_op_results = run_op_benchmarks(mx_indexing_ops, dtype, ctx, profiler, warmup, runs)
+    mx_indexing_op_results = run_op_benchmarks(mx_indexing_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_indexing_op_results
diff --git a/benchmark/opperf/nd_operations/linalg_operators.py b/benchmark/opperf/nd_operations/linalg_operators.py
index d2c1cee0a307..1d35ef1fc951 100644
--- a/benchmark/opperf/nd_operations/linalg_operators.py
+++ b/benchmark/opperf/nd_operations/linalg_operators.py
@@ -34,8 +34,8 @@
 from benchmark.opperf.utils.common_utils import merge_map_list
 from benchmark.opperf.rules.default_params import MX_OP_MODULE
 
-def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the linear algebra
+def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the linear algebra
     operators in MXNet.
 
     Parameters
@@ -46,6 +46,8 @@ def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nat
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -74,5 +76,5 @@ def run_linalg_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nat
     # Fetch all Linear Algebra Operators
     mx_linalg_ops = get_all_linalg_operators()
     # Run benchmarks
-    mx_linalg_op_results = run_op_benchmarks(mx_linalg_ops, dtype, ctx, profiler, warmup, runs)
+    mx_linalg_op_results = run_op_benchmarks(mx_linalg_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return merge_map_list(linalg_potrf_benchmark + [mx_linalg_op_results])
diff --git a/benchmark/opperf/nd_operations/misc_operators.py b/benchmark/opperf/nd_operations/misc_operators.py
index 5a0efc57de0d..fb8535a959a0 100644
--- a/benchmark/opperf/nd_operations/misc_operators.py
+++ b/benchmark/opperf/nd_operations/misc_operators.py
@@ -37,7 +37,7 @@
 from benchmark.opperf.custom_operations.custom_operations import CustomAddOneProp
 
 
-def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
+def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
     """Runs benchmarks with the given context and precision (dtype) for all the miscellaneous
     operators in MXNet.
 
@@ -49,6 +49,8 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -59,6 +61,48 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
     Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
 
     """
+
+    standard_inputs_array_ops = [{"args": [(1024, 1024)],
+                                  "num_arrays": 1},
+                                 {"args": [(10000, 1)],
+                                  "num_arrays": 1},
+                                 {"args": [(10000, 10)],
+                                  "num_arrays": 1}]
+    int64_tensor_inputs_array_ops = [{"args": [(2**32, 1)],
+                                      "num_arrays":1}]
+    standard_inputs_add_n = [{"args": [(1024, 1024)]},
+                             {"args": [(10000, 1)]},
+                             {"args": [(10000, 10)]}]
+    int64_tensor_inputs_add_n = [{"args": [(2**16, 2**16)]}]
+    standard_inputs_upsampling = [{"args": (32, 3, 256, 256),
+                                   "scale": 2,
+                                   "sample_type": "nearest"},
+                                  {"args": (32, 3, 10000, 1),
+                                   "scale": 4,
+                                   "sample_type": "nearest"}]
+    int64_tensor_inputs_upsampling = [{"args": (2**32 + 1, 1, 1, 1),
+                                       "scale": 2,
+                                       "sample_type": "nearest"}]
+    standard_inputs_custom = [{"args": [(1024, 1024)],
+                               "op_type": "CustomAddOne"},
+                              {"args": [(10000, 1)],
+                               "op_type": "CustomAddOne"},
+                              {"args": [(10000, 10)],
+                               "op_type": "CustomAddOne"}]
+    int64_tensor_inputs_custom = [{"args": [(2**32 + 1, 1)],
+                                   "op_type": "CustomAddOne"}]
+
+    if int64_tensor == 'on':
+        inputs_array_ops = int64_tensor_inputs_array_ops
+        inputs_add_n = int64_tensor_inputs_add_n
+        inputs_upsampling = int64_tensor_inputs_upsampling
+        inputs_custom = int64_tensor_inputs_custom
+    else:
+        inputs_array_ops = standard_inputs_array_ops
+        inputs_add_n = standard_inputs_add_n
+        inputs_upsampling = standard_inputs_upsampling
+        inputs_custom = standard_inputs_custom
+
     # Individual tests for ops with positional args
     array_ops_benchmark = run_performance_test([getattr(MX_OP_MODULE, "reset_arrays"),
                                                 getattr(MX_OP_MODULE, "multi_all_finite"),
@@ -67,12 +111,7 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
                                                dtype=dtype,
                                                ctx=ctx,
                                                profiler=profiler,
-                                               inputs=[{"args": [(1024, 1024)],
-                                                        "num_arrays": 1},
-                                                       {"args": [(10000, 1)],
-                                                        "num_arrays": 1},
-                                                       {"args": [(10000, 10)],
-                                                        "num_arrays": 1}],
+                                               inputs=inputs_array_ops,
                                                warmup=warmup,
                                                runs=runs)
     add_n_benchmark = run_performance_test([getattr(MX_OP_MODULE, "add_n")],
@@ -80,9 +119,7 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
                                            dtype=dtype,
                                            ctx=ctx,
                                            profiler=profiler,
-                                           inputs=[{"args": [(1024, 1024)]},
-                                                   {"args": [(10000, 1)]},
-                                                   {"args": [(10000, 10)]}],
+                                           inputs=inputs_add_n,
                                            warmup=warmup,
                                            runs=runs)
     # There are currently issus with UpSampling with bilinear interpolation.
@@ -92,12 +129,7 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
                                                 dtype=dtype,
                                                 ctx=ctx,
                                                 profiler=profiler,
-                                                inputs=[{"args": (32, 3, 256, 256),
-                                                         "scale": 2,
-                                                         "sample_type": "nearest"},
-                                                        {"args": (32, 3, 10000, 1),
-                                                         "scale": 4,
-                                                         "sample_type": "nearest"}],
+                                                inputs=inputs_upsampling,
                                                 warmup=warmup,
                                                 runs=runs)
     # Create and register CustomAddOne operator for use in Custom op testing
@@ -108,17 +140,12 @@ def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
                                             dtype=dtype,
                                             ctx=ctx,
                                             profiler=profiler,
-                                            inputs=[{"args": [(1024, 1024)],
-                                                     "op_type": "CustomAddOne"},
-                                                    {"args": [(10000, 1)],
-                                                     "op_type": "CustomAddOne"},
-                                                    {"args": [(10000, 10)],
-                                                     "op_type": "CustomAddOne"}],
+                                            inputs=inputs_custom,
                                             warmup=warmup,
                                             runs=runs)
 
     # Fetch remaining Miscellaneous Operators
     mx_misc_ops = get_remaining_miscellaneous_operators()
     # Run benchmarks
-    mx_misc_op_results = run_op_benchmarks(mx_misc_ops, dtype, ctx, profiler, warmup, runs)
+    mx_misc_op_results = run_op_benchmarks(mx_misc_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return merge_map_list(array_ops_benchmark + add_n_benchmark + upsampling_benchmark + custom_benchmark + [mx_misc_op_results])
diff --git a/benchmark/opperf/nd_operations/nn_activation_operators.py b/benchmark/opperf/nd_operations/nn_activation_operators.py
index b77777cc04dd..161dfe72123e 100644
--- a/benchmark/opperf/nd_operations/nn_activation_operators.py
+++ b/benchmark/opperf/nd_operations/nn_activation_operators.py
@@ -43,9 +43,9 @@
 """
 
 
-def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the activation
-    operators in MXNet.
+def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the activation
+    operators (relu, sigmoid, softmax) in MXNet.
 
     Parameters
     ----------
@@ -55,6 +55,8 @@ def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler=
         Precision to use for benchmarks
     profiler: str, default 'native'
         Module to use for tracking benchmark excecution time
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -70,6 +72,6 @@ def run_activation_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler=
     mx_activation_ops = get_all_nn_activation_operators()
 
     # Run benchmarks
-    mx_activation_op_results = run_op_benchmarks(mx_activation_ops, dtype, ctx, profiler, warmup, runs)
+    mx_activation_op_results = run_op_benchmarks(mx_activation_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_activation_op_results
     
\ No newline at end of file
diff --git a/benchmark/opperf/nd_operations/nn_basic_operators.py b/benchmark/opperf/nd_operations/nn_basic_operators.py
index a8273d4105dc..f3007bac188c 100644
--- a/benchmark/opperf/nd_operations/nn_basic_operators.py
+++ b/benchmark/opperf/nd_operations/nn_basic_operators.py
@@ -20,6 +20,10 @@
 from benchmark.opperf.utils.op_registry_utils import get_all_nn_basic_operators
 from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks
 
+from benchmark.opperf.utils.benchmark_utils import run_performance_test
+from benchmark.opperf.utils.common_utils import merge_map_list
+from benchmark.opperf.rules.default_params import MX_OP_MODULE
+
 """Performance benchmark tests for MXNet NDArray basic NN Operators.
 
 1. FullyConnected
@@ -45,8 +49,8 @@
 """
 
 
-def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the NN basic
+def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the basic neural network
     operators in MXNet.
 
     Parameters
@@ -56,7 +60,9 @@ def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
     dtype: str, default 'float32'
         Precision to use for benchmarks
     profiler: str, default 'native'
-        Module to use for tracking benchmark excecution time
+        Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -68,9 +74,71 @@ def run_nn_basic_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
 
     """
 
+    standard_data_list = [(1024, 4, 4)]
+    int64_tensor_data_list = [(2**28, 4, 4)]
+
+    if int64_tensor == 'on':
+        data_list = int64_tensor_data_list
+    else:
+        data_list = standard_data_list
+
+    for data in data_list:
+        rnn_relu_benchmark = run_performance_test([getattr(MX_OP_MODULE, "RNN")],
+                                                  run_backward=True,
+                                                  dtype=dtype,
+                                                  ctx=ctx,
+                                                  profiler=profiler,
+                                                  inputs=[{"data": data,
+                                                           "parameters": (7,),
+                                                           "state": (1, 4, 1),
+                                                           "mode": "rnn_relu",
+                                                           "state_size": 1,
+                                                           "num_layers": 1}],
+                                                  warmup=warmup,
+                                                  runs=runs)
+        rnn_tanh_benchmark = run_performance_test([getattr(MX_OP_MODULE, "RNN")],
+                                                  run_backward=True,
+                                                  dtype=dtype,
+                                                  ctx=ctx,
+                                                  profiler=profiler,
+                                                  inputs=[{"data": data,
+                                                           "parameters": (7,),
+                                                           "state": (1, 4, 1),
+                                                           "mode": "rnn_tanh",
+                                                           "state_size": 1,
+                                                           "num_layers": 1}],
+                                                  warmup=warmup,
+                                                  runs=runs)
+        rnn_lstm_benchmark = run_performance_test([getattr(MX_OP_MODULE, "RNN")],
+                                                  run_backward=True,
+                                                  dtype=dtype,
+                                                  ctx=ctx,
+                                                  profiler=profiler,
+                                                  inputs=[{"data": data,
+                                                           "parameters": (28,),
+                                                           "state": (1, 4, 1),
+                                                           "state_cell": (1, 4, 1),
+                                                           "mode": "lstm",
+                                                           "state_size": 1,
+                                                           "num_layers": 1}],
+                                                  warmup=warmup,
+                                                  runs=runs)
+        rnn_gru_benchmark = run_performance_test([getattr(MX_OP_MODULE, "RNN")],
+                                                 run_backward=True,
+                                                 dtype=dtype,
+                                                 ctx=ctx,
+                                                 profiler=profiler,
+                                                 inputs=[{"data": data,
+                                                          "parameters": (21,),
+                                                          "state": (1, 4, 1),
+                                                          "mode": "gru",
+                                                          "state_size": 1,
+                                                          "num_layers": 1}],
+                                                 warmup=warmup,
+                                                 runs=runs)
     # Fetch all NN Basic Operators
     mx_nn_basic_ops = get_all_nn_basic_operators()
     
     # Run benchmarks
-    mx_nn_basic_op_results = run_op_benchmarks(mx_nn_basic_ops, dtype, ctx, profiler, warmup, runs)
-    return mx_nn_basic_op_results
+    mx_nn_basic_op_results = run_op_benchmarks(mx_nn_basic_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
+    return merge_map_list(rnn_relu_benchmark + rnn_tanh_benchmark + rnn_lstm_benchmark + rnn_gru_benchmark + [mx_nn_basic_op_results])
diff --git a/benchmark/opperf/nd_operations/nn_conv_operators.py b/benchmark/opperf/nd_operations/nn_conv_operators.py
index 9c80f00c354b..d44b89117511 100644
--- a/benchmark/opperf/nd_operations/nn_conv_operators.py
+++ b/benchmark/opperf/nd_operations/nn_conv_operators.py
@@ -52,16 +52,55 @@
 """
 
 
-def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
+def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the pooling
+    operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    profiler: str, default 'native'
+        Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
+    warmup: int, default 25
+        Number of times to run for warmup
+    runs: int, default 100
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
     pool_types = ['avg', 'max', 'sum']
     global_pool_types = [0, 1]
 
+    standard_data_list_pool1d = [(32, 3, 256), (32, 3, 64)]
+    int64_tensor_data_list_pool1d = [(1, 1, 2**32)]
+    standard_data_list_pool2d = [(32, 3, 256, 256), (32, 3, 64, 64)]
+    int64_tensor_data_list_pool2d = [(2**28, 1, 4, 4)]
+    standard_data_list_roipool = [(32, 3, 256, 256), (32, 3, 64, 64)]
+    int64_tensor_data_list_roipool = [(32, 3, 2**13, 2**13)]
+
+    if int64_tensor == 'on':
+        data_list_pool1d = int64_tensor_data_list_pool1d
+        data_list_pool2d = int64_tensor_data_list_pool2d
+        data_list_roipool = int64_tensor_data_list_roipool
+    else:
+        data_list_pool1d = standard_data_list_pool1d
+        data_list_pool2d = standard_data_list_pool2d
+        data_list_roipool = standard_data_list_roipool
+
     # Run 1D and 2D Pooling performance runs
     pool1d_benchmark_res = []
     pool2d_benchmark_res = []
     for pool_type in pool_types:
         for global_pool in global_pool_types:
-            for pool1d_data in [(32, 3, 256), (32, 3, 64)]:
+            for pool1d_data in data_list_pool1d:
                 pool1d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")],
                                                              run_backward=True,
                                                              dtype=dtype,
@@ -73,10 +112,10 @@ def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
                                                                       "global_pool": global_pool,
                                                                       "stride": 1,
                                                                       "pad": 1}
-                                                                     ],
+                                                                    ],
                                                              warmup=warmup,
                                                              runs=runs)
-            for pool2d_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
+            for pool2d_data in data_list_pool2d:
                 pool2d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Pooling")],
                                                              run_backward=True,
                                                              dtype=dtype,
@@ -88,68 +127,118 @@ def run_pooling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='na
                                                                       "global_pool": global_pool,
                                                                       "stride": (1, 1),
                                                                       "pad": (0, 0)}
-                                                                     ],
+                                                                    ],
                                                              warmup=warmup,
                                                              runs=runs)
-    # Run ROI Pooling performance runs
-    roipool_benchmark_res = []
-    for roipool_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
-        roipool_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "ROIPooling")],
-                                                      run_backward=True,
-                                                      dtype=dtype,
-                                                      ctx=ctx,
-                                                      profiler=profiler,
-                                                      inputs=[{"data": roipool_data,
-                                                               "rois": (32, 5),
-                                                               "pooled_size": (2, 2),
-                                                               "spatial_scale": .5}
-                                                             ],
-                                                      warmup=warmup,
-                                                      runs=runs)
+            # Run ROI Pooling performance runs
+            roipool_benchmark_res = []
+            for roipool_data in data_list_roipool:
+                roipool_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "ROIPooling")],
+                                                              run_backward=True,
+                                                              dtype=dtype,
+                                                              ctx=ctx,
+                                                              profiler=profiler,
+                                                              inputs=[{"data": roipool_data,
+                                                                       "rois": (32, 5),
+                                                                       "pooled_size": (2, 2),
+                                                                       "spatial_scale": .5}
+                                                                     ],
+                                                              warmup=warmup,
+                                                              runs=runs)
     # Prepare combined results
     mx_pooling_op_results = merge_map_list(pool1d_benchmark_res + pool2d_benchmark_res + roipool_benchmark_res)
     return mx_pooling_op_results
 
 
-def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    # Conv1D Benchmarks
+def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the convolution
+    operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    profiler: str, default 'native'
+        Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
+    warmup: int, default 25
+        Number of times to run for warmup
+    runs: int, default 100
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
+
+    standard_data_list_conv1d = [(32, 3, 256), (32, 3, 64)]
+    int64_tensor_data_list_conv1d = [(2**30, 1, 4)]
+    standard_weight_conv1d = (1, 3, 3)
+    int64_tensor_weight_conv1d = (1, 1, 1)
+    standard_kernel_conv1d = (3,)
+    int64_tensor_kernel_conv1d = (1,)
+    standard_data_list_conv2d = [(32, 3, 256, 256), (32, 3, 64, 64)]
+    int64_tensor_data_list_conv2d = [(2**28, 1, 4, 4)]
+    standard_weight_conv2d = (1, 3, 3, 3)
+    int64_tensor_weight_conv2d = (1, 1, 1, 1)
+    standard_kernel_conv2d = (3, 3)
+    int64_tensor_kernel_conv2d = (1, 1)
+
+    if int64_tensor == 'on':
+        data_list_conv1d = int64_tensor_data_list_conv1d
+        weight_conv1d = int64_tensor_weight_conv1d
+        kernel_conv1d = int64_tensor_kernel_conv1d
+        data_list_conv2d = int64_tensor_data_list_conv2d
+        weight_conv2d = int64_tensor_weight_conv2d
+        kernel_conv2d = int64_tensor_kernel_conv2d
+    else:
+        data_list_conv1d = standard_data_list_conv1d
+        weight_conv1d = standard_weight_conv1d
+        kernel_conv1d = standard_kernel_conv1d
+        data_list_conv2d = standard_data_list_conv2d
+        weight_conv2d = standard_weight_conv2d
+        kernel_conv2d = standard_kernel_conv2d
+
     conv1d_benchmark_res = []
-    for conv_data in [(32, 3, 256), (32, 3, 64)]:
+    conv2d_benchmark_res = []
+    # Conv1D Benchmarks
+    for conv_data in data_list_conv1d:
         conv1d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Convolution")],
                                                      run_backward=True,
                                                      dtype=dtype,
                                                      ctx=ctx,
                                                      profiler=profiler,
                                                      inputs=[{"data": conv_data,
-                                                              "weight": (64, 3, 3),
-                                                              "bias": (64,),
-                                                              "kernel": (3,),
+                                                              "weight": weight_conv1d,
+                                                              "bias": (1,),
+                                                              "kernel": kernel_conv1d,
                                                               "stride": (1,),
                                                               "dilate": (1,),
                                                               "pad": (0,),
-                                                              "num_filter": 64,
-                                                              "layout": 'NCW'}
-                                                             ],
+                                                              "num_filter": 1,
+                                                              "layout": 'NCW'}],
                                                      warmup=warmup,
                                                      runs=runs)
     # Conv2D Benchmarks
-    conv2d_benchmark_res = []
-    for conv_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
+    for conv_data in data_list_conv2d:
         conv2d_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Convolution")],
                                                      run_backward=True,
                                                      dtype=dtype,
                                                      ctx=ctx,
                                                      profiler=profiler,
                                                      inputs=[{"data": conv_data,
-                                                              "weight": (64, 3, 3, 3),
-                                                              "bias": (64,),
-                                                              "kernel": (3, 3),
+                                                              "weight": weight_conv2d,
+                                                              "bias": (1,),
+                                                              "kernel": kernel_conv2d,
                                                               "stride": (1, 1),
                                                               "dilate": (1, 1),
                                                               "pad": (0, 0),
-                                                              "num_filter": 64,
-                                                              "layout": 'NCHW'}
-                                                             ],
+                                                              "num_filter": 1,
+                                                              "layout": 'NCHW'}],
                                                      warmup=warmup,
                                                      runs=runs)
     # Prepare combined results
@@ -157,50 +246,98 @@ def run_convolution_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler
     return mx_conv_op_results
 
 
-def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(), profiler='native', dtype='float32', warmup=10, runs=50):
+def run_transpose_convolution_operators_benchmarks(ctx=mx.cpu(), profiler='native', int64_tensor='off', dtype='float32', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the transpose convolution
+    operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    profiler: str, default 'native'
+        Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
+    warmup: int, default 25
+        Number of times to run for warmup
+    runs: int, default 100
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
+
+    standard_data_list_conv1d_transpose = [(32, 3, 256), (32, 3, 64)]
+    int64_tensor_data_list_conv1d_transpose = [(2**30, 1, 4)]
+    standard_weight_conv1d_transpose = (3, 1, 3)
+    int64_tensor_weight_conv1d_transpose = (1, 1, 1)
+    standard_kernel_conv1d_transpose = (3,)
+    int64_tensor_kernel_conv1d_transpose = (1,)
+    standard_data_list_conv2d_transpose = [(32, 3, 256, 256), (32, 3, 64, 64)]
+    int64_tensor_data_list_conv2d_transpose = [(2**28, 1, 4, 4)]
+    standard_weight_conv2d_transpose = (3, 1, 3, 3)
+    int64_tensor_weight_conv2d_transpose = (1, 1, 1, 1)
+    standard_kernel_conv2d_transpose = (3, 3)
+    int64_tensor_kernel_conv2d_transpose = (1, 1)
+
+    if int64_tensor == 'on':
+        data_list_conv1d_transpose = int64_tensor_data_list_conv1d_transpose
+        weight_conv1d_transpose = int64_tensor_weight_conv1d_transpose
+        kernel_conv1d_transpose = int64_tensor_kernel_conv1d_transpose
+        data_list_conv2d_transpose = int64_tensor_data_list_conv2d_transpose
+        weight_conv2d_transpose = int64_tensor_weight_conv2d_transpose
+        kernel_conv2d_transpose = int64_tensor_kernel_conv2d_transpose
+    else:
+        data_list_conv1d_transpose = standard_data_list_conv1d_transpose
+        weight_conv1d_transpose = standard_weight_conv1d_transpose
+        kernel_conv1d_transpose = standard_kernel_conv1d_transpose
+        data_list_conv2d_transpose = standard_data_list_conv2d_transpose
+        weight_conv2d_transpose = standard_weight_conv2d_transpose
+        kernel_conv2d_transpose = standard_kernel_conv2d_transpose
+
     # Conv1DTranspose Benchmarks
     conv1d_transpose_benchmark_res = []
-    for conv_data in [(32, 3, 256), (32, 3, 64)]:
+    for conv_data in data_list_conv1d_transpose:
         conv1d_transpose_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Deconvolution")],
-                                                               run_backward=True,
-                                                               dtype=dtype,
-                                                               ctx=ctx,
-                                                               profiler=profiler,
-                                                               inputs=[{"data": conv_data,
-                                                                        "weight": (3, 64, 3),
-                                                                        "bias": (64,),
-                                                                        "kernel": (3,),
-                                                                        "stride": (1,),
-                                                                        "dilate": (1,),
-                                                                        "pad": (0,),
-                                                                        "adj": (0,),
-                                                                        "num_filter": 64,
-                                                                        "no_bias": False,
-                                                                        "layout": 'NCW'}
-                                                                       ],
-                                                               warmup=warmup,
-                                                               runs=runs)
+                                                                   run_backward=True,
+                                                                   dtype=dtype,
+                                                                   ctx=ctx,
+                                                                   profiler=profiler,
+                                                                   inputs=[{"data": conv_data,
+                                                                            "weight": weight_conv1d_transpose,
+                                                                            "bias": (1,),
+                                                                            "kernel": kernel_conv1d_transpose,
+                                                                            "stride": (1,),
+                                                                            "dilate": (1,),
+                                                                            "pad": (0,),
+                                                                            "num_filter": 1,
+                                                                            "no_bias": False,
+                                                                            "layout": 'NCW'}],
+                                                                   warmup=warmup,
+                                                                   runs=runs)
     # Conv2DTranspose Benchmarks
     conv2d_transpose_benchmark_res = []
-    for conv_data in [(32, 3, 256, 256), (32, 3, 64, 64)]:
+    for conv_data in data_list_conv2d_transpose:
         conv2d_transpose_benchmark_res += run_performance_test([getattr(MX_OP_MODULE, "Deconvolution")],
-                                                               run_backward=True,
-                                                               dtype=dtype,
-                                                               ctx=ctx,
-                                                               profiler=profiler,
-                                                               inputs=[{"data": conv_data,
-                                                                        "weight": (3, 64, 3, 3),
-                                                                        "bias": (64,),
-                                                                        "kernel": (3, 3),
-                                                                        "stride": (1, 1),
-                                                                        "dilate": (1, 1),
-                                                                        "pad": (0, 0),
-                                                                        "num_filter": 64,
-                                                                        "no_bias": False,
-                                                                        "layout": 'NCHW'}
-                                                                       ],
-                                                               warmup=warmup,
-                                                               runs=runs)
+                                                                   run_backward=True,
+                                                                   dtype=dtype,
+                                                                   ctx=ctx,
+                                                                   profiler=profiler,
+                                                                   inputs=[{"data": conv_data,
+                                                                            "weight": weight_conv2d_transpose,
+                                                                            "bias": (1,),
+                                                                            "kernel": kernel_conv2d_transpose,
+                                                                            "stride": (1, 1),
+                                                                            "pad": (0, 0),
+                                                                            "num_filter": 1,
+                                                                            "no_bias": False,
+                                                                            "layout": 'NCHW'}],
+                                                                   warmup=warmup,
+                                                                   runs=runs)
     # Prepare combined results
     mx_transpose_conv_op_results = merge_map_list(conv1d_transpose_benchmark_res + conv2d_transpose_benchmark_res)
     return mx_transpose_conv_op_results
diff --git a/benchmark/opperf/nd_operations/nn_loss_operators.py b/benchmark/opperf/nd_operations/nn_loss_operators.py
index 9d894087343b..dea19f14f1af 100644
--- a/benchmark/opperf/nd_operations/nn_loss_operators.py
+++ b/benchmark/opperf/nd_operations/nn_loss_operators.py
@@ -28,8 +28,8 @@
 """
 
 
-def run_loss_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the
+def run_loss_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and data size (int64_tensor) for all the
     Neural Network loss operators in MXNet.
 
     Parameters
@@ -40,6 +40,8 @@ def run_loss_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -54,5 +56,5 @@ def run_loss_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='nativ
     mx_loss_ops = get_all_loss_operators()
 
     # Run benchmarks
-    mx_loss_op_results = run_op_benchmarks(mx_loss_ops, dtype, ctx, profiler, warmup, runs)
+    mx_loss_op_results = run_op_benchmarks(mx_loss_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_loss_op_results
diff --git a/benchmark/opperf/nd_operations/nn_optimizer_operators.py b/benchmark/opperf/nd_operations/nn_optimizer_operators.py
index ac380655d136..db18b30081d4 100644
--- a/benchmark/opperf/nd_operations/nn_optimizer_operators.py
+++ b/benchmark/opperf/nd_operations/nn_optimizer_operators.py
@@ -54,8 +54,8 @@
 """
 
 
-def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype) for all the neural network
+def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the neural network
     optimizer update operators in MXNet.
 
     Parameters
@@ -66,6 +66,8 @@ def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -76,60 +78,68 @@ def run_optimizer_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='
     Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
 
     """
+    standard_shape = (5, 5)
+    int64_tensor_shape = (2**16, 2**16)
+
+    if int64_tensor == 'on':
+        arg_shape = int64_tensor_shape
+    else:
+        arg_shape = standard_shape
+
     # Run independent tests for ops that need specific input data
     multi_mp_sgd_mom_res = run_performance_test([getattr(MX_OP_MODULE, "multi_mp_sgd_mom_update")],
-                                                inputs=[{"args0": nd.random_normal(shape=(5,5)),
-                                                "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)),
-                                                "args3": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2,
-                                                "out": nd.random_normal(shape=(5,5))}],run_backward=False)
+                                                inputs=[{"args0": nd.random_normal(shape=arg_shape),
+                                                "args1": nd.random_normal(shape=arg_shape), "args2": nd.random_normal(shape=arg_shape),
+                                                "args3": nd.random_normal(shape=arg_shape), "lrs": 0.1, "wds": 0.2,
+                                                "out": nd.random_normal(shape=arg_shape)}],run_backward=False)
 
     multi_sgd_mom_res = run_performance_test([getattr(MX_OP_MODULE, "multi_sgd_mom_update")],
-                                             inputs=[{"args0": nd.random_normal(shape=(5,5)),
-                                             "args1": nd.random_normal(shape=(5,5)),"args2": nd.random_normal(shape=(5,5)),
-                                             "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False)
+                                             inputs=[{"args0": nd.random_normal(shape=arg_shape),
+                                             "args1": nd.random_normal(shape=arg_shape),"args2": nd.random_normal(shape=arg_shape),
+                                             "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=arg_shape)}], run_backward=False)
 
     multi_sgd_res = run_performance_test([getattr(MX_OP_MODULE, "multi_sgd_update")],
-                                         inputs=[{"args0": nd.random_normal(shape=(5,5)),
-                                         "args1": nd.random_normal(shape=(5,5)), "lrs": 0.1, "wds": 0.2,
-                                         "out": nd.random_normal(shape=(5,5))}], run_backward=False)
+                                         inputs=[{"args0": nd.random_normal(shape=arg_shape),
+                                         "args1": nd.random_normal(shape=arg_shape), "lrs": 0.1, "wds": 0.2,
+                                         "out": nd.random_normal(shape=arg_shape)}], run_backward=False)
 
     multi_mp_sgd_res = run_performance_test([getattr(MX_OP_MODULE, "multi_mp_sgd_update")],
-                                            inputs=[{"args0": nd.random_normal(shape=(5,5)),
-                                            "args1": nd.random_normal(shape=(5,5)),"args2": nd.random_normal(shape=(5,5)),
-                                            "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=(5,5))}], run_backward=False)
+                                            inputs=[{"args0": nd.random_normal(shape=arg_shape),
+                                            "args1": nd.random_normal(shape=arg_shape),"args2": nd.random_normal(shape=arg_shape),
+                                            "lrs": 0.1, "wds": 0.2, "out": nd.random_normal(shape=arg_shape)}], run_backward=False)
 
     preloaded_multi_mp_sgd_res = run_performance_test(
                                  [getattr(MX_OP_MODULE, "preloaded_multi_mp_sgd_update")],
-                                 inputs=[{"args0": nd.random_normal(shape=(5,5)),
-                                          "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)),
+                                 inputs=[{"args0": nd.random_normal(shape=arg_shape),
+                                          "args1": nd.random_normal(shape=arg_shape), "args2": nd.random_normal(shape=arg_shape),
                                           "args3": nd.random_normal(shape=(1)), "args4": nd.random_normal(shape=(1)),
-                                          "out": nd.random_normal(shape=(5,5))}], run_backward=False)
+                                          "out": nd.random_normal(shape=arg_shape)}], run_backward=False)
 
     preloaded_multi_sgd_mom_res = run_performance_test(
                                   [getattr(MX_OP_MODULE, "preloaded_multi_sgd_mom_update")],
-                                  inputs=[{"args0": nd.random_normal(shape=(5,5)),
-                                           "args1": nd.random_normal(shape=(5,5)), "args2": nd.random_normal(shape=(5,5)),
+                                  inputs=[{"args0": nd.random_normal(shape=arg_shape),
+                                           "args1": nd.random_normal(shape=arg_shape), "args2": nd.random_normal(shape=arg_shape),
                                            "args3": nd.random_normal(shape=(1)), "args4": nd.random_normal(shape=(1)),
-                                           "out": nd.random_normal(shape=(5,5))}], run_backward=False)
+                                           "out": nd.random_normal(shape=arg_shape)}], run_backward=False)
 
     preloaded_multi_sgd_res = run_performance_test(
                               [getattr(MX_OP_MODULE, "preloaded_multi_sgd_update")],
-                              inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)),
+                              inputs=[{"args0": nd.random_normal(shape=arg_shape), "args1": nd.random_normal(shape=arg_shape),
                                        "args4": nd.random_normal(shape=(1)), "args5": nd.random_normal(shape=(1)),
-                                       "out": nd.random_normal(shape=(5,5))}], run_backward=False)
+                                       "out": nd.random_normal(shape=arg_shape)}], run_backward=False)
 
     preloaded_multi_mp_sgd_mom_res = run_performance_test(
                                      [getattr(MX_OP_MODULE, "preloaded_multi_mp_sgd_mom_update")],
-                                     inputs=[{"args0": nd.random_normal(shape=(5,5)), "args1": nd.random_normal(shape=(5,5)),
-                                              "args2": nd.random_normal(shape=(5,5)), "args3": nd.random_normal(shape=(5,5)),
+                                     inputs=[{"args0": nd.random_normal(shape=arg_shape), "args1": nd.random_normal(shape=arg_shape),
+                                              "args2": nd.random_normal(shape=arg_shape), "args3": nd.random_normal(shape=arg_shape),
                                               "args4": nd.random_normal(shape=(1)), "args5": nd.random_normal(shape=(1)),
-                                              "out": nd.random_normal(shape=(5,5))}], run_backward=False)
+                                              "out": nd.random_normal(shape=arg_shape)}], run_backward=False)
 
     # Fetch remaining optimizer operators
     mx_optimizer_ops = get_all_optimizer_operators()
 
     # Run benchmarks
-    mx_optimizer_op_results = run_op_benchmarks(mx_optimizer_ops, dtype, ctx, profiler, warmup, runs)
+    mx_optimizer_op_results = run_op_benchmarks(mx_optimizer_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return merge_map_list(multi_sgd_mom_res + multi_sgd_mom_res + multi_sgd_res + multi_mp_sgd_res + preloaded_multi_mp_sgd_res +\
                           preloaded_multi_sgd_mom_res + preloaded_multi_mp_sgd_res + preloaded_multi_mp_sgd_mom_res +\
-                          [mx_optimizer_op_results])
+                          multi_mp_sgd_mom_res + preloaded_multi_sgd_res + [mx_optimizer_op_results])
diff --git a/benchmark/opperf/nd_operations/random_sampling_operators.py b/benchmark/opperf/nd_operations/random_sampling_operators.py
index b6a1f44dba25..777f26af317c 100644
--- a/benchmark/opperf/nd_operations/random_sampling_operators.py
+++ b/benchmark/opperf/nd_operations/random_sampling_operators.py
@@ -34,8 +34,8 @@
 from benchmark.opperf.utils.op_registry_utils import get_all_random_sampling_operators
 
 
-def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the random sampling
+def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the random sampling
     operators in MXNet.
 
     Parameters
@@ -46,6 +46,8 @@ def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', p
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -59,5 +61,5 @@ def run_mx_random_sampling_operators_benchmarks(ctx=mx.cpu(), dtype='float32', p
     # Fetch all Random Sampling Operators
     mx_random_sample_ops = get_all_random_sampling_operators()
     # Run benchmarks
-    mx_random_sample_op_results = run_op_benchmarks(mx_random_sample_ops, dtype, ctx, profiler, warmup, runs)
+    mx_random_sample_op_results = run_op_benchmarks(mx_random_sample_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_random_sample_op_results
diff --git a/benchmark/opperf/nd_operations/reduction_operators.py b/benchmark/opperf/nd_operations/reduction_operators.py
index 6cc0d49c899b..d6e4b6dd6c2d 100644
--- a/benchmark/opperf/nd_operations/reduction_operators.py
+++ b/benchmark/opperf/nd_operations/reduction_operators.py
@@ -31,8 +31,8 @@
 from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks
 
 
-def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the reduction
+def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the reduction
     operators in MXNet.
 
     Parameters
@@ -43,6 +43,8 @@ def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profile
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -56,5 +58,5 @@ def run_mx_reduction_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profile
     # Fetch all Reduction Operators
     mx_reduction_broadcast_ops = get_all_reduction_operators()
     # Run benchmarks
-    mx_reduction_op_results = run_op_benchmarks(mx_reduction_broadcast_ops, dtype, ctx, profiler, warmup, runs)
+    mx_reduction_op_results = run_op_benchmarks(mx_reduction_broadcast_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_reduction_op_results
diff --git a/benchmark/opperf/nd_operations/sorting_searching_operators.py b/benchmark/opperf/nd_operations/sorting_searching_operators.py
index 2d936cdc48ca..d0d9fc064888 100644
--- a/benchmark/opperf/nd_operations/sorting_searching_operators.py
+++ b/benchmark/opperf/nd_operations/sorting_searching_operators.py
@@ -29,8 +29,8 @@
 """
 
 
-def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the sorting and searching
+def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the sorting and searching
     operators in MXNet.
 
     Parameters
@@ -41,6 +41,8 @@ def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', pr
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -54,5 +56,5 @@ def run_sorting_searching_operators_benchmarks(ctx=mx.cpu(), dtype='float32', pr
     # Fetch all Random Sampling Operators
     mx_sort_search_ops = get_all_sorting_searching_operators()
     # Run benchmarks
-    mx_sort_search_op_results = run_op_benchmarks(mx_sort_search_ops, dtype, ctx, profiler, warmup, runs)
+    mx_sort_search_op_results = run_op_benchmarks(mx_sort_search_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return mx_sort_search_op_results
diff --git a/benchmark/opperf/nd_operations/unary_operators.py b/benchmark/opperf/nd_operations/unary_operators.py
index 08075906fae5..53cab57cfe15 100644
--- a/benchmark/opperf/nd_operations/unary_operators.py
+++ b/benchmark/opperf/nd_operations/unary_operators.py
@@ -38,8 +38,8 @@
 from benchmark.opperf.utils.common_utils import merge_map_list
 from benchmark.opperf.rules.default_params import MX_OP_MODULE
 
-def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
-    """Runs benchmarks with the given context and precision (dtype)for all the unary
+def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
+    """Runs benchmarks with the given context, precision (dtype), and input data size (int64_tensor) for all the unary
     operators in MXNet.
 
     Parameters
@@ -50,6 +50,8 @@ def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
         Precision to use for benchmarks
     profiler: str, default 'native'
         Type of Profiler to use (native/python)
+    int64_tensor: str, default 'off'
+        Input tensor size to use for tests (if on, dimensions >= 2**32)
     warmup: int, default 25
         Number of times to run for warmup
     runs: int, default 100
@@ -60,16 +62,26 @@ def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
     Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
 
     """
+
+    standard_inputs = [{"args": [(1024, 1024)],
+                        "num_outputs":1},
+                       {"args": [(10000, 1)],
+                        "num_outputs":1}]
+    int64_tensor_inputs = [{"args": [(2**32, 1)],
+                            "num_outputs":1}]
+
+    if int64_tensor == 'on':
+        inputs = int64_tensor_inputs
+    else:
+        inputs = standard_inputs
+
     # Run amp_multicast as it needs data as positional argument
     amp_multicast_benchmark = run_performance_test([getattr(MX_OP_MODULE, "amp_multicast")],
                                                    run_backward=True,
                                                    dtype=dtype,
                                                    ctx=ctx,
                                                    profiler=profiler,
-                                                   inputs=[{"args": [(1024, 1024)],
-                                                            "num_outputs":1},
-                                                           {"args": [(10000, 1)],
-                                                            "num_outputs":1}],
+                                                   inputs=inputs,
                                                    warmup=warmup,
                                                    runs=runs)
 
@@ -77,5 +89,5 @@ def run_mx_unary_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
     mx_unary_broadcast_ops = get_all_unary_operators()
 
     # Run benchmarks
-    mx_unary_op_results = run_op_benchmarks(mx_unary_broadcast_ops, dtype, ctx, profiler, warmup, runs)
+    mx_unary_op_results = run_op_benchmarks(mx_unary_broadcast_ops, dtype, ctx, profiler, int64_tensor, warmup, runs)
     return merge_map_list(amp_multicast_benchmark + [mx_unary_op_results])
diff --git a/benchmark/opperf/opperf.py b/benchmark/opperf/opperf.py
index 5b8c43f417da..c0ac7b7dcd98 100755
--- a/benchmark/opperf/opperf.py
+++ b/benchmark/opperf/opperf.py
@@ -51,7 +51,7 @@
     get_current_runtime_features
 
 
-def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
+def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', int64_tensor='off', warmup=25, runs=100):
     """Run all the MXNet operators (NDArray) benchmarks.
 
     Returns
@@ -63,64 +63,66 @@ def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
     # *************************MXNET TENSOR OPERATOR BENCHMARKS*****************************
 
     # Run all Unary operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_mx_unary_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_mx_unary_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Binary Broadcast, element_wise, and miscellaneous operations benchmarks with default input values
     mxnet_operator_benchmark_results.append(run_mx_binary_broadcast_operators_benchmarks(ctx=ctx,
-                                                                                         dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+                                                                                         dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
     mxnet_operator_benchmark_results.append(run_mx_binary_element_wise_operators_benchmarks(ctx=ctx,
-                                                                                            dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+                                                                                            dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     mxnet_operator_benchmark_results.append(run_mx_binary_misc_operators_benchmarks(ctx=ctx,
-                                                                                         dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+                                                                                         dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all GEMM operations benchmarks with default input values
     mxnet_operator_benchmark_results.append(run_gemm_operators_benchmarks(ctx=ctx,
-                                                                          dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+                                                                          dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Random sampling operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_mx_random_sampling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_mx_random_sampling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Reduction operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_mx_reduction_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_mx_reduction_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Sorting and Searching operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_sorting_searching_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_sorting_searching_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Array Rearrange operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_rearrange_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_rearrange_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Indexing routines benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_indexing_routines_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_indexing_routines_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # ************************ MXNET NN OPERATOR BENCHMARKS ****************************
 
     # Run all basic NN operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_nn_basic_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_nn_basic_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Activation operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_activation_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_activation_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Pooling operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_pooling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_pooling_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Convolution operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all Optimizer operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_optimizer_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
-
+    mxnet_operator_benchmark_results.append(run_optimizer_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
+    
     # Run all Transpose Convolution operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_transpose_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_transpose_convolution_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # Run all NN loss operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_loss_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_loss_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
     
     # Run all Miscellaneous operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_mx_misc_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    mxnet_operator_benchmark_results.append(run_mx_misc_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
-    # Run all Linear Algebra operations benchmarks with default input values
-    mxnet_operator_benchmark_results.append(run_linalg_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    # Linear Algebra operators do not work with int64 tensor data. Issue tracked here: https://github.com/apache/incubator-mxnet/issues/17716
+    if int64_tensor == 'off':
+        # Run all Linear Algebra operations benchmarks with default input values
+        mxnet_operator_benchmark_results.append(run_linalg_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs))
 
     # ****************************** PREPARE FINAL RESULTS ********************************
     final_benchmark_result_map = merge_map_list(mxnet_operator_benchmark_results)
@@ -162,6 +164,11 @@ def main():
                         help='Use built-in CPP profiler (native) or Python'
                              'time module.'
                              'Valid Inputs - native, python')
+    
+    parser.add_argument('--int64-tensor', type=str, default='off',
+                        help='Run performance tests with large tensor input'
+                             'data (dimension >= 2**32) or standard input data.'
+                             'Valid Inputs - on, off')
 
     parser.add_argument('-w', '--warmup', type=int, default=25,
                         help='Number of times to run for warmup.'
@@ -169,7 +176,7 @@ def main():
 
     parser.add_argument('-r', '--runs', type=int, default=100,
                         help='Number of runs to capture benchmark results.'
-                             'Valid Inputs - positive integers')
+                             'Valid Inputs - positive integers')    
 
     args = parser.parse_args()
     logging.info("Running MXNet operator benchmarks with the following options: {args}".format(args=args))
@@ -180,9 +187,10 @@ def main():
     ctx = _parse_mxnet_context(args.ctx)
     dtype = args.dtype
     profiler = args.profiler
+    int64_tensor = args.int64_tensor
     warmup = args.warmup
     runs = args.runs
-    benchmark_results = run_all_mxnet_operator_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs)
+    benchmark_results = run_all_mxnet_operator_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, int64_tensor=int64_tensor, warmup=warmup, runs=runs)
 
     # Sort benchmark results alphabetically by op name
     final_benchmark_results = dict()
diff --git a/benchmark/opperf/rules/default_params.py b/benchmark/opperf/rules/default_params.py
index 15bcd72b0553..2bf80d58e264 100644
--- a/benchmark/opperf/rules/default_params.py
+++ b/benchmark/opperf/rules/default_params.py
@@ -35,15 +35,22 @@
 DEFAULT_DTYPE_INT = ['int32', 'int64', 'int32']  # randint works for int* types only
 DEFAULT_DTYPE_FLOAT = ['float16', 'float32', 'float64']  # random_exp works for float* types only
 
+DEFAULT_DATA_LARGE_TENSOR = [(2**16, 2**16)]
+
 # For Binary miscellaneous operators like choose_element0_index
 # argument data must be indexed via an NDArray.
 # NOTE: Data used is DEFAULT_DATA
 DEFAULT_INDEX = [(1, 1024), (1, 1), (1, 100)]
 
+DEFAULT_INDEX_LARGE_TENSOR = [(1, 2**16)]
+
 # For Binary broadcast operators like - broadcast_add/sub/mod/logical_and etc..
 DEFAULT_LHS = [(1024, 1024), (10000, 10), (10000, 1)]
 DEFAULT_RHS = [(1024, 1024), (10000, 10), (10000, 1)]
 
+DEFAULT_LHS_LARGE_TENSOR = [(2**16, 2**16), (2**28, 2**4), (2**32, 1)]
+DEFAULT_RHS_LARGE_TENSOR = [(2**16, 2**16), (2**28, 2**4), (2**32, 1)]
+
 # For operators like - random_uniform, random_normal etc..
 DEFAULT_SHAPE = [(1024, 1024), (10000, 1), (10000, 100)]
 DEFAULT_SAMPLE = [(2,)]
@@ -52,6 +59,15 @@
 DEFAULT_K = [1]
 DEFAULT_P = [1]
 
+DEFAULT_SHAPE_LARGE_TENSOR = [(2**16, 2**16)]#, (2**32, 1), (2**25, 2**7)]
+DEFAULT_SAMPLE_LARGE_TENSOR = [(2**32,)]
+DEFAULT_DATA_RPD_LARGE_TENSOR = [(2**32 + 1, 5)]
+DEFAULT_ALPHA_RPD_LARGE_TENSOR = [(2**32,)]
+DEFAULT_SAMPLE_RPE_LARGE_TENSOR = [(1, 2**32)]
+DEFAULT_LAM_RPE_LARGE_TENSOR = [(1,)]
+DEFAULT_SAMPLE_RPG_LARGE_TENSOR = [(1, 2**32 + 1)]
+DEFAULT_ALPHA_RPG_LARGE_TENSOR = [(1,)]
+
 # For operators like - sample_uniform, sample_normal etc..
 # NOTE: There are many overlapping operators in random_* and sample_*,
 # Ex: random_uniform, sample_uniform. Parameter names are same, but, for
@@ -73,6 +89,24 @@
 DEFAULT_TARGET_SHAPE = [(256, 6)]
 DEFAULT_DATA_SM = [(32, 32), (64, 64)]
 
+DEFAULT_LOW_ND_LARGE_TENSOR = [[0.0] * 2**16 + [2.5] * 2**16]
+DEFAULT_HIGH_ND_LARGE_TENSOR = [[1.0] * 2**16 + [3.7] * 2**16]
+DEFAULT_MU_ND_LARGE_TENSOR = [[2.0] * 2**16 + [2.5] * 2**16]
+DEFAULT_SIGMA_LARGE_TENSOR = [[1.0] * 2**16 + [3.7] * 2**16]
+DEFAULT_ALPHA_ND_LARGE_TENSOR = [[0.0] * 2**16 + [2.5] * 2**16]
+DEFAULT_BETA_ND_LARGE_TENSOR = [[1.0] * 2**16 + [0.7] * 2**16]
+DEFAULT_LAM_ND_LARGE_TENSOR = [[1.0] * 2**16 + [8.5] * 2**16]
+DEFAULT_K_ND_LARGE_TENSOR = [[20] * 2**16 + [49] * 2**16]
+DEFAULT_P_ND_LARGE_TENSOR = [[0.4] * 2**16 + [0.77] * 2**16]
+DEFAULT_DATA_BILINEAR_LARGE_TENSOR = [(2**32, 1, 1, 1)]
+DEFAULT_GRID_LARGE_TENSOR = [(2**32, 2, 1, 1)]
+DEFAULT_DATA_GRIDGEN_LARGE_TENSOR = [(2**31, 2, 1, 1), (1, 6)]
+DEFAULT_TARGET_SHAPE_LARGE_TENSOR = [(1, 6)]
+DEFAULT_DATA_SM_LARGE_TENSOR = [(2**32,)]
+DEFAULT_SHAPE_SE_LARGE_TENSOR = [(1,)]
+DEFAULT_LAM_SE_LARGE_TENSOR = [(2**32 + 1,)]
+DEFAULT_SHAPE_SU_LARGE_TENSOR = [(2**32,)]
+
 # For reduction operators
 # NOTE: Data used is DEFAULT_DATA
 DEFAULT_AXIS_SHAPE = [(), 0, (0, 1)]
@@ -107,7 +141,6 @@
 DEFAULT_NSIZE = [3]
 DEFAULT_PARAMETERS = [(7,), (104,)]
 DEFAULT_STATE = [(1, 4, 1), (2, 10000, 4)]
-DEFAULT_MODE = ["rnn_relu", "rnn_tanh"]
 DEFAULT_STATE_SIZE = [1, 4]
 DEFAULT_NUM_LAYERS = [1, 2]
 DEFAULT_NUM_GROUPS = [1, 10]
@@ -119,6 +152,30 @@
 DEFAULT_KERNEL = [(1, 1, 1), (1, 1, 1)]
 DEFAULT_STRIDE = [(2, 2, 2), (1, 1, 1)]
 
+DEFAULT_DATA_NN_BASIC_LARGE_TENSOR = [(2**32 + 1, 1)]
+DEFAULT_NUM_HIDDEN_LARGE_TENSOR = [(1,)]
+DEFAULT_BIAS_LARGE_TENSOR = [(1,)]
+DEFAULT_FLATTEN_LARGE_TENSOR = [False]
+DEFAULT_GAMMA_LARGE_TENSOR = [(1,)]
+DEFAULT_BETA_LARGE_TENSOR = [(1,)]
+DEFAULT_MOVING_MEAN_LARGE_TENSOR = [(2**32 + 1,)]
+DEFAULT_MOVING_VAR_LARGE_TENSOR = [(2**32 + 1,)]
+DEFAULT_INPUT_DIM_LARGE_TENSOR = [2**32]
+DEFAULT_OUTPUT_DIM_LARGE_TENSOR = [1]
+DEFAULT_KERNEL_SIZE_LARGE_TENSOR = [1]
+DEFAULT_MAX_DISPLACEMENT_LARGE_TENSOR = [1]
+DEFAULT_STRIDE_1_LARGE_TENSOR = [1]
+DEFAULT_STRIDE_2_LARGE_TENSOR = [1]
+DEFAULT_DILATE_LARGE_TENSOR = [[]]
+DEFAULT_PAD_LARGE_TENSOR = [[]]
+DEFAULT_OUTPUT_SIZE_LARGE_TENSOR = [(2, 2, 1)]
+DEFAULT_KERNEL_LARGE_TENSOR = [(1, 1, 1)]
+DEFAULT_STRIDE_LARGE_TENSOR = [[]]
+DEFAULT_PARAMETERS_LARGE_TENSOR = [(7,)]
+DEFAULT_STATE_LARGE_TENSOR = [(1, 4, 1)]
+DEFAULT_STATE_SIZE_LARGE_TENSOR = [1]
+DEFAULT_NUM_LAYERS_LARGE_TENSOR = [1]
+
 # BatchNorm
 DEFAULT_AXIS_BN = [1]
 
@@ -132,41 +189,81 @@
 # SVMOutput
 DEFAULT_LABEL_SVM = [(32, 3, 256), (32, 3, 10000)]
 
+DEFAULT_DATA_SVM_LARGE_TENSOR = [(2**29, 2, 2, 2)]
+DEFAULT_LABEL_SVM_LARGE_TENSOR = [(2**29, 2, 2)]
+
 # SoftmaxOutput
 DEFAULT_LABEL_SM = [(32, 3, 256), (32, 3, 10000)]
 
+DEFAULT_DATA_SO_LARGE_TENSOR = [(2**29, 2, 2, 2)]
+DEFAULT_LABEL_SO_LARGE_TENSOR = [(2**29, 2, 2)]
+
 # FullyConnected
 DEFAULT_WEIGHT_FC = [(64, 3 * 256 * 256), (64, 10)]
 
+DEFAULT_DATA_FC_LARGE_TENSOR = [(2**32, 1)]
+DEFAULT_WEIGHT_FC_LARGE_TENSOR = [(1, 1)]
+DEFAULT_NUM_HIDDEN_FC_LARGE_TENSOR = [1]
+
 # Embedding
 DEFAULT_WEIGHT_EMBEDDING = [(3, 4), (16, 9)]
 
+DEFAULT_WEIGHT_EMBEDDING_LARGE_TENSOR = [(2**32, 1)]
+
 # GroupNorm
 DEFAULT_DATA_GN = [(32, 3, 256, 256), (32, 10, 10000, 10)]
 DEFAULT_BETA_GAMMA_GN = [(1,), (10,)]
 
+DEFAULT_DATA_GN_LARGE_TENSOR = [(2**27, 4, 4, 2)]
+DEFAULT_BETA_GAMMA_GN_LARGE_TENSOR = [(1,)]
+
 # Dropout
 DEFAULT_DATA_DROPOUT = [(32, 3, 256, 256), (10000, 10)]
 DEFAULT_MODE_DROPOUT = ["always"]
 
+DEFAULT_DATA_DROPOUT_LARGE_TENSOR = [(2**32 + 1,)]
+DEFAULT_P_DROPOUT_LARGE_TENSOR = [.5]
+DEFAULT_AXES_DROPOUT_LARGE_TENSOR = [[]]
+
 # SpatialTransformer
 DEFAULT_DATA_ST = [(32, 3, 256, 6), (256, 3, 10000, 6)]
 DEFAULT_LOC_TAR_ST = [(32, 6), (256, 6)]
 
+DEFAULT_DATA_ST_LARGE_TENSOR = [(2, 2**29, 1, 6)]
+DEFAULT_LOC_TAR_ST_LARGE_TENSOR = [(2, 6)]
+
 # im2col
 DEFAULT_KERNEL_I2C = [(3,), (3, 3)]
 DEFAULT_STRIDE_I2C = [(1,), (1, 1)]
 
+DEFAULT_DATA_I2C_LARGE_TENSOR = [(2**29, 2, 2, 6)]
+DEFAULT_KERNEL_I2C_LARGE_TENSOR = [(1,)]
+DEFAULT_STRIDE_I2C_LARGE_TENSOR = [[]]
+
 # col2im
 DEFAULT_DATA_C2I = [(32, 64, 256), (32, 64, 256)]
 
-# RNN
-DEFAULT_DATA_RNN = [(32, 4, 4), (512, 10000, 10)]
-DEFAULT_P_RNN = [.5]
+DEFAULT_DATA_C2I_LARGE_TENSOR = [(1, 2**30, 4)]
 
 # LRN
 DEFAULT_BETA_LRN = [.2]
 
+DEFAULT_DATA_LRN_LARGE_TENSOR = [(2**27, 4, 4, 2)]
+
+# Correlation
+DEFAULT_DATA1_LARGE_TENSOR = [(2**23, 8, 8, 8)]
+DEFAULT_DATA2_LARGE_TENSOR = [(2**23, 8, 8, 8)]
+
+# For regression operators
+DEFAULT_DATA_REG_LARGE_TENSOR = [(2**29, 2, 2, 2)]
+DEFAULT_LABEL_REG_LARGE_TENSOR = [(2**29, 2, 2, 2)]
+
+# For normalization operators
+DEFAULT_DATA_NORM_LARGE_TENSOR = [(2**29, 2, 2, 2)]
+DEFAULT_GAMMA_NORM_LARGE_TENSOR = [(2,)]
+DEFAULT_BETA_NORM_LARGE_TENSOR = [(2,)]
+DEFAULT_AXIS_LARGE_TENSOR = [-1]
+
 # For optimizer operators
 DEFAULT_WEIGHT = [(1024, 1024), (10000, 1), (10000, 100)]
 DEFAULT_GRAD = [(1024, 1024), (10000, 1), (10000, 100)]
@@ -181,10 +278,10 @@
 DEFAULT_R1 = [(1, 1024), (1, 1), (1, 100)]
 DEFAULT_R2 = [(1, 1024), (1, 1), (1, 100)]
 DEFAULT_DELTA = [(1024, 1024), (10000, 1), (10000, 100)]
-DEFAULT_LRS = [(0.1, 0.1)]
-DEFAULT_LR = [0.1, 0.5, 0.9]
-DEFAULT_GAMMA_1 = [0.1, 0.5, 0.9]
-DEFAULT_GAMMA_2 = [0.1, 0.5, 0.9]
+DEFAULT_LRS = [(0.1,0.1)]
+DEFAULT_LR = [0.1,0.5,0.9]
+DEFAULT_RHO = [0.1,0.5,0.9]
+DEFAULT_MOMENTUM = [0.1,0.5,0.9]
 DEFAULT_EPSILON = [1e-08]
 DEFAULT_BETA_1 = [0.1, 0.5, 0.9]
 DEFAULT_BETA_2 = [0.1, 0.5, 0.9]
@@ -194,6 +291,20 @@
 DEFAULT_CLIP_WEIGHTS = [-1.0, 0.8]
 DEFAULT_LAZY_UPDATE = [0, 1]
 
+DEFAULT_WEIGHT_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_GRAD_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_MOM_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_MEAN_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_VAR_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_N_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_D_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_V_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_Z_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_G_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+DEFAULT_R1_LARGE_TENSOR = [(1,)]
+DEFAULT_R2_LARGE_TENSOR = [(1,)]
+DEFAULT_DELTA_LARGE_TENSOR = [(2**16, 2**16), (2**32, 1), (2**25, 2**7)]
+
 # For rearrange operators
 # NOTE: Data needs to be a 4D tensor for  operators like space_to_depth and depth_to_space
 # Hence below we append 4d to mark the difference.
@@ -201,6 +312,9 @@
 DEFAULT_DATA_4d = [(1, 4, 2, 4), (10, 25, 10, 100)]
 DEFAULT_BLOCK_SIZE = [2, 5]
 
+DEFAULT_DATA_4d_LARGE_TENSOR = [(1, 4, 2, 2**29), (1,2**4,2**4,2**24)]
+DEFAULT_BLOCK_SIZE_LARGE_TENSOR = [2, 4]
+
 # For miscellaneous operators
 DEFAULT_DATA_SQUEEZE = [(1, 1024, 1024), (32, 1, 256, 256)]
 DEFAULT_AXIS_SQUEEZE = [0, 1]
@@ -217,6 +331,15 @@
 DEFAULT_MHS = [(1024,), (10000,), (10000,)]
 DEFAULT_RHS_FEI = [(1024,), (10000,), (10000,)]
 
+DEFAULT_DATA_SQUEEZE_LARGE_TENSOR = [(2**32, 1)]
+DEFAULT_AXIS_SQUEEZE_LARGE_TENSOR = [1]
+DEFAULT_WSS_LARGE_TENSOR = [(2**32, 1)]
+DEFAULT_GSS_LARGE_TENSOR = [(2**32, 1)]
+DEFAULT_WDS_LARGE_TENSOR = [(2**32, 1)]
+DEFAULT_LHS_FEI_LARGE_TENSOR = [(2, 2**32 + 1)]
+DEFAULT_RHS_FEI_LARGE_TENSOR = [(2,)]
+DEFAULT_MHS_LARGE_TENSOR = [(2,)]
+
 # For swapaxis operator
 DEFAULT_DIM_1 = [0]
 DEFAULT_DIM_2 = [1]
@@ -231,21 +354,33 @@
 DEFAULT_Y = [(1024, 1024), (10000, 1), (10000, 100)]
 DEFAULT_COND = [(1024,), (10000,), (10000,)]
 DEFAULT_DEPTH = [0]
+
 # For ravel_multi_index op, ndim(shape) = 2; hence data NDArray's first dim = 2
 # First dimension of input of ravel operator should match shape parameter dimension
 # DEFAULT_SHAPE is reused for ravel_multi_index op
 RAVEL_DATA = [(2, 1024)]
 
+RAVEL_DATA_LARGE_TENSOR = [(2, 2**32)]
+DEFAULT_X_LARGE_TENSOR = [(2**32, 1)]
+
 # For loss operators
 DEFAULT_DATA_3d = [(1024, 100, 100)]
 DEFAULT_LABEL = [(100,100)]
 DEFAULT_DATA_SMCE = [(1024, 1024)]
 DEFAULT_LABEL_SMCE = [(1024,)]
+
+DEFAULT_LABEL_LARGE_TENSOR = [(1, 1)]
+DEFAULT_DATA_CTCLOSS = [(2**32, 1, 1)]
+DEFAULT_DATA_SMCE_LARGE_TENSOR = [(2**32 + 1, 1)]
+DEFAULT_LABEL_SMCE_LARGE_TENSOR = [(2**32 + 1,)]
+
 # For NN operators
 DEFAULT_ACT_TYPE_LR = ['leaky', 'elu', 'selu', 'gelu']
 DEFAULT_ACT_TYPE_ACTIVATION = ['relu', 'sigmoid', 'softrelu', 'softsign', 'tanh']
 DEFAULT_LABEL_SOFTMAX = [(1024, 1024), (10000, 1), (10000, 100)]
 
+DEFAULT_LABEL_SOFTMAX_LARGE_TENSOR = [(2**32, 1)]
+
 # For linalg operators
 DEFAULT_A = [(1024, 1024)]
 DEFAULT_B = [(1024, 1024)]
@@ -253,6 +388,11 @@
 DEFAULT_A_MT = [(1024, 1035)]
 DEFAULT_AXES = [[0, 1]]
 
+DEFAULT_A_LARGE_TENSOR = [(2**16, 2**16)]
+DEFAULT_B_LARGE_TENSOR = [(2**16, 2**16)]
+DEFAULT_C_LARGE_TENSOR = [(2**16, 2**16)]
+DEFAULT_A_MT_LARGE_TENSOR = [(2**32 + 1, 1)]
+
 # Default Inputs. MXNet Op Param Name to Default Input mapping
 DEFAULTS_INPUTS = {"data": DEFAULT_DATA,
                    "dtype": DEFAULT_DTYPE,
@@ -277,33 +417,30 @@
                    "p_nd": DEFAULT_P_ND,
                    "axis_shape": DEFAULT_AXIS_SHAPE,
                    "axis": DEFAULT_AXIS,
-                   "weight": DEFAULT_WEIGHT,
-                   "weight32": DEFAULT_WEIGHT,
-                   "grad": DEFAULT_GRAD,
-                   "mean": DEFAULT_MEAN,
-                   "var": DEFAULT_VAR,
-                   "mom": DEFAULT_MOM,
-                   "r1": DEFAULT_R1,
-                   "r2": DEFAULT_R2,
-                   "n": DEFAULT_N,
-                   "d": DEFAULT_D,
-                   "v": DEFAULT_V,
-                   "z": DEFAULT_Z,
-                   "g": DEFAULT_G,
-                   "delta": DEFAULT_DELTA,
-                   "lr": DEFAULT_LR,
-                   "lrs": DEFAULT_LRS,
-                   "wds": DEFAULT_LRS,
-                   "wd": DEFAULT_LR,
-                   "gamma1": DEFAULT_GAMMA_1,
-                   "gamma2": DEFAULT_GAMMA_2,
-                   "epsilon": DEFAULT_EPSILON,
-                   "beta1": DEFAULT_BETA_1,
-                   "beta2": DEFAULT_BETA_2,
-                   "t": DEFAULT_T,
-                   "rescale_grad": DEFAULT_RESCALE_GRAD,
-                   "clip_grad": DEFAULT_CLIP_GRADIENT,
-                   "lazy_update": DEFAULT_LAZY_UPDATE,
+                   "weight" : DEFAULT_WEIGHT,
+                   "weight32" : DEFAULT_WEIGHT,
+                   "grad" : DEFAULT_GRAD,
+                   "mean" : DEFAULT_MEAN,
+                   "var" : DEFAULT_VAR,
+                   "mom" : DEFAULT_MOM,
+                   "n" : DEFAULT_N,
+                   "d" : DEFAULT_D,
+                   "v" : DEFAULT_V,
+                   "z" : DEFAULT_Z,
+                   "g" : DEFAULT_G,
+                   "delta" : DEFAULT_DELTA,
+                   "lr" : DEFAULT_LR,
+                   "lrs" : DEFAULT_LRS,
+                   "wds" : DEFAULT_LRS,
+                   "rho" : DEFAULT_RHO,
+                   "momentum" : DEFAULT_MOMENTUM,
+                   "epsilon" : DEFAULT_EPSILON,
+                   "beta1" : DEFAULT_BETA_1,
+                   "beta2" : DEFAULT_BETA_2,
+                   "t" : DEFAULT_T,
+                   "rescale_grad" : DEFAULT_RESCALE_GRAD,
+                   "clip_grad" : DEFAULT_CLIP_GRADIENT,
+                   "lazy_update" : DEFAULT_LAZY_UPDATE,
                    "data_4d": DEFAULT_DATA_4d,
                    "dim1": DEFAULT_DIM_1,
                    "dim2": DEFAULT_DIM_2,
@@ -363,13 +500,10 @@
                    "output_size": DEFAULT_OUTPUT_SIZE,
                    "kernel_col2im": DEFAULT_KERNEL,
                    "stride_col2im": DEFAULT_STRIDE,
-                   "data_rnn": DEFAULT_DATA_RNN,
-                   "p_rnn": DEFAULT_P_RNN,
                    "parameters": DEFAULT_PARAMETERS,
                    "state": DEFAULT_STATE,
                    "state_size": DEFAULT_STATE_SIZE,
                    "num_layers": DEFAULT_NUM_LAYERS,
-                   "mode_rnn": DEFAULT_MODE,
                    "data_groupnorm": DEFAULT_DATA_GN,
                    "gamma_groupnorm": DEFAULT_BETA_GAMMA_GN,
                    "beta_groupnorm": DEFAULT_BETA_GAMMA_GN,
@@ -433,6 +567,222 @@
                    "data_layernorm": DEFAULT_DATA_NN_BASIC,
                    "axis_layernorm": DEFAULT_AXIS}
 
+# Default Inputs for Large Tensor. MXNet Op Param Name to Default Input mapping
+DEFAULTS_INPUTS_LARGE_TENSOR = {"data": DEFAULT_DATA_LARGE_TENSOR,
+                                "dtype": DEFAULT_DTYPE,
+                                "dtype_int": DEFAULT_DTYPE_INT,
+                                "dtype_float": DEFAULT_DTYPE_FLOAT,
+                                "sample": DEFAULT_SAMPLE_LARGE_TENSOR,
+                                "lhs": DEFAULT_LHS_LARGE_TENSOR,
+                                "rhs": DEFAULT_RHS_LARGE_TENSOR,
+                                "shape": DEFAULT_SHAPE_LARGE_TENSOR,
+                                "low": DEFAULT_LOW,
+                                "high": DEFAULT_HIGH,
+                                "low_nd": DEFAULT_LOW_ND_LARGE_TENSOR,
+                                "high_nd": DEFAULT_HIGH_ND_LARGE_TENSOR,
+                                "mu_nd": DEFAULT_MU_ND_LARGE_TENSOR,
+                                "sigma": DEFAULT_SIGMA_LARGE_TENSOR,
+                                "alpha_nd": DEFAULT_ALPHA_ND_LARGE_TENSOR,
+                                "beta_nd": DEFAULT_BETA_ND_LARGE_TENSOR,
+                                "lam_nd": DEFAULT_LAM_ND_LARGE_TENSOR,
+                                "lam_random_pdf_exponential": DEFAULT_LAM_RPE_LARGE_TENSOR,
+                                "sample_random_pdf_exponential": DEFAULT_SAMPLE_RPE_LARGE_TENSOR,
+                                "k": DEFAULT_K,
+                                "p": DEFAULT_P,
+                                "k_nd": DEFAULT_K_ND_LARGE_TENSOR,
+                                "p_nd": DEFAULT_P_ND_LARGE_TENSOR,
+                                "axis_shape": DEFAULT_AXIS_SHAPE,
+                                "axis": DEFAULT_AXIS,
+                                "weight" : DEFAULT_WEIGHT_LARGE_TENSOR,
+                                "weight32" : DEFAULT_WEIGHT_LARGE_TENSOR,
+                                "grad" : DEFAULT_GRAD_LARGE_TENSOR,
+                                "mean" : DEFAULT_MEAN_LARGE_TENSOR,
+                                "var" : DEFAULT_VAR_LARGE_TENSOR,
+                                "mom" : DEFAULT_MOM_LARGE_TENSOR,
+                                "r1": DEFAULT_R1_LARGE_TENSOR,
+                                "r2": DEFAULT_R2_LARGE_TENSOR,
+                                "n" : DEFAULT_N_LARGE_TENSOR,
+                                "d" : DEFAULT_D_LARGE_TENSOR,
+                                "v" : DEFAULT_V_LARGE_TENSOR,
+                                "z" : DEFAULT_Z_LARGE_TENSOR,
+                                "g" : DEFAULT_G_LARGE_TENSOR,
+                                "delta" : DEFAULT_DELTA_LARGE_TENSOR,
+                                "lr" : DEFAULT_LR,
+                                "lrs" : DEFAULT_LRS,
+                                "wds" : DEFAULT_LRS,
+                                "wd": DEFAULT_LR,
+                                "gamma1" : DEFAULT_GAMMA_1,
+                                "gamma2" : DEFAULT_GAMMA_2,
+                                "epsilon" : DEFAULT_EPSILON,
+                                "beta1" : DEFAULT_BETA_1,
+                                "beta2" : DEFAULT_BETA_2,
+                                "t" : DEFAULT_T,
+                                "rescale_grad" : DEFAULT_RESCALE_GRAD,
+                                "clip_grad" : DEFAULT_CLIP_GRADIENT,
+                                "lazy_update" : DEFAULT_LAZY_UPDATE,
+                                "data_4d": DEFAULT_DATA_4d_LARGE_TENSOR,
+                                "dim1": DEFAULT_DIM_1,
+                                "dim2": DEFAULT_DIM_2,
+                                "block_size": DEFAULT_BLOCK_SIZE_LARGE_TENSOR,
+                                "args": DEFAULT_ARGS,
+                                "index": DEFAULT_INDEX_LARGE_TENSOR,
+                                "data_smce": DEFAULT_DATA_SMCE_LARGE_TENSOR,
+                                "label_smce": DEFAULT_LABEL_SMCE_LARGE_TENSOR,
+                                "grid": DEFAULT_GRID_LARGE_TENSOR,
+                                "data_bilinearsampler": DEFAULT_DATA_BILINEAR_LARGE_TENSOR,
+                                "transform_type": DEFAULT_TRANSFORM_TYPE,
+                                "data_gridgenerator": DEFAULT_DATA_GRIDGEN_LARGE_TENSOR,
+                                "target_shape_gridgenerator": DEFAULT_TARGET_SHAPE_LARGE_TENSOR,
+                                "data_sample_multinomial": DEFAULT_DATA_SM_LARGE_TENSOR,
+                                "data_random_pdf_dirichlet": DEFAULT_DATA_RPD_LARGE_TENSOR,
+                                "alpha_random_pdf_dirichlet": DEFAULT_ALPHA_RPD_LARGE_TENSOR,
+                                "sample_random_pdf_gamma": DEFAULT_SAMPLE_RPG_LARGE_TENSOR,
+                                "alpha_random_pdf_gamma": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "beta_random_pdf_gamma": DEFAULT_BETA_LARGE_TENSOR,
+                                "sample_random_pdf_generalized_negative_binomial": DEFAULT_SAMPLE_RPG_LARGE_TENSOR,
+                                "mu_random_pdf_generalized_negative_binomial": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "alpha_random_pdf_generalized_negative_binomial": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "sample_random_pdf_negative_binomial": DEFAULT_SAMPLE_RPG_LARGE_TENSOR,
+                                "k_random_pdf_negative_binomial": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "p_random_pdf_negative_binomial": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "sample_random_pdf_normal": DEFAULT_SAMPLE_RPG_LARGE_TENSOR,
+                                "mu_random_pdf_normal": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "sigma_random_pdf_normal": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "sample_random_pdf_poisson": DEFAULT_SAMPLE_RPG_LARGE_TENSOR,
+                                "lam_random_pdf_poisson": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "sample_random_pdf_uniform": DEFAULT_SAMPLE_RPG_LARGE_TENSOR,
+                                "low_random_pdf_uniform": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "high_random_pdf_uniform": DEFAULT_ALPHA_RPG_LARGE_TENSOR,
+                                "shape_sample_exponential": DEFAULT_SHAPE_SE_LARGE_TENSOR,
+                                "lam_sample_exponential": DEFAULT_LAM_SE_LARGE_TENSOR,
+                                "mu_sample_normal": DEFAULT_LAM_SE_LARGE_TENSOR,
+                                "sigma_sample_normal": DEFAULT_LAM_SE_LARGE_TENSOR,
+                                "shape_sample_poisson": DEFAULT_LAM_SE_LARGE_TENSOR,
+                                "lam_sample_poisson": DEFAULT_SHAPE_SE_LARGE_TENSOR,
+                                "shape_sample_uniform": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "low_sample_uniform": DEFAULT_LAM_SE_LARGE_TENSOR,
+                                "high_sample_uniform": DEFAULT_LAM_SE_LARGE_TENSOR,
+                                "alpha_sample_gamma": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "beta_sample_gamma": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "mu_sample_generalized_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "shape_sample_generalized_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "alpha_sample_generalized_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "shape_sample_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "k_sample_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "p_sample_negative_binomial": DEFAULT_SHAPE_SU_LARGE_TENSOR,
+                                "A": DEFAULT_A_LARGE_TENSOR,
+                                "B": DEFAULT_B_LARGE_TENSOR,
+                                "C": DEFAULT_C_LARGE_TENSOR,
+                                "A_linalg_maketrian": DEFAULT_A_MT_LARGE_TENSOR,
+                                "axes": DEFAULT_AXES,
+                                "act_type_leakyrelu": DEFAULT_ACT_TYPE_LR,
+                                "label_softmax": DEFAULT_LABEL_SOFTMAX_LARGE_TENSOR,
+                                "act_type_activation": DEFAULT_ACT_TYPE_ACTIVATION,
+                                "data_squeeze": DEFAULT_DATA_SQUEEZE_LARGE_TENSOR,
+                                "axis_squeeze": DEFAULT_AXIS_SQUEEZE_LARGE_TENSOR,
+                                "a_min": DEFAULT_A_MIN,
+                                "a_max": DEFAULT_A_MAX,
+                                "weights_sum_sq": DEFAULT_WSS_LARGE_TENSOR,
+                                "grads_sum_sq": DEFAULT_GSS_LARGE_TENSOR,
+                                "wds": DEFAULT_WDS_LARGE_TENSOR,
+                                "eta": DEFAULT_ETA,
+                                "eps": DEFAULT_EPSILON,
+                                "stype": DEFAULT_STYPE,
+                                "indices": DEFAULT_INDICES,
+                                "begin": DEFAULT_BEGIN,
+                                "end": DEFAULT_END,
+                                "shape_like": DEFAULT_DATA_LARGE_TENSOR,
+                                "depth": DEFAULT_DEPTH,
+                                "condition": DEFAULT_X_LARGE_TENSOR,
+                                "x": DEFAULT_X_LARGE_TENSOR,
+                                "y": DEFAULT_X_LARGE_TENSOR,
+                                "ravel_data": RAVEL_DATA_LARGE_TENSOR,
+                                "a": DEFAULT_A_LARGE_TENSOR,
+                                "lhs_fill_element_0index": DEFAULT_LHS_FEI_LARGE_TENSOR,
+                                "rhs_fill_element_0index": DEFAULT_RHS_FEI_LARGE_TENSOR,
+                                "mhs": DEFAULT_MHS_LARGE_TENSOR,
+                                "lrs_multi_lars": DEFAULT_WSS_LARGE_TENSOR,
+                                "data_softmax": DEFAULT_LABEL_SOFTMAX_LARGE_TENSOR,
+                                "data_spatialtransformer": DEFAULT_DATA_ST_LARGE_TENSOR,
+                                "loc_spatialtransformer": DEFAULT_LOC_TAR_ST_LARGE_TENSOR,
+                                "target_shape": DEFAULT_LOC_TAR_ST_LARGE_TENSOR,
+                                "transform_type_spatialtransformer": DEFAULT_TRANSFORM,
+                                "sampler_type": DEFAULT_SAMPLER,
+                                "data_col2im": DEFAULT_DATA_C2I_LARGE_TENSOR,
+                                "output_size": DEFAULT_OUTPUT_SIZE_LARGE_TENSOR,
+                                "kernel_col2im": DEFAULT_KERNEL_LARGE_TENSOR,
+                                "stride_col2im": DEFAULT_STRIDE_LARGE_TENSOR,
+                                "data_ctcloss": DEFAULT_DATA_CTCLOSS,
+                                "label_ctcloss": DEFAULT_LABEL_LARGE_TENSOR,
+                                "data_ctc_loss": DEFAULT_DATA_CTCLOSS,
+                                "label_ctc_loss": DEFAULT_LABEL_LARGE_TENSOR,
+                                "parameters": DEFAULT_PARAMETERS_LARGE_TENSOR,
+                                "state": DEFAULT_STATE_LARGE_TENSOR,
+                                "state_size": DEFAULT_STATE_SIZE_LARGE_TENSOR,
+                                "num_layers": DEFAULT_NUM_LAYERS_LARGE_TENSOR,
+                                "data_groupnorm": DEFAULT_DATA_GN_LARGE_TENSOR,
+                                "gamma_groupnorm": DEFAULT_BETA_GAMMA_GN_LARGE_TENSOR,
+                                "beta_groupnorm": DEFAULT_BETA_GAMMA_GN_LARGE_TENSOR,
+                                "eps": DEFAULT_EPSILON,
+                                "data_dropout": DEFAULT_DATA_DROPOUT_LARGE_TENSOR,
+                                "mode_dropout": DEFAULT_MODE_DROPOUT,
+                                "p_dropout": DEFAULT_P_DROPOUT_LARGE_TENSOR,
+                                "axes_dropout": DEFAULT_AXES_DROPOUT_LARGE_TENSOR,
+                                "data_nn_basic": DEFAULT_DATA_NN_BASIC_LARGE_TENSOR,
+                                "num_hidden": DEFAULT_NUM_HIDDEN_LARGE_TENSOR,
+                                "data_fullyconnected": DEFAULT_DATA_FC_LARGE_TENSOR,
+                                "weight_fullyconnected": DEFAULT_WEIGHT_FC_LARGE_TENSOR,
+                                "num_hidden_fullyconnected": DEFAULT_NUM_HIDDEN_FC_LARGE_TENSOR,
+                                "weight_embedding": DEFAULT_WEIGHT_EMBEDDING_LARGE_TENSOR,
+                                "bias": DEFAULT_BIAS_LARGE_TENSOR,
+                                "flatten": DEFAULT_FLATTEN_LARGE_TENSOR,
+                                "data_batchnorm": DEFAULT_DATA_NN_BASIC_LARGE_TENSOR,
+                                "gamma_batchnorm": DEFAULT_GAMMA_LARGE_TENSOR,
+                                "beta_batchnorm": DEFAULT_BETA_LARGE_TENSOR,
+                                "moving_mean_batchnorm": DEFAULT_MOVING_MEAN_LARGE_TENSOR,
+                                "moving_var_batchnorm": DEFAULT_MOVING_VAR_LARGE_TENSOR,
+                                "axis_batchnorm": DEFAULT_AXIS_BN,
+                                "data_softmaxoutput": DEFAULT_DATA_SO_LARGE_TENSOR,
+                                "label_softmaxoutput": DEFAULT_LABEL_SO_LARGE_TENSOR,
+                                "data_maeregressionoutput": DEFAULT_DATA_REG_LARGE_TENSOR,
+                                "label_maeregressionoutput": DEFAULT_LABEL_REG_LARGE_TENSOR,
+                                "data_logisticregressionoutput": DEFAULT_DATA_REG_LARGE_TENSOR,
+                                "label_logisticregressionoutput": DEFAULT_LABEL_REG_LARGE_TENSOR,
+                                "data_linearregressionoutput": DEFAULT_DATA_REG_LARGE_TENSOR,
+                                "label_linearregressionoutput": DEFAULT_LABEL_REG_LARGE_TENSOR,
+                                "data_svmoutput": DEFAULT_DATA_SVM_LARGE_TENSOR,
+                                "label_svmoutput": DEFAULT_LABEL_SVM_LARGE_TENSOR,
+                                "grad_scale": DEFAULT_GRAD_SCALE,
+                                "normalization": DEFAULT_NORMALIZATION,
+                                "margin": DEFAULT_MARGIN,
+                                "regularization_coefficient": DEFAULT_REG_COEFF,
+                                "data_l2normalization": DEFAULT_DATA_NORM_LARGE_TENSOR,
+                                "mode_l2normalization": DEFAULT_MODE_L2,
+                                "gamma_layernorm": DEFAULT_GAMMA_NORM_LARGE_TENSOR,
+                                "beta_layernorm": DEFAULT_BETA_NORM_LARGE_TENSOR,
+                                "data_instancenorm": DEFAULT_DATA_NORM_LARGE_TENSOR,
+                                "gamma_instancenorm": DEFAULT_GAMMA_NORM_LARGE_TENSOR,
+                                "beta_instancenorm": DEFAULT_GAMMA_NORM_LARGE_TENSOR,
+                                "input_dim": DEFAULT_INPUT_DIM_LARGE_TENSOR,
+                                "output_dim": DEFAULT_OUTPUT_DIM_LARGE_TENSOR,
+                                "sparse_grad": DEFAULT_SPARSE_GRAD,
+                                "data1": DEFAULT_DATA1_LARGE_TENSOR,
+                                "data2": DEFAULT_DATA2_LARGE_TENSOR,
+                                "kernel_size": DEFAULT_KERNEL_SIZE_LARGE_TENSOR,
+                                "max_displacement": DEFAULT_MAX_DISPLACEMENT_LARGE_TENSOR,
+                                "stride1": DEFAULT_STRIDE_1_LARGE_TENSOR,
+                                "stride2": DEFAULT_STRIDE_2_LARGE_TENSOR,
+                                "data_im2col": DEFAULT_DATA_I2C_LARGE_TENSOR,
+                                "kernel_im2col": DEFAULT_KERNEL_I2C_LARGE_TENSOR,
+                                "stride_im2col": DEFAULT_STRIDE_I2C_LARGE_TENSOR,
+                                "dilate_im2col": DEFAULT_DILATE_LARGE_TENSOR,
+                                "pad_im2col": DEFAULT_PAD_LARGE_TENSOR,
+                                "data_lrn": DEFAULT_DATA_LRN_LARGE_TENSOR,
+                                "alpha_lrn": DEFAULT_ALPHA,
+                                "beta_lrn": DEFAULT_BETA_LRN,
+                                "nsize": DEFAULT_NSIZE,
+                                "data_layernorm": DEFAULT_DATA_NORM_LARGE_TENSOR,
+                                "axis_layernorm": DEFAULT_AXIS_LARGE_TENSOR}
 
 # These are names of MXNet operator parameters that is of type NDArray.
 # We maintain this list to automatically recognize these parameters are to be
@@ -446,4 +796,6 @@
                           "v", "z", "g", "delta", "args", "indices", "shape_like", "y",
                           "x", "condition", "a", "index", "raveL_data", "label", "grid",
                           "A", "B", "C", "r1", "r2", "rois", "lrs", "wds", "weights_sum_sq",
-                          "grads_sum_sq", "mhs", "data1", "data2", "loc", "parameters", "state"]
+                          "grads_sum_sq", "mhs", "data1", "data2", "loc", "parameters", "state",
+                          "state_cell"]
+
diff --git a/benchmark/opperf/utils/benchmark_utils.py b/benchmark/opperf/utils/benchmark_utils.py
index f6cdfe004215..f2cce0abec09 100644
--- a/benchmark/opperf/utils/benchmark_utils.py
+++ b/benchmark/opperf/utils/benchmark_utils.py
@@ -181,7 +181,7 @@ def run_performance_test(ops, inputs, run_backward=True,
     return op_benchmark_result
 
 
-def run_op_benchmarks(ops, dtype, ctx, profiler, warmup, runs):
+def run_op_benchmarks(ops, dtype, ctx, profiler, int64_tensor, warmup, runs):
     # Running SoftmaxOutput backwards on GPU results in errors
     # track issue here: https://github.com/apache/incubator-mxnet/issues/880
     gpu_backwards_disabled_ops = ['SoftmaxOutput']
@@ -195,7 +195,7 @@ def run_op_benchmarks(ops, dtype, ctx, profiler, warmup, runs):
     for op, op_params in ops.items():
         if ctx == mx.cpu() or op not in gpu_disabled_ops:
             # Prepare inputs for the operator
-            inputs = prepare_op_inputs(op, op_params)
+            inputs = prepare_op_inputs(op, op_params, int64_tensor)
 
             # setting backward false for ops with known issue
             if (ctx == mx.gpu() and op in gpu_backwards_disabled_ops) or op in no_backward:
diff --git a/benchmark/opperf/utils/op_registry_utils.py b/benchmark/opperf/utils/op_registry_utils.py
index 99678b8d31a9..b27b8e4e73b5 100644
--- a/benchmark/opperf/utils/op_registry_utils.py
+++ b/benchmark/opperf/utils/op_registry_utils.py
@@ -20,7 +20,7 @@
 from mxnet import runtime
 import mxnet as mx
 
-from benchmark.opperf.rules.default_params import DEFAULTS_INPUTS, MX_OP_MODULE
+from benchmark.opperf.rules.default_params import DEFAULTS_INPUTS, DEFAULTS_INPUTS_LARGE_TENSOR, MX_OP_MODULE
 
 
 def _select_ops(operator_names, filters=("_contrib", "_"), merge_op_forward_backward=True):
@@ -109,7 +109,7 @@ def prepare_op_inputs(arg_params, arg_values):
     return inputs
 
 
-def prepare_op_inputs(op, arg_params):
+def prepare_op_inputs(op, arg_params, int64_tensor):
     inputs = []
 
     # 4d tensor is needed only by following two ops
@@ -120,14 +120,27 @@ def prepare_op_inputs(op, arg_params):
 
     # For ops with args that need to change shape/value for different ops
     custom_data = {'Activation', 'LeakyReLU', 'Softmax', 'BilinearSampler', 'GridGenerator', 'sample_multinomial', 'linalg_maketrian',
-                   'SpatialTransformer', 'col2im', 'RNN', 'GroupNorm', 'Dropout', 'FullyConnected',
+                   'SpatialTransformer', 'col2im', 'GroupNorm', 'Dropout', 'FullyConnected',
                    'SoftmaxOutput', 'LinearRegressionOutput', 'BatchNorm', 'LogisticRegressionOutput',
                    'MAERegressionOutput', 'SVMOutput', 'L2Normalization', 'LayerNorm', 'InstanceNorm',
                    'Embedding', 'Correlation', 'im2col', 'LRN', 'squeeze', 'fill_element_0index'}
 
+    custom_data_int64 = {'random_pdf_dirichlet', 'random_pdf_exponential', 'random_pdf_gamma',
+                         'random_pdf_generalized_negative_binomial', 'random_pdf_negative_binomial',
+                         'random_pdf_normal', 'random_pdf_poisson', 'random_pdf_uniform', 'sample_exponential',
+                         'sample_normal', 'sample_poisson', 'sample_uniform', 'sample_gamma',
+                         'sample_generalized_negative_binomial', 'sample_negative_binomial', 'CTCLoss',
+                         'ctc_loss', 'multi_lars'}
+
     int_only = {'random_randint'}
     float_only = {'log_softmax', 'softmax', 'softmin'}
 
+    if int64_tensor == 'on':
+        default_inputs = DEFAULTS_INPUTS_LARGE_TENSOR
+        custom_data |= custom_data_int64
+    else:
+        default_inputs = DEFAULTS_INPUTS
+
     # Prepare op to default input mapping
     arg_values = {}
     for arg_name, arg_type in zip(arg_params["params"]["arg_names"],
@@ -137,29 +150,29 @@ def prepare_op_inputs(op, arg_params):
         # same for randint (which is the only op that takes only int as input)
         # rest all operators take int as well as float
         if op in int_only and arg_name == "dtype":
-            arg_values[arg_name] = DEFAULTS_INPUTS["dtype_int"]
+            arg_values[arg_name] = default_inputs["dtype_int"]
         elif (op.startswith(('random','sample')) or op in float_only) and arg_name == "dtype":
-            arg_values[arg_name] = DEFAULTS_INPUTS["dtype_float"]
+            arg_values[arg_name] = default_inputs["dtype_float"]
         elif "NDArray" in arg_type and op == "ravel_multi_index":
-            arg_values[arg_name] = DEFAULTS_INPUTS["ravel_data"]
-        elif op in custom_data and arg_name + "_" + op.lower() in DEFAULTS_INPUTS:
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_" + op.lower()]
-        elif "NDArray" in arg_type and arg_name + "_nd" in DEFAULTS_INPUTS:
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_nd"]
-        elif "NDArray" in arg_type and op in ops_4d and arg_name + "_4d" in DEFAULTS_INPUTS:
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_4d"]
-        elif "NDArray" in arg_type and op in ops_3d and arg_name + "_3d" in DEFAULTS_INPUTS:
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_3d"]
+            arg_values[arg_name] = default_inputs["ravel_data"]
+        elif op in custom_data and arg_name + "_" + op.lower() in default_inputs:
+            arg_values[arg_name] = default_inputs[arg_name + "_" + op.lower()]
+        elif "NDArray" in arg_type and arg_name + "_nd" in default_inputs:
+            arg_values[arg_name] = default_inputs[arg_name + "_nd"]
+        elif "NDArray" in arg_type and op in ops_4d and arg_name + "_4d" in default_inputs:
+            arg_values[arg_name] = default_inputs[arg_name + "_4d"]
+        elif "NDArray" in arg_type and op in ops_3d and arg_name + "_3d" in default_inputs:
+            arg_values[arg_name] = default_inputs[arg_name + "_3d"]
         elif "NDArray" in arg_type and op == 'softmax_cross_entropy':
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_smce"]
-        elif arg_name in DEFAULTS_INPUTS:
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name]
-        elif "float" in arg_type and arg_name + "_float" in DEFAULTS_INPUTS:
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_float"]
-        elif "Shape" in arg_type and arg_name + "_shape" in DEFAULTS_INPUTS:
+            arg_values[arg_name] = default_inputs[arg_name + "_smce"]
+        elif arg_name in default_inputs:
+            arg_values[arg_name] = default_inputs[arg_name]
+        elif "float" in arg_type and arg_name + "_float" in default_inputs:
+            arg_values[arg_name] = default_inputs[arg_name + "_float"]
+        elif "Shape" in arg_type and arg_name + "_shape" in default_inputs:
             # This is for cases where in some ops 'axis' is Int in some ops a shape tuple.
             # Ex: axis in sum is shape, axis in sort is int.
-            arg_values[arg_name] = DEFAULTS_INPUTS[arg_name + "_shape"]
+            arg_values[arg_name] = default_inputs[arg_name + "_shape"]
 
     # Number of different inputs we want to use to test
     # the operator
@@ -340,7 +353,7 @@ def get_all_nn_basic_operators():
     nn_basic_ops = ['FullyConnected', 'Dropout', 'BatchNorm', 'SoftmaxOutput', 'LinearRegressionOutput',
                     'LogisticRegressionOutput', 'MAERegressionOutput', 'SVMOutput', 'L2Normalization',
                     'LayerNorm', 'InstanceNorm', 'Embedding', 'Correlation', 'SpatialTransformer', 'im2col',
-                    'col2im', 'GroupNorm', 'RNN', 'LRN']
+                    'col2im', 'GroupNorm', 'LRN']
 
     # Get all mxnet operators
     mx_operators = _get_all_mxnet_operators()
diff --git a/ci/docker/install/centos7_base.sh b/ci/docker/install/centos7_base.sh
index 3b84aeb57b06..c5f860e6e7a7 100755
--- a/ci/docker/install/centos7_base.sh
+++ b/ci/docker/install/centos7_base.sh
@@ -27,7 +27,20 @@ yum -y install epel-release
 yum -y install git
 yum -y install wget
 yum -y install make
-yum -y install cmake
 yum -y install unzip
 yum -y install ninja-build
 yum -y install gcc-gfortran
+yum -y install protobuf-compiler
+yum -y install protobuf-devel
+yum -y install zeromq-devel
+
+# Centos 7 only provides ninja-build
+ln -s /usr/bin/ninja-build /usr/bin/ninja
+
+# CMake 3.13.2+ is required
+mkdir /opt/cmake && cd /opt/cmake
+wget -nv https://cmake.org/files/v3.13/cmake-3.13.5-Linux-x86_64.sh
+sh cmake-3.13.5-Linux-x86_64.sh --prefix=/opt/cmake --skip-license
+ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
+rm cmake-3.13.5-Linux-x86_64.sh
+cmake --version
diff --git a/ci/docker/install/centos7_core.sh b/ci/docker/install/centos7_core.sh
index 577f9dba7439..fbdb239cf0c2 100755
--- a/ci/docker/install/centos7_core.sh
+++ b/ci/docker/install/centos7_core.sh
@@ -30,10 +30,23 @@ yum -y install atlas-devel # Provide clbas headerfiles
 yum -y install openblas-devel
 yum -y install lapack-devel
 yum -y install opencv-devel
+yum -y install protobuf-compiler
+yum -y install protobuf-devel
+yum -y install zeromq-devel
 yum -y install openssl-devel
 yum -y install gcc-c++-4.8.*
 yum -y install make
-yum -y install cmake
 yum -y install wget
 yum -y install unzip
 yum -y install ninja-build
+
+# Centos 7 only provides ninja-build
+ln -s /usr/bin/ninja-build /usr/bin/ninja
+
+# CMake 3.13.2+ is required
+mkdir /opt/cmake && cd /opt/cmake
+wget -nv https://cmake.org/files/v3.13/cmake-3.13.5-Linux-x86_64.sh
+sh cmake-3.13.5-Linux-x86_64.sh --prefix=/opt/cmake --skip-license
+ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
+rm cmake-3.13.5-Linux-x86_64.sh
+cmake --version
diff --git a/ci/docker/install/ubuntu_core.sh b/ci/docker/install/ubuntu_core.sh
index 2773aa26246c..9ff3ac70b859 100755
--- a/ci/docker/install/ubuntu_core.sh
+++ b/ci/docker/install/ubuntu_core.sh
@@ -47,6 +47,8 @@ apt-get install -y \
     zlib1g-dev \
     libedit-dev \
     libxml2-dev \
+    libprotobuf-dev \
+    protobuf-compiler \
     ninja-build \
     software-properties-common \
     sudo \
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 0c7630f24015..b4394c0b84bc 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -156,70 +156,50 @@ gather_licenses() {
 
 build_ubuntu_cpu_release() {
     set -ex
-
-    build_ccache_wrappers
-
-    make  \
-        DEV=0                         \
-        ENABLE_TESTCOVERAGE=0         \
-        USE_CPP_PACKAGE=0             \
-        USE_MKLDNN=0                  \
-        USE_BLAS=openblas             \
-        USE_SIGNAL_HANDLER=1          \
-        -j$(nproc)
+    cd /work/build
+    cmake \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_MKLDNN=OFF \
+        -DUSE_CUDA=OFF \
+        -G Ninja /work/mxnet
+    ninja
 }
 
 build_ubuntu_cpu_mkldnn_release() {
     set -ex
-
-    build_ccache_wrappers
-
-    make  \
-        DEV=0                         \
-        ENABLE_TESTCOVERAGE=0         \
-        USE_CPP_PACKAGE=0             \
-        USE_MKLDNN=1                  \
-        USE_BLAS=openblas             \
-        USE_SIGNAL_HANDLER=1          \
-        -j$(nproc)
+    cd /work/build
+    cmake \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_MKLDNN=ON \
+        -DUSE_CUDA=OFF \
+        -G Ninja /work/mxnet
+    ninja
 }
 
 build_ubuntu_gpu_release() {
     set -ex
-    # unfortunately this build has problems in 3rdparty dependencies with ccache and make
-    # build_ccache_wrappers
-
-    make \
-        DEV=0                                     \
-        ENABLE_TESTCOVERAGE=0                     \
-        USE_BLAS=openblas                         \
-        USE_MKLDNN=0                              \
-        USE_CUDA=1                                \
-        USE_CUDA_PATH=/usr/local/cuda             \
-        USE_CUDNN=1                               \
-        USE_CPP_PACKAGE=0                         \
-        USE_DIST_KVSTORE=1                        \
-        USE_SIGNAL_HANDLER=1                      \
-        -j$(nproc)
+    cd /work/build
+    cmake \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_MKLDNN=OFF \
+        -DUSE_DIST_KVSTORE=ON \
+        -DUSE_CUDA=ON \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
+        -G Ninja /work/mxnet
+    ninja
 }
 
 build_ubuntu_gpu_mkldnn_release() {
     set -ex
-    # unfortunately this build has problems in 3rdparty dependencies with ccache and make
-    # build_ccache_wrappers
-
-    make \
-        DEV=0                                     \
-        ENABLE_TESTCOVERAGE=0                     \
-        USE_BLAS=openblas                         \
-        USE_MKLDNN=1                              \
-        USE_CUDA=1                                \
-        USE_CUDA_PATH=/usr/local/cuda             \
-        USE_CUDNN=1                               \
-        USE_CPP_PACKAGE=0                         \
-        USE_DIST_KVSTORE=1                        \
-        USE_SIGNAL_HANDLER=1                      \
-        -j$(nproc)
+    cd /work/build
+    cmake \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_MKLDNN=ON \
+        -DUSE_DIST_KVSTORE=ON \
+        -DUSE_CUDA=ON \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
+        -G Ninja /work/mxnet
+    ninja
 }
 
 # Compiles the dynamic mxnet library
@@ -265,7 +245,6 @@ build_jetson() {
 
 build_armv6() {
     set -ex
-    pushd .
     cd /work/build
 
     # Lapack functionality will be included and statically linked to openblas.
@@ -291,12 +270,10 @@ build_armv6() {
 
     ninja
     build_wheel
-    popd
 }
 
 build_armv7() {
     set -ex
-    pushd .
     cd /work/build
 
     # Lapack functionality will be included and statically linked to openblas.
@@ -321,11 +298,11 @@ build_armv7() {
 
     ninja
     build_wheel
-    popd
 }
 
 build_armv8() {
     build_ccache_wrappers
+    cd /work/build
     cmake \
         -DUSE_CUDA=OFF\
         -DSUPPORT_F16C=OFF\
@@ -358,7 +335,6 @@ build_android_armv7() {
         -DUSE_OPENCV=OFF\
         -DUSE_OPENMP=OFF\
         -DUSE_SIGNAL_HANDLER=ON\
-        -DCMAKE_BUILD_TYPE=RelWithDebInfo\
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
     ninja
@@ -376,13 +352,25 @@ build_android_armv8() {
         -DUSE_OPENCV=OFF\
         -DUSE_OPENMP=OFF\
         -DUSE_SIGNAL_HANDLER=ON\
-        -DCMAKE_BUILD_TYPE=RelWithDebInfo\
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
     ninja
 }
 
 build_centos7_cpu() {
+    set -ex
+    cd /work/build
+    cmake \
+        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_MKLDNN=OFF \
+        -DUSE_DIST_KVSTORE=ON \
+        -DUSE_CUDA=OFF \
+        -G Ninja /work/mxnet
+    ninja
+}
+
+build_centos7_cpu_make() {
     set -ex
     cd /work/mxnet
     export CC="ccache gcc"
@@ -400,6 +388,7 @@ build_centos7_cpu() {
 }
 
 build_amzn_linux_cpu() {
+    set -ex
     cd /work/build
     build_ccache_wrappers
     cmake \
@@ -417,36 +406,28 @@ build_amzn_linux_cpu() {
 
 build_centos7_mkldnn() {
     set -ex
-    cd /work/mxnet
-    export CC="ccache gcc"
-    export CXX="ccache g++"
-    build_ccache_wrappers
-    make \
-        DEV=1 \
-        USE_LAPACK=1 \
-        USE_LAPACK_PATH=/usr/lib64/liblapack.so \
-        USE_BLAS=openblas \
-        USE_SIGNAL_HANDLER=1 \
-        -j$(nproc)
+    cd /work/build
+    cmake \
+        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_MKLDNN=ON \
+        -DUSE_CUDA=OFF \
+        -G Ninja /work/mxnet
+    ninja
 }
 
 build_centos7_gpu() {
     set -ex
-    cd /work/mxnet
-    # unfortunately this build has problems in 3rdparty dependencies with ccache and make
-    build_ccache_wrappers
-    make \
-        DEV=1                                     \
-        USE_LAPACK=1                              \
-        USE_LAPACK_PATH=/usr/lib64/liblapack.so   \
-        USE_BLAS=openblas                         \
-        USE_MKLDNN=0                              \
-        USE_CUDA=1                                \
-        USE_CUDA_PATH=/usr/local/cuda             \
-        USE_CUDNN=1                               \
-        USE_DIST_KVSTORE=1                        \
-        CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
-        -j$(nproc)
+    cd /work/build
+    cmake \
+        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_MKLDNN=ON \
+        -DUSE_CUDA=ON \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
+        -DUSE_DIST_KVSTORE=ON\
+        -G Ninja /work/mxnet
+    ninja
 }
 
 build_ubuntu_cpu() {
@@ -454,6 +435,22 @@ build_ubuntu_cpu() {
 }
 
 build_ubuntu_cpu_openblas() {
+    set -ex
+    cd /work/build
+    cmake \
+        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DUSE_TVM_OP=ON \
+        -DUSE_CPP_PACKAGE=ON \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_MKLDNN=OFF \
+        -DUSE_CUDA=OFF \
+        -DUSE_DIST_KVSTORE=ON \
+        -DBUILD_CYTHON_MODULES=ON \
+        -G Ninja /work/mxnet
+    ninja
+}
+
+build_ubuntu_cpu_openblas_make() {
     set -ex
     export CC="gcc"
     export CXX="g++"
@@ -489,13 +486,11 @@ build_ubuntu_cpu_mkl() {
 
 build_ubuntu_cpu_cmake_debug() {
     set -ex
-    pushd .
     cd /work/build
     build_ccache_wrappers
     cmake \
         -DUSE_CUDA=OFF \
         -DUSE_TVM_OP=ON \
-        -DPython3_EXECUTABLE=/usr/bin/python3 \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_OPENMP=OFF \
         -DUSE_OPENCV=ON \
@@ -503,20 +498,16 @@ build_ubuntu_cpu_cmake_debug() {
         -DCMAKE_BUILD_TYPE=Debug \
         -G Ninja \
         /work/mxnet
-
     ninja
-    popd
 }
 
 build_ubuntu_cpu_cmake_no_tvm_op() {
     set -ex
-    pushd .
     cd /work/build
     build_ccache_wrappers
     cmake \
         -DUSE_CUDA=OFF \
         -DUSE_TVM_OP=OFF \
-        -DPython3_EXECUTABLE=/usr/bin/python3 \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_OPENMP=OFF \
         -DUSE_OPENCV=ON \
@@ -526,13 +517,11 @@ build_ubuntu_cpu_cmake_no_tvm_op() {
         /work/mxnet
 
     ninja
-    popd
 }
 
 build_ubuntu_cpu_cmake_asan() {
     set -ex
 
-    pushd .
     cd /work/build
     export CXX=g++-8
     export CC=gcc-8
@@ -555,99 +544,80 @@ build_ubuntu_cpu_cmake_asan() {
     ASAN_OPTIONS=detect_leaks=0 \
     LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.5 \
     make -j $(nproc) mlp_cpu
-    popd
 }
 
 build_ubuntu_cpu_clang39() {
     set -ex
-    export CXX=clang++-3.9
-    export CC=clang-3.9
-    build_ccache_wrappers
-    make \
-        USE_CPP_PACKAGE=1             \
-        USE_BLAS=openblas             \
-        USE_MKLDNN=0                  \
-        USE_OPENMP=0                  \
-        USE_DIST_KVSTORE=1            \
-        -j$(nproc)
+    cd /work/build
+    CXX=clang++-3.9 CC=clang-3.9 cmake \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_MKLDNN=OFF \
+        -DUSE_CUDA=OFF \
+        -DUSE_OPENMP=OFF \
+        -DUSE_DIST_KVSTORE=ON \
+        -DUSE_CPP_PACKAGE=ON \
+        -G Ninja /work/mxnet
+    ninja
 }
 
 build_ubuntu_cpu_clang60() {
     set -ex
-
-    export CXX=clang++-6.0
-    export CC=clang-6.0
-
-    build_ccache_wrappers
-
-    make  \
-        USE_CPP_PACKAGE=1             \
-        USE_BLAS=openblas             \
-        USE_MKLDNN=0                  \
-        USE_OPENMP=1                  \
-        USE_DIST_KVSTORE=1            \
-        -j$(nproc)
+    cd /work/build
+    CXX=clang++-6.0 CC=clang-6.0 cmake \
+       -DUSE_MKL_IF_AVAILABLE=OFF \
+       -DUSE_MKLDNN=OFF \
+       -DUSE_CUDA=OFF \
+       -DUSE_OPENMP=ON \
+       -DUSE_DIST_KVSTORE=ON \
+       -DUSE_CPP_PACKAGE=ON \
+       -G Ninja /work/mxnet
+    ninja
 }
 
 build_ubuntu_cpu_clang_tidy() {
     set -ex
-
-    export CXX=clang++-6.0
-    export CC=clang-6.0
-    export CLANG_TIDY=/usr/lib/llvm-6.0/share/clang/run-clang-tidy.py
-
-    pushd .
     cd /work/build
-    build_ccache_wrappers
-    cmake \
-        -DUSE_CUDA=OFF \
-        -DUSE_MKLDNN=OFF \
-        -DUSE_MKL_IF_AVAILABLE=OFF \
-        -DUSE_OPENCV=ON \
-        -DCMAKE_BUILD_TYPE=Debug \
-        -G Ninja \
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-        /work/mxnet
-
+    export CLANG_TIDY=/usr/lib/llvm-6.0/share/clang/run-clang-tidy.py
+    CXX=clang++-6.0 CC=clang-6.0 cmake \
+       -DUSE_MKL_IF_AVAILABLE=OFF \
+       -DUSE_MKLDNN=OFF \
+       -DUSE_CUDA=OFF \
+       -DCMAKE_BUILD_TYPE=Debug \
+       -DUSE_DIST_KVSTORE=ON \
+       -DUSE_CPP_PACKAGE=ON \
+       -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+       -G Ninja /work/mxnet
     ninja
     cd /work/mxnet
     $CLANG_TIDY -p /work/build -j $(nproc) -clang-tidy-binary clang-tidy-6.0 /work/mxnet/src
-    popd
 }
 
 build_ubuntu_cpu_clang39_mkldnn() {
     set -ex
-
-    export CXX=clang++-3.9
-    export CC=clang-3.9
-
-    build_ccache_wrappers
-
-    make \
-        USE_CPP_PACKAGE=1             \
-        USE_BLAS=openblas             \
-        USE_OPENMP=0                  \
-        USE_SIGNAL_HANDLER=1          \
-        -j$(nproc)
+    cd /work/build
+    CXX=clang++-3.9 CC=clang-3.9 cmake \
+       -DUSE_MKL_IF_AVAILABLE=OFF \
+       -DUSE_MKLDNN=ON \
+       -DUSE_CUDA=OFF \
+       -DUSE_CPP_PACKAGE=ON \
+       -DUSE_OPENMP=OFF \
+       -G Ninja /work/mxnet
+    ninja
 }
 
 build_ubuntu_cpu_clang60_mkldnn() {
     set -ex
-
-    export CXX=clang++-6.0
-    export CC=clang-6.0
-
-    build_ccache_wrappers
-
-    make \
-        USE_CPP_PACKAGE=1             \
-        USE_BLAS=openblas             \
-        USE_OPENMP=1                  \
-        USE_SIGNAL_HANDLER=1          \
-        -j$(nproc)
+    cd /work/build
+    CXX=clang++-6.0 CC=clang-6.0 cmake \
+       -DUSE_MKL_IF_AVAILABLE=OFF \
+       -DUSE_MKLDNN=ON \
+       -DUSE_CUDA=OFF \
+       -DUSE_CPP_PACKAGE=ON \
+       -G Ninja /work/mxnet
+    ninja
 }
 
-build_ubuntu_cpu_mkldnn() {
+build_ubuntu_cpu_mkldnn_make() {
     set -ex
 
     build_ccache_wrappers
@@ -661,9 +631,22 @@ build_ubuntu_cpu_mkldnn() {
         -j$(nproc)
 }
 
-build_ubuntu_cpu_mkldnn_mkl() {
+build_ubuntu_cpu_mkldnn() {
     set -ex
+    cd /work/build
+    cmake \
+        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+       -DUSE_MKL_IF_AVAILABLE=OFF \
+       -DUSE_TVM_OP=ON \
+       -DUSE_MKLDNN=ON \
+       -DUSE_CUDA=OFF \
+       -DUSE_CPP_PACKAGE=ON \
+       -G Ninja /work/mxnet
+    ninja
+}
 
+build_ubuntu_cpu_mkldnn_mkl() {
+    set -ex
     build_ccache_wrappers
 
     make  \
@@ -735,40 +718,51 @@ build_ubuntu_gpu_tensorrt() {
 
 build_ubuntu_gpu_mkldnn() {
     set -ex
-
-    build_ccache_wrappers
-
-    make  \
-        DEV=1                                     \
-        USE_CPP_PACKAGE=1                         \
-        USE_BLAS=openblas                         \
-        USE_CUDA=1                                \
-        USE_CUDA_PATH=/usr/local/cuda             \
-        USE_CUDNN=1                               \
-        USE_TVM_OP=1                              \
-        CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
-        USE_SIGNAL_HANDLER=1                      \
-        -j$(nproc)
+    cd /work/build
+    cmake \
+        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_TVM_OP=ON \
+        -DUSE_CUDA=ON \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
+        -DUSE_CPP_PACKAGE=ON \
+        -G Ninja /work/mxnet
+    ninja
 }
 
 build_ubuntu_gpu_mkldnn_nocudnn() {
     set -ex
-
-    build_ccache_wrappers
-
-    make  \
-        DEV=1                                     \
-        USE_BLAS=openblas                         \
-        USE_CUDA=1                                \
-        USE_CUDA_PATH=/usr/local/cuda             \
-        USE_CUDNN=0                               \
-        USE_TVM_OP=1                              \
-        CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
-        USE_SIGNAL_HANDLER=1                      \
-        -j$(nproc)
+    cd /work/build
+    cmake \
+        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_TVM_OP=ON \
+        -DUSE_CUDA=ON \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
+        -DUSE_CUDNN=OFF \
+        -DUSE_CPP_PACKAGE=ON \
+        -G Ninja /work/mxnet
+    ninja
 }
 
 build_ubuntu_gpu_cuda101_cudnn7() {
+    set -ex
+    cd /work/build
+    cmake \
+        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_TVM_OP=ON \
+        -DUSE_CUDA=ON \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
+        -DUSE_CUDNN=ON \
+        -DUSE_MKLDNN=OFF \
+        -DUSE_CPP_PACKAGE=ON \
+        -DBUILD_CYTHON_MODULES=ON \
+        -G Ninja /work/mxnet
+    ninja
+}
+
+build_ubuntu_gpu_cuda101_cudnn7_make() {
     set -ex
     build_ccache_wrappers
     make \
@@ -809,22 +803,19 @@ build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test() {
 
 build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op() {
     set -ex
-    build_ccache_wrappers
-    make \
-        DEV=1                                     \
-        USE_BLAS=openblas                         \
-        USE_MKLDNN=0                              \
-        USE_CUDA=1                                \
-        USE_CUDA_PATH=/usr/local/cuda             \
-        USE_CUDNN=1                               \
-        USE_TVM_OP=0                              \
-        USE_CPP_PACKAGE=1                         \
-        USE_DIST_KVSTORE=1                        \
-        CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
-        USE_SIGNAL_HANDLER=1                      \
-        -j$(nproc)
-
-    make cython PYTHON=python3
+    cd /work/build
+    cmake \
+        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_TVM_OP=OFF \
+        -DUSE_CUDA=ON \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
+        -DUSE_CUDNN=ON \
+        -DUSE_MKLDNN=OFF \
+        -DBUILD_CYTHON_MODULES=ON \
+        -DUSE_DIST_KVSTORE=ON \
+        -G Ninja /work/mxnet
+    ninja
 }
 
 build_ubuntu_amalgamation() {
@@ -846,25 +837,6 @@ build_ubuntu_amalgamation_min() {
         MIN=1
 }
 
-build_ubuntu_gpu_cmake_mkldnn() {
-    set -ex
-    cd /work/build
-    build_ccache_wrappers
-    cmake \
-        -DUSE_SIGNAL_HANDLER=ON                 \
-        -DUSE_CUDA=1                            \
-        -DUSE_CUDNN=1                           \
-        -DUSE_TVM_OP=1                          \
-        -DPython3_EXECUTABLE=/usr/bin/python3   \
-        -DUSE_MKLML_MKL=1                       \
-        -DCMAKE_BUILD_TYPE=Release              \
-        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-        -G Ninja                                \
-        /work/mxnet
-
-    ninja
-}
-
 build_ubuntu_gpu_cmake() {
     set -ex
     cd /work/build
@@ -874,7 +846,6 @@ build_ubuntu_gpu_cmake() {
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
         -DUSE_TVM_OP=ON                         \
-        -DPython3_EXECUTABLE=/usr/bin/python3   \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
         -DUSE_MKLDNN=OFF                        \
@@ -897,7 +868,6 @@ build_ubuntu_gpu_cmake_no_rtc() {
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
         -DUSE_TVM_OP=ON                         \
-        -DPython3_EXECUTABLE=/usr/bin/python3   \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
         -DUSE_MKLDNN=ON                         \
@@ -921,7 +891,6 @@ build_ubuntu_gpu_cmake_no_tvm_op() {
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
         -DUSE_TVM_OP=OFF                        \
-        -DPython3_EXECUTABLE=/usr/bin/python3   \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
         -DUSE_MKLDNN=OFF                        \
@@ -961,7 +930,6 @@ build_ubuntu_gpu_large_tensor() {
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
         -DUSE_TVM_OP=ON                         \
-        -DPython3_EXECUTABLE=/usr/bin/python3   \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
         -DUSE_MKLDNN=OFF                        \
@@ -984,7 +952,8 @@ build_ubuntu_blc() {
 sanity_check() {
     set -ex
     tools/license_header.py check
-    make cpplint rcpplint jnilint
+    make cpplint jnilint
+    make -f R-package/Makefile rcpplint
     make pylint
     nosetests-3.4 tests/tutorials/test_sanity_tutorials.py
 }
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 3f5fb2503b56..f87a55fc4683 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -23,25 +23,27 @@
 utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, lib/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
-mx_lib_cython = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, lib/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, python/mxnet/_cy3/*.so, python/mxnet/_ffi/_cy3/*.so'
+mx_lib = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, build/3rdparty/openmp/runtime/src/libomp.so'
+mx_lib_cython = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, python/mxnet/_cy3/*.so, build/3rdparty/openmp/runtime/src/libomp.so, python/mxnet/_ffi/_cy3/*.so'
+mx_lib_make = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, lib/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 
 // Python wheels
 mx_pip = 'build/*.whl'
 
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
-mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
-mx_cmake_lib_no_tvm_op = 'build/libmxnet.so, build/libmxnet.a, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
-mx_cmake_lib_cython = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, python/mxnet/_cy3/*.so, python/mxnet/_ffi/_cy3/*.so'
+mx_cmake_lib = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
+mx_cmake_lib_no_tvm_op = 'build/libmxnet.so, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
+mx_cmake_lib_cython = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, python/mxnet/_cy3/*.so, python/mxnet/_ffi/_cy3/*.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
-mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, lib/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
-mx_tensorrt_lib = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
-mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, lib/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, python/mxnet/_cy3/*.so, python/mxnet/_ffi/_cy3/*.so'
-mx_lib_cpp_capi = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, lib/tvmop.conf, libsample_lib.so, lib/libmkldnn.so.1, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, python/mxnet/_cy3/*.so, python/mxnet/_ffi/_cy3/*.so, build/tests/cpp/mxnet_unit_tests'
-mx_lib_cpp_examples_no_tvm_op = 'lib/libmxnet.so, lib/libmxnet.a, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, python/mxnet/_cy3/*.so, python/mxnet/_ffi/_cy3/*.so'
-mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/cpp-package/example/*'
+mx_cmake_lib_debug = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, build/tests/mxnet_unit_tests'
+mx_mkldnn_lib = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/3rdparty/openmp/runtime/src/libomp.so, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so'
+mx_mkldnn_lib_make = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, lib/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_tensorrt_lib = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/3rdparty/openmp/runtime/src/libomp.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
+mx_lib_cpp_examples = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/3rdparty/openmp/runtime/src/libomp.so, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, build/cpp-package/example/*, python/mxnet/_cy3/*.so, python/mxnet/_ffi/_cy3/*.so'
+mx_lib_cpp_examples_make = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, lib/tvmop.conf, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, python/mxnet/_cy3/*.so, python/mxnet/_ffi/_cy3/*.so'
+mx_lib_cpp_capi_make = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, lib/libtvmop.so, lib/tvmop.conf, libsample_lib.so, lib/libmkldnn.so.1, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, python/mxnet/_cy3/*.so, python/mxnet/_ffi/_cy3/*.so, build/tests/cpp/mxnet_unit_tests'
+mx_lib_cpp_examples_no_tvm_op = 'build/libmxnet.so, build/libcustomop_lib.so, build/libcustomop_gpu_lib.so, build/libsubgraph_lib.so, build/3rdparty/openmp/runtime/src/libomp.so,  build/cpp-package/example/*, python/mxnet/_cy3/*.so, python/mxnet/_ffi/_cy3/*.so'
+mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, build/3rdparty/openmp/runtime/src/libomp.so, build/cpp-package/example/*'
 
 // Python unittest for CPU
 // Python 3
@@ -102,6 +104,20 @@ def compile_unix_cpu_openblas() {
     }]
 }
 
+def compile_unix_cpu_openblas_make() {
+    return ['CPU: Openblas Makefile': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-cpu-openblas') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_openblas_make', false)
+            utils.pack_lib('cpu_make', mx_lib_make)
+          }
+        }
+      }
+    }]
+}
+
 def compile_unix_openblas_debug_cpu() {
     return ['CPU: Openblas, cmake, debug': {
       node(NODE_LINUX_CPU) {
@@ -159,13 +175,13 @@ def compile_unix_int64_gpu() {
 }
 
 def compile_unix_mkl_cpu() {
-    return ['CPU: MKL': {
+    return ['CPU: MKL Makefile': {
       node(NODE_LINUX_CPU) {
         ws('workspace/build-cpu-mkl') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_mkl', false)
-            utils.pack_lib('cpu_mkl', mx_mkldnn_lib)
+            utils.pack_lib('cpu_mkl', mx_mkldnn_lib_make)
           }
         }
       }
@@ -186,14 +202,28 @@ def compile_unix_mkldnn_cpu() {
     }]
 }
 
+def compile_unix_mkldnn_cpu_make() {
+    return ['CPU: MKLDNN Makefile': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-mkldnn-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_mkldnn_make', false)
+            utils.pack_lib('mkldnn_cpu_make', mx_mkldnn_lib_make)
+          }
+        }
+      }
+    }]
+}
+
 def compile_unix_mkldnn_mkl_cpu() {
-    return ['CPU: MKLDNN_MKL': {
+    return ['CPU: MKLDNN_MKL Makefile': {
       node(NODE_LINUX_CPU) {
         ws('workspace/build-mkldnn-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_mkldnn_mkl', false)
-            utils.pack_lib('mkldnn_mkl_cpu', mx_mkldnn_lib)
+            utils.pack_lib('mkldnn_mkl_cpu', mx_mkldnn_lib_make)
           }
         }
       }
@@ -242,42 +272,42 @@ def compile_unix_full_gpu() {
     }]
 }
 
-def compile_unix_full_gpu_mkldnn_cpp_test() {
-    return ['GPU: CUDA10.1+cuDNN7+MKLDNN+CPPTEST': {
+def compile_unix_full_gpu_make() {
+    return ['GPU: CUDA10.1+cuDNN7 Makefile': {
       node(NODE_LINUX_CPU) {
-        ws('workspace/build-gpu-mkldnn-cpp') {
+        ws('workspace/build-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test', false)
-            utils.pack_lib('gpu_mkldnn_cpp_test', mx_lib_cpp_capi)
+            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda101_cudnn7_make', false)
+            utils.pack_lib('gpu_make', mx_lib_cpp_examples_make)
           }
         }
       }
     }]
 }
 
-def compile_unix_full_gpu_no_tvm_op() {
-    return ['GPU: CUDA10.1+cuDNN7 TVM_OP OFF': {
+def compile_unix_full_gpu_mkldnn_cpp_test() {
+    return ['GPU: CUDA10.1+cuDNN7+MKLDNN+CPPTEST Makefile': {
       node(NODE_LINUX_CPU) {
-        ws('workspace/build-gpu-no-tvm-op') {
+        ws('workspace/build-gpu-mkldnn-cpp') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op', false)
-            utils.pack_lib('gpu_no_tvm_op', mx_lib_cpp_examples_no_tvm_op)
+            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test', false)
+            utils.pack_lib('gpu_mkldnn_cpp_test_make', mx_lib_cpp_capi_make)
           }
         }
       }
     }]
 }
 
-def compile_unix_cmake_mkldnn_gpu() {
-    return ['GPU: CMake MKLDNN': {
+def compile_unix_full_gpu_no_tvm_op() {
+    return ['GPU: CUDA10.1+cuDNN7 TVM_OP OFF': {
       node(NODE_LINUX_CPU) {
-        ws('workspace/build-cmake-mkldnn-gpu') {
+        ws('workspace/build-gpu-no-tvm-op') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run('ubuntu_gpu_cu101', 'build_ubuntu_gpu_cmake_mkldnn', false)
-            utils.pack_lib('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib)
+            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op', false)
+            utils.pack_lib('gpu_no_tvm_op', mx_lib_cpp_examples_no_tvm_op)
           }
         }
       }
@@ -352,6 +382,20 @@ def compile_centos7_cpu() {
     }]
 }
 
+def compile_centos7_cpu_make() {
+    return ['CPU: CentOS 7 Makefile': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-centos7-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('centos7_cpu', 'build_centos7_cpu_make', false)
+            utils.pack_lib('centos7_cpu_make', mx_lib_make)
+          }
+        }
+      }
+    }]
+}
+
 def compile_centos7_cpu_mkldnn() {
     return ['CPU: CentOS 7 MKLDNN': {
       node(NODE_LINUX_CPU) {
@@ -733,11 +777,11 @@ def test_unix_python3_cpu() {
 }
 
 def test_unix_python3_mkl_cpu() {
-    return ['Python3: MKL-CPU': {
+    return ['Python3: MKL-CPU Makefile': {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-python3-cpu') {
           try {
-            utils.unpack_and_init('cpu_mkl', mx_lib)
+            utils.unpack_and_init('cpu_mkl', mx_lib_make)
             python3_ut('ubuntu_cpu')
             utils.publish_test_coverage()
           } finally {
@@ -849,11 +893,11 @@ def test_unix_python3_mkldnn_cpu() {
 }
 
 def test_unix_python3_mkldnn_mkl_cpu() {
-    return ['Python3: MKLDNN-MKL-CPU': {
+    return ['Python3: MKLDNN-MKL-CPU Makefile': {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-python3-mkldnn-mkl-cpu') {
           try {
-            utils.unpack_and_init('mkldnn_mkl_cpu', mx_mkldnn_lib)
+            utils.unpack_and_init('mkldnn_mkl_cpu', mx_mkldnn_lib_make)
             python3_ut_mkldnn('ubuntu_cpu')
             utils.publish_test_coverage()
           } finally {
@@ -945,11 +989,11 @@ def test_unix_caffe_gpu() {
 }
 
 def test_unix_cpp_package_gpu() {
-    return ['cpp-package GPU': {
+    return ['cpp-package GPU Makefile': {
       node(NODE_LINUX_GPU) {
         ws('workspace/it-cpp-package') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_lib_cpp_examples)
+            utils.unpack_and_init('gpu_make', mx_lib_cpp_examples_make)
             utils.docker_run('ubuntu_gpu_cu101', 'integrationtest_ubuntu_gpu_cpp_package', true)
             utils.publish_test_coverage()
           }
@@ -959,11 +1003,11 @@ def test_unix_cpp_package_gpu() {
 }
 
 def test_unix_capi_cpp_package() {
-    return ['capi-cpp-package GPU': {
+    return ['capi-cpp-package GPU Makefile': {
       node(NODE_LINUX_GPU) {
         ws('workspace/it-capi-cpp-package') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu_mkldnn_cpp_test', mx_lib_cpp_capi)
+            utils.unpack_and_init('gpu_mkldnn_cpp_test_make', mx_lib_cpp_capi_make)
             utils.docker_run('ubuntu_gpu_cu101', 'integrationtest_ubuntu_gpu_capi_cpp_package', true)
             utils.publish_test_coverage()
           }
@@ -973,11 +1017,11 @@ def test_unix_capi_cpp_package() {
 }
 
 def test_unix_scala_cpu() {
-    return ['Scala: CPU': {
+    return ['Scala: CPU Makefile': {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-scala-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib)
+            utils.unpack_and_init('cpu_make', mx_lib_make)
             utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_scala', false)
             utils.publish_test_coverage()
           }
@@ -987,11 +1031,11 @@ def test_unix_scala_cpu() {
 }
 
 def test_unix_scala_mkldnn_cpu(){
-  return ['Scala: MKLDNN-CPU': {
+  return ['Scala: MKLDNN-CPU Makefile': {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-scala-mkldnn-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('mkldnn_cpu', mx_mkldnn_lib)
+            utils.unpack_and_init('mkldnn_cpu_make', mx_mkldnn_lib_make)
             utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_scala', false)
             utils.publish_test_coverage()
           }
@@ -1001,11 +1045,11 @@ def test_unix_scala_mkldnn_cpu(){
 }
 
 def test_unix_scala_gpu() {
-    return ['Scala: GPU': {
+    return ['Scala: GPU Makefile': {
       node(NODE_LINUX_GPU) {
         ws('workspace/ut-scala-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_lib)
+            utils.unpack_and_init('gpu_make', mx_lib_make)
             utils.docker_run('ubuntu_gpu_cu101', 'integrationtest_ubuntu_gpu_scala', true)
             utils.publish_test_coverage()
           }
@@ -1015,11 +1059,11 @@ def test_unix_scala_gpu() {
 }
 
 def test_unix_clojure_cpu() {
-    return ['Clojure: CPU': {
+    return ['Clojure: CPU Makefile': {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-clojure-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib)
+            utils.unpack_and_init('cpu_make', mx_lib_make)
             utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_clojure', false)
             utils.publish_test_coverage()
           }
@@ -1029,11 +1073,11 @@ def test_unix_clojure_cpu() {
 }
 
 def test_unix_clojure_integration_cpu() {
-    return ['Clojure: CPU Integration': {
+    return ['Clojure: CPU Integration Makefile': {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-clojure-integration-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib)
+            utils.unpack_and_init('cpu_make', mx_lib_make)
             utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_clojure_integration', false)
           }
         }
@@ -1070,11 +1114,11 @@ def test_unix_r_mkldnn_cpu() {
 }
 
 def test_unix_perl_cpu() {
-    return ['Perl: CPU': {
+    return ['Perl: CPU Makefile': {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-perl-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib)
+            utils.unpack_and_init('cpu_make', mx_lib_make)
             utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpugpu_perl', false)
             utils.publish_test_coverage()
           }
@@ -1097,20 +1141,6 @@ def test_unix_cpp_gpu() {
     }]
 }
 
-def test_unix_cpp_mkldnn_gpu() {
-    return ['Cpp: MKLDNN+GPU': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-cpp-mkldnn-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib)
-            utils.docker_run('ubuntu_gpu_cu101', 'unittest_cpp', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    }]
-}
-
 def test_unix_cpp_cpu() {
     return ['Cpp: CPU': {
       node(NODE_LINUX_CPU) {
@@ -1126,11 +1156,11 @@ def test_unix_cpp_cpu() {
 }
 
 def test_unix_perl_gpu() {
-    return ['Perl: GPU': {
+    return ['Perl: GPU Makefile': {
       node(NODE_LINUX_GPU) {
         ws('workspace/ut-perl-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_lib)
+            utils.unpack_and_init('gpu_make', mx_lib_make)
             utils.docker_run('ubuntu_gpu_cu101', 'unittest_ubuntu_cpugpu_perl', true)
             utils.publish_test_coverage()
           }
@@ -1180,11 +1210,11 @@ def test_unix_julia10_cpu() {
 }
 
 def test_unix_onnx_cpu() {
-    return ['Onnx CPU': {
+    return ['Onnx: CPU Makefile': {
       node(NODE_LINUX_CPU) {
         ws('workspace/it-onnx-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_lib)
+            utils.unpack_and_init('cpu_make', mx_lib_make)
             utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_onnx', false)
             utils.publish_test_coverage()
           }
@@ -1259,11 +1289,11 @@ def test_centos7_python3_gpu() {
 }
 
 def test_centos7_scala_cpu() {
-    return ['Scala: CentOS CPU': {
+    return ['Scala: CentOS CPU Makefile': {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-scala-centos7-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('centos7_cpu', mx_lib)
+            utils.unpack_and_init('centos7_cpu_make', mx_lib_make)
             utils.docker_run('centos7_cpu', 'unittest_centos7_cpu_scala', false)
             utils.publish_test_coverage()
           }
@@ -1430,7 +1460,7 @@ def docs_python() {
       node(NODE_LINUX_CPU) {
         ws('workspace/docs') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('libmxnet', mx_lib, false)
+            utils.unpack_and_init('libmxnet', 'lib/libmxnet.so', false)
             utils.docker_run('ubuntu_cpu_python', 'build_python_docs', false)
             if (should_pack_website()) {
               utils.pack_lib('python-artifacts', 'docs/_build/python-artifacts.tgz', false)
@@ -1466,7 +1496,7 @@ def docs_julia() {
       node(NODE_LINUX_CPU) {
         ws('workspace/docs') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('libmxnet', mx_lib, false)
+            utils.unpack_and_init('libmxnet', 'lib/libmxnet.so', false)
             utils.docker_run('ubuntu_cpu_julia', 'build_julia_docs', false)
             if (should_pack_website()) {
               utils.pack_lib('julia-artifacts', 'docs/_build/julia-artifacts.tgz', false)
@@ -1484,7 +1514,7 @@ def docs_r() {
       node(NODE_LINUX_CPU) {
         ws('workspace/docs') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('libmxnet', mx_lib, false)
+            utils.unpack_and_init('libmxnet', 'lib/libmxnet.so', false)
             utils.docker_run('ubuntu_cpu_r', 'build_r_docs', false)
             if (should_pack_website()) {
               utils.pack_lib('r-artifacts', 'docs/_build/r-artifacts.tgz', false)
@@ -1503,7 +1533,7 @@ def docs_scala() {
       node(NODE_LINUX_CPU) {
         ws('workspace/docs') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('libmxnet', mx_lib, false)
+            utils.unpack_and_init('libmxnet', 'lib/libmxnet.so', false)
             utils.docker_run('ubuntu_cpu_scala', 'build_scala_docs', false)
             if (should_pack_website()) {
               utils.pack_lib('scala-artifacts', 'docs/_build/scala-artifacts.tgz', false)
@@ -1522,7 +1552,7 @@ def docs_java() {
       node(NODE_LINUX_CPU) {
         ws('workspace/docs') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('libmxnet', mx_lib, false)
+            utils.unpack_and_init('libmxnet', 'lib/libmxnet.so', false)
             utils.docker_run('ubuntu_cpu_scala', 'build_java_docs', false)
             if (should_pack_website()) {
               utils.pack_lib('java-artifacts', 'docs/_build/java-artifacts.tgz', false)
@@ -1541,7 +1571,7 @@ def docs_clojure() {
       node(NODE_LINUX_CPU) {
         ws('workspace/docs') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('libmxnet', mx_lib, false)
+            utils.unpack_and_init('libmxnet', 'lib/libmxnet.so', false)
             utils.docker_run('ubuntu_cpu_scala', 'build_clojure_docs', false)
             if (should_pack_website()) {
               utils.pack_lib('clojure-artifacts', 'docs/_build/clojure-artifacts.tgz', false)
diff --git a/ci/jenkins/Jenkinsfile_centos_cpu b/ci/jenkins/Jenkinsfile_centos_cpu
index a47ab3de7fb7..793d1f12e8d3 100644
--- a/ci/jenkins/Jenkinsfile_centos_cpu
+++ b/ci/jenkins/Jenkinsfile_centos_cpu
@@ -35,13 +35,14 @@ utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
     custom_steps.compile_centos7_cpu(),
+    custom_steps.compile_centos7_cpu_make(),
     custom_steps.compile_centos7_cpu_mkldnn()
-  ]) 
+  ])
 
   utils.parallel_stage('Tests', [
     custom_steps.test_centos7_python3_cpu(),
     custom_steps.test_centos7_scala_cpu()
-  ]) 
+  ])
 }
 ,
 failure_handler: {
diff --git a/ci/jenkins/Jenkinsfile_unix_cpu b/ci/jenkins/Jenkinsfile_unix_cpu
index 71917de58e82..5bfad60ef605 100644
--- a/ci/jenkins/Jenkinsfile_unix_cpu
+++ b/ci/jenkins/Jenkinsfile_unix_cpu
@@ -35,9 +35,11 @@ utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
     custom_steps.compile_unix_cpu_openblas(),
+    custom_steps.compile_unix_cpu_openblas_make(),
     custom_steps.compile_unix_openblas_debug_cpu(),
     custom_steps.compile_unix_mkl_cpu(),
     custom_steps.compile_unix_mkldnn_cpu(),
+    custom_steps.compile_unix_mkldnn_cpu_make(),
     custom_steps.compile_unix_mkldnn_mkl_cpu(),
     custom_steps.compile_unix_int64_cpu(),
     custom_steps.compile_unix_openblas_cpu_no_tvm_op(),
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index f8c28d5d1994..66d3c1391944 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -37,7 +37,7 @@ core_logic: {
     custom_steps.compile_unix_mkldnn_gpu(),
     custom_steps.compile_unix_mkldnn_nocudnn_gpu(),
     custom_steps.compile_unix_full_gpu(),
-    custom_steps.compile_unix_cmake_mkldnn_gpu(),
+    custom_steps.compile_unix_full_gpu_make(),
     custom_steps.compile_unix_cmake_gpu(),
     custom_steps.compile_unix_tensorrt_gpu(),
     custom_steps.compile_unix_int64_gpu(),
@@ -56,7 +56,6 @@ core_logic: {
     custom_steps.test_unix_perl_gpu(),
     custom_steps.test_unix_r_gpu(),
     custom_steps.test_unix_cpp_gpu(),
-    custom_steps.test_unix_cpp_mkldnn_gpu(),
     custom_steps.test_unix_python3_integration_gpu(),
     custom_steps.test_unix_cpp_package_gpu(),
     custom_steps.test_unix_scala_gpu(),
diff --git a/cmake/BuildCythonModules.cmake b/cmake/BuildCythonModules.cmake
index d2c3a46f1a71..48c8d8d1b924 100644
--- a/cmake/BuildCythonModules.cmake
+++ b/cmake/BuildCythonModules.cmake
@@ -16,23 +16,16 @@
 # under the License.
 
 function(add_cython_modules python_version)
-  unset(PYTHON_EXECUTABLE CACHE)
-  set(PYTHONINTERP_FOUND FALSE)
-  find_package(PythonInterp ${python_version} EXACT)
-  if(PYTHONINTERP_FOUND)
-    find_program(CYTHON_EXECUTABLE NAMES cython)
-    if(CYTHON_EXECUTABLE)
-      add_custom_command(COMMAND ${CMAKE_COMMAND} POST_BUILD
-                          -E env MXNET_LIBRARY_PATH=${CMAKE_BINARY_DIR}/libmxnet.so
-                          ${PYTHON_EXECUTABLE} setup.py build_ext --inplace --with-cython
-                          TARGET mxnet
-                          WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/python")
-      message("-- Cython modules for python${python_version} will be built")
-      set(PYTHON${python_version}_FOUND 1 PARENT_SCOPE)
-    else()
-      message(FATAL_ERROR "-- Cython not found")
-    endif()
+  find_package(Python3)
+  find_program(CYTHON_EXECUTABLE NAMES cython cython.bat cython3)
+  if(CYTHON_EXECUTABLE AND Python3_EXECUTABLE)
+    add_custom_command(COMMAND ${CMAKE_COMMAND} POST_BUILD
+                        -E env MXNET_LIBRARY_PATH=${CMAKE_BINARY_DIR}/libmxnet.so
+                        ${Python3_EXECUTABLE} setup.py build_ext --inplace --with-cython
+                        TARGET mxnet
+                        WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/python")
+    message("-- Cython modules will be built")
   else()
-    set(PYTHON${python_version}_FOUND 0 PARENT_SCOPE)
+    message(FATAL_ERROR "-- Cython not found")
   endif()
 endfunction()
diff --git a/config/distribution/darwin_cpu.cmake b/config/distribution/darwin_cpu.cmake
index a0c803c059ce..790e18320157 100644
--- a/config/distribution/darwin_cpu.cmake
+++ b/config/distribution/darwin_cpu.cmake
@@ -30,3 +30,4 @@ set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
 set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
 set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
 set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
+set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
diff --git a/config/distribution/linux_cpu.cmake b/config/distribution/linux_cpu.cmake
index cad348578454..15b4f5aa7e59 100644
--- a/config/distribution/linux_cpu.cmake
+++ b/config/distribution/linux_cpu.cmake
@@ -28,3 +28,4 @@ set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
 set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
 set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
 set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
+set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
diff --git a/config/distribution/linux_cu100.cmake b/config/distribution/linux_cu100.cmake
index d26b4d73eee7..bdbec7e63005 100644
--- a/config/distribution/linux_cu100.cmake
+++ b/config/distribution/linux_cu100.cmake
@@ -29,6 +29,7 @@ set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
 set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
 set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
 set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
+set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
 
 set(CUDACXX "/usr/local/cuda-10.0/bin/nvcc" CACHE STRING "Cuda compiler")
 set(MXNET_CUDA_ARCH "3.0;5.0;6.0;7.0;7.5" CACHE STRING "Cuda architectures")
diff --git a/config/distribution/linux_cu101.cmake b/config/distribution/linux_cu101.cmake
index aaf76cc10df1..fd773e88193b 100644
--- a/config/distribution/linux_cu101.cmake
+++ b/config/distribution/linux_cu101.cmake
@@ -31,6 +31,7 @@ set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
 set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
 set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
 set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
+set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
 
 set(CUDACXX "/usr/local/cuda-10.1/bin/nvcc" CACHE STRING "Cuda compiler")
 set(MXNET_CUDA_ARCH "3.0;5.0;6.0;7.0;7.5" CACHE STRING "Cuda architectures")
diff --git a/config/distribution/linux_cu102.cmake b/config/distribution/linux_cu102.cmake
index 6b575683e919..9f740f543ecb 100644
--- a/config/distribution/linux_cu102.cmake
+++ b/config/distribution/linux_cu102.cmake
@@ -29,6 +29,7 @@ set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
 set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
 set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
 set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
+set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
 
 set(CUDACXX "/usr/local/cuda-10.2/bin/nvcc" CACHE STRING "Cuda compiler")
 set(MXNET_CUDA_ARCH "3.0;5.0;6.0;7.0;7.5" CACHE STRING "Cuda architectures")
diff --git a/config/distribution/linux_cu75.cmake b/config/distribution/linux_cu75.cmake
index 45ba2b9de5d7..91ef97150519 100644
--- a/config/distribution/linux_cu75.cmake
+++ b/config/distribution/linux_cu75.cmake
@@ -29,6 +29,7 @@ set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
 set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
 set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
 set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
+set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
 
 set(CUDACXX "/usr/local/cuda-7.5/bin/nvcc" CACHE STRING "Cuda compiler")
 set(MXNET_CUDA_ARCH "3.0;3.5;5.0;5.2" CACHE STRING "Cuda architectures")
diff --git a/config/distribution/linux_cu80.cmake b/config/distribution/linux_cu80.cmake
index ce8e0083bcad..6b98538e6c89 100644
--- a/config/distribution/linux_cu80.cmake
+++ b/config/distribution/linux_cu80.cmake
@@ -29,6 +29,7 @@ set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
 set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
 set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
 set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
+set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
 
 set(CUDACXX "/usr/local/cuda-8.0/bin/nvcc" CACHE STRING "Cuda compiler")
 set(MXNET_CUDA_ARCH "3.0;5.0;6.0;6.2" CACHE STRING "Cuda architectures")
diff --git a/config/distribution/linux_cu90.cmake b/config/distribution/linux_cu90.cmake
index 01097cb882e4..1932a320f615 100644
--- a/config/distribution/linux_cu90.cmake
+++ b/config/distribution/linux_cu90.cmake
@@ -29,6 +29,7 @@ set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
 set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
 set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
 set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
+set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
 
 set(CUDACXX "/usr/local/cuda-9.0/bin/nvcc" CACHE STRING "Cuda compiler")
 set(MXNET_CUDA_ARCH "3.0;5.0;6.0;7.0;7.2" CACHE STRING "Cuda architectures")
diff --git a/config/distribution/linux_cu91.cmake b/config/distribution/linux_cu91.cmake
index f6301fa9f720..36e10a624e40 100644
--- a/config/distribution/linux_cu91.cmake
+++ b/config/distribution/linux_cu91.cmake
@@ -29,6 +29,7 @@ set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
 set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
 set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
 set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
+set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
 
 set(CUDACXX "/usr/local/cuda-9.1/bin/nvcc" CACHE STRING "Cuda compiler")
 set(MXNET_CUDA_ARCH "3.0;5.0;6.0;7.0;7.2" CACHE STRING "Cuda architectures")
diff --git a/config/distribution/linux_cu92.cmake b/config/distribution/linux_cu92.cmake
index 63ab9fce20d8..285daccdabc0 100644
--- a/config/distribution/linux_cu92.cmake
+++ b/config/distribution/linux_cu92.cmake
@@ -29,6 +29,7 @@ set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
 set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
 set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
 set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
+set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
 
 set(CUDACXX "/usr/local/cuda-9.2/bin/nvcc" CACHE STRING "Cuda compiler")
 set(MXNET_CUDA_ARCH "3.0;5.0;6.0;7.0;7.2" CACHE STRING "Cuda architectures")
diff --git a/contrib/clojure-package/examples/bert/project.clj b/contrib/clojure-package/examples/bert/project.clj
index 44ae4d536d26..3339f02ec325 100644
--- a/contrib/clojure-package/examples/bert/project.clj
+++ b/contrib/clojure-package/examples/bert/project.clj
@@ -23,7 +23,7 @@
             ;;; so if you run into trouble please delete the `lein-juptyter` plugin
             [lein-jupyter "0.1.16" :exclusions [org.clojure/tools.nrepl org.clojure/clojure org.codehaus.plexus/plexus-utils org.clojure/tools.reader]]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]
                  [cheshire "5.8.1"]
                  [clojure-csv/clojure-csv "2.0.1"]]
   :pedantic? :skip
diff --git a/contrib/clojure-package/examples/captcha/project.clj b/contrib/clojure-package/examples/captcha/project.clj
index 795f3fea875a..c0e00f1b70f4 100644
--- a/contrib/clojure-package/examples/captcha/project.clj
+++ b/contrib/clojure-package/examples/captcha/project.clj
@@ -19,7 +19,7 @@
   :description "Captcha recognition via multi-label classification"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]]
   :main ^:skip-aot captcha.train-ocr
   :profiles {:train {:main captcha.train-ocr}
              :infer {:main captcha.infer-ocr}
diff --git a/contrib/clojure-package/examples/cnn-text-classification/project.clj b/contrib/clojure-package/examples/cnn-text-classification/project.clj
index 1b8859fd732c..0eb31263c5d7 100644
--- a/contrib/clojure-package/examples/cnn-text-classification/project.clj
+++ b/contrib/clojure-package/examples/cnn-text-classification/project.clj
@@ -19,6 +19,6 @@
   :description "CNN text classification with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]]
   :pedantic? :skip
   :main cnn-text-classification.classifier)
diff --git a/contrib/clojure-package/examples/gan/project.clj b/contrib/clojure-package/examples/gan/project.clj
index 4048b0c1cb2e..8911502331f9 100644
--- a/contrib/clojure-package/examples/gan/project.clj
+++ b/contrib/clojure-package/examples/gan/project.clj
@@ -20,7 +20,7 @@
   :plugins [[lein-cljfmt "0.5.7"]]
   :repositories [["vendredi" {:url "https://repository.hellonico.info/repository/hellonico/"}]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]
                  [origami "4.0.0-3"]
                  ]
-  :main gan.gan-mnist)
\ No newline at end of file
+  :main gan.gan-mnist)
diff --git a/contrib/clojure-package/examples/imclassification/project.clj b/contrib/clojure-package/examples/imclassification/project.clj
index 702a33d67ee8..439d14c344ef 100644
--- a/contrib/clojure-package/examples/imclassification/project.clj
+++ b/contrib/clojure-package/examples/imclassification/project.clj
@@ -19,6 +19,6 @@
   :description "Clojure examples for image classification"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]]
   :pedantic? :skip
   :main imclassification.train-mnist)
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/project.clj b/contrib/clojure-package/examples/infer/imageclassifier/project.clj
index dcca5982fd28..76cfb10d59c5 100644
--- a/contrib/clojure-package/examples/infer/imageclassifier/project.clj
+++ b/contrib/clojure-package/examples/infer/imageclassifier/project.clj
@@ -20,6 +20,6 @@
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
                  [org.clojure/tools.cli "0.4.1"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]]
   :main ^:skip-aot infer.imageclassifier-example
   :profiles {:uberjar {:aot :all}})
diff --git a/contrib/clojure-package/examples/infer/objectdetector/project.clj b/contrib/clojure-package/examples/infer/objectdetector/project.clj
index 1d29be2d8e94..73f749a6a7b3 100644
--- a/contrib/clojure-package/examples/infer/objectdetector/project.clj
+++ b/contrib/clojure-package/examples/infer/objectdetector/project.clj
@@ -22,6 +22,6 @@
   :aliases {"run-detector" ["run" "--" "-m" "models/resnet50_ssd/resnet50_ssd_model" "-i" "images/dog.jpg" "-d" "images/"]}
   :dependencies [[org.clojure/clojure "1.9.0"]
                  [org.clojure/tools.cli "0.4.1"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]]
   :main ^:skip-aot infer.objectdetector-example
   :profiles {:uberjar {:aot :all}})
diff --git a/contrib/clojure-package/examples/infer/predictor/project.clj b/contrib/clojure-package/examples/infer/predictor/project.clj
index 936d9179b76e..4bbc9bfa57eb 100644
--- a/contrib/clojure-package/examples/infer/predictor/project.clj
+++ b/contrib/clojure-package/examples/infer/predictor/project.clj
@@ -20,6 +20,6 @@
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
                  [org.clojure/tools.cli "0.4.1"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]]
   :main ^:skip-aot infer.predictor-example
   :profiles {:uberjar {:aot :all}})
diff --git a/contrib/clojure-package/examples/module/project.clj b/contrib/clojure-package/examples/module/project.clj
index 83519e8e6886..42e6ab084066 100644
--- a/contrib/clojure-package/examples/module/project.clj
+++ b/contrib/clojure-package/examples/module/project.clj
@@ -19,7 +19,7 @@
   :description "Clojure examples for module"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]]
   :pedantic? :skip
   :main mnist-mlp)
 
diff --git a/contrib/clojure-package/examples/multi-label/project.clj b/contrib/clojure-package/examples/multi-label/project.clj
index d9ec86eb84f2..197fc3c22f38 100644
--- a/contrib/clojure-package/examples/multi-label/project.clj
+++ b/contrib/clojure-package/examples/multi-label/project.clj
@@ -19,5 +19,5 @@
   :description "Example of multi-label classification"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]]
   :main multi-label.core)
diff --git a/contrib/clojure-package/examples/neural-style/project.clj b/contrib/clojure-package/examples/neural-style/project.clj
index 32a640be1490..986121605275 100644
--- a/contrib/clojure-package/examples/neural-style/project.clj
+++ b/contrib/clojure-package/examples/neural-style/project.clj
@@ -20,6 +20,6 @@
   :plugins [[lein-cljfmt "0.5.7"]]
   :repositories [["vendredi" {:url "https://repository.hellonico.info/repository/hellonico/"}]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]
                  [origami "4.0.0-3"]]
-  :main neural-style.core)
\ No newline at end of file
+  :main neural-style.core)
diff --git a/contrib/clojure-package/examples/pre-trained-models/project.clj b/contrib/clojure-package/examples/pre-trained-models/project.clj
index 07e0f77b5933..e04093ec9567 100644
--- a/contrib/clojure-package/examples/pre-trained-models/project.clj
+++ b/contrib/clojure-package/examples/pre-trained-models/project.clj
@@ -21,6 +21,6 @@
   :repositories [["vendredi" {:url "https://repository.hellonico.info/repository/hellonico/"}]]
   :aliases {"predict-image" ["run" "-m" "pre-trained-models.predict-image" ]}
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]
                  [origami "4.0.0-3"]]
   :main pre-trained-models.fine-tune)
diff --git a/contrib/clojure-package/examples/profiler/project.clj b/contrib/clojure-package/examples/profiler/project.clj
index b5c737b521e2..fd8b63e059d9 100644
--- a/contrib/clojure-package/examples/profiler/project.clj
+++ b/contrib/clojure-package/examples/profiler/project.clj
@@ -18,5 +18,5 @@
 (defproject profiler "0.1.0-SNAPSHOT"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]]
   :main profiler.core)
diff --git a/contrib/clojure-package/examples/rnn/project.clj b/contrib/clojure-package/examples/rnn/project.clj
index ffbae5da3dd4..fe1d2f2e7c7b 100644
--- a/contrib/clojure-package/examples/rnn/project.clj
+++ b/contrib/clojure-package/examples/rnn/project.clj
@@ -19,5 +19,5 @@
   :description "RNN example"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]]
   :main rnn.train-char-rnn)
diff --git a/contrib/clojure-package/examples/tutorial/project.clj b/contrib/clojure-package/examples/tutorial/project.clj
index 6bb77bd753d1..a48a91e17dba 100644
--- a/contrib/clojure-package/examples/tutorial/project.clj
+++ b/contrib/clojure-package/examples/tutorial/project.clj
@@ -19,7 +19,7 @@
   :description "MXNET tutorials"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-  				 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]
+  				 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]
 
                  ;; Uncomment the one appropriate for your machine & configuration:
                  #_[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.4.0"]
diff --git a/contrib/clojure-package/examples/visualization/project.clj b/contrib/clojure-package/examples/visualization/project.clj
index dae61919716e..1492b4138ca0 100644
--- a/contrib/clojure-package/examples/visualization/project.clj
+++ b/contrib/clojure-package/examples/visualization/project.clj
@@ -19,5 +19,5 @@
   :description "Visualization example"
   :plugins [[lein-cljfmt "0.5.7"]]
   :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.6.0-SNAPSHOT"]]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.7.0-SNAPSHOT"]]
   :main visualization.core)
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
index 672090a899b3..e94a59879466 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
@@ -96,30 +96,30 @@
   ([]
    (ada-delta {})))
 
-(s/def gamma1 number?)
-(s/def gamma2 number?)
-(s/def ::rms-prop-opts (s/keys :opt-un [::learning-rate ::rescale-gradient ::gamma1 ::gamma2 ::wd ::clip-gradient]))
+(s/def rho number?)
+(s/def momentum number?)
+(s/def ::rms-prop-opts (s/keys :opt-un [::learning-rate ::rescale-gradient ::rho ::momentum ::wd ::clip-gradient]))
 
 (defn rms-prop
   "RMSProp optimizer as described in Tieleman & Hinton, 2012.
    http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
    - learningRate Step size.
-   - gamma1  decay factor of moving average for gradient, gradient^^2.
-   -  gamma2  momentum factor of moving average for gradient.
-   -  rescale-gradient rescaling factor of gradient.
-   -  wd L2 regularization coefficient add to all the weights
-   -  clip-gradient clip gradient in range [-clip_gradient, clip_gradient]
-   -  lr-scheduler The learning rate scheduler"
-  ([{:keys [learning-rate rescale-gradient gamma1 gamma2 wd lr-scheduler clip-gradient] :as opts
+   - rho  decay factor of moving average for gradient, gradient^^2.
+   - momentum  momentum factor of moving average for gradient.
+   - rescale-gradient rescaling factor of gradient.
+   - wd L2 regularization coefficient add to all the weights
+   - clip-gradient clip gradient in range [-clip_gradient, clip_gradient]
+   - lr-scheduler The learning rate scheduler"
+  ([{:keys [learning-rate rescale-gradient rho momentum wd lr-scheduler clip-gradient] :as opts
      :or {learning-rate 0.002
           rescale-gradient 1.0
-          gamma1 0.95
-          gamma2 0.9
+          rho 0.95
+          momentum 0.9
           wd 0.0
           clip-gradient 0}}]
    (util/validate! ::rms-prop-opts opts "Incorrect rms-prop optimizer options")
-   (new RMSProp (float learning-rate) (float rescale-gradient) (float gamma1)
-        (float gamma2) (float wd) lr-scheduler (float clip-gradient)))
+   (new RMSProp (float learning-rate) (float rescale-gradient) (float rho)
+        (float momentum) (float wd) lr-scheduler (float clip-gradient)))
   ([]
    (rms-prop {})))
 
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj
index 599a0672bea5..f2413dc91101 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj
@@ -50,7 +50,7 @@
   (is (thrown? Exception (optimizer/dcasgd {:lambda 'a})))
   (is (thrown? Exception (optimizer/nag {:momentum 'a})))
   (is (thrown? Exception (optimizer/ada-delta {:epsilon 'a})))
-  (is (thrown? Exception (optimizer/rms-prop {:gamma1 'a})))
+  (is (thrown? Exception (optimizer/rms-prop {:rho 'a})))
   (is (thrown? Exception (optimizer/ada-grad {:rescale-gradient 'a})))
   (is (thrown? Exception (optimizer/adam {:beta1 'a})))
   (is (thrown? Exception (optimizer/sgld {:lr-scheduler 0.1}))))
\ No newline at end of file
diff --git a/cpp-package/.gitignore b/cpp-package/.gitignore
new file mode 100644
index 000000000000..51453c9b8423
--- /dev/null
+++ b/cpp-package/.gitignore
@@ -0,0 +1,2 @@
+# Rebuildable file(s)
+include/mxnet-cpp/op.h
diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp
index 94e9455c5941..3d1b91d729e2 100644
--- a/cpp-package/example/charRNN.cpp
+++ b/cpp-package/example/charRNN.cpp
@@ -553,7 +553,7 @@ void trainWithBuiltInRNNOp(const std::string file, int batch_size, int max_epoch
   }
   start_epoch++;
 
-  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+  Optimizer* opt = OptimizerRegistry::Find("sgd");
 //  opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size)
 //  ->SetParam("clip_gradient", 10);
 
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
index 54be0edccc14..3e34dbb486ab 100644
--- a/cpp-package/example/lenet.cpp
+++ b/cpp-package/example/lenet.cpp
@@ -136,7 +136,7 @@ class Lenet {
     // args_map["fc1_b"] = 0;
 
     lenet.InferArgsMap(ctx_dev, &args_map, args_map);
-    Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+    Optimizer* opt = OptimizerRegistry::Find("sgd");
     opt->SetParam("momentum", 0.9)
        ->SetParam("rescale_grad", 1.0)
        ->SetParam("clip_gradient", 10)
diff --git a/cpp-package/include/mxnet-cpp/.gitignore b/cpp-package/include/mxnet-cpp/.gitignore
deleted file mode 100644
index 995efdd6f07b..000000000000
--- a/cpp-package/include/mxnet-cpp/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# Rebuildable file(s)
-op.h
diff --git a/cpp-package/include/mxnet-cpp/optimizer.hpp b/cpp-package/include/mxnet-cpp/optimizer.hpp
index 26fd00f3a162..b259c7bba61d 100644
--- a/cpp-package/include/mxnet-cpp/optimizer.hpp
+++ b/cpp-package/include/mxnet-cpp/optimizer.hpp
@@ -128,7 +128,6 @@ inline Optimizer* OptimizerRegistry::Find(const std::string& name) {
   if (cmap().empty()) {
     // Optimizers should only be registered once
     MXNETCPP_REGISTER_OPTIMIZER(sgd, SGDOptimizer);
-    MXNETCPP_REGISTER_OPTIMIZER(ccsgd, SGDOptimizer);  // For backward compatibility
     MXNETCPP_REGISTER_OPTIMIZER(rmsprop, RMSPropOptimizer);
     MXNETCPP_REGISTER_OPTIMIZER(adam, AdamOptimizer);
     MXNETCPP_REGISTER_OPTIMIZER(adagrad, AdaGradOptimizer);
@@ -271,8 +270,8 @@ inline RMSPropOptimizer::RMSPropOptimizer(unsigned begin_num_update)
   : Optimizer(begin_num_update) {
   update_handle_ = op_map()->GetSymbolCreator("rmsprop_update");
   alex_update_handle_ = op_map()->GetSymbolCreator("rmspropalex_update");
-  SetParam("gamma1", 0.9f);
-  SetParam("gamma2", 0.9f);
+  SetParam("rho", 0.9f);
+  SetParam("momentum", 0.9f);
   SetParam("epsilon", 1e-8);
 }
 
diff --git a/docs/python_docs/python/tutorials/packages/optimizer/index.md b/docs/python_docs/python/tutorials/packages/optimizer/index.md
index 3ae15121a9a4..b68848b8760f 100644
--- a/docs/python_docs/python/tutorials/packages/optimizer/index.md
+++ b/docs/python_docs/python/tutorials/packages/optimizer/index.md
@@ -181,10 +181,10 @@ Here is an example snippet creating the RMSProp optimizer in MXNet.
 
 
 ```python
-rmsprop_optimizer = optimizer.RMSProp(learning_rate=0.001, gamma1=0.9, gamma2=0.9, epsilon=1e-07, centered=False)
+rmsprop_optimizer = optimizer.RMSProp(learning_rate=0.001, rho=0.9, momentum=0.9, epsilon=1e-07, centered=False)
 ```
 
-In the code snippet above, `gamma1` is $\beta$ in the equations above and `gamma2` is $\gamma$, which is only used where `centered=True`.
+In the code snippet above, `rho` is $\beta$ in the equations above and `momentum` is $\gamma$, which is only used where `centered=True`.
 
 ### [AdaDelta](/api/python/docs/api/optimizer/index.html#mxnet.optimizer.AdaDelta)
 
@@ -281,32 +281,6 @@ Here is how to create the signum optimizer in MXNet.
 signum_optimizer = optimizer.Signum(learning_rate=0.01, momentum=0.9, wd_lh=0.0)
 ```
 
-### [LBSGD](/api/python/docs/api/optimizer/index.html#mxnet.optimizer.LBSGD)
-LBSGD stands for Large Batch Stochastic Gradient Descent and implements a technique where Layer-wise Adaptive Rate Scaling (LARS) is used to maintain a separate learning rate for each layer of the neural network. LBSGD has no additional modifications to SGD and performs the same parameter update steps as the SGD optimizer described above.
-
-LBSGD was introduced by [You et al](https://arxiv.org/pdf/1708.03888.pdf) for distributed training with data-parallel synchronous SGD across multiple worker nodes to overcome the issue of reduced model accuracy when the number of workers, and by extension effective batch size, is increased.
-
-Here is how to initialize the LBSGD optimizer in MXNet.
-
-
-```python
-lbsgd_optimizer = optimizer.LBSGD(momentum=0.0,
-                                  multi_precision=False,
-                                  warmup_strategy='linear',
-                                  warmup_epochs=5,
-                                  batch_scale=1,
-                                  updates_per_epoch=32,
-                                  begin_epoch=0,
-                                  num_epochs=60)
-```
-
-LBSGD has a number of extra keyword arguments described below
-* `multi_precision` - When True performs updates with float32 precision weights regardless of whether weights are initialized with lower precision. When False perform updates with same precision as the weights when initialized. Set to True to improve performance when training with low precision weight represenations.
-* `warmup_strategy` - The warmup is period where the learning rate is increased through the first few epochs. The following strategies are supported:  ['linear', 'power2', 'sqrt','lars']
-* `warmup_epochs` - How many epochs to perform warmup for
-* `batch_scale` - use batch size*numworkers
-* `updates_per_epoch` - How many updates to the learning rate to perform every epoch. For example during warmup the warmup strategy is applied to increase the learning rate a total of `warmup_epochs*updates_per_epoch` number of times.
-* `begin_epoch` - The epoch at which to start warmup.
 
 ### [DCASGD](/api/python/docs/api/optimizer/index.html#mxnet.optimizer.DCASGD)
 
diff --git a/docs/static_site/src/_includes/get_started/linux/python/cpu/pip.md b/docs/static_site/src/_includes/get_started/linux/python/cpu/pip.md
index 81db75ea9038..08901aef9daf 100644
--- a/docs/static_site/src/_includes/get_started/linux/python/cpu/pip.md
+++ b/docs/static_site/src/_includes/get_started/linux/python/cpu/pip.md
@@ -1,6 +1,6 @@
 Run the following command:
 
-<div class="v1-5-0">
+<div class="v1-5-1">
 {% highlight bash %}
 $ pip install mxnet
 {% endhighlight %}
@@ -13,7 +13,7 @@ in the <a href="https://mxnet.io/api/faq/perf#intel-cpu">MXNet tuning guide</a>.
 $ pip install mxnet-mkl
 {% endhighlight %}
 
-</div> <!-- End of v1-5-0 -->
+</div> <!-- End of v1-5-1 -->
 
 <div class="v1-4-1">
 
@@ -103,7 +103,7 @@ $ pip install mxnet==0.11.0
 <div class="master">
 
 {% highlight bash %}
-$ pip install mxnet --pre
+$ pip install --pre mxnet -f https://dist.mxnet.io/python/all
 {% endhighlight %}
 
 MKL-DNN enabled pip packages are optimized for Intel hardware. You can find
@@ -111,11 +111,11 @@ performance numbers
 in the <a href="https://mxnet.io/api/faq/perf#intel-cpu">MXNet tuning guide</a>.
 
 {% highlight bash %}
-$ pip install mxnet-mkl --pre
+$ pip install --pre mxnet-mkl -f https://dist.mxnet.io/python/all
 {% endhighlight %}
 
 </div> <!-- End of master-->
 <br>
 
 
-{% include /get_started/pip_snippet.md %}
\ No newline at end of file
+{% include /get_started/pip_snippet.md %}
diff --git a/docs/static_site/src/_includes/get_started/linux/python/gpu/pip.md b/docs/static_site/src/_includes/get_started/linux/python/gpu/pip.md
index 249cd5b54052..8848edde7a7d 100644
--- a/docs/static_site/src/_includes/get_started/linux/python/gpu/pip.md
+++ b/docs/static_site/src/_includes/get_started/linux/python/gpu/pip.md
@@ -1,11 +1,11 @@
 Run the following command:
 
-<div class="v1-5-0">
+<div class="v1-5-1">
 {% highlight bash %}
 $ pip install mxnet-cu101
 {% endhighlight %}
 
-</div> <!-- End of v1-5-0 -->
+</div> <!-- End of v1-5-1 -->
 <div class="v1-4-1">
 
 {% highlight bash %}
@@ -63,7 +63,7 @@ $ pip install mxnet-cu80==0.11.0
 <div class="master">
 
 {% highlight bash %}
-$ pip install mxnet-cu101 --pre
+$ pip install --pre mxnet-cu102 -f https://dist.mxnet.io/python/all
 {% endhighlight %}
 
 </div> <!-- End of master-->
@@ -71,4 +71,4 @@ $ pip install mxnet-cu101 --pre
 <br>
 
 {% include /get_started/pip_snippet.md %}
-{% include /get_started/gpu_snippet.md %}
\ No newline at end of file
+{% include /get_started/gpu_snippet.md %}
diff --git a/docs/static_site/src/_includes/get_started/macos/python/cpu/pip.md b/docs/static_site/src/_includes/get_started/macos/python/cpu/pip.md
index beb5eb4fb797..35c3b78b08f8 100644
--- a/docs/static_site/src/_includes/get_started/macos/python/cpu/pip.md
+++ b/docs/static_site/src/_includes/get_started/macos/python/cpu/pip.md
@@ -1,11 +1,11 @@
 Run the following command:
 
-<div class="v1-5-0">
+<div class="v1-5-1">
 
 {% highlight bash %}
 $ pip install mxnet
 {% endhighlight %}
-</div> <!-- End of v1-5-0 -->
+</div> <!-- End of v1-5-1 -->
 <div class="v1-4-1">
 
 {% highlight bash %}
@@ -65,9 +65,9 @@ $ pip install mxnet==0.11.0
 <div class="master">
 
 {% highlight bash %}
-$ pip install mxnet --pre
+$ pip install --pre mxnet -f https://dist.mxnet.io/python/all
 {% endhighlight %}
 
 </div> <!-- End of master-->
 
-{% include /get_started/pip_snippet.md %}
\ No newline at end of file
+{% include /get_started/pip_snippet.md %}
diff --git a/docs/static_site/src/_includes/get_started/windows/python/cpu/pip.md b/docs/static_site/src/_includes/get_started/windows/python/cpu/pip.md
index d5c7f1fd08f0..7061794f1d2f 100644
--- a/docs/static_site/src/_includes/get_started/windows/python/cpu/pip.md
+++ b/docs/static_site/src/_includes/get_started/windows/python/cpu/pip.md
@@ -1,12 +1,12 @@
 Run the following command:
 
-<div class="v1-5-0">
+<div class="v1-5-1">
 
 {% highlight bash %}
 $ pip install mxnet
 {% endhighlight %}
 
-</div> <!-- End of v1-5-0 -->
+</div> <!-- End of v1-5-1 -->
 <div class="v1-4-1">
 
 {% highlight bash %}
@@ -64,10 +64,10 @@ $ pip install mxnet==0.11.0
 <div class="master">
 
 {% highlight bash %}
-$ pip install mxnet --pre
+$ pip install --pre mxnet -f https://dist.mxnet.io/python/all
 {% endhighlight %}
 
 </div> <!-- End of master-->
 
 {% include /get_started/pip_snippet.md %}
-{% include /get_started/gpu_snippet.md %}
\ No newline at end of file
+{% include /get_started/gpu_snippet.md %}
diff --git a/docs/static_site/src/_includes/get_started/windows/python/gpu/pip.md b/docs/static_site/src/_includes/get_started/windows/python/gpu/pip.md
index cbcd9d44d6af..194a5a3220c5 100644
--- a/docs/static_site/src/_includes/get_started/windows/python/gpu/pip.md
+++ b/docs/static_site/src/_includes/get_started/windows/python/gpu/pip.md
@@ -1,12 +1,12 @@
 Run the following command:
 
-<div class="v1-5-0">
+<div class="v1-5-1">
 
 {% highlight bash %}
 $ pip install mxnet-cu101
 {% endhighlight %}
 
-</div> <!-- End of v1-5-0 -->
+</div> <!-- End of v1-5-1 -->
 <div class="v1-4-1">
 
 {% highlight bash %}
@@ -64,11 +64,11 @@ $ pip install mxnet-cu80==0.11.0
 <div class="master">
 
 {% highlight bash %}
-$ pip install mxnet-cu101 --pre
+$ pip install --pre mxnet-cu102 -f https://dist.mxnet.io/python/all
 {% endhighlight %}
 
 </div> <!-- End of master-->
 
 
 {% include /get_started/pip_snippet.md %}
-{% include /get_started/gpu_snippet.md %}
\ No newline at end of file
+{% include /get_started/gpu_snippet.md %}
diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py
index 8e8b0197960a..38ca296cf986 100755
--- a/example/image-classification/common/fit.py
+++ b/example/image-classification/common/fit.py
@@ -235,7 +235,7 @@ def fit(args, network, data_loader, **kwargs):
         'multi_precision': True}
 
     # Only a limited number of optimizers have 'momentum' property
-    has_momentum = {'sgd', 'dcasgd', 'nag', 'signum', 'lbsgd'}
+    has_momentum = {'sgd', 'dcasgd', 'nag', 'signum'}
     if args.optimizer in has_momentum:
         optimizer_params['momentum'] = args.mom
 
@@ -243,7 +243,7 @@ def fit(args, network, data_loader, **kwargs):
         args.monitor, pattern=".*") if args.monitor > 0 else None
 
     # A limited number of optimizers have a warmup period
-    has_warmup = {'lbsgd', 'lbnag'}
+    has_warmup = {'lbnag'}
     if args.optimizer in has_warmup:
         nworkers = kv.num_workers
         if epoch_size < 1:
diff --git a/example/profiler/profiler_executor.py b/example/profiler/profiler_executor.py
index 91532535bd05..cba1515fa1a1 100644
--- a/example/profiler/profiler_executor.py
+++ b/example/profiler/profiler_executor.py
@@ -102,7 +102,7 @@ def get_module(ctx, sym, provide_data, provide_label, batch_size=None, is_train=
         mod.bind(data_shapes=provide_data, label_shapes=provide_label, for_training=False, inputs_need_grad=False)
 
     mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-    mod.init_optimizer(optimizer='ccsgd',
+    mod.init_optimizer(optimizer='sgd',
                        optimizer_params={
                             'learning_rate': 0.0001,
                             'momentum': 0.0,
diff --git a/example/speech_recognition/deepspeech.cfg b/example/speech_recognition/deepspeech.cfg
index 69894ae7d640..387d4f31eb2b 100644
--- a/example/speech_recognition/deepspeech.cfg
+++ b/example/speech_recognition/deepspeech.cfg
@@ -112,7 +112,7 @@ optimizer_params_dictionary={"momentum":0.9}
 # adagrad
 # optimizer_params_dictionary={"eps":1e-08}
 # rmsprop
-# optimizer_params_dictionary={"gamma1":0.9, "gamma2":0.9,"epsilon":1e-08}
+# optimizer_params_dictionary={"rho":0.9, "momentum":0.9,"epsilon":1e-08}
 # adadelta
 # optimizer_params_dictionary={"rho":0.95, "epsilon":1e-08}
 # set to 0 to disable gradient clipping
diff --git a/example/speech_recognition/default.cfg b/example/speech_recognition/default.cfg
index b0869a9dad2e..f583da11c61a 100644
--- a/example/speech_recognition/default.cfg
+++ b/example/speech_recognition/default.cfg
@@ -109,7 +109,7 @@ optimizer_params_dictionary={"beta1":0.9,"beta2":0.999}
 # adagrad
 # optimizer_params_dictionary={"eps":1e-08}
 # rmsprop
-# optimizer_params_dictionary={"gamma1":0.9, "gamma2":0.9,"epsilon":1e-08}
+# optimizer_params_dictionary={"rho":0.9, "momentum":0.9,"epsilon":1e-08}
 # adadelta
 # optimizer_params_dictionary={"rho":0.95, "epsilon":1e-08}
 # set to 0 to disable gradient clipping
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 4e47e03c69f8..47852953655a 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -73,7 +73,7 @@
 /*! \brief major version */
 #define MXNET_MAJOR 1
 /*! \brief minor version */
-#define MXNET_MINOR 6
+#define MXNET_MINOR 7
 /*! \brief patch version */
 #define MXNET_PATCH 0
 /*! \brief mxnet version */
diff --git a/include/mxnet/ir/expr.h b/include/mxnet/ir/expr.h
index b9483c74320a..a9f4ff2bbf70 100644
--- a/include/mxnet/ir/expr.h
+++ b/include/mxnet/ir/expr.h
@@ -141,7 +141,7 @@ class IntImmNode : public PrimExprNode {
   int64_t value;
 
   static constexpr const char* _type_key = "IntImm";
-  MXNET_DECLARE_FINAL_OBJECT_INFO(IntImmNode, PrimExprNode);
+  MXNET_DECLARE_FINAL_OBJECT_INFO(IntImmNode, PrimExprNode)
 };
 
 /*!
@@ -186,7 +186,7 @@ class FloatImmNode : public PrimExprNode {
   double value;
 
   static constexpr const char* _type_key = "FloatImm";
-  MXNET_DECLARE_FINAL_OBJECT_INFO(FloatImmNode, PrimExprNode);
+  MXNET_DECLARE_FINAL_OBJECT_INFO(FloatImmNode, PrimExprNode)
 };
 
 /*!
diff --git a/include/mxnet/node/container.h b/include/mxnet/node/container.h
index 27b9853a74b7..e164f64a9184 100644
--- a/include/mxnet/node/container.h
+++ b/include/mxnet/node/container.h
@@ -42,7 +42,7 @@ class ArrayNode : public Object {
   std::vector<ObjectRef> data;
 
   static constexpr const char* _type_key = "Array";
-  MXNET_DECLARE_FINAL_OBJECT_INFO(ArrayNode, Object);
+  MXNET_DECLARE_FINAL_OBJECT_INFO(ArrayNode, Object)
 };
 
 /*!
diff --git a/include/mxnet/runtime/container.h b/include/mxnet/runtime/container.h
index 3dd7e0fc9c79..cd719aaa51a6 100644
--- a/include/mxnet/runtime/container.h
+++ b/include/mxnet/runtime/container.h
@@ -173,7 +173,7 @@ class ADTObj : public Object, public InplaceArrayBase<ADTObj, ObjectRef> {
 
   static constexpr const uint32_t _type_index = TypeIndex::kMXNetADT;
   static constexpr const char* _type_key = "MXNet.ADT";
-  MXNET_DECLARE_FINAL_OBJECT_INFO(ADTObj, Object);
+  MXNET_DECLARE_FINAL_OBJECT_INFO(ADTObj, Object)
 
  private:
   /*!
@@ -273,7 +273,7 @@ class ADT : public ObjectRef {
     return ADT(0, std::forward<Args>(args)...);
   }
 
-  MXNET_DEFINE_OBJECT_REF_METHODS(ADT, ObjectRef, ADTObj);
+  MXNET_DEFINE_OBJECT_REF_METHODS(ADT, ObjectRef, ADTObj)
 };
 
 }  // namespace runtime
diff --git a/include/mxnet/runtime/ffi_helper.h b/include/mxnet/runtime/ffi_helper.h
index b539524dfd05..49134ca122a7 100644
--- a/include/mxnet/runtime/ffi_helper.h
+++ b/include/mxnet/runtime/ffi_helper.h
@@ -38,7 +38,7 @@ class EllipsisObj : public Object {
  public:
   static constexpr const uint32_t _type_index = TypeIndex::kEllipsis;
   static constexpr const char* _type_key = "MXNet.Ellipsis";
-  MXNET_DECLARE_FINAL_OBJECT_INFO(EllipsisObj, Object);
+  MXNET_DECLARE_FINAL_OBJECT_INFO(EllipsisObj, Object)
 };
 
 inline ObjectRef CreateEllipsis() {
@@ -54,7 +54,7 @@ class SliceObj : public Object {
 
   static constexpr const uint32_t _type_index = TypeIndex::kSlice;
   static constexpr const char* _type_key = "MXNet.Slice";
-  MXNET_DECLARE_FINAL_OBJECT_INFO(SliceObj, Object);
+  MXNET_DECLARE_FINAL_OBJECT_INFO(SliceObj, Object)
 };
 
 class Slice : public ObjectRef {
@@ -74,7 +74,7 @@ class Slice : public ObjectRef {
   // constant to represent None.
   static constexpr int64_t kNoneValue = std::numeric_limits<int64_t>::min();
 
-  MXNET_DEFINE_OBJECT_REF_METHODS(Slice, ObjectRef, SliceObj);
+  MXNET_DEFINE_OBJECT_REF_METHODS(Slice, ObjectRef, SliceObj)
 };
 
 int64_t inline SliceNoneValue() {
@@ -86,7 +86,7 @@ class IntegerObj: public Object {
   int64_t value;
   static constexpr const uint32_t _type_index = TypeIndex::kInteger;
   static constexpr const char* _type_key = "MXNet.Integer";
-  MXNET_DECLARE_FINAL_OBJECT_INFO(IntegerObj, Object);
+  MXNET_DECLARE_FINAL_OBJECT_INFO(IntegerObj, Object)
 };
 
 class Integer: public ObjectRef {
@@ -96,7 +96,7 @@ class Integer: public ObjectRef {
     data->value = value;
     data_ = std::move(data);
   }
-  MXNET_DEFINE_OBJECT_REF_METHODS(Integer, ObjectRef, IntegerObj);
+  MXNET_DEFINE_OBJECT_REF_METHODS(Integer, ObjectRef, IntegerObj)
 };
 
 //  Helper functions for fast FFI implementations
diff --git a/include/mxnet/runtime/object.h b/include/mxnet/runtime/object.h
index e2fb067f1067..a031a56d88ed 100644
--- a/include/mxnet/runtime/object.h
+++ b/include/mxnet/runtime/object.h
@@ -644,22 +644,20 @@ struct ObjectEqual {
  * \param TypeName The name of the current type.
  * \param ParentType The name of the ParentType
  */
-#define MXNET_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType)              \
-  static const uint32_t RuntimeTypeIndex()  {                           \
-    if (TypeName::_type_index != ::mxnet::runtime::TypeIndex::kDynamic) { \
-      return TypeName::_type_index;                                     \
-    }                                                                   \
-    return _GetOrAllocRuntimeTypeIndex();                               \
-  }                                                                     \
-  static const uint32_t _GetOrAllocRuntimeTypeIndex()  {                \
-    static uint32_t tidx = GetOrAllocRuntimeTypeIndex(                  \
-        TypeName::_type_key,                                            \
-        TypeName::_type_index,                                          \
-        ParentType::_GetOrAllocRuntimeTypeIndex(),                      \
-        TypeName::_type_child_slots,                                    \
-        TypeName::_type_child_slots_can_overflow);                      \
-    return tidx;                                                        \
-  }                                                                     \
+#define MXNET_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType)                \
+  static uint32_t RuntimeTypeIndex()  {                                     \
+    return TypeName::_type_index != ::mxnet::runtime::TypeIndex::kDynamic ? \
+           TypeName::_type_index : _GetOrAllocRuntimeTypeIndex();           \
+  }                                                                         \
+  static uint32_t _GetOrAllocRuntimeTypeIndex()  {                          \
+    static uint32_t tidx = GetOrAllocRuntimeTypeIndex(                      \
+        TypeName::_type_key,                                                \
+        TypeName::_type_index,                                              \
+        ParentType::_GetOrAllocRuntimeTypeIndex(),                          \
+        TypeName::_type_child_slots,                                        \
+        TypeName::_type_child_slots_can_overflow);                          \
+    return tidx;                                                            \
+  }
 
 /*!
  * \brief helper macro to declare type information in a final class.
@@ -667,8 +665,8 @@ struct ObjectEqual {
   * \param ParentType The name of the ParentType
   */
 #define MXNET_DECLARE_FINAL_OBJECT_INFO(TypeName, ParentType)             \
-  static const constexpr bool _type_final = true;                       \
-  static const constexpr int _type_child_slots = 0;                     \
+  static const constexpr bool _type_final = true;                         \
+  static const constexpr int _type_child_slots = 0;                       \
   MXNET_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType)                    \
 
 
@@ -684,25 +682,25 @@ struct ObjectEqual {
 
 
 #define MXNET_DEFINE_OBJECT_REF_METHODS(TypeName, ParentType, ObjectName) \
-  TypeName() {}                                                         \
-  explicit TypeName(                                                    \
-      ::mxnet::runtime::ObjectPtr<::mxnet::runtime::Object> n)              \
-      : ParentType(n) {}                                                \
-  const ObjectName* operator->() const {                                \
-    return static_cast<const ObjectName*>(data_.get());                 \
-  }                                                                     \
-  operator bool() const { return data_ != nullptr; }                    \
+  TypeName() {}                                                           \
+  explicit TypeName(                                                      \
+      ::mxnet::runtime::ObjectPtr<::mxnet::runtime::Object> n)            \
+      : ParentType(n) {}                                                  \
+  const ObjectName* operator->() const {                                  \
+    return static_cast<const ObjectName*>(data_.get());                   \
+  }                                                                       \
+  operator bool() const { return data_ != nullptr; }                      \
   using ContainerType = ObjectName;
 
 #define MXNET_DEFINE_OBJECT_REF_METHODS_MUT(TypeName, ParentType, ObjectName) \
-  TypeName() {}                                                             \
-  explicit TypeName(                                                        \
-      ::mxnet::runtime::ObjectPtr<::mxnet::runtime::Object> n)                  \
-      : ParentType(n) {}                                                    \
-  ObjectName* operator->() {                                    \
-    return static_cast<ObjectName*>(data_.get());                     \
-  }                                                                         \
-  operator bool() const { return data_ != nullptr; }                        \
+  TypeName() {}                                                               \
+  explicit TypeName(                                                          \
+      ::mxnet::runtime::ObjectPtr<::mxnet::runtime::Object> n)                \
+      : ParentType(n) {}                                                      \
+  ObjectName* operator->() {                                                  \
+    return static_cast<ObjectName*>(data_.get());                             \
+  }                                                                           \
+  operator bool() const { return data_ != nullptr; }                          \
   using ContainerType = ObjectName;
 
 // Implementations details below
diff --git a/julia/NEWS.md b/julia/NEWS.md
index 8f5efbb0edd5..9b2fe8210b18 100644
--- a/julia/NEWS.md
+++ b/julia/NEWS.md
@@ -15,6 +15,9 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
+# v1.7.0
+
+
 # v1.6.0
 
 * Add an abstract type `AbstractMXError` as the parent type for all MXNet-related
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
index 7e78cd384220..e4d8b5abde0b 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
@@ -1037,12 +1037,13 @@ method update($index, $weight, $grad, $state)
         }
         else
         {
+            $grad += $wd * $weight;
             my $mom = $state;
             $mom *= $self->momentum;
-            $grad += $wd * $weight;
-            $mom += $grad;
+            $mom -= $lr * $grad;
+	    $grad *= -$lr;
             $grad += $self->momentum * $mom;
-            $weight += -$lr * $grad;
+            $weight += $grad;
         }
     }
     else
@@ -1061,11 +1062,12 @@ method update($index, $weight, $grad, $state)
         }
         else
         {
+	    $grad32 += $wd * $weight32;
             $mom *= $self->momentum;
-            $grad32 += $wd * $weight32;
-            $mom += $grad32;
+            $mom -= $lr * $grad32;
+	    $grad32 *= -$lr;
             $grad32 += $self->momentum * $mom;
-            $weight32 += -$lr * $grad32;
+            $weight32 += $grad32;
         }
         my $tmp = $weight32->astype($weight->dtype);
         $tmp->copyto($weight);
@@ -1276,7 +1278,7 @@ __PACKAGE__->register;
     rescale_grad : Num, optional
         rescaling factor of gradient. Normally should be 1/batch_size.
 
-    eps: Num, optional
+    epsilon: Num, optional
         A small float number to make the updating processing stable
         Default value is set to 1e-7.
 
@@ -1288,7 +1290,7 @@ use Mouse;
 
 extends 'AI::MXNet::Optimizer';
 
-has 'eps'    => (is => "rw", isa => "Num", default => 1e-7);
+has 'epsilon'    => (is => "rw", isa => "Num", default => 1e-7);
 
 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
@@ -1314,7 +1316,7 @@ method update(
     if($is_sparse)
     {
         my %kwargs = (
-            epsilon => $self->eps,
+            epsilon => $self->epsilon,
             rescale_grad => $self->rescale_grad
         );
         if($self->clip_gradient)
@@ -1330,9 +1332,10 @@ method update(
         {
             $grad = AI::MXNet::NDArray->clip($grad, -$self->clip_gradient, $self->clip_gradient);
         }
+	$grad += $wd * $weight;
         $history += $grad->square;
-        my $div = $grad / ($history + $self->eps)->sqrt;
-        $weight += ($div + $weight * $wd) * -$lr;
+        my $div = $grad / (($history)->sqrt + $self->epsilon);
+        $weight += $div * -$lr;
     }
 }
 
@@ -1359,11 +1362,10 @@ __PACKAGE__->register;
     learning_rate : Num, optional
         Step size.
         Default value is set to 0.001.
-    gamma1: Num, optional
+    rho: Num, optional
         decay factor of moving average for gradient^2.
         Default value is set to 0.9.
-    gamma2: Num, optional
-        "momentum" factor.
+    momentum: Num, optional
         Default value if set to 0.9.
         Only used if centered=True
     epsilon : Num, optional
@@ -1386,8 +1388,8 @@ use Mouse;
 extends 'AI::MXNet::Optimizer';
 
 has '+learning_rate' => (default => 0.001);
-has 'gamma1'         => (is => "ro", isa => "Num",  default => 0.9);
-has 'gamma2'         => (is => "ro", isa => "Num",  default => 0.9);
+has 'rho'         => (is => "ro", isa => "Num",  default => 0.9);
+has 'momentum'         => (is => "ro", isa => "Num",  default => 0.9);
 has 'epsilon'        => (is => "ro", isa => "Num",  default => 1e-8);
 has 'centered'       => (is => "ro", isa => "Bool", default => 0);
 has 'clip_weights'   => (is => "ro", isa => "Num");
@@ -1397,12 +1399,12 @@ sub BUILD
 {
     my $self = shift;
     $self->kwargs({
-        gamma1       => $self->gamma1,
+        rho       => $self->rho,
         epsilon      => $self->epsilon
     });
     if($self->centered)
     {
-        $self->kwargs->{gamma2} = $self->gamma2;
+        $self->kwargs->{momentum} = $self->momentum;
     }
     if($self->clip_gradient)
     {
@@ -1461,7 +1463,7 @@ method update(
     if($self->centered)
     {
         AI::MXNet::NDArray->rmspropalex_update(
-            $weight, $grad, $n, $g, $delta,
+            $weight, $grad, $g, $n, $delta,
             {
                 out => $weight,
                 lr  => $lr,
diff --git a/perl-package/AI-MXNet/t/test_optimizers.t b/perl-package/AI-MXNet/t/test_optimizers.t
index af3e54e554f3..26a87cdd75ba 100644
--- a/perl-package/AI-MXNet/t/test_optimizers.t
+++ b/perl-package/AI-MXNet/t/test_optimizers.t
@@ -76,11 +76,12 @@ method update($index, $weight, $grad, $state)
     my $t = $self->_index_update_count->{$index};
     my ($mean, $variance) = @$state;
     my $wd = $self->_get_wd($index);
-    $grad = $grad * $self->rescale_grad + $wd * $weight;
+    $grad = $grad * $self->rescale_grad;
     if($self->clip_gradient)
     {
         mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient, out => $grad);
     }
+    $grad += $wd * $weight;
     $mean *= $self->beta1;
     $mean += $grad * (1 - $self->beta1);
 
@@ -109,11 +110,10 @@ method update($index, $weight, $grad, $state)
     learning_rate : float, optional
         Step size.
         Default value is set to 0.001.
-    gamma1: float, optional
+    rho: float, optional
         decay factor of moving average for gradient, gradient^2.
         Default value is set to 0.9.
-    gamma2: float, optional
-        "momentum" factor.
+    momentum: float, optional
         Default value if set to 0.9.
         Only used if centered=True
     epsilon : float, optional
@@ -134,8 +134,8 @@ package PerlRMSProp;
 use Mouse;
 extends 'AI::MXNet::Optimizer';
 has '+learning_rate' => (default => 0.001);
-has 'gamma1'         => (is => "ro", isa => "Num",  default => 0.9);
-has 'gamma2'         => (is => "ro", isa => "Num",  default => 0.9);
+has 'rho'         => (is => "ro", isa => "Num",  default => 0.9);
+has 'momentum'         => (is => "ro", isa => "Num",  default => 0.9);
 has 'epsilon'        => (is => "ro", isa => "Num",  default => 1e-8);
 has 'centered'       => (is => "ro", isa => "Bool", default => 0);
 has 'clip_weights'   => (is => "ro", isa => "Num");
@@ -174,7 +174,7 @@ method update($index, $weight, $grad, $state)
     my $lr = $self->_get_lr($index);
     my $wd = $self->_get_wd($index);
     $self->_update_count($index);
-    $grad = $grad * $self->rescale_grad + $wd * $weight;
+    $grad = $grad * $self->rescale_grad;
     if(not $self->centered)
     {
         my ($n) = @$state;
@@ -182,8 +182,9 @@ method update($index, $weight, $grad, $state)
         {
             $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
         }
-        $n .= (1 - $self->gamma1) * ($grad * $grad) + $self->gamma1 * $n;
-        $weight -= $lr * $grad/(mx->nd->sqrt($n + $self->epsilon));
+        $grad += $wd * $weight;
+        $n .= (1 - $self->rho) * ($grad * $grad) + $self->rho * $n;
+        $weight -= $lr * $grad/(mx->nd->sqrt($n) + $self->epsilon);
     }
     else
     {
@@ -192,9 +193,10 @@ method update($index, $weight, $grad, $state)
         {
             $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
         }
-        $n .= (1 - $self->gamma1) * ($grad * $grad) + $self->gamma1 * $n;
-        $g .= (1 - $self->gamma1) * $grad + $self->gamma1 * $g;
-        $delta .= ($self->gamma2) * $delta - $lr * $grad/(mx->nd->sqrt($n - $g*$g + $self->epsilon));
+        $grad += $wd * $weight;
+        $n .= (1 - $self->rho) * ($grad * $grad) + $self->rho * $n;
+        $g .= (1 - $self->rho) * $grad + $self->rho * $g;
+        $delta .= ($self->momentum) * $delta - $lr * $grad/(mx->nd->sqrt($n - $g*$g + $self->epsilon));
         $weight += $delta;
     }
     if($self->clip_weights)
@@ -443,12 +445,13 @@ method update($index, $weight, $grad, $state)
         }
         else
         {
+            $grad += $wd * $weight;
             my $mom = $state;
             $mom *= $self->momentum;
-            $grad += $wd * $weight;
-            $mom += $grad;
+            $mom -= $lr * $grad;
+            $grad *= -$lr;
             $grad += $self->momentum * $mom;
-            $weight += -$lr * $grad;
+            $weight += $grad;
         }
     }
     else
@@ -467,11 +470,12 @@ method update($index, $weight, $grad, $state)
         }
         else
         {
-            $mom *= $self->momentum;
             $grad32 += $wd * $weight32;
-            $mom += $grad32;
+            $mom *= $self->momentum;
+            $mom -= $lr * $grad32;
+            $grad32 *= -$lr;
             $grad32 += $self->momentum * $mom;
-            $weight32 += -$lr * $grad32;
+            $weight32 += $grad32;
         }
         my $tmp = $weight32->astype($weight->dtype);
         $tmp->copyto($weight);
@@ -499,11 +503,12 @@ method update($index, $weight, $grad, $state)
     my $wd = $self->_get_wd($index);
     my $t = $self->_index_update_count->{$index};
 
-    my $grad = $grad * $self->rescale_grad + $wd * $weight;
+    my $grad = $grad * $self->rescale_grad;
     if(defined $self->clip_gradient)
     {
         $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
     }
+    $grad += $wd * $weight;
     # get previous states
     my ($prev_d, $prev_v, $prev_z) = @{ $state };
     # compute states
@@ -604,8 +609,8 @@ method update($index, $weight, $grad, $state)
         $n->at($row) += $grad_row * $grad_row;
 
         # update weight
-        $weight->at($row) .= (mx->nd->sign($dn->at($row)) * $self->lamda1 - $dn->at($row)) /
-                          (($self->beta + mx->nd->sqrt($n->at($row))) / $lr + $wd) * (mx->nd->abs($dn->at($row)) > $self->lamda1);
+        $weight->at($row) .= - mx->nd->sign($dn->at($row)) * (mx->nd->abs($dn->at($row)) - $self->lamda1)->maximum(0) /
+                          (($self->beta + mx->nd->sqrt($n->at($row))) / $lr + $wd);
     }
 }
 
@@ -613,7 +618,7 @@ package PerlAdaGrad;
 use Mouse;
 extends 'AI::MXNet::Optimizer';
 
-has 'eps' => (is => 'rw', default => 1e-7);
+has 'epsilon' => (is => 'rw', default => 1e-7);
 method create_state($index, $weight)
 {
     mx->nd->zeros($weight->shape, ctx => $weight->context, stype => $weight->stype);
@@ -631,9 +636,10 @@ method update($index, $weight, $grad, $state)
     {
         $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
     }
+    $grad += $wd * $weight;
     $history += mx->nd->square($grad);
-    my $div = $grad / mx->nd->sqrt($history + $self->eps);
-    $weight += ($div + $weight * $wd) * -$lr;
+    my $div = $grad / (mx->nd->sqrt($history) + $self->epsilon);
+    $weight -= $lr * $div;
 }
 
 package main;
@@ -1052,7 +1058,7 @@ sub test_adagrad
     my $opt1 = 'PerlAdaGrad';
     my $opt2 = mx->optimizer->AdaGrad;
     my $shape = [3, 4, 5];
-    my @eps_options= ({}, {eps => 1e-9});
+    my @eps_options= ({}, {epsilon => 1e-9});
     my @cg_options = ({}, {clip_gradient => 0.4}, {clip_gradient => 0.5});
     my @rg_options = ({}, {rescale_grad  => 0.14}, {rescale_grad => 0.8});
     my @wd_options = ({}, {wd => 0});
@@ -1072,11 +1078,11 @@ sub test_adagrad
                         %kwarg = (%kwarg, %$rg_option);
                         %kwarg = (%kwarg, %$wd_option);
                         compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype);
-                        if(($wd_option->{wd}//0) == 0)
-                        {
-                            compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype, 'row_sparse', 'row_sparse');
-                            compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype, 'default', 'row_sparse');
-                        }
+			if($wd_option->{wd} == 0)
+			{
+			    compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype, 'row_sparse', 'row_sparse');
+			    compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype, 'default', 'row_sparse');
+			}
                     }
                 }
             }
diff --git a/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py b/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py
index 0f695a1b2ff0..fba99a0434d7 100644
--- a/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py
+++ b/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+# pylint: disable=W0223
 """A `_SVRGOptimizer` encapsulates two optimizers to support SVRGModule in single machine and distributed settings.
 Both `_AssignmentOptimizer` and `_SVRGOptimizer` are designed to be used with SVRGModule only.
 """
@@ -27,24 +29,24 @@ class _AssignmentOptimizer(mx.optimizer.Optimizer):
     """_AssignmentOptimizer assigns gradients to weights for SVRGModule's full gradients
     accumulation in the KVStore. It is a helper optimizer that is designed to be used with SVRGModule only.
     """
-    def update(self, index, weight, grad, state):
+    def update(self, indices, weights, grads, states):
         """Assign the gradients to weight for accumulating full gradients in the KVStore across all devices and workers.
 
         Parameters
         ----------
-        index : int
-            The unique index of the parameter into the individual learning
-            rates and weight decays. Learning rates and weight decay
-            may be set via `set_lr_mult()` and `set_wd_mult()`, respectively.
-        weight : NDArray
-            The parameter to be updated.
-        grad : NDArray
-            The gradient of the objective with respect to this parameter.
-        state: any obj
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
             AssignmentOptimizer will not need to be associated with state.
         """
-
-        weight[:] = grad
+        for weight, grad in zip(weights, grads):
+            weight[:] = grad
 
 
 @mx.optimizer.register
@@ -98,31 +100,32 @@ def _check_params(**kwargs):
 
         return default_params
 
-    def update(self, index, weight, grad, state):
+    def update(self, indices, weights, grads, states):
         """Updates the given parameter using the corresponding gradient and state. If key contains 'full', update with
         `_AssignmentOptimizer` otherwise will use default optimizer.
 
         Parameters
         ----------
-        index : int
-            The unique index of the parameter into the individual learning
-            rates and weight decays. Learning rates and weight decay
-            may be set via `set_lr_mult()` and `set_wd_mult()`, respectively.
-        weight : NDArray
-            The parameter to be updated.
-        grad : NDArray
-            The gradient of the objective with respect to this parameter.
-        state : any obj
-            The state returned by `create_state()`.
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
         """
 
-        name = self._check_index(index)
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            name = self._check_index(index)
 
-        if "full" in name:
-            self.aux_opt.update(index, weight, grad, state)
-        else:
-            # use the default optimizer
-            self.default_opt.update(index, weight, grad, state)
+            if "full" in name:
+                self.aux_opt.update([index], [weight], [grad], [state])
+            else:
+                # use the default optimizer
+                self.default_opt.update([index], [weight], [grad], [state])
 
     def create_state(self, index, weight):
         """Creates auxiliary state for a given weight.
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index fed3c440ac21..303167d8abf2 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -60,9 +60,11 @@ class Trainer(object):
         Arguments would then be {'type':'2bit', 'threshold':0.5}
         See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
     update_on_kvstore : bool, default None
-        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
-        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
-        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
+        Whether to perform parameter updates on kvstore. If None and optimizer.aggregate_num <= 1,
+        then trainer will choose the more suitable option depending on the type of kvstore.
+        If None and optimizer.aggregate_num > 1, `update_on_kvstore` is set to False.
+        If the `update_on_kvstore` argument is provided,
+        environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
 
     Properties
     ----------
@@ -103,6 +105,12 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
         optimizer_params = optimizer_params if optimizer_params else {}
         self._init_optimizer(optimizer, optimizer_params)
         self._scale = self._optimizer.rescale_grad
+        if self._optimizer.aggregate_num > 1 and update_on_kvstore is not None:
+            if update_on_kvstore:
+                raise ValueError("Cannot set update_on_kvstore=True "
+                                 "when optimizer.aggregate_num > 1.")
+        if update_on_kvstore is None and self._optimizer.aggregate_num > 1:
+            update_on_kvstore = False
         self._kvstore_params = {'kvstore': kvstore, 'update_on_kvstore': update_on_kvstore}
         self._kv_initialized = False
         self._kvstore = None
@@ -457,8 +465,8 @@ def _update(self, ignore_stale_grad=False):
         if not (self._kvstore and self._update_on_kvstore):
             for updater, upd in zip(self._updaters, updates):
                 if upd:
-                    i, w, g = zip(*upd)
-                    updater(i, w, g)
+                    i, g, w = zip(*upd)
+                    updater(i, g, w)
 
     def save_states(self, fname):
         """Saves trainer states (e.g. optimizer, momentum) to a file.
diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py
index 4ee86c315f2e..30b0d82e810d 100644
--- a/python/mxnet/libinfo.py
+++ b/python/mxnet/libinfo.py
@@ -141,4 +141,4 @@ def find_conf_path(prefix='tvmop'):
 
 
 # current version
-__version__ = "1.6.0"
+__version__ = "1.7.0"
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 39a307bec9e1..82b57fb8cc1f 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -38,7 +38,7 @@
            'tensordot', 'eye', 'linspace',
            'logspace', 'expand_dims', 'tile', 'arange', 'array_split', 'split', 'hsplit', 'vsplit', 'dsplit',
            'concatenate', 'append', 'stack', 'vstack', 'row_stack', 'column_stack', 'hstack', 'dstack',
-           'average', 'mean', 'maximum', 'minimum', 'around', 'round', 'round_',
+           'average', 'mean', 'maximum', 'minimum', 'around', 'round', 'round_', 'flatnonzero',
            'swapaxes', 'clip', 'argmax', 'argmin', 'std', 'var', 'indices', 'copysign', 'ravel', 'unravel_index',
            'diag_indices_from', 'hanning', 'hamming', 'blackman', 'flip', 'flipud', 'fliplr',
            'hypot', 'bitwise_and', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm',
@@ -4989,6 +4989,45 @@ def unravel_index(indices, shape, order='C'): # pylint: disable=redefined-outer-
         raise NotImplementedError('Do not support column-major (Fortran-style) order at this moment')
 
 
+def flatnonzero(a):
+    r"""
+    Return indices that are non-zero in the flattened version of a.
+
+    This is equivalent to np.nonzero(np.ravel(a))[0].
+
+    Parameters
+    ----------
+    a : array_like
+        Input data.
+
+    Returns
+    -------
+    res : ndarray
+        Output array, containing the indices of the elements of `a.ravel()`
+        that are non-zero.
+
+    See Also
+    --------
+    nonzero : Return the indices of the non-zero elements of the input array.
+    ravel : Return a 1-D array containing the elements of the input array.
+
+    Examples
+    --------
+    >>> x = np.arange(-2, 3)
+    >>> x
+    array([-2, -1,  0,  1,  2])
+    >>> np.flatnonzero(x)
+    array([0, 1, 3, 4])
+
+    Use the indices of the non-zero elements as an index array to extract
+    these elements:
+
+    >>> x.ravel()[np.flatnonzero(x)]
+    array([-2, -1,  1,  2])
+    """
+    return nonzero(ravel(a))[0]
+
+
 def diag_indices_from(arr):
     """
     This returns a tuple of indices that can be used to access the main diagonal of an array
diff --git a/python/mxnet/ndarray/numpy_extension/random.py b/python/mxnet/ndarray/numpy_extension/random.py
index 8bd17cf092b0..1ddd28f9e013 100644
--- a/python/mxnet/ndarray/numpy_extension/random.py
+++ b/python/mxnet/ndarray/numpy_extension/random.py
@@ -165,18 +165,22 @@ def uniform_n(low=0.0, high=1.0, batch_shape=None, dtype=None, ctx=None):
         ctx = current_context()
     if batch_shape == ():
         batch_shape = None
+    else:
+        if isinstance(batch_shape, int):
+            batch_shape = (batch_shape,)
+        batch_shape = (-2,) + batch_shape
     if input_type == (True, True):
-        return _npi.uniform_n(low, high, low=None, high=None, size=batch_shape,
-                              ctx=ctx, dtype=dtype)
+        return _npi.uniform(low, high, low=None, high=None, size=batch_shape,
+                            ctx=ctx, dtype=dtype)
     elif input_type == (False, True):
-        return _npi.uniform_n(high, low=low, high=None, size=batch_shape,
-                              ctx=ctx, dtype=dtype)
+        return _npi.uniform(high, low=low, high=None, size=batch_shape,
+                            ctx=ctx, dtype=dtype)
     elif input_type == (True, False):
-        return _npi.uniform_n(low, low=None, high=high, size=batch_shape,
-                              ctx=ctx, dtype=dtype)
+        return _npi.uniform(low, low=None, high=high, size=batch_shape,
+                            ctx=ctx, dtype=dtype)
     else:
-        return _npi.uniform_n(low=low, high=high, size=batch_shape,
-                              ctx=ctx, dtype=dtype)
+        return _npi.uniform(low=low, high=high, size=batch_shape,
+                            ctx=ctx, dtype=dtype)
 
 
 def normal_n(loc=0.0, scale=1.0, batch_shape=None, dtype=None, ctx=None):
@@ -252,15 +256,19 @@ def normal_n(loc=0.0, scale=1.0, batch_shape=None, dtype=None, ctx=None):
         ctx = current_context()
     if batch_shape == ():
         batch_shape = None
+    else:
+        if isinstance(batch_shape, int):
+            batch_shape = (batch_shape,)
+        batch_shape = (-2,) + batch_shape
     if input_type == (True, True):
-        return _npi.normal_n(loc, scale, loc=None, scale=None, size=batch_shape,
-                             ctx=ctx, dtype=dtype)
+        return _npi.normal(loc, scale, loc=None, scale=None, size=batch_shape,
+                           ctx=ctx, dtype=dtype)
     elif input_type == (False, True):
-        return _npi.normal_n(scale, loc=loc, scale=None, size=batch_shape,
-                             ctx=ctx, dtype=dtype)
+        return _npi.normal(scale, loc=loc, scale=None, size=batch_shape,
+                           ctx=ctx, dtype=dtype)
     elif input_type == (True, False):
-        return _npi.normal_n(loc, loc=None, scale=scale, size=batch_shape,
-                             ctx=ctx, dtype=dtype)
+        return _npi.normal(loc, loc=None, scale=scale, size=batch_shape,
+                           ctx=ctx, dtype=dtype)
     else:
-        return _npi.normal_n(loc=loc, scale=scale, size=batch_shape,
-                             ctx=ctx, dtype=dtype)
+        return _npi.normal(loc=loc, scale=scale, size=batch_shape,
+                           ctx=ctx, dtype=dtype)
diff --git a/python/mxnet/numpy/fallback.py b/python/mxnet/numpy/fallback.py
index 1e45d8e54cc2..b98d377e4cd2 100644
--- a/python/mxnet/numpy/fallback.py
+++ b/python/mxnet/numpy/fallback.py
@@ -38,7 +38,6 @@
     'digitize',
     'divmod',
     'extract',
-    'flatnonzero',
     'float_power',
     'frexp',
     'heaviside',
@@ -124,7 +123,6 @@
 digitize = onp.digitize
 divmod = onp.divmod
 extract = onp.extract
-flatnonzero = onp.flatnonzero
 float_power = onp.float_power
 frexp = onp.frexp
 heaviside = onp.heaviside
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 651699460348..3354ce8eb939 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -57,7 +57,7 @@
            'degrees', 'log2', 'log1p', 'rint', 'radians', 'reciprocal', 'square', 'negative', 'histogram',
            'fix', 'ceil', 'floor', 'trunc', 'logical_not', 'arcsinh', 'arccosh', 'arctanh', 'append', 'argsort',
            'sort', 'tensordot', 'eye', 'linspace', 'logspace', 'expand_dims', 'tile', 'arange',
-           'array_split', 'split', 'hsplit', 'vsplit', 'dsplit',
+           'array_split', 'split', 'hsplit', 'vsplit', 'dsplit', 'flatnonzero',
            'concatenate', 'stack', 'vstack', 'row_stack', 'column_stack', 'hstack', 'dstack',
            'average', 'mean', 'maximum', 'minimum', 'swapaxes', 'clip', 'argmax', 'argmin', 'std', 'var', 'insert',
            'indices', 'copysign', 'ravel', 'unravel_index', 'diag_indices_from', 'hanning', 'hamming', 'blackman',
@@ -190,6 +190,8 @@ def _np_ndarray_cls(handle, writable=True, stype=0):
 
 _NUMPY_ARRAY_FUNCTION_DICT = {}
 _NUMPY_ARRAY_UFUNC_DICT = {}
+_FALLBACK_ARRAY_FUNCTION_WARNED_RECORD = {}
+_FALLBACK_ARRAY_UFUNC_WARNED_RECORD = {}
 
 
 @set_module('mxnet.numpy')  # pylint: disable=invalid-name
@@ -263,6 +265,11 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):  # pylint: disable=
                                      .format(name)
                 onp_op = _get_np_op(name)
                 new_inputs = [arg.asnumpy() if isinstance(arg, ndarray) else arg for arg in inputs]
+                if onp_op not in _FALLBACK_ARRAY_UFUNC_WARNED_RECORD:
+                    import logging
+                    logging.warning("np.%s is a fallback operator, "
+                                    "which is actually using official numpy's implementation", name)
+                    _FALLBACK_ARRAY_UFUNC_WARNED_RECORD[onp_op] = True
                 out = onp_op(*new_inputs, **kwargs)
                 return _as_mx_np_array(out, ctx=inputs[0].ctx)
             else:
@@ -277,6 +284,7 @@ def __array_function__(self, func, types, args, kwargs):  # pylint: disable=bad-
         this function.
         """
         mx_np_func = _NUMPY_ARRAY_FUNCTION_DICT.get(func, None)
+        func_name = func.__name__
         if mx_np_func is None:
             # try to fallback to official NumPy op
             if is_recording():
@@ -290,6 +298,11 @@ def __array_function__(self, func, types, args, kwargs):  # pylint: disable=bad-
             new_kwargs = {}
             for k, v in kwargs.items():
                 new_kwargs[k] = v.asnumpy() if isinstance(v, ndarray) else v
+            if func not in _FALLBACK_ARRAY_FUNCTION_WARNED_RECORD:
+                import logging
+                logging.warning("np.%s is a fallback operator, "
+                                "which is actually using official numpy's implementation.", func_name)
+                _FALLBACK_ARRAY_FUNCTION_WARNED_RECORD[func] = True
             out = func(*new_args, **new_kwargs)
             return _as_mx_np_array(out, ctx=cur_ctx)
         else:
@@ -6875,6 +6888,45 @@ def unravel_index(indices, shape, order='C'): # pylint: disable=redefined-outer-
     return _mx_nd_np.unravel_index(indices, shape, order=order)
 
 
+def flatnonzero(a):
+    r"""
+    Return indices that are non-zero in the flattened version of a.
+
+    This is equivalent to np.nonzero(np.ravel(a))[0].
+
+    Parameters
+    ----------
+    a : array_like
+        Input data.
+
+    Returns
+    -------
+    res : ndarray
+        Output array, containing the indices of the elements of `a.ravel()`
+        that are non-zero.
+
+    See Also
+    --------
+    nonzero : Return the indices of the non-zero elements of the input array.
+    ravel : Return a 1-D array containing the elements of the input array.
+
+    Examples
+    --------
+    >>> x = np.arange(-2, 3)
+    >>> x
+    array([-2, -1,  0,  1,  2])
+    >>> np.flatnonzero(x)
+    array([0, 1, 3, 4])
+
+    Use the indices of the non-zero elements as an index array to extract
+    these elements:
+
+    >>> x.ravel()[np.flatnonzero(x)]
+    array([-2, -1,  1,  2])
+    """
+    return _mx_nd_np.flatnonzero(a)
+
+
 def diag_indices_from(arr):
     """
     This returns a tuple of indices that can be used to access the main diagonal of an array
diff --git a/python/mxnet/numpy_dispatch_protocol.py b/python/mxnet/numpy_dispatch_protocol.py
index d8d7c0907bcf..781ec55b3796 100644
--- a/python/mxnet/numpy_dispatch_protocol.py
+++ b/python/mxnet/numpy_dispatch_protocol.py
@@ -142,6 +142,7 @@ def _run_with_array_ufunc_proto(*args, **kwargs):
     'transpose',
     'unique',
     'unravel_index',
+    'flatnonzero',
     'diag_indices_from',
     'delete',
     'var',
diff --git a/python/mxnet/optimizer/__init__.py b/python/mxnet/optimizer/__init__.py
index 72eb5a741520..89b37de1c873 100644
--- a/python/mxnet/optimizer/__init__.py
+++ b/python/mxnet/optimizer/__init__.py
@@ -16,9 +16,48 @@
 # under the License.
 """Optimizer API of MXNet."""
 
-from . import optimizer, contrib
+from . import (optimizer, contrib, updater, utils, sgd,
+               sgld, signum, dcasgd, nag, adagrad,
+               adadelta, adam, adamax, nadam, ftrl,
+               ftml, lars, lamb, rmsprop)
 # pylint: disable=wildcard-import
 from .optimizer import *
-# pylint: enable=wildcard-import
 
-__all__ = optimizer.__all__ + ['contrib']
+from .updater import *
+
+from .utils import *
+
+from .sgd import *
+
+from .sgld import *
+
+from .signum import *
+
+from .dcasgd import *
+
+from .nag import *
+
+from .adagrad import *
+
+from .adadelta import *
+
+from .adam import *
+
+from .adamax import *
+
+from .nadam import *
+
+from .ftrl import *
+
+from .ftml import *
+
+from .lars import *
+
+from .lamb import *
+
+from .rmsprop import *
+
+__all__ = optimizer.__all__ + updater.__all__ + ['contrib'] + sgd.__all__ + sgld.__all__ \
+          + signum.__all__ + dcasgd.__all__ + nag.__all__ + adagrad.__all__ + adadelta.__all__ \
+          + adam.__all__ + adamax.__all__ + nadam.__all__ + ftrl.__all__ + ftml.__all__ \
+          + lars.__all__ + lamb.__all__ + rmsprop.__all__
diff --git a/python/mxnet/optimizer/adadelta.py b/python/mxnet/optimizer/adadelta.py
new file mode 100644
index 000000000000..a8f01401e282
--- /dev/null
+++ b/python/mxnet/optimizer/adadelta.py
@@ -0,0 +1,111 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=W0223
+"""AdaDelta optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from .optimizer import Optimizer, register
+
+__all__ = ['AdaDelta']
+
+
+@register
+class AdaDelta(Optimizer):
+    """The AdaDelta optimizer.
+
+    This class implements AdaDelta, an optimizer described in  *ADADELTA: An adaptive
+    learning rate method*, available at https://arxiv.org/abs/1212.5701.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        acc_grad = rho * acc_grad + (1. - rho) * grad * grad
+        delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
+        acc_delta = rho * acc_delta + (1. - rho) * delta * delta
+        weight -= learning_rate * delta
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 1.0
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    rho: float, default 0.9
+        Decay rate for both squared gradients and delta.
+    epsilon : float, default 1e-6
+        Small value to avoid division by 0.
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=1.0, rho=0.9, epsilon=1e-6, use_fused_step=False, **kwargs):
+        super(AdaDelta, self).__init__(learning_rate=learning_rate,
+                                       use_fused_step=use_fused_step,
+                                       **kwargs)
+        self.rho = rho
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context),  # accumulated g
+                zeros(weight.shape, weight.context))  # accumulated delta
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            acc_g, acc_delta = state
+
+            # update g, delta
+            acc_g[:] *= self.rho
+            acc_g[:] += (1. - self.rho) * square(grad)
+            current_delta = sqrt(acc_delta + self.epsilon)
+            current_delta /= sqrt(acc_g + self.epsilon)
+            current_delta *= grad
+            acc_delta[:] *= self.rho
+            acc_delta[:] += (1. - self.rho) * square(current_delta)
+
+            # update weight
+            weight[:] -= lr * current_delta
diff --git a/python/mxnet/optimizer/adagrad.py b/python/mxnet/optimizer/adagrad.py
new file mode 100644
index 000000000000..aa31abf0030b
--- /dev/null
+++ b/python/mxnet/optimizer/adagrad.py
@@ -0,0 +1,141 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""AdaGrad optimizer"""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from ..ndarray import sparse
+from .optimizer import Optimizer, register
+
+__all__ = ['AdaGrad']
+
+
+@register
+class AdaGrad(Optimizer):
+    """AdaGrad optimizer.
+
+    This class implements the AdaGrad optimizer described in *Adaptive Subgradient
+    Methods for Online Learning and Stochastic Optimization*, and available at
+    http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        history += square(grad)
+        weight -= learning_rate * grad / (sqrt(history) + epsilon)
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    See Also
+    ----------
+    :meth:`mxnet.ndarray.sparse.adagrad_update`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.01
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    epsilon : float, default 1e-6
+        Small value to avoid division by 0.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False or grad is not sparse, step is called,
+        otherwise, fused_step is called.
+
+    """
+    def __init__(self, learning_rate=0.01, epsilon=1e-6, use_fused_step=True, **kwargs):
+        super(AdaGrad, self).__init__(learning_rate=learning_rate,
+                                      use_fused_step=use_fused_step,
+                                      **kwargs)
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return zeros(weight.shape, weight.context, stype=weight.stype)  # history
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update history
+            history = state
+            history[:] += square(grad)
+            d = grad / (sqrt(history) + self.epsilon)
+
+            # update weight
+            weight[:] -= lr * d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            is_sparse = grad.stype == 'row_sparse'
+
+            if is_sparse:
+                self._update_count(index)
+                lr = self._get_lr(index)
+                wd = self._get_wd(index)
+                kwargs = {'epsilon': self.epsilon, 'rescale_grad': self.rescale_grad}
+                if self.clip_gradient:
+                    kwargs['clip_gradient'] = self.clip_gradient
+
+                history = state
+
+                # When grad is sparse, update weight with fused kernel
+                sparse.adagrad_update(weight, grad, history, out=weight, lr=lr, wd=wd, **kwargs)
+            else:
+                # When the grad is not sparse, the func step is called to update weight and state
+                self.step([index], [weight], [grad], [state])
diff --git a/python/mxnet/optimizer/adam.py b/python/mxnet/optimizer/adam.py
new file mode 100644
index 000000000000..24500c917433
--- /dev/null
+++ b/python/mxnet/optimizer/adam.py
@@ -0,0 +1,186 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Adam optimizer."""
+from __future__ import absolute_import
+import math
+from ..ndarray import (zeros, clip, sqrt, square)
+from ..ndarray import adam_update
+from .optimizer import Optimizer, register
+
+__all__ = ['Adam']
+
+
+@register
+class Adam(Optimizer):
+    """The Adam optimizer.
+
+    This class implements the optimizer described in *Adam: A Method for
+    Stochastic Optimization*, available at http://arxiv.org/abs/1412.6980.
+
+    If the storage types of grad is ``row_sparse``, and ``lazy_update`` is True, \
+    **lazy updates** at step t are applied by::
+
+        for row in grad.indices:
+            rescaled_grad[row] = clip(grad[row] * rescale_grad, clip_gradient) + wd * weight[row]
+            m[row] = beta1 * m[row] + (1 - beta1) * rescaled_grad[row]
+            v[row] = beta2 * v[row] + (1 - beta2) * (rescaled_grad[row]**2)
+            lr = learning_rate * sqrt(1 - beta2**t) / (1 - beta1**t)
+            w[row] = w[row] - lr * m[row] / (sqrt(v[row]) + epsilon)
+
+    The lazy update only updates the mean and var for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all indices.
+    Compared with the original update, it can provide large improvements in model training
+    throughput for some applications. However, it provides slightly different semantics than
+    the original update, and may lead to different empirical results.
+
+    Otherwise, **standard updates** at step t are applied by::
+
+        rescaled_grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        m = beta1 * m + (1 - beta1) * rescaled_grad
+        v = beta2 * v + (1 - beta2) * (rescaled_grad**2)
+        lr = learning_rate * sqrt(1 - beta2**t) / (1 - beta1**t)
+        w = w - lr * m / (sqrt(v) + epsilon)
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    For details of the update algorithm, see :class:`~mxnet.ndarray.adam_update`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.9
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, default 0.999
+        Exponential decay rate for the second moment estimates.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    lazy_update : bool, default False
+       Default is False. If True, lazy updates are applied \
+       if the storage types of weight and grad are both ``row_sparse``.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+                 lazy_update=False, use_fused_step=True, **kwargs):
+        super(Adam, self).__init__(use_fused_step=use_fused_step,
+                                   learning_rate=learning_rate,
+                                   **kwargs)
+        if not self.use_fused_step:
+            assert not lazy_update,\
+                'When use_fused_step is set to False, lazy_update has to be turned off.'
+        self.lazy_update = lazy_update
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.lazy_update = lazy_update
+
+    def create_state(self, index, weight):
+        stype = weight.stype if self.lazy_update else 'default'
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype,
+                      stype=stype),  # mean
+                zeros(weight.shape, weight.context, dtype=weight.dtype,
+                      stype=stype))  # variance
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            coef1 = 1. - self.beta1**t
+            coef2 = 1. - self.beta2**t
+            lr *= math.sqrt(coef2) / coef1
+
+            # update mean and var
+            mean, var = state
+            mean[:] *= self.beta1
+            mean[:] += (1. - self.beta1) * grad
+            var[:] *= self.beta2
+            var[:] += (1. - self.beta2) * square(grad)
+
+            # update weight
+            d = mean / (sqrt(var) + self.epsilon)
+            weight[:] -= lr * d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            coef1 = 1. - self.beta1**t
+            coef2 = 1. - self.beta2**t
+
+            lr *= math.sqrt(coef2)/coef1
+
+            kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                      'rescale_grad': self.rescale_grad}
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+
+            mean, var = state
+
+            # update weight with fused kernel
+            adam_update(weight, grad, mean, var, out=weight,
+                        lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
diff --git a/python/mxnet/optimizer/adamax.py b/python/mxnet/optimizer/adamax.py
new file mode 100644
index 000000000000..a2ffd9c68b2c
--- /dev/null
+++ b/python/mxnet/optimizer/adamax.py
@@ -0,0 +1,111 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=W0223
+"""Adamax optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, maximum, abs as NDabs)
+from .optimizer import Optimizer, register
+
+__all__ = ['Adamax']
+
+
+# pylint: enable=line-too-long
+@register
+class Adamax(Optimizer):
+    """The AdaMax optimizer.
+
+    It is a variant of Adam based on the infinity norm
+    available at http://arxiv.org/abs/1412.6980 Section 7.
+
+    The optimizer updates the weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        m = beta1 * m_t + (1 - beta1) * grad
+        u = maximum(beta2 * u, abs(grad))
+        weight -= lr / (1 - beta1**t) * m / u
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.002
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.9
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, default 0.999
+        Exponential decay rate for the second moment estimates.
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999,
+                 use_fused_step=False, **kwargs):
+        super(Adamax, self).__init__(learning_rate=learning_rate,
+                                     use_fused_step=use_fused_step,
+                                     **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
+                zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            lr /= (1. - self.beta1**t)
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mean and var
+            mean, var = state
+            mean[:] *= self.beta1
+            mean[:] += (1. - self.beta1) * grad
+            var[:] = maximum(self.beta2 * var, NDabs(grad))
+
+            # update weight
+            d = mean / var
+            weight[:] -= lr * d
diff --git a/python/mxnet/optimizer/contrib.py b/python/mxnet/optimizer/contrib.py
index d269aa1bd069..71cda70098be 100644
--- a/python/mxnet/optimizer/contrib.py
+++ b/python/mxnet/optimizer/contrib.py
@@ -15,14 +15,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-# pylint: disable=too-many-lines
 """Contrib optimizers."""
-from ..ndarray import (NDArray, clip, contrib, mean, sqrt, square, zeros)
-from .optimizer import Optimizer
-
-# convenience wrapper for Optimizer.Register
-register = Optimizer.register  # pylint: disable=invalid-name
+from ..ndarray import (clip, contrib, mean, sqrt, square, zeros)
+from .optimizer import Optimizer, register
 
 __all__ = ['GroupAdaGrad']
 
@@ -40,8 +35,7 @@ class GroupAdaGrad(Optimizer):
 
         grad = clip(grad * rescale_grad, clip_gradient)
         history += mean(square(grad), axis=1, keepdims=True)
-        div = grad / sqrt(history + float_stable_eps)
-        weight -= div * lr
+        weight -= lr * grad / (sqrt(history) + epsilon)
 
     Weights are updated lazily if the gradient is sparse.
 
@@ -53,14 +47,24 @@ class GroupAdaGrad(Optimizer):
 
     Parameters
     ----------
-    eps: float, optional
-        Initial value of the history accumulator. Avoids division by 0.
-
+    learning_rate : float, default 0.01
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    epsilon : float, default 1e-6
+        Small value to avoid division by 0.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False or grad is not sparse, step is called,
+        otherwise, fused_step is called.
     """
 
-    def __init__(self, eps=1e-5, **kwargs):
-        super(GroupAdaGrad, self).__init__(**kwargs)
-        self.float_stable_eps = eps
+    def __init__(self, learning_rate=0.01, epsilon=1e-6, use_fused_step=True, **kwargs):
+        super(GroupAdaGrad, self).__init__(learning_rate=learning_rate,
+                                           use_fused_step=use_fused_step,
+                                           **kwargs)
+        self.epsilon = epsilon
 
     def create_state(self, index, weight):
         assert len(weight.shape) == 2
@@ -68,33 +72,81 @@ def create_state(self, index, weight):
             (weight.shape[0], 1), weight.context, stype=weight.stype)
         return history
 
-    def update(self, index, weight, grad, state):
-        assert (isinstance(weight, NDArray))
-        assert (isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        assert wd == 0, 'Weight decay is not supported for GroupAdaGrad'
-
-        is_sparse = grad.stype == 'row_sparse'
-        if is_sparse:
-            kwargs = {
-                'epsilon': self.float_stable_eps,
-                'rescale_grad': self.rescale_grad
-            }
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-            contrib.group_adagrad_update(
-                weight,
-                grad,
-                state,
-                out=weight,
-                lr=lr,
-                **kwargs)
-        else:
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+         Parameters
+         ----------
+         indices : list of int
+             List of unique indices of the parameters into the individual learning rates
+             and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+             and `set_wd_mult()`, respectively.
+         weights : list of NDArray
+             List of parameters to be updated.
+         grads : list of NDArray
+             List of gradients of the objective with respect to this parameter.
+         states : List of any obj
+             List of state returned by `create_state()`.
+         """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            assert wd == 0, 'Weight decay is not supported for GroupAdaGrad'
+
+            # preprocess grad
             grad = grad * self.rescale_grad
             if self.clip_gradient is not None:
                 grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-            state[:] += mean(square(grad), axis=1, keepdims=True)
-            div = lr * grad / sqrt(state + self.float_stable_eps)
-            weight[:] -= div
+
+            # update history
+            history = state
+            history[:] += mean(square(grad), axis=1, keepdims=True)
+
+            # update weight
+            d = grad / (sqrt(history) + self.epsilon)
+            weight[:] -= lr * d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            is_sparse = grad.stype == 'row_sparse'
+
+            if is_sparse:
+                self._update_count(index)
+                lr = self._get_lr(index)
+                wd = self._get_wd(index)
+                assert wd == 0, 'Weight decay is not supported for GroupAdaGrad'
+
+                kwargs = {'epsilon': self.epsilon, 'rescale_grad': self.rescale_grad}
+                if self.clip_gradient:
+                    kwargs['clip_gradient'] = self.clip_gradient
+
+                history = state
+
+                # When grad is sparse, update weight with fused kernel
+                contrib.group_adagrad_update(
+                    weight,
+                    grad,
+                    history,
+                    out=weight,
+                    lr=lr,
+                    **kwargs)
+            else:
+                # When the grad is not sparse, the func step is called to update weight and state
+                self.step([index], [weight], [grad], [state])
diff --git a/python/mxnet/optimizer/dcasgd.py b/python/mxnet/optimizer/dcasgd.py
new file mode 100644
index 000000000000..789ceeb03cd7
--- /dev/null
+++ b/python/mxnet/optimizer/dcasgd.py
@@ -0,0 +1,116 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=W0223
+"""DCASGD optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, square)
+from .optimizer import Optimizer, register
+
+__all__ = ['DCASGD']
+
+
+@register
+class DCASGD(Optimizer):
+    """The DCASGD optimizer.
+
+    This class implements the optimizer described in *Asynchronous Stochastic Gradient Descent
+    with Delay Compensation for Distributed Deep Learning*,
+    available at https://arxiv.org/abs/1609.08326.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, optional
+       The momentum value.
+    lamda : float, optional
+       Scale DC value.
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, momentum=0.0, lamda=0.04,
+                 use_fused_step=False, **kwargs):
+        super(DCASGD, self).__init__(learning_rate=learning_rate,
+                                     use_fused_step=use_fused_step,
+                                     **kwargs)
+        self.momentum = momentum
+        self.weight_previous = {}
+        self.lamda = lamda
+
+    def create_state(self, index, weight):
+        if self.momentum == 0.0:
+            return None, weight.copy()  # previous weight
+        else:
+            return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # momentum
+                    weight.copy())  # previous weight
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mom, previous_weight
+            mom, previous_weight = state
+
+            d = square(grad)
+            d *= weight - previous_weight
+            d *= self.lamda
+            d += grad
+
+            if mom is not None:
+                mom[:] *= self.momentum
+                mom[:] -= lr * d
+            else:
+                assert (self.momentum == 0.0)
+                mom = d
+                mom *= -lr
+            previous_weight[:] = weight
+
+            # update weight
+            weight[:] += mom
diff --git a/python/mxnet/optimizer/ftml.py b/python/mxnet/optimizer/ftml.py
new file mode 100644
index 000000000000..d9f1ccb5d080
--- /dev/null
+++ b/python/mxnet/optimizer/ftml.py
@@ -0,0 +1,158 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""FTML optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from ..ndarray import ftml_update
+from .optimizer import Optimizer, register
+
+__all__ = ['FTML']
+
+
+@register
+class FTML(Optimizer):
+    """The FTML optimizer.
+
+    This class implements the optimizer described in
+    *FTML - Follow the Moving Leader in Deep Learning*,
+    available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf.
+
+    Denote time step by t. The optimizer updates the weight by::
+
+        rescaled_grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        v = beta2 * v + (1 - beta2) * square(rescaled_grad)
+        d_t = (1 - power(beta1, t)) / lr * (square_root(v / (1 - power(beta2, t))) + epsilon)
+        z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) * weight
+        weight = - z / d_t
+
+    For details of the update algorithm, see :class:`~mxnet.ndarray.ftml_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.0025
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.6
+        0 < beta1 < 1. Generally close to 0.5.
+    beta2 : float, default 0.999
+        0 < beta2 < 1. Generally close to 1.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.0025, beta1=0.6, beta2=0.999, epsilon=1e-8,
+                 use_fused_step=True, **kwargs):
+        super(FTML, self).__init__(learning_rate=learning_rate,
+                                   use_fused_step=use_fused_step,
+                                   **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype), # d_0
+                zeros(weight.shape, weight.context, dtype=weight.dtype), # v_0
+                zeros(weight.shape, weight.context, dtype=weight.dtype)) # z_0
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            coef1 = 1. - self.beta1**t
+            coef2 = 1. - self.beta2**t
+
+            # update d, v, z
+            d, v, z = state
+
+            v[:] *= self.beta2
+            v[:] += (1. - self.beta2) * square(grad)
+            sigma = - self.beta1 * d
+            d[:] = sqrt(v / coef2) + self.epsilon
+            d[:] *= coef1 / lr
+            sigma += d
+            z[:] *= self.beta1
+            z[:] += (1. - self.beta1) * grad
+            z[:] -= sigma * weight
+
+            # update weight
+            weight[:] = - z / d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                      'rescale_grad': self.rescale_grad, 't': t}
+            if self.clip_gradient:
+                kwargs['clip_grad'] = self.clip_gradient
+
+            d, v, z = state
+
+            # update weight with fused kernel
+            ftml_update(weight, grad, d, v, z, out=weight, lr=lr, wd=wd, **kwargs)
diff --git a/python/mxnet/optimizer/ftrl.py b/python/mxnet/optimizer/ftrl.py
new file mode 100644
index 000000000000..a14a1a79b2ee
--- /dev/null
+++ b/python/mxnet/optimizer/ftrl.py
@@ -0,0 +1,167 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""FTRL optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square, sign, maximum, abs as NDabs)
+from ..ndarray import ftrl_update
+from .optimizer import Optimizer, register
+
+__all__ = ['Ftrl']
+
+
+#pylint: disable=invalid-name
+#pylint: disable=line-too-long
+@register
+class Ftrl(Optimizer):
+    """The Ftrl optimizer.
+
+    Referenced from *Ad Click Prediction: a View from the Trenches*, available at
+    http://dl.acm.org/citation.cfm?id=2488200.
+
+    eta :
+        .. math::
+           \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^2}}
+
+    The optimizer updates the weight by::
+
+        rescaled_grad = clip(grad * rescale_grad, clip_gradient)
+        z += rescaled_grad - (sqrt(n + rescaled_grad**2) - sqrt(n)) * weight / learning_rate
+        n += rescaled_grad**2
+        w = (sign(z) * lamda1 - z) / ((beta + sqrt(n)) / learning_rate + wd) * (abs(z) > lamda1)
+
+    If the storage types of weight, state and grad are all ``row_sparse``, \
+    **sparse updates** are applied by::
+
+        for row in grad.indices:
+            rescaled_grad[row] = clip(grad[row] * rescale_grad, clip_gradient)
+            z[row] += rescaled_grad[row] - (sqrt(n[row] + rescaled_grad[row]**2) - sqrt(n[row])) * weight[row] / learning_rate
+            n[row] += rescaled_grad[row]**2
+            w[row] = (sign(z[row]) * lamda1 - z[row]) / ((beta + sqrt(n[row])) / learning_rate + wd) * (abs(z[row]) > lamda1)
+
+    The sparse update only updates the z and n for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all
+    indices. Compared with the original update, it can provide large
+    improvements in model training throughput for some applications. However, it
+    provides slightly different semantics than the original update, and
+    may lead to different empirical results.
+
+    For details of the update algorithm, see :class:`~mxnet.ndarray.ftrl_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    lamda1 : float, default 0.01
+        L1 regularization coefficient.
+    beta : float, default 1.0
+        Per-coordinate learning rate correlation parameter.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+
+    def __init__(self, learning_rate=0.1, lamda1=0.01, beta=1.,
+                 use_fused_step=True, **kwargs):
+        super(Ftrl, self).__init__(learning_rate=learning_rate,
+                                   use_fused_step=use_fused_step,
+                                   **kwargs)
+        self.lamda1 = lamda1
+        self.beta = beta
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context, stype=weight.stype),  # z
+                zeros(weight.shape, weight.context, stype=weight.stype))  # n
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+
+            # update z, n
+            z, n = state
+
+            sigma = - sqrt(n)
+            n[:] += square(grad)
+            denom = sqrt(n)
+            sigma += denom
+            sigma /= lr
+            z[:] += grad - sigma * weight
+
+            # update weight
+            denom += self.beta
+            denom /= lr
+            denom += wd
+            d = sign(z) * maximum(NDabs(z) - self.lamda1, 0)
+            weight[:] = - d / denom
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+
+            kwargs = {'lamda1': self.lamda1, 'beta': self.beta, 'rescale_grad': self.rescale_grad}
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+
+            # update weight with fused kernel
+            z, n = state
+            ftrl_update(weight, grad, z, n, out=weight, lr=lr, wd=wd, **kwargs)
diff --git a/python/mxnet/optimizer/lamb.py b/python/mxnet/optimizer/lamb.py
new file mode 100644
index 000000000000..f1f7e1347f4b
--- /dev/null
+++ b/python/mxnet/optimizer/lamb.py
@@ -0,0 +1,261 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Lamb optimizer."""
+from __future__ import absolute_import
+import numpy
+from ..ndarray import (zeros, clip, sqrt, where, square, ones_like,
+                       maximum, minimum)
+from ..ndarray import (lamb_update_phase1, lamb_update_phase2,
+                       mp_lamb_update_phase1, mp_lamb_update_phase2)
+from ..ndarray.contrib import (multi_lamb_update, multi_mp_lamb_update)
+from .optimizer import Optimizer, register
+
+__all__ = ['LAMB']
+
+
+@register
+class LAMB(Optimizer):
+    """LAMB Optimizer.
+
+    Referenced from 'Large Batch Optimization for Deep Learning: Training BERT in 76 minutes'
+    (https://arxiv.org/pdf/1904.00962.pdf)
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.9
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, default 0.999
+        Exponential decay rate for the second moment estimates.
+    epsilon : float, default 1e-6
+        Small value to avoid division by 0.
+    lower_bound : float, default None
+        Lower limit of norm of weight
+    upper_bound : float, default None
+        Upper limit of norm of weight
+    bias_correction : bool, default True
+        Whether or not to apply bias correction
+    aggregate_num : int, default 4
+        Number of weights to be aggregated in a list.
+        They are passed to the optimizer for a single optimization step.
+        In default, all the weights are aggregated.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6,
+                 lower_bound=None, upper_bound=None, bias_correction=True,
+                 aggregate_num=4, use_fused_step=True, **kwargs):
+        assert aggregate_num <= 45,\
+            'When use_fused_step is True, LAMB only supports aggregate_num <= 45,' \
+            ' and receives {}'.format(aggregate_num)
+        super(LAMB, self).__init__(learning_rate=learning_rate,
+                                   aggregate_num=aggregate_num,
+                                   use_fused_step=use_fused_step,
+                                   **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+        self.bias_correction = bias_correction
+
+    def create_state(self, index, weight):
+        stype = weight.stype
+        return (zeros(weight.shape, weight.context, dtype=numpy.float32, stype=stype),  # mean
+                zeros(weight.shape, weight.context, dtype=numpy.float32, stype=stype))  # var
+
+    def step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+
+            # update mean, var
+            mean, var = state
+            mean[:] *= self.beta1
+            mean[:] += (1. - self.beta1) * grad
+            var[:] *= self.beta2
+            var[:] += (1. - self.beta2) * square(grad)
+
+            r1 = weight.norm()
+            if self.lower_bound is not None:
+                r1 = maximum(r1, self.lower_bound)
+            if self.upper_bound is not None:
+                r1 = minimum(r1, self.upper_bound)
+
+            if self.bias_correction:
+                # apply bias correction
+                coef1 = 1. - self.beta1**t
+                coef2 = 1. - self.beta2**t
+                mean_hat = mean / coef1
+                var_hat = var / coef2
+                sqrt(var_hat, out=var_hat)
+                var_hat += self.epsilon
+                mean_hat /= var_hat
+                mean_hat += wd * weight
+            else:
+                mean_hat = sqrt(var)
+                mean_hat += self.epsilon
+                mean_hat[:] = mean / mean_hat
+                mean_hat += wd * weight
+
+            g = mean_hat
+            r2 = g.norm()
+
+            # calculate lamb_trust_ratio
+            ratio = r1 / r2
+            # becomes NaN if ratio == NaN or 0, otherwise 0
+            nan_or_zero = 1 - ratio / ratio
+            r = where(nan_or_zero, ones_like(ratio), ratio)
+            lr *= r
+
+            # update weight
+            g *= lr
+            weight[:] -= g
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        aggregate = self.aggregate_num > 1
+        for weight, grad in zip(weights, grads):
+            aggregate = (aggregate and
+                         weight.stype == 'default' and
+                         grad.stype == 'default')
+        self._update_count(indices)
+        lrs = self._get_lrs(indices)
+        wds = self._get_wds(indices)
+
+        if aggregate:
+            kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                      'bias_correction': self.bias_correction,
+                      'rescale_grad': self.rescale_grad}
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+            if self.lower_bound:
+                kwargs['lower_bound'] = self.lower_bound
+            if self.upper_bound:
+                kwargs['upper_bound'] = self.upper_bound
+
+            step_counts = []
+            for index in indices:
+                step_counts.append(self._index_update_count[index])
+
+            multi_precision = self.multi_precision and weights[0].dtype == numpy.float16
+
+            if not multi_precision:
+                mean, var = list(zip(*states))
+                multi_lamb_update(weights, grads, mean, var,
+                                  out=weights, step_count=step_counts,
+                                  lrs=lrs, wds=wds, **kwargs)
+            else:
+                weights32, mean_var = list(zip(*states))
+                mean, var = list(zip(*mean_var))
+                multi_mp_lamb_update(weights, grads,
+                                     mean, var, weights32,
+                                     out=weights, step_count=step_counts,
+                                     lrs=lrs, wds=wds, **kwargs)
+        else:
+            for index, weight, grad, state in zip(indices, weights, grads, states):
+                self._update_count(index)
+                lr = self._get_lr(index)
+                wd = self._get_wd(index)
+                t = self._index_update_count[index]
+                kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                          'bias_correction': self.bias_correction,
+                          'rescale_grad': self.rescale_grad, 't': t}
+                if self.clip_gradient:
+                    kwargs['clip_gradient'] = self.clip_gradient
+
+                multi_precision = self.multi_precision and weight.dtype == numpy.float16
+
+                if multi_precision:
+                    weight32 = state[0]
+                    mean, var = state[1]
+                    g = mp_lamb_update_phase1(weight, grad, mean, var, weight32, wd=wd, **kwargs)
+
+                    kwargs = {}
+                    if self.lower_bound:
+                        kwargs['lower_bound'] = self.lower_bound
+                    if self.upper_bound:
+                        kwargs['upper_bound'] = self.upper_bound
+                    r_1 = weight32.norm()
+                    r_2 = g.norm()
+                    mp_lamb_update_phase2(weight, g, r_1, r_2, weight32, lr=lr,
+                                          out=weight, **kwargs)
+                else:
+                    mean, var = state
+                    g = lamb_update_phase1(weight, grad, mean, var, wd=wd, **kwargs)
+
+                    kwargs = {}
+                    if self.lower_bound:
+                        kwargs['lower_bound'] = self.lower_bound
+                    if self.upper_bound:
+                        kwargs['upper_bound'] = self.upper_bound
+                    r_1 = weight.norm()
+                    r_2 = g.norm()
+                    lamb_update_phase2(weight, g, r_1, r_2, lr=lr, out=weight, **kwargs)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Override update_multi_precision.
+        """
+        if self.use_fused_step:
+            self.update(indices, weights, grads, states)
+        else:
+            super(LAMB, self).update_multi_precision(indices, weights, grads, states)
diff --git a/python/mxnet/optimizer/lars.py b/python/mxnet/optimizer/lars.py
new file mode 100644
index 000000000000..9492a9380018
--- /dev/null
+++ b/python/mxnet/optimizer/lars.py
@@ -0,0 +1,279 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""LARS optimizer."""
+from __future__ import absolute_import
+import numpy
+from ..ndarray import (zeros, clip, array,
+                       multi_sum_sq, multi_lars,
+                       norm as NDnorm,
+                       where, ones_like)
+from ..ndarray import (sgd_update, sgd_mom_update,
+                       mp_sgd_update, mp_sgd_mom_update,
+                       preloaded_multi_sgd_update, preloaded_multi_sgd_mom_update,
+                       preloaded_multi_mp_sgd_update, preloaded_multi_mp_sgd_mom_update)
+from .optimizer import Optimizer, register
+from .utils import _flatten_list
+
+__all__ = ['LARS']
+
+
+@register
+class LARS(Optimizer):
+    """the LARS optimizer from 'Large Batch Training of Convolution Networks' \
+    (https://arxiv.org/abs/1708.03888)
+
+    Behave mostly like SGD with momentum and weight decay but is scaling \
+    adaptively the learning for each layer:
+    w_norm = L2norm(weights)
+    g_norm = L2norm(gradients)
+    if w_norm > 0 and g_norm > 0:
+        lr_layer = lr * w_norm / (g_norm + weight_decay * w_norm + epsilon)
+    else:
+        lr_layer = lr
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, default 0.
+        The momentum value.
+    eta : float, default 0.001
+        LARS coefficient used to scale the learning rate.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    lazy_update : bool, default False
+        Default is False. If True, lazy updates are applied \
+        if the storage types of weight and grad are both ``row_sparse``.
+    aggregate_num : int, default 1
+        Number of weights to be aggregated in a list.
+        They are passed to the optimizer for a single optimization step.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, momentum=0.0, eta=0.001,
+                 epsilon=1e-8, lazy_update=False, use_fused_step=True,
+                 aggregate_num=1, **kwargs):
+        super(LARS, self).__init__(learning_rate=learning_rate,
+                                   use_fused_step=use_fused_step,
+                                   aggregate_num=aggregate_num,
+                                   **kwargs)
+        if not self.use_fused_step:
+            assert not lazy_update,\
+                'When use_fused_step is set to False, lazy_update has to be turned off.'
+        if lazy_update:
+            assert not self.multi_precision, \
+                'When lazy_update is set to True, multi_precision has be turned off.'
+        self.lazy_update = lazy_update
+        self.momentum = momentum
+        self.eta = eta
+        self.epsilon = epsilon
+        self.lazy_update = lazy_update
+
+    def create_state(self, index, weight):
+        momentum = None
+        if self.momentum != 0.0:
+            stype = weight.stype if self.lazy_update else 'default'
+            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
+        return momentum
+
+    def _l2norm(self, v, rescale=False):
+        """L2 Norm implementation"""
+        v = v.astype('float32')
+        if rescale:
+            v *= self.rescale_grad
+        norm = NDnorm(v)
+        return norm
+
+    def _get_lars(self, index, weight, grad, wd):
+        """Returns a scaling factor for the learning rate for this layer"""
+        lars = 1.0
+        name = self.idx2name[index] if index in self.idx2name else str(index)
+        if name.endswith('gamma') or name.endswith('beta') or name.endswith('bias'):
+            return lars
+
+        w_norm = self._l2norm(weight)
+        g_norm = self._l2norm(grad, rescale=True)
+
+        # calculate lars_trust_ratio
+        ratio = w_norm / g_norm
+        # becomes NaN if ratio == NaN or 0, otherwise 0
+        nan_or_zero = 1 - ratio / ratio
+        lars = self.eta * w_norm / (g_norm + wd * w_norm + self.epsilon)
+        lars = where(nan_or_zero, ones_like(lars), lars)
+
+        return lars.asscalar()
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+
+            # compute lars
+            # clip grad + wd * weight is performed after computing lars
+            lars = self._get_lars(index, weight, grad, wd)
+            lr *= lars
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mom
+            mom = state
+            if mom is not None:
+                mom[:] *= self.momentum
+                mom[:] -= lr * grad
+            else:
+                mom = -lr * grad
+
+            # update weight
+            weight[:] += mom
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        aggregate = self.aggregate_num > 1
+        for weight, grad in zip(weights, grads):
+            aggregate = (aggregate and
+                         weight.stype == 'default' and
+                         grad.stype == 'default')
+        self._update_count(indices)
+        lrs = self._get_lrs(indices)
+        wds = self._get_wds(indices)
+
+        kwargs = {'rescale_grad': self.rescale_grad}
+        if self.momentum > 0:
+            kwargs['momentum'] = self.momentum
+        if self.clip_gradient is not None:
+            kwargs['clip_gradient'] = self.clip_gradient
+
+        if aggregate:
+            nb_params = len(indices)
+            names = [self.idx2name[i] if i in self.idx2name else str(i) for i in indices]
+            lars_idx = [i for i in range(nb_params) if
+                        not(names[i].endswith('gamma') or names[i].endswith('beta') or
+                            names[i].endswith('bias'))]
+            nb_lars = len(lars_idx)
+            no_lars_idx = [i for i in range(nb_params) if
+                           (names[i].endswith('gamma') or names[i].endswith('beta') or
+                            names[i].endswith('bias'))]
+            cur_ctx = weights[0].context
+            full_idx = lars_idx + no_lars_idx
+            new_lrs = array([lrs[i] for i in full_idx], ctx=cur_ctx, dtype='float32')
+            new_wds = array([wds[i] for i in full_idx], ctx=cur_ctx, dtype='float32')
+            new_weights = [weights[i] for i in full_idx]
+            new_grads = [grads[i] for i in full_idx]
+            new_states = [states[i] for i in full_idx]
+            if nb_lars > 0:
+                w_sum_sq = multi_sum_sq(*new_weights[:nb_lars], num_arrays=nb_lars)
+                g_sum_sq = multi_sum_sq(*new_grads[:nb_lars], num_arrays=nb_lars)
+                multi_lars(new_lrs[:nb_lars], w_sum_sq, g_sum_sq, new_wds[:nb_lars],
+                           eta=self.eta, eps=self.epsilon, rescale_grad=self.rescale_grad,
+                           out=new_lrs[:nb_lars])
+            # Same than usual using preloaded sgd functions
+            multi_precision = self.multi_precision and weights[0].dtype == numpy.float16
+            if not multi_precision:
+                if self.momentum > 0:
+                    preloaded_multi_sgd_mom_update(
+                        *(_flatten_list(zip(new_weights, new_grads, new_states)) +
+                          [new_lrs, new_wds]), out=new_weights, num_weights=len(new_weights),
+                        **kwargs)
+                else:
+                    preloaded_multi_sgd_update(
+                        *(_flatten_list(zip(new_weights, new_grads)) +
+                          [new_lrs, new_wds]), out=new_weights, num_weights=len(new_weights),
+                        **kwargs)
+            else:
+                states = list(zip(*states))
+                weights32, moms = states
+                if self.momentum > 0:
+                    preloaded_multi_mp_sgd_mom_update(
+                        *(_flatten_list(zip(new_weights, new_grads, moms, weights32)) +
+                          [new_lrs, new_wds]), out=new_weights, num_weights=len(new_weights),
+                        **kwargs)
+                else:
+                    preloaded_multi_mp_sgd_update(
+                        *(_flatten_list(zip(new_weights, new_grads, weights32)) +
+                          [new_lrs, new_wds]), out=new_weights, num_weights=len(new_weights),
+                        **kwargs)
+        else:
+            for i, (index, weight, grad, state) in enumerate(zip(indices, weights, grads, states)):
+                wd = wds[i]
+                lr = lrs[i]
+                lr *= self._get_lars(index, weight, grad, wd)
+                multi_precision = self.multi_precision and weights[0].dtype == numpy.float16
+                if not multi_precision:
+                    mom = state
+                    if state is not None:
+                        sgd_mom_update(weight, grad, mom, out=weight,
+                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
+                    else:
+                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
+                                   lr=lr, wd=wd, **kwargs)
+                else:
+                    weight32, mom = state
+                    if mom is not None:
+                        mp_sgd_mom_update(weight, grad, mom, weight32, out=weight,
+                                          lr=lr, wd=wd, **kwargs)
+                    else:
+                        mp_sgd_update(weight, grad, weight32, out=weight,
+                                      lr=lr, wd=wd, **kwargs)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Override update_multi_precision.
+        """
+        if self.use_fused_step:
+            self.update(indices, weights, grads, states)
+        else:
+            super(LARS, self).update_multi_precision(indices, weights, grads, states)
diff --git a/python/mxnet/optimizer/nadam.py b/python/mxnet/optimizer/nadam.py
new file mode 100644
index 000000000000..a0e298696842
--- /dev/null
+++ b/python/mxnet/optimizer/nadam.py
@@ -0,0 +1,124 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=W0223
+"""Nadam optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from .optimizer import Optimizer, register
+
+__all__ = ['Nadam']
+
+
+@register
+class Nadam(Optimizer):
+    """The Nesterov Adam optimizer.
+
+    Much like Adam is essentially RMSprop with momentum,
+    Nadam is Adam RMSprop with Nesterov momentum available
+    at http://cs229.stanford.edu/proj2015/054_report.pdf.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.9
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, default 0.999
+        Exponential decay rate for the second moment estimates.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    schedule_decay : float, default 0.004
+        Exponential decay rate for the momentum schedule
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+                 schedule_decay=0.004, use_fused_step=False, **kwargs):
+        super(Nadam, self).__init__(learning_rate=learning_rate,
+                                    use_fused_step=use_fused_step,
+                                    **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.schedule_decay = schedule_decay
+        self.m_schedule = 1.
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
+                zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            coef2 = 1. - self.beta2**t
+
+            # warming momentum schedule
+            momentum_t = self.beta1 * (1. - 0.5 * (pow(0.96, t * self.schedule_decay)))
+            momentum_t_1 = self.beta1 * (1. - 0.5 * (pow(0.96, (t + 1) * self.schedule_decay)))
+            self.m_schedule = self.m_schedule * momentum_t
+            m_schedule_next = self.m_schedule * momentum_t_1
+
+            # update mean and var
+            mean, var = state
+            mean[:] *= self.beta1
+            mean[:] += (1. - self.beta1) * grad
+            var[:] *= self.beta2
+            var[:] += (1. - self.beta2) * square(grad)
+
+            grad_prime = grad / (1. - self.m_schedule)
+            mean_prime = mean / (1. - m_schedule_next)
+            var_prime = var / coef2
+            mean_bar = momentum_t_1 * mean_prime + (1. - momentum_t) * grad_prime
+
+            # update weight
+            d = mean_bar / (sqrt(var_prime) + self.epsilon)
+            weight[:] -= lr * d
diff --git a/python/mxnet/optimizer/nag.py b/python/mxnet/optimizer/nag.py
new file mode 100644
index 000000000000..8b816a729637
--- /dev/null
+++ b/python/mxnet/optimizer/nag.py
@@ -0,0 +1,162 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""NAG optimizer."""
+from __future__ import absolute_import
+import numpy
+from ..ndarray import (zeros, clip)
+from ..ndarray import (sgd_update, mp_sgd_update, nag_mom_update, mp_nag_mom_update)
+from .optimizer import Optimizer, register
+
+__all__ = ['NAG']
+
+
+@register
+class NAG(Optimizer):
+    """Nesterov accelerated gradient.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        state = momentum * state + lr * grad
+        weight = weight - (momentum * state + lr * grad)
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, default 0.9
+       The momentum value.
+    multi_precision: bool, default False
+        Flag to control the internal precision of the optimizer.
+        False: results in using the same precision as the weights (default),
+        True: makes internal 32-bit copy of the weights and applies gradients
+        in 32-bit precision even if actual weights used in the model have lower precision.
+        Turning this on can improve convergence and accuracy when training with float16.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, momentum=0.9, multi_precision=False,
+                 use_fused_step=True, **kwargs):
+        super(NAG, self).__init__(learning_rate=learning_rate,
+                                  multi_precision=multi_precision,
+                                  use_fused_step=use_fused_step,
+                                  **kwargs)
+        self.momentum = momentum
+
+    def create_state(self, index, weight):
+        momentum = None
+        if self.momentum != 0.0:
+            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype)
+        return momentum
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mom
+            mom = state
+            if mom is not None:
+                mom[:] *= self.momentum
+                mom[:] -= lr * grad
+                d = self.momentum * mom - lr * grad
+            else:
+                d = -lr * grad
+
+            # update weight
+            weight[:] += d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+
+            kwargs = {'rescale_grad': self.rescale_grad}
+            if self.momentum > 0:
+                kwargs['momentum'] = self.momentum
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+
+            multi_precision = self.multi_precision and weight.dtype == numpy.float16
+
+            if not multi_precision:
+                mom = state
+                if mom is not None:
+                    nag_mom_update(weight, grad, mom, out=weight, lr=lr, wd=wd, **kwargs)
+                else:
+                    sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
+            else:
+                weight32, mom = state
+                if mom is not None:
+                    mp_nag_mom_update(weight, grad, mom, weight32, out=weight,
+                                      lr=lr, wd=wd, **kwargs)
+                else:
+                    mp_sgd_update(weight, grad, weight32, out=weight,
+                                  lr=lr, wd=wd, **kwargs)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Override update_multi_precision.
+        """
+        if self.use_fused_step:
+            self.update(indices, weights, grads, states)
+        else:
+            super(NAG, self).update_multi_precision(indices, weights, grads, states)
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index 737a947509b0..2fef62eb8319 100755
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -17,37 +17,14 @@
 # under the License.
 
 # pylint: disable=too-many-lines
-"""Weight updating functions."""
-import logging
-import math
-import pickle
+"""Base Optimizer class."""
 import warnings
-import os
 import numpy
-from ..base import py_str
-from ..ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply,
-                       multi_sum_sq, multi_lars, norm as NDnorm)
-from ..ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
-                       mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update,
-                       signsgd_update, signum_update, nag_mom_update, mp_nag_mom_update,
-                       multi_sgd_update, multi_sgd_mom_update, multi_mp_sgd_update,
-                       multi_mp_sgd_mom_update, preloaded_multi_sgd_update,
-                       preloaded_multi_sgd_mom_update, preloaded_multi_mp_sgd_update,
-                       preloaded_multi_mp_sgd_mom_update, lamb_update_phase1, lamb_update_phase2,
-                       mp_lamb_update_phase1, mp_lamb_update_phase2)
-from ..ndarray.contrib import (multi_lamb_update, multi_mp_lamb_update)
-from ..ndarray import sparse
-from ..random import normal
+from ..ndarray import (NDArray, zeros, cast)
 from ..util import is_np_array
 
-__all__ = [
-    'AdaDelta', 'AdaGrad', 'Adam', 'Adamax', 'DCASGD', 'FTML', 'Ftrl', 'LARS', 'LBSGD',
-    'NAG', 'NDabs', 'Nadam', 'Optimizer', 'RMSProp', 'SGD', 'SGLD', 'Signum', 'LAMB',
-    'Test', 'Updater', 'ccSGD', 'create', 'get_updater', 'register'
-]
+__all__ = ['Optimizer', 'Test', 'create', 'register']
 
-def _flatten_list(nested_list):
-    return [item for sublist in nested_list for item in sublist]
 
 class Optimizer(object):
     """The base class inherited by all optimizers.
@@ -94,6 +71,17 @@ class Optimizer(object):
         Dictionary of parameter index to gluon.Parameter, used to lookup parameter attributes
         such as lr_mult, wd_mult, etc. param_dict shall not be deep copied.
 
+    aggregate_num : int, optional, default None
+        Number of weights to be aggregated in a list.
+        They are passed to the optimizer for a single optimization step.
+        In default, only one weight is aggregated.
+        When `aggregate_num` is set to numpy.inf, all the weights are aggregated.
+
+    use_fused_step : bool, optional, default None
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+
     Properties
     ----------
     learning_rate : float
@@ -103,7 +91,9 @@ class Optimizer(object):
     def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
                  clip_gradient=None, learning_rate=None,
                  lr_scheduler=None, sym=None, begin_num_update=0,
-                 multi_precision=False, param_dict=None):
+                 multi_precision=False, param_dict=None, aggregate_num=None,
+                 use_fused_step=None, **kwargs):
+        super(Optimizer, self).__init__(**kwargs)
         self.rescale_grad = rescale_grad
         self.lr_scheduler = lr_scheduler
         if self.lr_scheduler is None and learning_rate is None:
@@ -124,7 +114,11 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self._index_update_count = self._all_index_update_counts[0]
         self.clip_gradient = clip_gradient
         self.multi_precision = multi_precision
-        self.aggregate_num = 0
+
+        if aggregate_num is None:
+            self.aggregate_num = 1
+        else:
+            self.aggregate_num = aggregate_num
 
         if param_idx2name is None:
             param_idx2name = {}
@@ -134,6 +128,8 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self.sym_info = (sym.attr_dict(), sym.list_arguments()) if sym is not None else ()
         self.param_dict = param_dict if param_dict else {}
         self.allow_np_array = is_np_array()
+        self.use_fused_step = use_fused_step \
+            if use_fused_step is not None else False
 
         self.set_lr_mult({})
         self.set_wd_mult({})
@@ -249,7 +245,6 @@ def create_state_multi_precision(self, index, weight):
         state : any obj
             The state associated with the weight.
         """
-        weight_master_copy = None
         if self.multi_precision and weight.dtype == numpy.float16:
             weight_master_copy = weight.astype(numpy.float32)
             return (weight_master_copy,) + (self.create_state(index, weight_master_copy),)
@@ -260,50 +255,101 @@ def create_state_multi_precision(self, index, weight):
                           "optimizer")
         return self.create_state(index, weight)
 
-    def update(self, index, weight, grad, state):
-        """Updates the given parameter using the corresponding gradient and state.
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
 
         Parameters
         ----------
-        index : int
-            The unique index of the parameter into the individual learning
-            rates and weight decays. Learning rates and weight decay
-            may be set via `set_lr_mult()` and `set_wd_mult()`, respectively.
-        weight : NDArray
-            The parameter to be updated.
-        grad : NDArray
-            The gradient of the objective with respect to this parameter.
-        state : any obj
-            The state returned by `create_state()`.
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
-    def update_multi_precision(self, index, weight, grad, state):
-        """Updates the given parameter using the corresponding gradient and state.
-        Mixed precision version.
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        New operators that fuses optimizer's update should be put in this function.
 
         Parameters
         ----------
-        index : int
-            The unique index of the parameter into the individual learning
-            rates and weight decays. Learning rates and weight decay
-            may be set via `set_lr_mult()` and `set_wd_mult()`, respectively.
-        weight : NDArray
-            The parameter to be updated.
-        grad : NDArray
-            The gradient of the objective with respect to this parameter.
-        state : any obj
-            The state returned by `create_state()`.
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
         """
-        if self.multi_precision and weight.dtype == numpy.float16:
-            # Wrapper for mixed precision
-            weight_master_copy = state[0]
-            original_state = state[1]
-            grad32 = grad.astype(numpy.float32)
-            self.update(index, weight_master_copy, grad32, original_state)
-            cast(weight_master_copy, dtype=weight.dtype, out=weight)
+        raise NotImplementedError
+
+    def update(self, indices, weights, grads, states):
+        """Call step to perform a single optimization update if use_fused_step is False,
+         otherwise fused_step is called.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for weight, grad in zip(weights, grads):
+            assert(isinstance(weight, NDArray))
+            assert(isinstance(grad, NDArray))
+        if not self.use_fused_step:
+            self.step(indices, weights, grads, states)
         else:
-            self.update(index, weight, grad, state)
+            self.fused_step(indices, weights, grads, states)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Call step to perform a single optimization update if use_fused_step is False,
+         otherwise fused_step is called. Mixed precision version.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        weights_master_copy = []
+        original_states = []
+        grads32 = []
+        for weight, grad, state in zip(weights, grads, states):
+            if self.multi_precision and weight.dtype == numpy.float16:
+                weights_master_copy.append(state[0])
+                original_states.append(state[1])
+                grads32.append(grad.astype(numpy.float32))
+            else:
+                weights_master_copy.append(weight)
+                original_states.append(state)
+                grads32.append(grad)
+        self.update(indices, weights_master_copy, grads32, original_states)
+        for weight_master_copy, weight in zip(weights_master_copy, weights):
+            if self.multi_precision and weight.dtype == numpy.float16:
+                cast(weight_master_copy, dtype=weight.dtype, out=weight)
 
     def set_learning_rate(self, lr):
         """Sets a new learning rate of the optimizer.
@@ -322,10 +368,6 @@ def set_learning_rate(self, lr):
         else:
             self.lr = lr
 
-    def set_lr_scale(self, args_lrscale): # pylint: disable=unused-argument
-        """[DEPRECATED] Sets lr scale. Use set_lr_mult instead."""
-        raise DeprecationWarning
-
     def set_lr_mult(self, args_lr_mult):
         """Sets an individual learning rate multiplier for each parameter.
 
@@ -362,11 +404,6 @@ def set_lr_mult(self, args_lr_mult):
     def set_wd_mult(self, args_wd_mult):
         """Sets an individual weight decay multiplier for each parameter.
 
-        By default, if `param_idx2name` was provided in the
-        constructor, the weight decay multipler is set as 0 for all
-        parameters whose name don't end with ``_weight`` or
-        ``_gamma``.
-
         .. note:: The default weight decay multiplier for a `Variable`
             can be set with its `wd_mult` argument in the constructor.
 
@@ -386,9 +423,6 @@ def set_wd_mult(self, args_wd_mult):
             compatibility, and we recommend to use the name instead.
         """
         self.wd_mult = {}
-        for n in self.idx2name.values():
-            if not (n.endswith('_weight') or n.endswith('_gamma')):
-                self.wd_mult[n] = 0.0
         if self.sym_info:
             attr, arg_names = self.sym_info
             for name in arg_names:
@@ -518,1515 +552,11 @@ def __setstate__(self, state):
         # param_dict needs to be explicitly set by the trainer
         self.param_dict = {}
 
+
 # convenience wrapper for Optimizer.Register
 register = Optimizer.register   # pylint: disable=invalid-name
 
-# pylint: disable=line-too-long
-@register
-class SGD(Optimizer):
-    """The SGD optimizer with momentum and weight decay.
-
-    If the storage types of grad is ``row_sparse`` and ``lazy_update`` is True, \
-    **lazy updates** are applied by::
-
-        for row in grad.indices:
-            rescaled_grad[row] = lr * (rescale_grad * clip(grad[row], clip_gradient) + wd * weight[row])
-            state[row] = momentum[row] * state[row] + rescaled_grad[row]
-            weight[row] = weight[row] - state[row]
-
-    The sparse update only updates the momentum for the weights whose row_sparse
-    gradient indices appear in the current batch, rather than updating it for all
-    indices. Compared with the original update, it can provide large
-    improvements in model training throughput for some applications. However, it
-    provides slightly different semantics than the original update, and
-    may lead to different empirical results.
-
-    In the case when ``update_on_kvstore`` is set to False (either globally via
-    MXNET_UPDATE_ON_KVSTORE=0 environment variable or as a parameter in
-    :class:`~mxnet.gluon.Trainer`) SGD optimizer can perform aggregated update
-    of parameters, which may lead to improved performance. The aggregation size
-    is controlled by MXNET_OPTIMIZER_AGGREGATION_SIZE environment variable and
-    defaults to 4.
-
-    Otherwise, **standard updates** are applied by::
-
-        rescaled_grad = lr * (rescale_grad * clip(grad, clip_gradient) + wd * weight)
-        state = momentum * state + rescaled_grad
-        weight = weight - state
-
-    For details of the update algorithm see
-    :class:`~mxnet.ndarray.sgd_update` and :class:`~mxnet.ndarray.sgd_mom_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-        The momentum value.
-    lazy_update : bool, optional
-        Default is True. If True, lazy updates are applied \
-        if the storage types of weight and grad are both ``row_sparse``.
-    multi_precision: bool, optional
-        Flag to control the internal precision of the optimizer.
-        False: results in using the same precision as the weights (default),
-        True: makes internal 32-bit copy of the weights and applies gradients
-        in 32-bit precision even if actual weights used in the model have lower precision.
-        Turning this on can improve convergence and accuracy when training with float16.
-    """
-    def __init__(self, momentum=0.0, lazy_update=True, **kwargs):
-        super(SGD, self).__init__(**kwargs)
-        self.momentum = momentum
-        self.lazy_update = lazy_update
-        self.aggregate_num = int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "4"))
-
-    def create_state_multi_precision(self, index, weight):
-        weight_master_copy = None
-        if self.multi_precision and weight.dtype == numpy.float16:
-            weight_master_copy = weight.astype(numpy.float32)
-            return (self.create_state(index, weight_master_copy), weight_master_copy)
-        if weight.dtype == numpy.float16 and not self.multi_precision:
-            warnings.warn("Accumulating with float16 in optimizer can lead to "
-                          "poor accuracy or slow convergence. "
-                          "Consider using multi_precision=True option of the "
-                          "SGD optimizer")
-        return self.create_state(index, weight)
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            stype = weight.stype if self.lazy_update else 'default'
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
-        return momentum
-
-    def _update_impl(self, indices, weights, grads, states, multi_precision=False):
-        aggregate = True
-        if not isinstance(indices, (tuple, list)):
-            indices = [indices]
-            weights = [weights]
-            grads = [grads]
-            states = [states]
-        for weight, grad in zip(weights, grads):
-            assert(isinstance(weight, NDArray))
-            assert(isinstance(grad, NDArray))
-            aggregate = (aggregate and
-                         weight.stype == 'default' and
-                         grad.stype == 'default')
-        self._update_count(indices)
-        lrs = self._get_lrs(indices)
-        wds = self._get_wds(indices)
-
-        kwargs = {'rescale_grad': self.rescale_grad}
-        if self.momentum > 0:
-            kwargs['momentum'] = self.momentum
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        if aggregate:
-            if not multi_precision:
-                if self.momentum > 0:
-                    multi_sgd_mom_update(*_flatten_list(zip(weights, grads, states)), out=weights,
-                                         num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
-                else:
-                    multi_sgd_update(*_flatten_list(zip(weights, grads)), out=weights,
-                                     num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
-            else:
-                if self.momentum > 0:
-                    multi_mp_sgd_mom_update(*_flatten_list(zip(weights, grads, *zip(*states))),
-                                            out=weights, num_weights=len(weights),
-                                            lrs=lrs, wds=wds, **kwargs)
-                else:
-                    multi_mp_sgd_update(*_flatten_list(zip(weights, grads,
-                                                           list(zip(*states))[1])),
-                                        out=weights, num_weights=len(weights),
-                                        lrs=lrs, wds=wds, **kwargs)
-        else:
-            for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds):
-                if not multi_precision:
-                    if state is not None:
-                        sgd_mom_update(weight, grad, state, out=weight,
-                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
-                    else:
-                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
-                                   lr=lr, wd=wd, **kwargs)
-                else:
-                    if state[0] is not None:
-                        mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight,
-                                          lr=lr, wd=wd, **kwargs)
-                    else:
-                        mp_sgd_update(weight, grad, state[1], out=weight,
-                                      lr=lr, wd=wd, **kwargs)
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state, multi_precision=False)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        if not isinstance(index, (tuple, list)):
-            use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
-        else:
-            use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
-        self._update_impl(index, weight, grad, state,
-                          multi_precision=use_multi_precision)
-
-@register
-class Signum(Optimizer):
-    r"""The Signum optimizer that takes the sign of gradient or momentum.
-
-    The optimizer updates the weight by::
-
-        rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
-        state = momentum * state + (1-momentum)*rescaled_grad
-        weight = (1 - lr * wd_lh) * weight - lr * sign(state)
-
-    References
-    ----------
-    Jeremy Bernstein, Yu-Xiang Wang, Kamyar Azizzadenesheli & Anima Anandkumar. (2018).
-    signSGD: Compressed Optimisation for Non-Convex Problems. In ICML'18.
-
-    See: https://arxiv.org/abs/1802.04434
-
-    For details of the update algorithm see
-    :class:`~mxnet.ndarray.signsgd_update` and :class:`~mxnet.ndarray.signum_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-       The momentum value.
-    wd_lh : float, optional
-       The amount of decoupled weight decay regularization, see details in the original paper at:\
-       https://arxiv.org/abs/1711.05101
-    """
-    def __init__(self, learning_rate=0.01, momentum=0.9, wd_lh=0.0, **kwargs):
-        super(Signum, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.momentum = momentum
-        self.wd_lh = wd_lh
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
-        return momentum
-
-    def _update_impl(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        kwargs = {'rescale_grad': self.rescale_grad}
-        if self.momentum > 0:
-            kwargs['momentum'] = self.momentum
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-        if self.wd_lh:
-            kwargs['wd_lh'] = self.wd_lh
-
-        if state is not None:
-            signum_update(weight, grad, state, out=weight,
-                          lr=lr, wd=wd, **kwargs)
-        else:
-            signsgd_update(weight, grad, out=weight,
-                           lr=lr, wd=wd, **kwargs)
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state)
-
-@register
-class FTML(Optimizer):
-    """The FTML optimizer.
-
-    This class implements the optimizer described in
-    *FTML - Follow the Moving Leader in Deep Learning*,
-    available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf.
-
-    Denote time step by t. The optimizer updates the weight by::
-
-        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        v = beta2 * v + (1 - beta2) * square(rescaled_grad)
-        d_t = (1 - power(beta1, t)) / lr * square_root(v / (1 - power(beta2, t))) + epsilon)
-        z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) * weight
-        weight = - z / d_t
-
-    For details of the update algorithm, see :class:`~mxnet.ndarray.ftml_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        0 < beta1 < 1. Generally close to 0.5.
-    beta2 : float, optional
-        0 < beta2 < 1. Generally close to 1.
-    epsilon : float, optional
-        Small value to avoid division by 0.
-    """
-    def __init__(self, beta1=0.6, beta2=0.999, epsilon=1e-8, **kwargs):
-        super(FTML, self).__init__(**kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype), # d_0
-                zeros(weight.shape, weight.context, dtype=weight.dtype), # v_0
-                zeros(weight.shape, weight.context, dtype=weight.dtype)) # z_0
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        t = self._index_update_count[index]
-
-        kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
-                  'rescale_grad': self.rescale_grad, 't': t}
-        if self.clip_gradient:
-            kwargs['clip_grad'] = self.clip_gradient
-
-        prev_d, prev_v, prev_z = state
-        ftml_update(weight, grad, prev_d, prev_v, prev_z, out=weight,
-                    lr=lr, wd=wd, **kwargs)
-
-@register
-class LARS(Optimizer):
-    """the LARS optimizer from 'Large Batch Training of Convolution Networks' \
-    (https://arxiv.org/abs/1708.03888)
-
-    Behave mostly like SGD with momentum and weight decay but is scaling \
-    adaptively the learning for each layer (except bias and batch norm parameters):
-    w_norm = L2norm(weights)
-    g_norm = L2norm(gradients)
-    if w_norm > 0 and g_norm > 0:
-        lr_layer = lr * lr_mult * eta * w_norm / (g_norm + weight_decay * w_norm + eps)
-    else:
-        lr_layer = lr * lr_mult
-
-    Parameters
-    ----------
-    momentum : float, optional
-        The momentum value.
-    lazy_update : bool, optional
-        Default is True. If True, lazy updates are applied \
-        if the storage types of weight and grad are both ``row_sparse``.
-    lars_eta : float, optional
-        LARS coefficient used to scale the learning rate. Default set to 0.001.
-    lars_epsilon : float, optional
-        Optional epsilon in case of very small gradients. Default set to 0.
-    momentum_correction : bool, optional
-        If True scale momentum w.r.t global learning rate change (with an lr_scheduler) \
-        as indicated in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour` \
-        (https://arxiv.org/pdf/1706.02677.pdf)
-        Default set to True.
-    """
-    def __init__(self, momentum=0.0, lazy_update=True, eta=0.001, eps=0,
-                 momentum_correction=True, **kwargs):
-        super(LARS, self).__init__(**kwargs)
-        self.momentum = momentum
-        self.momentum_correction = momentum_correction
-        self.lazy_update = lazy_update
-        self.aggregate_num = int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "4"))
-        self.eta = eta
-        self.eps = eps
-        self.skip = 0
-        self.last_lr = None
-        self.cur_lr = None
-
-
-    def _get_lrs(self, indices):
-        """Gets the learning rates given the indices of the weights.
-
-        Parameters
-        ----------
-        indices : list of int
-            Indices corresponding to weights.
-
-        Returns
-        -------
-        lrs : list of float
-            Learning rates for those indices.
-        """
-        if self.cur_lr is not None:
-            self.last_lr = self.cur_lr
-
-        if self.lr_scheduler is not None:
-            lr = self.lr_scheduler(self.num_update)
-        else:
-            lr = self.lr
-
-        if self.cur_lr is None:
-            self.last_lr = lr
-        self.cur_lr = lr
-
-        lrs = [lr for _ in indices]
-        for i, index in enumerate(indices):
-            if index in self.param_dict:
-                lrs[i] *= self.param_dict[index].lr_mult
-            elif index in self.lr_mult:
-                lrs[i] *= self.lr_mult[index]
-            elif index in self.idx2name:
-                lrs[i] *= self.lr_mult.get(self.idx2name[index], 1.0)
-        return lrs
-
-    def set_wd_mult(self, args_wd_mult):
-        self.wd_mult = {}
-        for n in self.idx2name.values():
-            is_weight = n.endswith('_weight')
-
-            if not is_weight:
-                self.wd_mult[n] = 0.0
-
-        if self.sym_info:
-            attr, arg_names = self.sym_info
-            for name in arg_names:
-                if name in attr and '__wd_mult__' in attr[name]:
-                    self.wd_mult[name] = float(attr[name]['__wd_mult__'])
-        self.wd_mult.update(args_wd_mult)
-
-    def create_state_multi_precision(self, index, weight):
-        weight_master_copy = None
-        if self.multi_precision and weight.dtype == numpy.float16:
-            weight_master_copy = weight.astype(numpy.float32)
-            return (self.create_state(index, weight_master_copy), weight_master_copy)
-        if weight.dtype == numpy.float16 and not self.multi_precision:
-            warnings.warn("Accumulating with float16 in optimizer can lead to "
-                          "poor accuracy or slow convergence. "
-                          "Consider using multi_precision=True option of the "
-                          "SGD optimizer")
-        return self.create_state(index, weight)
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            stype = weight.stype if self.lazy_update else 'default'
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
-        return momentum
-
-    def _l2norm(self, v, rescale=False):
-        """L2 Norm implementation"""
-        v = v.astype('float32')
-        if rescale:
-            v *= self.rescale_grad
-        norm = NDnorm(v).asnumpy()[0]
-        return norm
-
-    def _get_lars(self, i, weight, g, lr, wd):
-        """Returns a scaling factor for the learning rate for this layer"""
-        name = self.idx2name[i] if i in self.idx2name else str(i)
-        if name.endswith('gamma') or name.endswith('beta') or name.endswith('bias'):
-            return lr
-
-        w_norm = self._l2norm(weight)
-        g_norm = self._l2norm(g, rescale=True)
-
-        if w_norm > 0.0 and g_norm > 0.0:
-            lars = self.eta * w_norm/(g_norm + wd * w_norm + self.eps)
-        else:
-            lars = 1.0
-        return lars * lr
-
-    def _update_impl(self, indices, weights, grads, states, multi_precision=False):
-        aggregate = True
-        if not isinstance(indices, (tuple, list)):
-            indices = [indices]
-            weights = [weights]
-            grads = [grads]
-            states = [states]
-        for weight, grad in zip(weights, grads):
-            assert(isinstance(weight, NDArray))
-            assert(isinstance(grad, NDArray))
-            aggregate = (aggregate and
-                         weight.stype == 'default' and
-                         grad.stype == 'default')
-        self._update_count(indices)
-        lrs = self._get_lrs(indices)
-        wds = self._get_wds(indices)
-
-        kwargs = {'rescale_grad': self.rescale_grad}
-        if self.momentum > 0:
-            kwargs['momentum'] = (self.momentum * (self.cur_lr / self.last_lr)) \
-                                 if (self.momentum_correction and self.last_lr != 0) else \
-                                 self.momentum
-
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        if aggregate:
-            nb_params = len(indices)
-            names = [self.idx2name[i] if i in self.idx2name else str(i) for i in indices]
-            lars_idx = [i for i in range(nb_params) if
-                        not(names[i].endswith('gamma') or names[i].endswith('beta') or
-                            names[i].endswith('bias'))]
-            nb_lars = len(lars_idx)
-            no_lars_idx = [i for i in range(nb_params) if
-                           (names[i].endswith('gamma') or names[i].endswith('beta') or
-                            names[i].endswith('bias'))]
-            cur_ctx = weights[0].context
-            full_idx = lars_idx + no_lars_idx
-            new_lrs = array([lrs[i] for i in full_idx], ctx=cur_ctx, dtype='float32')
-            new_wds = array([wds[i] for i in full_idx], ctx=cur_ctx, dtype='float32')
-            new_weights = [weights[i] for i in full_idx]
-            new_grads = [grads[i] for i in full_idx]
-            new_states = [states[i] for i in full_idx]
-            if nb_lars > 0:
-                w_sum_sq = multi_sum_sq(*new_weights[:nb_lars], num_arrays=nb_lars)
-                g_sum_sq = multi_sum_sq(*new_grads[:nb_lars], num_arrays=nb_lars)
-                multi_lars(new_lrs[:nb_lars], w_sum_sq, g_sum_sq, new_wds[:nb_lars],
-                           eta=self.eta, eps=self.eps, rescale_grad=self.rescale_grad,
-                           out=new_lrs[:nb_lars])
-            # Same than usual using preloaded sgd functions
-            sidx = 0
-            while sidx < len(indices):
-                eidx = sidx + len(new_weights[sidx:sidx+self.aggregate_num])
-                if not multi_precision:
-                    if self.momentum > 0:
-                        preloaded_multi_sgd_mom_update(
-                            *(_flatten_list(zip(new_weights[sidx:eidx],
-                                                new_grads[sidx:eidx],
-                                                new_states[sidx:eidx])) +
-                              [new_lrs[sidx:eidx], new_wds[sidx:eidx]]),
-                            out=new_weights[sidx:eidx],
-                            num_weights=len(new_weights[sidx:eidx]),
-                            **kwargs)
-                    else:
-                        preloaded_multi_sgd_update(
-                            *(_flatten_list(zip(new_weights[sidx:eidx],
-                                                new_grads[sidx:eidx])) +
-                              [new_lrs[sidx:eidx], new_wds[sidx:eidx]]),
-                            out=new_weights[sidx:eidx],
-                            num_weights=len(new_weights[sidx:eidx]),
-                            **kwargs)
-                else:
-                    if self.momentum > 0:
-                        preloaded_multi_mp_sgd_mom_update(
-                            *(_flatten_list(zip(new_weights[sidx:eidx],
-                                                new_grads[sidx:eidx],
-                                                *zip(*new_states[sidx:eidx]))) +
-                              [new_lrs[sidx:eidx], new_wds[sidx:eidx]]),
-                            out=new_weights[sidx:eidx],
-                            num_weights=len(new_weights[sidx:eidx]),
-                            **kwargs)
-                    else:
-                        preloaded_multi_mp_sgd_update(
-                            *(_flatten_list(zip(new_weights[sidx:eidx],
-                                                new_grads[sidx:eidx],
-                                                list(zip(*new_states[sidx:eidx]))[1])) +
-                              [new_lrs[sidx:eidx], new_wds[sidx:eidx]]),
-                            out=new_weights[sidx:eidx],
-                            num_weights=len(new_weights[sidx:eidx]),
-                            **kwargs)
-                sidx += self.aggregate_num
-        else:
-            lrs = [self._get_lars(i, w, g, lr, wd) for (i, w, g, lr, wd) in
-                   zip(indices, weights, grads, lrs, wds)]
-
-            for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds):
-                if not multi_precision:
-                    if state is not None:
-                        sgd_mom_update(weight, grad, state, out=weight,
-                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
-                    else:
-                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
-                                   lr=lr, wd=wd, **kwargs)
-                else:
-                    if state[0] is not None:
-                        mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight,
-                                          lr=lr, wd=wd, **kwargs)
-                    else:
-                        mp_sgd_update(weight, grad, state[1], out=weight,
-                                      lr=lr, wd=wd, **kwargs)
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state, multi_precision=False)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        if not isinstance(index, (tuple, list)):
-            use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
-        else:
-            use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
-        self._update_impl(index, weight, grad, state,
-                          multi_precision=use_multi_precision)
-
-#
-@register
-class LBSGD(Optimizer):
-    """The Large Batch SGD optimizer with momentum and weight decay.
-
-    The optimizer updates the weight by::
-
-        state = momentum * state + lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
-        weight = weight - state
-
-    For details of the update algorithm see :class:`~mxnet.ndarray.sgd_update`
-    and :class:`~mxnet.ndarray.sgd_mom_update`.
-    In addition to the SGD updates the LBSGD optimizer uses the LARS, Layer-wise
-    Adaptive Rate Scaling, algorithm to have a separate learning rate for each
-    layer of the network, which leads to better stability over large batch sizes.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-        The momentum value.
-    multi_precision: bool, optional
-        Flag to control the internal precision of the optimizer.
-        False: results in using the same precision as the weights (default),
-        True: makes internal 32-bit copy of the weights and applies gradients
-        in 32-bit precision even if actual weights used in the model have lower precision.
-        Turning this on can improve convergence and accuracy when training with float16.
-
-    warmup_strategy: string ('linear', 'power2', 'sqrt'. , 'lars'   default : 'linear')
-    warmup_epochs: unsigned, default: 5
-    batch_scale:   unsigned, default: 1 (same as batch size * numworkers)
-    updates_per_epoch: updates_per_epoch (default: 32, Default might not reflect true number batches per epoch. Used for warmup.)
-    begin_epoch: unsigned, default 0, starting epoch.
-    """
-    def __init__(self, momentum=0.0, multi_precision=False, warmup_strategy='linear',
-                 warmup_epochs=5, batch_scale=1, updates_per_epoch=32, begin_epoch=0, num_epochs=60,
-                 **kwargs):
-        super(LBSGD, self).__init__(**kwargs)
-        logging.info('Running Large-Batch SGD Algorithm')
-        logging.info('(Batch_scale=%f, warmup_epochs=%d, warmup_strategy=%s, updates_per_epoch=%d)',
-                     batch_scale, warmup_epochs, warmup_strategy, updates_per_epoch)
-        self.momentum = momentum
-        self.multi_precision = multi_precision
-        # new user parameters for large batch
-        self.warmup_strategy = warmup_strategy
-        self.warmup_epochs = warmup_epochs
-        self.batch_scale = batch_scale
-        self.updates_per_epoch = updates_per_epoch
-        self.init_updates = begin_epoch * updates_per_epoch
-        self.num_epochs = num_epochs
-        # addl internal usage parameters and storage
-        self.lbmult = 1
-        self.cumgrads = {}
-        # for adaptive lr
-        self.adaptive = False
-        self.admult = 1  # adaptation constant
-
-    def create_state(self, index, weight):
-        momentum = None
-        weight_master_copy = None
-        if self.multi_precision and weight.dtype == numpy.float16:
-            weight_master_copy = array(weight, ctx=weight.context, dtype=numpy.float32)
-            if self.momentum != 0.0:
-                momentum = zeros(weight.shape, weight.context, dtype=numpy.float32,
-                                 stype=weight.stype)
-            return (momentum, weight_master_copy)
-        if weight.dtype == numpy.float16 and not self.multi_precision:
-            warnings.warn("Accumulating with float16 in optimizer can lead to "
-                          "poor accuracy or slow convergence. "
-                          "Consider using multi_precision=True option of the "
-                          "SGD optimizer")
-        if self.momentum != 0.0:
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
-        return momentum
-
-    def _get_lbmult(self, nup):
-        """Returns lr scaling factor for large batch according to warmup schedule
-        (to be implemented)
-        """
-        nwup = self.warmup_epochs * self.updates_per_epoch
-        strategy = self.warmup_strategy
-        maxmult = float(self.batch_scale)
-        if nup >= nwup:
-            mult = maxmult
-        elif nwup <= 1:
-            mult = 1.0
-        else:
-            if (strategy == 'linear'):
-                mult = 1.0 + (maxmult - 1) * nup / nwup
-            elif (strategy == 'power2'):
-                mult = 1.0 + (maxmult-1) * (nup*nup)/(nwup*nwup)
-            elif (strategy == 'sqrt'):
-                mult = 1.0 + (maxmult - 1) * math.sqrt(float(nup) / nwup)
-            else:
-                mult = 1.0
-        return mult
-
-    def _get_lars(self, weight, g, wd):
-        """Returns a scaling factor for the learning rate for this layer
-        default is 1
-        """
-        weight2 = self._l2norm(weight)
-        grad2 = self._l2norm(g)
-        lars = math.sqrt(weight2 / (grad2 + wd * weight2 + 1e-18))
-        if lars < 0.01:
-            lars = 0.01
-        elif lars > 100:
-            lars = 100
-        return lars
-
-    def _l2norm(self, v):
-        "inner product implementation"
-        norm = multiply(v, v).asnumpy().sum()
-        return norm
-
-    def _reset_cum_gradient(self, index):
-        "called every macro-batch to reset cumulated gradients to 0 for a given index"
-        self.cumgrads[index]['cum_grad'] = 0
-
-    def _get_cum_gradient(self, index):
-        "get the cumulated gradient for index"
-        if index in self.cumgrads:
-            return self.cumgrads[index]
-        else:
-            return {}
-
-    def _put_cum_gradient(self, index, cgrad):
-        "store cumulated gradient for index"
-        self.cumgrads[index] = cgrad
-
-    def _cumulate_gradient(self, grad, index):
-        "Cumulate gradients for large-batch emulation. Cumulated by index (layer)"
-        cgrad = self._get_cum_gradient(index)
-        if cgrad:
-            num_cums = cgrad['num_cums']
-            if num_cums > 0:
-                cum_grad = cgrad['cum_grad'] + grad
-                num_cums += 1
-            else:
-                cum_grad = grad
-                num_cums = self.init_updates + 1
-        else:
-            cum_grad = grad
-            num_cums = self.init_updates + 1
-        cgrad = {'cum_grad': cum_grad, 'num_cums': num_cums}
-        self._put_cum_gradient(index, cgrad)
-        return cgrad
-
-    def update(self, index, weight, grad, state):
-        assert (isinstance(weight, NDArray))
-        assert (isinstance(grad, NDArray))
-
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-
-        # new stuff for large batch
-        cgrad = self._cumulate_gradient(grad, index)
-        if (cgrad['num_cums'] % self.batch_scale) == 0:
-            grad = cgrad['cum_grad'] / self.batch_scale
-            if self.warmup_strategy == 'lars':
-                lbmult = self._get_lars(weight, grad, wd)
-            else:
-                lbmult = self._get_lbmult(cgrad['num_cums'])
-            lr = lr * lbmult
-            # do the regular sgd update flow
-            kwargs = {'rescale_grad': self.rescale_grad}
-            if self.momentum > 0:
-                kwargs['momentum'] = self.momentum
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-            use_multi_precision = isinstance(state, (list, tuple))
-
-            if not use_multi_precision:
-                if state is not None:
-                    sgd_mom_update(weight, grad, state, out=weight, lr=lr, wd=wd, **kwargs)
-                else:
-                    sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
-            else:
-                if state[0] is not None:
-                    mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight, lr=lr, wd=wd,
-                                      **kwargs)
-                else:
-                    mp_sgd_update(weight, grad, state[1], out=weight, lr=lr, wd=wd, **kwargs)
-            # reset update count and cumulated gradient per large batch
-            self._reset_cum_gradient(index)
-        else:
-            lr = 0.0
-            kwargs = {}
-            sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
-
-
-@register
-class LAMB(Optimizer):
-    """LAMB Optimizer.
-    """
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6,
-                 lower_bound=None, upper_bound=None, bias_correction=True, **kwargs):
-        super(LAMB, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.lower_bound = lower_bound
-        self.upper_bound = upper_bound
-        self.bias_correction = bias_correction
-        self.aggregate_num = max(1, min(45, int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "45"))))
-
-    def create_state(self, index, weight):
-        stype = weight.stype
-        dtype = weight.dtype
-        return (zeros(weight.shape, weight.context, dtype=dtype, stype=stype),
-                zeros(weight.shape, weight.context, dtype=dtype, stype=stype))
-
-    def _update_impl(self, index, weight, grad, state, multi_precision=False):
-        kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
-                  'bias_correction': self.bias_correction,
-                  'rescale_grad': self.rescale_grad}
-
-        if self.aggregate_num <= 1 or not isinstance(index, (tuple, list)):
-            if isinstance(index, (tuple, list)):
-                assert(len(index) == self.aggregate_num)
-                index, weight, grad, state = index[0], weight[0], grad[0], state[0]
-            assert(isinstance(weight, NDArray))
-            assert(isinstance(grad, NDArray))
-            self._update_count(index)
-            lr = self._get_lr(index)
-            wd = self._get_wd(index)
-            t = self._index_update_count[index]
-            weight_ptr = weight
-            grad_ptr = grad
-            if multi_precision:
-                mean, var = state[1]
-                weight32 = state[0]
-            else:
-                mean, var = state
-            kwargs['t'] = t
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-
-            if multi_precision:
-                g = mp_lamb_update_phase1(weight_ptr, grad_ptr, mean, var, weight32, wd=wd, **kwargs)
-                kwargs = {}
-                if self.lower_bound:
-                    kwargs['lower_bound'] = self.lower_bound
-                if self.upper_bound:
-                    kwargs['upper_bound'] = self.upper_bound
-                r_1 = weight32.norm()
-                r_2 = g.norm()
-                mp_lamb_update_phase2(weight_ptr, g, r_1, r_2, weight32, lr=lr, out=weight_ptr, **kwargs)
-            else:
-                g = lamb_update_phase1(weight_ptr, grad_ptr, mean, var, wd=wd, **kwargs)
-                kwargs = {}
-                if self.lower_bound:
-                    kwargs['lower_bound'] = self.lower_bound
-                if self.upper_bound:
-                    kwargs['upper_bound'] = self.upper_bound
-                r_1 = weight_ptr.norm()
-                r_2 = g.norm()
-                lamb_update_phase2(weight_ptr, g, r_1, r_2, lr=lr, out=weight_ptr, **kwargs)
-        else:
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-            if self.lower_bound:
-                kwargs['lower_bound'] = self.lower_bound
-            if self.upper_bound:
-                kwargs['upper_bound'] = self.upper_bound
-
-            step_count, lrs, wds = [], [], []
-            for i, w_i, g_i in zip(index, weight, grad):
-                assert(isinstance(w_i, NDArray))
-                assert(isinstance(g_i, NDArray))
-                self._update_count(i)
-                step_count.append(self._index_update_count[i])
-                lrs.append(self._get_lr(i))
-                wds.append(self._get_wd(i))
-
-            updated_tensors = 0
-            while updated_tensors < len(weight):
-                sidx = updated_tensors
-                eidx = min(updated_tensors + self.aggregate_num, len(weight))
-                if not multi_precision:
-                    mean, var = list(zip(*state[sidx:eidx]))
-                    multi_lamb_update(weight[sidx:eidx],
-                                      grad[sidx:eidx],
-                                      mean, var,
-                                      out=weight[sidx:eidx],
-                                      step_count=step_count[sidx:eidx],
-                                      lrs=lrs[sidx:eidx],
-                                      wds=wds[sidx:eidx],
-                                      **kwargs)
-                else:
-                    mean_var = list(zip(*state[sidx:eidx]))[1]
-                    temp = list(zip(*mean_var))
-                    mean = temp[0]
-                    var = temp[1]
-                    multi_mp_lamb_update(weight[sidx:eidx],
-                                         grad[sidx:eidx],
-                                         mean, var,
-                                         list(zip(*state[sidx:eidx]))[0],
-                                         out=weight[sidx:eidx],
-                                         step_count=step_count[sidx:eidx],
-                                         lrs=lrs[sidx:eidx],
-                                         wds=wds[sidx:eidx],
-                                         **kwargs)
-                updated_tensors += self.aggregate_num
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state, multi_precision=False)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        if not isinstance(index, (tuple, list)):
-            use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
-        else:
-            use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
-        self._update_impl(index, weight, grad, state,
-                          multi_precision=use_multi_precision)
-
-# pylint: enable=line-too-long
-@register
-class DCASGD(Optimizer):
-    """The DCASGD optimizer.
-
-    This class implements the optimizer described in *Asynchronous Stochastic Gradient Descent
-    with Delay Compensation for Distributed Deep Learning*,
-    available at https://arxiv.org/abs/1609.08326.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-       The momentum value.
-
-    lamda : float, optional
-       Scale DC value.
-    """
-    def __init__(self, momentum=0.0, lamda=0.04, **kwargs):
-        super(DCASGD, self).__init__(**kwargs)
-        self.momentum = momentum
-        self.weight_previous = {}
-        self.lamda = lamda
-
-    def create_state(self, index, weight):
-        if self.momentum == 0.0:
-            return (None,
-                    weight.copy())  # previous weight
-        else:
-            return (zeros(weight.shape, weight.context, dtype=weight.dtype), # momentum
-                    weight.copy())  # previous weight
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-
-        mom, previous_weight = state
-        if mom:
-            mom[:] *= self.momentum
-            mom[:] += -lr * (grad + wd * weight + self.lamda \
-                             * grad * grad * (weight - previous_weight))
-        else:
-            assert(self.momentum == 0.0)
-            mom = -lr * (grad + wd * weight + self.lamda \
-                         * grad * grad * (weight - previous_weight))
-        previous_weight[:] = weight
-        weight[:] += mom
-
-@register
-class NAG(Optimizer):
-    """Nesterov accelerated gradient.
-
-    This optimizer updates each weight by::
-
-        state = momentum * state + grad + wd * weight
-        weight = weight - (lr * (grad + momentum * state))
-
-    Parameters
-    ----------
-    momentum : float, optional
-       The momentum value.
-    multi_precision: bool, optional
-        Flag to control the internal precision of the optimizer.
-        False: results in using the same precision as the weights (default),
-        True: makes internal 32-bit copy of the weights and applies gradients
-        in 32-bit precision even if actual weights used in the model have lower precision.
-        Turning this on can improve convergence and accuracy when training with float16.
-    """
-    def __init__(self, momentum=0.0, **kwargs):
-        super(NAG, self).__init__(**kwargs)
-        self.momentum = momentum
-
-    def create_state_multi_precision(self, index, weight):
-        weight_master_copy = None
-        if self.multi_precision and weight.dtype == numpy.float16:
-            weight_master_copy = weight.astype(numpy.float32)
-            return (self.create_state(index, weight_master_copy), weight_master_copy)
-        if weight.dtype == numpy.float16 and not self.multi_precision:
-            warnings.warn("Accumulating with float16 in optimizer can lead to "
-                          "poor accuracy or slow convergence. "
-                          "Consider using multi_precision=True option of the "
-                          "NAG optimizer")
-        return self.create_state(index, weight)
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype)
-        return momentum
-
-    def _update_impl(self, index, weight, grad, state, multi_precision=False):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        kwargs = {'rescale_grad': self.rescale_grad}
-        if self.momentum > 0:
-            kwargs['momentum'] = self.momentum
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        if not multi_precision:
-            if state is not None:
-                nag_mom_update(weight, grad, state, out=weight, lr=lr, wd=wd, **kwargs)
-            else:
-                sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
-        else:
-            if state[0] is not None:
-                mp_nag_mom_update(weight, grad, state[0], state[1], out=weight,
-                                  lr=lr, wd=wd, **kwargs)
-            else:
-                mp_sgd_update(weight, grad, state[1], out=weight,
-                              lr=lr, wd=wd, **kwargs)
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state, multi_precision=False)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        use_multi_precision = self.multi_precision and weight.dtype == numpy.float16 \
-                                and isinstance(state, (tuple, list))
-        self._update_impl(index, weight, grad, state,
-                          multi_precision=use_multi_precision)
-
-
-@register
-class SGLD(Optimizer):
-    """Stochastic Gradient Riemannian Langevin Dynamics.
-
-    This class implements the optimizer described in the paper *Stochastic Gradient
-    Riemannian Langevin Dynamics on the Probability Simplex*, available at
-    https://papers.nips.cc/paper/4883-stochastic-gradient-riemannian-langevin-dynamics-on-the-probability-simplex.pdf.
-
-    """
-    def __init__(self, **kwargs):
-        super(SGLD, self).__init__(**kwargs)
-
-    def create_state(self, index, weight):
-        return None
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-        weight[:] += - lr/2 * (grad + wd * weight)
-        weight[:] += normal(0, math.sqrt(lr), shape=weight.shape,
-                            dtype=weight.dtype, ctx=weight.context)
-
-
-
-@register  # pylint: disable=invalid-name
-class ccSGD(SGD):
-    """[DEPRECATED] Same as `SGD`. Left here for backward compatibility."""
-    def __init__(self, *args, **kwargs):
-        super(ccSGD, self).__init__(*args, **kwargs)
-
-@register
-class Adam(Optimizer):
-    """The Adam optimizer.
-
-    This class implements the optimizer described in *Adam: A Method for
-    Stochastic Optimization*, available at http://arxiv.org/abs/1412.6980.
-
-    If the storage types of grad is ``row_sparse``, and ``lazy_update`` is True, \
-    **lazy updates** at step t are applied by::
-
-        for row in grad.indices:
-            rescaled_grad[row] = clip(grad[row] * rescale_grad + wd * weight[row], clip_gradient)
-            m[row] = beta1 * m[row] + (1 - beta1) * rescaled_grad[row]
-            v[row] = beta2 * v[row] + (1 - beta2) * (rescaled_grad[row]**2)
-            lr = learning_rate * sqrt(1 - beta1**t) / (1 - beta2**t)
-            w[row] = w[row] - lr * m[row] / (sqrt(v[row]) + epsilon)
-
-    The lazy update only updates the mean and var for the weights whose row_sparse
-    gradient indices appear in the current batch, rather than updating it for all indices.
-    Compared with the original update, it can provide large improvements in model training
-    throughput for some applications. However, it provides slightly different semantics than
-    the original update, and may lead to different empirical results.
-
-    Otherwise, **standard updates** at step t are applied by::
-
-        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        m = beta1 * m + (1 - beta1) * rescaled_grad
-        v = beta2 * v + (1 - beta2) * (rescaled_grad**2)
-        lr = learning_rate * sqrt(1 - beta1**t) / (1 - beta2**t)
-        w = w - lr * m / (sqrt(v) + epsilon)
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    For details of the update algorithm, see :class:`~mxnet.ndarray.adam_update`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        Exponential decay rate for the first moment estimates.
-    beta2 : float, optional
-        Exponential decay rate for the second moment estimates.
-    epsilon : float, optional
-        Small value to avoid division by 0.
-    lazy_update : bool, optional
-       Default is True. If True, lazy updates are applied \
-       if the storage types of weight and grad are both ``row_sparse``.
-    """
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 lazy_update=True, **kwargs):
-        super(Adam, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.lazy_update = lazy_update
-
-    def create_state(self, index, weight):
-        stype = weight.stype if self.lazy_update else 'default'
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype,
-                      stype=stype),  # mean
-                zeros(weight.shape, weight.context, dtype=weight.dtype,
-                      stype=stype))  # variance
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        t = self._index_update_count[index]
-        coef1 = 1. - self.beta1**t
-        coef2 = 1. - self.beta2**t
-        lr *= math.sqrt(coef2)/coef1
-
-        kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
-                  'rescale_grad': self.rescale_grad}
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        mean, var = state
-        adam_update(weight, grad, mean, var, out=weight,
-                    lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
-
-@register
-class AdaGrad(Optimizer):
-    """AdaGrad optimizer.
-
-    This class implements the AdaGrad optimizer described in *Adaptive Subgradient
-    Methods for Online Learning and Stochastic Optimization*, and available at
-    http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
-
-    This optimizer updates each weight by::
-
-        grad = clip(grad * rescale_grad, clip_gradient)
-        history += square(grad)
-        div = grad / sqrt(history + float_stable_eps)
-        weight += (div + weight * wd) * -lr
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    See Also
-    ----------
-    :meth:`mxnet.ndarray.sparse.adagrad_update`.
-
-    Parameters
-    ----------
-    eps: float, optional
-        Initial value of the history accumulator. Avoids division by 0.
-
-    """
-    def __init__(self, eps=1e-7, **kwargs):
-        super(AdaGrad, self).__init__(**kwargs)
-        self.float_stable_eps = eps
-
-    def create_state(self, index, weight):
-        return zeros(weight.shape, weight.context, stype=weight.stype)  # history
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        is_sparse = grad.stype == 'row_sparse'
-        history = state
-
-        if is_sparse:
-            kwargs = {'epsilon': self.float_stable_eps,
-                      'rescale_grad': self.rescale_grad}
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-            sparse.adagrad_update(weight, grad, history, out=weight, lr=lr, wd=wd, **kwargs)
-        else:
-            grad = grad * self.rescale_grad
-            if self.clip_gradient is not None:
-                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-            history[:] += square(grad)
-            div = grad / sqrt(history + self.float_stable_eps)
-            weight[:] += (div + weight * wd) * -lr
-
-@register
-class RMSProp(Optimizer):
-    """The RMSProp optimizer.
-
-    Two versions of RMSProp are implemented:
-
-    If ``centered=False``, we follow
-    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
-    Tieleman & Hinton, 2012.
-    For details of the update algorithm see :class:`~mxnet.ndarray.rmsprop_update`.
-
-    If ``centered=True``, we follow http://arxiv.org/pdf/1308.0850v5.pdf (38)-(45)
-    by Alex Graves, 2013.
-    For details of the update algorithm see :class:`~mxnet.ndarray.rmspropalex_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    gamma1: float, optional
-        A decay factor of moving average over past squared gradient.
-    gamma2: float, optional
-        A "momentum" factor. Only used if `centered`=``True``.
-    epsilon : float, optional
-        Small value to avoid division by 0.
-    centered : bool, optional
-        Flag to control which version of RMSProp to use.::
-
-            True: will use Graves's version of `RMSProp`,
-            False: will use Tieleman & Hinton's version of `RMSProp`.
-
-    clip_weights : float, optional
-        Clips weights into range ``[-clip_weights, clip_weights]``.
-    """
-    def __init__(self, learning_rate=0.001, gamma1=0.9, gamma2=0.9,
-                 epsilon=1e-8, centered=False, clip_weights=None, **kwargs):
-        super(RMSProp, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.gamma1 = gamma1
-        self.gamma2 = gamma2
-        self.centered = centered
-        self.epsilon = epsilon
-        self.clip_weights = clip_weights
-
-    def create_state(self, index, weight):
-        if self.centered:
-            return (
-                zeros(weight.shape, weight.context, stype=weight.stype),  # n
-                zeros(weight.shape, weight.context, stype=weight.stype),  # g
-                zeros(weight.shape, weight.context, stype=weight.stype))  # delta
-        else:
-            return (zeros(weight.shape, weight.context, stype=weight.stype),)  # n
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        kwargs = {'gamma1': self.gamma1, 'epsilon': self.epsilon,
-                  'rescale_grad': self.rescale_grad}
-        if self.centered:
-            kwargs['gamma2'] = self.gamma2
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-        if self.clip_weights:
-            kwargs['clip_weights'] = self.clip_weights
-
-        if not self.centered:
-            (n, ) = state
-            rmsprop_update(
-                weight, grad, n, out=weight, lr=lr, wd=wd, **kwargs)
-        else:
-            n, g, delta = state
-            rmspropalex_update(weight, grad, n, g, delta, out=weight,
-                               lr=lr, wd=wd, **kwargs)
-
-@register
-class AdaDelta(Optimizer):
-    """The AdaDelta optimizer.
-
-    This class implements AdaDelta, an optimizer described in  *ADADELTA: An adaptive
-    learning rate method*, available at https://arxiv.org/abs/1212.5701.
-
-    This optimizer updates each weight by::
-
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        acc_grad = rho * acc_grad + (1. - rho) * grad * grad
-        delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
-        acc_delta = rho * acc_delta + (1. - rho) * delta * delta
-        weight -= (delta + wd * weight)
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    rho: float
-        Decay rate for both squared gradients and delta.
-    epsilon : float
-        Small value to avoid division by 0.
-    """
-    def __init__(self, rho=0.90, epsilon=1e-5, **kwargs):
-        super(AdaDelta, self).__init__(**kwargs)
-        self.rho = rho
-        self.epsilon = epsilon
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context),  # accumulated g
-                zeros(weight.shape, weight.context))  # accumulated delta
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        wd = self._get_wd(index)
-        self._update_count(index)
-
-        # preprocess grad
-        grad *= self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = clip(grad, - self.clip_gradient, self.clip_gradient)
-
-        # accumulated g and delta initlization
-        acc_g, acc_delta = state
-
-        # update g, delta
-        acc_g[:] *= self.rho
-        acc_g[:] += (1. - self.rho) * grad * grad
-        current_delta = sqrt(acc_delta + self.epsilon) / sqrt(acc_g + self.epsilon) * grad
-        acc_delta[:] *= self.rho
-        acc_delta[:] += (1. - self.rho) * current_delta * current_delta
-
-        # update weight
-        weight[:] -= current_delta + wd * weight
-
-#pylint: disable=invalid-name
-#pylint: disable=line-too-long
-@register
-class Ftrl(Optimizer):
-    """The Ftrl optimizer.
-
-    Referenced from *Ad Click Prediction: a View from the Trenches*, available at
-    http://dl.acm.org/citation.cfm?id=2488200.
-
-    eta :
-        .. math::
-           \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^2}}
-
-    The optimizer updates the weight by::
-
-        rescaled_grad = clip(grad * rescale_grad, clip_gradient)
-        z += rescaled_grad - (sqrt(n + rescaled_grad**2) - sqrt(n)) * weight / learning_rate
-        n += rescaled_grad**2
-        w = (sign(z) * lamda1 - z) / ((beta + sqrt(n)) / learning_rate + wd) * (abs(z) > lamda1)
-
-    If the storage types of weight, state and grad are all ``row_sparse``, \
-    **sparse updates** are applied by::
-
-        for row in grad.indices:
-            rescaled_grad[row] = clip(grad[row] * rescale_grad, clip_gradient)
-            z[row] += rescaled_grad[row] - (sqrt(n[row] + rescaled_grad[row]**2) - sqrt(n[row])) * weight[row] / learning_rate
-            n[row] += rescaled_grad[row]**2
-            w[row] = (sign(z[row]) * lamda1 - z[row]) / ((beta + sqrt(n[row])) / learning_rate + wd) * (abs(z[row]) > lamda1)
-
-    The sparse update only updates the z and n for the weights whose row_sparse
-    gradient indices appear in the current batch, rather than updating it for all
-    indices. Compared with the original update, it can provide large
-    improvements in model training throughput for some applications. However, it
-    provides slightly different semantics than the original update, and
-    may lead to different empirical results.
-
-    For details of the update algorithm, see :class:`~mxnet.ndarray.ftrl_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    lamda1 : float, optional
-        L1 regularization coefficient.
-    learning_rate : float, optional
-        The initial learning rate.
-    beta : float, optional
-        Per-coordinate learning rate correlation parameter.
-    """
-
-    def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, **kwargs):
-        super(Ftrl, self).__init__(**kwargs)
-        self.lamda1 = lamda1
-        self.beta = beta
-        self.lr = learning_rate
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context, stype=weight.stype),  # z
-                zeros(weight.shape, weight.context, stype=weight.stype))  # n
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        wd = self._get_wd(index)
-        lr = self._get_lr(index)
-
-        kwargs = {'lamda1': self.lamda1, 'beta': self.beta, 'rescale_grad': self.rescale_grad}
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        # accumulated g and delta initialization
-        z, n = state
-        ftrl_update(weight, grad, z, n, out=weight,
-                    lr=lr, wd=wd, **kwargs)
-
-# pylint: enable=line-too-long
-@register
-class Adamax(Optimizer):
-    """The AdaMax optimizer.
-
-    It is a variant of Adam based on the infinity norm
-    available at http://arxiv.org/abs/1412.6980 Section 7.
-
-    The optimizer updates the weight by::
-
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        m = beta1 * m_t + (1 - beta1) * grad
-        u = maximum(beta2 * u, abs(grad))
-        weight -= lr / (1 - beta1**t) * m / u
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        Exponential decay rate for the first moment estimates.
-    beta2 : float, optional
-        Exponential decay rate for the second moment estimates.
-    """
-    def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, **kwargs):
-        super(Adamax, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
-                zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        t = self._index_update_count[index]
-        lr /= (1. - self.beta1**t)
-
-        # preprocess grad
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-
-        # update m_t and u_t
-        m_t, u_t = state
-        m_t[:] *= self.beta1
-        m_t[:] += (1. - self.beta1) * grad
-        u_t[:] = maximum(self.beta2 * u_t, NDabs(grad))
-
-        # update weight
-        weight[:] -= lr * m_t / u_t
-
-@register
-class Nadam(Optimizer):
-    """The Nesterov Adam optimizer.
-
-    Much like Adam is essentially RMSprop with momentum,
-    Nadam is Adam RMSprop with Nesterov momentum available
-    at http://cs229.stanford.edu/proj2015/054_report.pdf.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        Exponential decay rate for the first moment estimates.
-    beta2 : float, optional
-        Exponential decay rate for the second moment estimates.
-    epsilon : float, optional
-        Small value to avoid division by 0.
-    schedule_decay : float, optional
-        Exponential decay rate for the momentum schedule
-    """
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 schedule_decay=0.004, **kwargs):
-        super(Nadam, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.schedule_decay = schedule_decay
-        self.m_schedule = 1.
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
-                zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        t = self._index_update_count[index]
-
-        # preprocess grad
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-
-        # warming momentum schedule
-        momentum_t = self.beta1 * (1. - 0.5 * (pow(0.96, t * self.schedule_decay)))
-        momentum_t_1 = self.beta1 * (1. - 0.5 * (pow(0.96, (t + 1) * self.schedule_decay)))
-        self.m_schedule = self.m_schedule * momentum_t
-        m_schedule_next = self.m_schedule * momentum_t_1
-
-        # update m_t and v_t
-        m_t, v_t = state
-        m_t[:] *= self.beta1
-        m_t[:] += (1. - self.beta1) * grad
-        v_t[:] *= self.beta2
-        v_t[:] += (1. - self.beta2) * grad * grad
-
-        grad_prime = grad / (1. - self.m_schedule)
-        m_t_prime = m_t / (1. - m_schedule_next)
-        v_t_prime = v_t / (1. - pow(self.beta2, t))
-        m_t_bar = (1. - momentum_t) * grad_prime + momentum_t_1 * m_t_prime
-
-        # update weight
-        weight[:] -= lr * m_t_bar / (sqrt(v_t_prime) + self.epsilon)
-
+# pylint: disable=W0223
 @register
 class Test(Optimizer):
     """The Test optimizer"""
@@ -2037,139 +567,14 @@ def create_state(self, index, weight):
         """Creates a state to duplicate weight."""
         return zeros(weight.shape, weight.context)
 
-    def update(self, index, weight, grad, state):
+    def step(self, indices, weights, grads, states):
         """Performs w += rescale_grad * grad."""
-        weight[:] += grad * self.rescale_grad
-        state[:] = weight
-
-# backward compatibility wrapper for Optimizer.CreateOptimizer
-create = Optimizer.create_optimizer  # pylint: disable=invalid-name
-
-
-def _as_classic(a, allow_np):
-    # TODO(junwu): This is a temp solution for allowing converting
-    # np.ndarray to mx.nd.NDArray to be fed into the optimizer since
-    # users may have custom optimizers implemented using mx.nd.NDArray ops.
-    from ..numpy import ndarray as np_ndarray
-    if isinstance(a, (tuple, list)):
-        if any(isinstance(x, np_ndarray) for x in a):
-            if allow_np:
-                return [x.as_nd_ndarray() for x in a]
-            else:
-                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
-    else:
-        if isinstance(a, np_ndarray):
-            if allow_np:
-                return a.as_nd_ndarray()
-            else:
-                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
-    return a
-
-
-
-class Updater(object):
-    """Updater for kvstore."""
-    def __init__(self, optimizer):
-        self.optimizer = optimizer
-        self.states = {}
-        self.states_synced = {}
-        self.aggregate_updates = optimizer.aggregate_num > 0
-
-    def __call__(self, index, grad, weight):
-        """Updates weight given gradient and index."""
-        allow_np = self.optimizer.allow_np_array if hasattr(self.optimizer, "allow_np_array") else is_np_array()
-        if not isinstance(index, (list, tuple)):
-            indices = [index]
-            grads = [_as_classic(grad, allow_np)]
-            weights = [_as_classic(weight, allow_np)]
-        else:
-            indices = index
-            grads = _as_classic(grad, allow_np)
-            weights = _as_classic(weight, allow_np)
-        if weights:
-            self.optimizer._set_current_context(weights[0].context.device_id)
-        for i, idx in enumerate(indices):
-            # convert ctypes.char_p.value back to python str if needed
-            if isinstance(idx, bytes):
-                indices[i] = py_str(idx)
-                idx = indices[i]
-            if idx not in self.states:
-                self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i])
-                self.states_synced[idx] = True
-            elif not self.states_synced[idx]:
-                self.states[idx] = \
-                    self.sync_state_context(self.states[idx], weights[i].context)
-                self.states_synced[idx] = True
-        if self.aggregate_updates:
-            # segregate values based on type
-            type_map = {}
-            for i, w, g in zip(indices, weights, grads):
-                if w.dtype in type_map:
-                    type_map[w.dtype].append((i, w, g))
-                else:
-                    type_map[w.dtype] = [(i, w, g)]
-            for idx in type_map:
-                current_index = 0
-                indices, weights, grads = zip(*type_map[idx])
-                while current_index < len(indices):
-                    states = []
-                    step = min(self.optimizer.aggregate_num, len(indices) - current_index)
-                    for j in range(step):
-                        states.append(self.states[indices[current_index + j]])
-                    self.optimizer.update_multi_precision(
-                        indices[current_index:current_index + self.optimizer.aggregate_num],
-                        weights[current_index:current_index + self.optimizer.aggregate_num],
-                        grads[current_index:current_index + self.optimizer.aggregate_num],
-                        states)
-                    current_index += self.optimizer.aggregate_num
-        else:
-            for i, w, g in zip(indices, weights, grads):
-                self.optimizer.update_multi_precision(i, w, g, self.states[i])
-
-    def sync_state_context(self, state, context):
-        """sync state context."""
-        if isinstance(state, NDArray):
-            return state.as_in_context(context)
-        elif isinstance(state, (tuple, list)):
-            synced_state = (self.sync_state_context(i, context) for i in state)
-            if isinstance(state, tuple):
-                return tuple(synced_state)
-            else:
-                return list(synced_state)
-        else:
-            return state
-
-    def set_states(self, states):
-        """Sets updater states."""
-        states = pickle.loads(states)
-        if isinstance(states, tuple) and len(states) == 2:
-            self.states, self.optimizer = states
-        else:
-            self.states = states
-        self.states_synced = dict.fromkeys(self.states.keys(), False)
-
-    def get_states(self, dump_optimizer=False):
-        """Gets updater states.
-
-        Parameters
-        ----------
-        dump_optimizer : bool, default False
-            Whether to also save the optimizer itself. This would also save optimizer
-            information such as learning rate and weight decay schedules.
-        """
-        return pickle.dumps((self.states, self.optimizer) if dump_optimizer else self.states)
-
-def get_updater(optimizer):
-    """Returns a closure of the updater needed for kvstore.
+        for index, weight, grad in zip(indices, weights, grads):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            grad = self.rescale_grad * grad
+            weight[:] -= lr * (grad + wd * weight)
 
-    Parameters
-    ----------
-    optimizer: Optimizer
-         The optimizer.
 
-    Returns
-    -------
-    updater: function
-         The closure of the updater.
-    """
-    return Updater(optimizer)
+create = Optimizer.create_optimizer  # pylint: disable=invalid-name
diff --git a/python/mxnet/optimizer/rmsprop.py b/python/mxnet/optimizer/rmsprop.py
new file mode 100644
index 000000000000..2d4b2d618d64
--- /dev/null
+++ b/python/mxnet/optimizer/rmsprop.py
@@ -0,0 +1,181 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""RMSProp optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from ..ndarray import (rmsprop_update, rmspropalex_update)
+from .optimizer import Optimizer, register
+
+__all__ = ['RMSProp']
+
+
+@register
+class RMSProp(Optimizer):
+    """The RMSProp optimizer.
+
+    Two versions of RMSProp are implemented:
+
+    If ``centered=False``, we follow
+    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
+    Tieleman & Hinton, 2012.
+    For details of the update algorithm see :class:`~mxnet.ndarray.rmsprop_update`.
+
+    If ``centered=True``, we follow http://arxiv.org/pdf/1308.0850v5.pdf (38)-(45)
+    by Alex Graves, 2013.
+    For details of the update algorithm see :class:`~mxnet.ndarray.rmspropalex_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    rho: float, default 0.9
+        A decay factor of moving average over past squared gradient.
+    momentum: float, default 0.9
+        Heavy ball momentum factor. Only used if `centered`=``True``.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    centered : bool, default False
+        Flag to control which version of RMSProp to use.::
+
+            True: will use Graves's version of `RMSProp`,
+            False: will use Tieleman & Hinton's version of `RMSProp`.
+
+    clip_weights : float, optional
+        Clips weights into range ``[-clip_weights, clip_weights]``.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.001, rho=0.9, momentum=0.9,
+                 epsilon=1e-8, centered=False, clip_weights=None,
+                 use_fused_step=True, **kwargs):
+        super(RMSProp, self).__init__(learning_rate=learning_rate,
+                                      use_fused_step=use_fused_step,
+                                      **kwargs)
+        self.rho = rho
+        self.momentum = momentum
+        self.centered = centered
+        self.epsilon = epsilon
+        self.clip_weights = clip_weights
+
+    def create_state(self, index, weight):
+        if self.centered:
+            return (
+                zeros(weight.shape, weight.context, stype=weight.stype),  # mean
+                zeros(weight.shape, weight.context, stype=weight.stype),  # var
+                zeros(weight.shape, weight.context, stype=weight.stype))  # mom
+        else:
+            return zeros(weight.shape, weight.context, stype=weight.stype)  # var
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            if not self.centered:
+                # update var
+                var = state
+                var[:] *= self.rho
+                var[:] += (1 - self.rho) * square(grad)
+
+                # update weight
+                d = grad / (sqrt(var) + self.epsilon)
+                weight[:] -= lr * d
+            else:
+                # update mean, var, mom
+                mean, var, mom = state
+                mean[:] *= self.rho
+                mean[:] += (1 - self.rho) * grad
+                var[:] *= self.rho
+                var[:] += (1 - self.rho) * square(grad)
+                mom[:] *= self.momentum
+                mom[:] -= lr * grad / sqrt(var - square(mean) + self.epsilon)
+
+                # update weight
+                weight[:] += mom
+
+            if self.clip_weights:
+                clip(weight, -self.clip_weights, self.clip_weights, out=weight)
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+
+            kwargs = {'rho': self.rho, 'epsilon': self.epsilon,
+                      'rescale_grad': self.rescale_grad}
+            if self.centered:
+                kwargs['momentum'] = self.momentum
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+            if self.clip_weights:
+                kwargs['clip_weights'] = self.clip_weights
+
+            # update weight with fused kernel
+            if not self.centered:
+                var = state
+                rmsprop_update(weight, grad, var, out=weight, lr=lr, wd=wd, **kwargs)
+            else:
+                mean, var, mom = state
+                rmspropalex_update(weight, grad, mean, var, mom, out=weight,
+                                   lr=lr, wd=wd, **kwargs)
diff --git a/python/mxnet/optimizer/sgd.py b/python/mxnet/optimizer/sgd.py
new file mode 100644
index 000000000000..7b2905710806
--- /dev/null
+++ b/python/mxnet/optimizer/sgd.py
@@ -0,0 +1,242 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""SGD optimizer"""
+from __future__ import absolute_import
+import numpy
+from ..ndarray import (zeros, clip)
+from ..ndarray import (sgd_update, sgd_mom_update,
+                       mp_sgd_update, mp_sgd_mom_update,
+                       multi_sgd_update, multi_sgd_mom_update,
+                       multi_mp_sgd_update, multi_mp_sgd_mom_update)
+from .optimizer import Optimizer, register
+from .utils import _flatten_list
+
+__all__ = ['SGD']
+
+
+@register
+class SGD(Optimizer):
+    """The SGD optimizer with momentum and weight decay.
+
+    If the storage types of grad is ``row_sparse`` and ``lazy_update`` is True, \
+    **lazy updates** are applied by::
+
+        for row in grad.indices:
+            rescaled_grad[row] = clip(rescale_grad * grad[row] + wd * weight[row], clip_gradient)
+            state[row] = momentum[row] * state[row] + lr * rescaled_grad[row]
+            weight[row] = weight[row] - state[row]
+
+    The sparse update only updates the momentum for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all
+    indices. Compared with the original update, it can provide large
+    improvements in model training throughput for some applications. However, it
+    provides slightly different semantics than the original update, and
+    may lead to different empirical results.
+
+    In the case when ``update_on_kvstore`` is set to False (either globally via
+    MXNET_UPDATE_ON_KVSTORE=0 environment variable or as a parameter in
+    :class:`~mxnet.gluon.Trainer`) SGD optimizer can perform aggregated update
+    of parameters, which may lead to improved performance. The aggregation size
+    is controlled by ``aggregate_num`` and defaults to 4.
+
+    Otherwise, **standard updates** are applied by::
+
+        rescaled_grad = clip(rescale_grad * grad, clip_gradient)) + wd * weight
+        state = momentum * state + lr * rescaled_grad
+        weight = weight - state
+
+    For details of the update algorithm see
+    :class:`~mxnet.ndarray.sgd_update` and :class:`~mxnet.ndarray.sgd_mom_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, default 0.
+        The momentum value.
+    lazy_update : bool, default False
+        Default is False. If True, lazy updates are applied \
+        if the storage types of weight and grad are both ``row_sparse``.
+    multi_precision: bool, default False
+        Flag to control the internal precision of the optimizer.
+        False: results in using the same precision as the weights (default),
+        True: makes internal 32-bit copy of the weights and applies gradients
+        in 32-bit precision even if actual weights used in the model have lower precision.
+        Turning this on can improve convergence and accuracy when training with float16.
+    aggregate_num : int, default 1
+        Number of weights to be aggregated in a list.
+        They are passed to the optimizer for a single optimization step.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, momentum=0.0, lazy_update=False,
+                 multi_precision=False, use_fused_step=True, aggregate_num=1, **kwargs):
+        super(SGD, self).__init__(learning_rate=learning_rate,
+                                  multi_precision=multi_precision,
+                                  aggregate_num=aggregate_num,
+                                  use_fused_step=use_fused_step,
+                                  **kwargs)
+        if not self.use_fused_step:
+            assert not lazy_update, \
+                'When use_fused_step is set to False, lazy_update has to be turned off.'
+        if lazy_update:
+            assert not multi_precision, \
+                'When lazy_update is set to True, multi_precision has be turned off.'
+        self.momentum = momentum
+        self.lazy_update = lazy_update
+
+    def create_state(self, index, weight):
+        momentum = None
+        if self.momentum != 0.0:
+            stype = weight.stype if self.lazy_update else 'default'
+            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
+        return momentum
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mom
+            mom = state
+            if mom is not None:
+                mom[:] *= self.momentum
+                mom[:] -= lr * grad
+            else:
+                mom = -lr * grad
+
+            # update weight
+            weight[:] += mom
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        # When either weight or gradient is sparse, aggregate is False.
+        aggregate = self.aggregate_num > 1
+        for weight, grad in zip(weights, grads):
+            aggregate = (aggregate and
+                         weight.stype == 'default' and
+                         grad.stype == 'default')
+        self._update_count(indices)
+        lrs = self._get_lrs(indices)
+        wds = self._get_wds(indices)
+
+        kwargs = {'rescale_grad': self.rescale_grad}
+        if self.momentum > 0:
+            kwargs['momentum'] = self.momentum
+        if self.clip_gradient:
+            kwargs['clip_gradient'] = self.clip_gradient
+
+        if aggregate:
+            # update `aggregate_num` number of weights in a single kernel.
+            # this does not support sparse weight or gradient.
+            multi_precision = self.multi_precision and weights[0].dtype == numpy.float16
+            if not multi_precision:
+                if self.momentum > 0:
+                    multi_sgd_mom_update(*_flatten_list(zip(weights, grads, states)), out=weights,
+                                         num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
+                else:
+                    multi_sgd_update(*_flatten_list(zip(weights, grads)), out=weights,
+                                     num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
+            else:
+                states = list(zip(*states))
+                weights32, moms = states
+                if self.momentum > 0:
+                    multi_mp_sgd_mom_update(*_flatten_list(zip(weights, grads,
+                                                               moms, weights32)),
+                                            out=weights, num_weights=len(weights),
+                                            lrs=lrs, wds=wds, **kwargs)
+                else:
+                    multi_mp_sgd_update(*_flatten_list(zip(weights, grads,
+                                                           weights32)),
+                                        out=weights, num_weights=len(weights),
+                                        lrs=lrs, wds=wds, **kwargs)
+        else:
+            for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds):
+                multi_precision = self.multi_precision and weight.dtype == numpy.float16
+                if not multi_precision:
+                    mom = state
+                    if mom is not None:
+                        sgd_mom_update(weight, grad, mom, out=weight,
+                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
+                    else:
+                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
+                                   lr=lr, wd=wd, **kwargs)
+                else:
+                    # weight32 is a float32 copy of weight.
+                    # in the kernel, we firstly update weight32,
+                    # and then cast the result to float16 and save it to weight.
+                    weight32, mom = state
+                    if mom is not None:
+                        mp_sgd_mom_update(weight, grad, mom, weight32, out=weight,
+                                          lr=lr, wd=wd, **kwargs)
+                    else:
+                        mp_sgd_update(weight, grad, weight32, out=weight,
+                                      lr=lr, wd=wd, **kwargs)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Override update_multi_precision.
+        """
+        if self.use_fused_step:
+            self.update(indices, weights, grads, states)
+        else:
+            super(SGD, self).update_multi_precision(indices, weights, grads, states)
diff --git a/python/mxnet/optimizer/sgld.py b/python/mxnet/optimizer/sgld.py
new file mode 100644
index 000000000000..cc97fa5ebcc5
--- /dev/null
+++ b/python/mxnet/optimizer/sgld.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=W0223
+"""SGLD optimizer."""
+from __future__ import absolute_import
+import math
+from ..ndarray import clip
+from ..random import normal
+from .optimizer import Optimizer, register
+
+__all__ = ['SGLD']
+
+
+@register
+class SGLD(Optimizer):
+    """Stochastic Gradient Riemannian Langevin Dynamics.
+
+    This class implements the optimizer described in the paper *Stochastic Gradient
+    Riemannian Langevin Dynamics on the Probability Simplex*, available at
+    https://papers.nips.cc/paper/4883-stochastic-gradient-riemannian-langevin-dynamics-on-the-probability-simplex.pdf.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, use_fused_step=False, **kwargs):
+        super(SGLD, self).__init__(learning_rate=learning_rate,
+                                   use_fused_step=use_fused_step,
+                                   **kwargs)
+
+    def create_state(self, index, weight):
+        return None
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad in zip(indices, weights, grads):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update weight
+            weight[:] -= lr / 2 * grad
+            weight[:] += normal(0, math.sqrt(lr), shape=weight.shape,
+                                dtype=weight.dtype, ctx=weight.context)
diff --git a/python/mxnet/optimizer/signum.py b/python/mxnet/optimizer/signum.py
new file mode 100644
index 000000000000..0bb44f9bb4cd
--- /dev/null
+++ b/python/mxnet/optimizer/signum.py
@@ -0,0 +1,158 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Signum optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip)
+from ..ndarray import (signsgd_update, signum_update)
+from .optimizer import Optimizer, register
+
+__all__ = ['Signum']
+
+
+@register
+class Signum(Optimizer):
+    r"""The Signum optimizer that takes the sign of gradient or momentum.
+
+    The optimizer updates the weight by::
+
+        rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
+        state = momentum * state + (1-momentum)*rescaled_grad
+        weight = (1 - lr * wd_lh) * weight - lr * sign(state)
+
+    References
+    ----------
+    Jeremy Bernstein, Yu-Xiang Wang, Kamyar Azizzadenesheli & Anima Anandkumar. (2018).
+    signSGD: Compressed Optimisation for Non-Convex Problems. In ICML'18.
+
+    See: https://arxiv.org/abs/1802.04434
+
+    For details of the update algorithm see
+    :class:`~mxnet.ndarray.signsgd_update` and :class:`~mxnet.ndarray.signum_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.01
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, optional
+       The momentum value.
+    wd_lh : float, optional
+       The amount of decoupled weight decay regularization, see details in the original paper at:\
+       https://arxiv.org/abs/1711.05101
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.01, momentum=0.9, wd_lh=0.0, use_fused_step=True, **kwargs):
+        super(Signum, self).__init__(learning_rate=learning_rate,
+                                     use_fused_step=use_fused_step,
+                                     **kwargs)
+        self.momentum = momentum
+        self.wd_lh = wd_lh
+
+    def create_state(self, index, weight):
+        momentum = None
+        if self.momentum != 0.0:
+            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
+        return momentum
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+         Parameters
+         ----------
+         indices : list of int
+             List of unique indices of the parameters into the individual learning rates
+             and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+             and `set_wd_mult()`, respectively.
+         weights : list of NDArray
+             List of parameters to be updated.
+         grads : list of NDArray
+             List of gradients of the objective with respect to this parameter.
+         states : List of any obj
+             List of state returned by `create_state()`.
+         """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+
+            if state is not None:
+                # preprocess grad
+                grad *= self.rescale_grad
+                if self.clip_gradient is not None:
+                    grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+                grad += wd * weight
+
+                # update mom
+                mom = state
+                mom[:] *= self.momentum
+                mom[:] -= (1 - self.momentum) * grad
+
+                # update weight
+                weight[:] *= 1 - lr * self.wd_lh
+                weight[:] += lr * ((mom > 0) - (mom < 0))
+            else:
+                # update weight
+                weight[:] *= 1 - lr * (wd + self.wd_lh)
+                weight[:] -= lr * ((grad > 0) - (grad < 0))
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+
+            kwargs = {'rescale_grad': self.rescale_grad}
+            if self.momentum > 0:
+                kwargs['momentum'] = self.momentum
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+
+            # update weight with fused kernel
+            if state is not None:
+                if self.wd_lh:
+                    kwargs['wd_lh'] = self.wd_lh
+                signum_update(weight, grad, state, out=weight,
+                              lr=lr, wd=wd, **kwargs)
+            else:
+                wd += self.wd_lh
+                signsgd_update(weight, grad, out=weight,
+                               lr=lr, wd=wd, **kwargs)
diff --git a/python/mxnet/optimizer/updater.py b/python/mxnet/optimizer/updater.py
new file mode 100644
index 000000000000..62b700455075
--- /dev/null
+++ b/python/mxnet/optimizer/updater.py
@@ -0,0 +1,140 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Updater class."""
+from __future__ import absolute_import
+import pickle
+import numpy
+from ..base import py_str
+from ..ndarray import NDArray
+from ..util import is_np_array
+from .utils import _as_classic
+
+__all__ = ['Updater', 'get_updater']
+
+
+class Updater(object):
+    """Updater for kvstore."""
+    def __init__(self, optimizer):
+        self.optimizer = optimizer
+        self.states = {}
+        self.states_synced = {}
+        self.aggregate_updates = optimizer.aggregate_num > 1
+
+    def __call__(self, index, grad, weight):
+        """Updates weight given gradient and index."""
+        allow_np = self.optimizer.allow_np_array if hasattr(self.optimizer, "allow_np_array") else is_np_array()
+        if not isinstance(index, (list, tuple)):
+            indices = [index]
+            grads = [_as_classic(grad, allow_np)]
+            weights = [_as_classic(weight, allow_np)]
+        else:
+            indices = index
+            grads = _as_classic(grad, allow_np)
+            weights = _as_classic(weight, allow_np)
+        if weights:
+            self.optimizer._set_current_context(weights[0].context.device_id)
+        for i, idx in enumerate(indices):
+            # convert ctypes.char_p.value back to python str if needed
+            if isinstance(idx, bytes):
+                indices[i] = py_str(idx)
+                idx = indices[i]
+            if idx not in self.states:
+                self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i])
+                self.states_synced[idx] = True
+            elif not self.states_synced[idx]:
+                self.states[idx] = \
+                    self.sync_state_context(self.states[idx], weights[i].context)
+                self.states_synced[idx] = True
+        if self.aggregate_updates:
+            # segregate values based on type
+            if self.optimizer.aggregate_num is not numpy.inf:
+                type_map = {}
+                for i, w, g in zip(indices, weights, grads):
+                    if w.dtype in type_map:
+                        type_map[w.dtype].append((i, w, g))
+                    else:
+                        type_map[w.dtype] = [(i, w, g)]
+                for idx in type_map:
+                    current_index = 0
+                    indices, weights, grads = zip(*type_map[idx])
+                    while current_index < len(indices):
+                        states = []
+                        step = min(self.optimizer.aggregate_num, len(indices) - current_index)
+                        for j in range(step):
+                            states.append(self.states[indices[current_index + j]])
+                        self.optimizer.update_multi_precision(
+                            indices[current_index:current_index + self.optimizer.aggregate_num],
+                            weights[current_index:current_index + self.optimizer.aggregate_num],
+                            grads[current_index:current_index + self.optimizer.aggregate_num],
+                            states)
+                        current_index += self.optimizer.aggregate_num
+            else:
+                states = [self.states[i] for i in indices]
+                self.optimizer.update_multi_precision(indices, weights, grads, states)
+        else:
+            for i, w, g in zip(indices, weights, grads):
+                self.optimizer.update_multi_precision([i], [w], [g], [self.states[i]])
+
+    def sync_state_context(self, state, context):
+        """sync state context."""
+        if isinstance(state, NDArray):
+            return state.as_in_context(context)
+        elif isinstance(state, (tuple, list)):
+            synced_state = (self.sync_state_context(i, context) for i in state)
+            if isinstance(state, tuple):
+                return tuple(synced_state)
+            else:
+                return list(synced_state)
+        else:
+            return state
+
+    def set_states(self, states):
+        """Sets updater states."""
+        states = pickle.loads(states)
+        if isinstance(states, tuple) and len(states) == 2:
+            self.states, self.optimizer = states
+        else:
+            self.states = states
+        self.states_synced = dict.fromkeys(self.states.keys(), False)
+
+    def get_states(self, dump_optimizer=False):
+        """Gets updater states.
+
+        Parameters
+        ----------
+        dump_optimizer : bool, default False
+            Whether to also save the optimizer itself. This would also save optimizer
+            information such as learning rate and weight decay schedules.
+        """
+        return pickle.dumps((self.states, self.optimizer) if dump_optimizer else self.states)
+
+
+def get_updater(optimizer):
+    """Returns a closure of the updater needed for kvstore.
+
+    Parameters
+    ----------
+    optimizer: Optimizer
+         The optimizer.
+
+    Returns
+    -------
+    updater: function
+         The closure of the updater.
+    """
+    return Updater(optimizer)
diff --git a/python/mxnet/optimizer/utils.py b/python/mxnet/optimizer/utils.py
new file mode 100644
index 000000000000..f7dc136c10c7
--- /dev/null
+++ b/python/mxnet/optimizer/utils.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Optimizer utility functions."""
+from __future__ import absolute_import
+
+
+def _flatten_list(nested_list):
+    return [item for sublist in nested_list for item in sublist]
+
+
+def _as_classic(a, allow_np):
+    # TODO(junwu): This is a temp solution for allowing converting
+    # np.ndarray to mx.nd.NDArray to be fed into the optimizer since
+    # users may have custom optimizers implemented using mx.nd.NDArray ops.
+    from ..numpy import ndarray as np_ndarray
+    if isinstance(a, (tuple, list)):
+        if any(isinstance(x, np_ndarray) for x in a):
+            if allow_np:
+                return [x.as_nd_ndarray() for x in a]
+            else:
+                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
+    else:
+        if isinstance(a, np_ndarray):
+            if allow_np:
+                return a.as_nd_ndarray()
+            else:
+                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
+    return a
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index da8d5d738f46..8756b6a78ac9 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -43,7 +43,7 @@
            'trunc', 'logical_not', 'arcsinh', 'arccosh', 'arctanh', 'argsort', 'sort', 'tensordot', 'eye', 'linspace',
            'logspace', 'expand_dims', 'tile', 'arange', 'array_split', 'split', 'hsplit', 'vsplit', 'dsplit',
            'concatenate', 'append', 'stack', 'vstack', 'row_stack', 'column_stack', 'hstack', 'dstack',
-           'average', 'mean', 'maximum', 'minimum', 'around', 'round', 'round_',
+           'average', 'mean', 'maximum', 'minimum', 'around', 'round', 'round_', 'flatnonzero',
            'swapaxes', 'clip', 'argmax', 'argmin', 'std', 'var', 'indices', 'copysign', 'ravel', 'unravel_index',
            'diag_indices_from', 'hanning', 'hamming', 'blackman', 'flip', 'flipud', 'fliplr',
            'hypot', 'bitwise_and', 'bitwise_xor', 'bitwise_or', 'rad2deg', 'deg2rad', 'unique', 'lcm',
@@ -4664,6 +4664,32 @@ def unravel_index(indices, shape, order='C'): # pylint: disable=redefined-outer-
         raise NotImplementedError('Don not support column-major (Fortran-style) order at this moment')
 
 
+def flatnonzero(a):
+    r"""
+    Return indices that are non-zero in the flattened version of a.
+
+    This is equivalent to np.nonzero(np.ravel(a))[0].
+
+    Parameters
+    ----------
+    a : _Symbol
+        Input data.
+
+    Returns
+    -------
+    res : _Symbol
+        Output array, containing the indices of the elements of `a.ravel()`
+        that are non-zero.
+
+    See Also
+    --------
+    nonzero : Return the indices of the non-zero elements of the input array.
+    ravel : Return a 1-D array containing the elements of the input array.
+    """
+    out = _npi.nonzero(ravel(a))
+    return out.reshape(-1,)
+
+
 def diag_indices_from(arr):
     """
     This returns a tuple of indices that can be used to access the main diagonal of an array
diff --git a/python/mxnet/symbol/numpy_extension/random.py b/python/mxnet/symbol/numpy_extension/random.py
index 35bc8489c27e..bad6a74d139f 100644
--- a/python/mxnet/symbol/numpy_extension/random.py
+++ b/python/mxnet/symbol/numpy_extension/random.py
@@ -165,18 +165,22 @@ def uniform_n(low=0.0, high=1.0, batch_shape=None, dtype=None, ctx=None):
         ctx = current_context()
     if batch_shape == ():
         batch_shape = None
+    else:
+        if isinstance(batch_shape, int):
+            batch_shape = (batch_shape,)
+        batch_shape = (-2,) + batch_shape
     if input_type == (True, True):
-        return _npi.uniform_n(low, high, low=None, high=None, size=batch_shape,
-                              ctx=ctx, dtype=dtype)
+        return _npi.uniform(low, high, low=None, high=None, size=batch_shape,
+                            ctx=ctx, dtype=dtype)
     elif input_type == (False, True):
-        return _npi.uniform_n(high, low=low, high=None, size=batch_shape,
-                              ctx=ctx, dtype=dtype)
+        return _npi.uniform(high, low=low, high=None, size=batch_shape,
+                            ctx=ctx, dtype=dtype)
     elif input_type == (True, False):
-        return _npi.uniform_n(low, low=None, high=high, size=batch_shape,
-                              ctx=ctx, dtype=dtype)
+        return _npi.uniform(low, low=None, high=high, size=batch_shape,
+                            ctx=ctx, dtype=dtype)
     else:
-        return _npi.uniform_n(low=low, high=high, size=batch_shape,
-                              ctx=ctx, dtype=dtype)
+        return _npi.uniform(low=low, high=high, size=batch_shape,
+                            ctx=ctx, dtype=dtype)
 
 
 def normal_n(loc=0.0, scale=1.0, batch_shape=None, dtype=None, ctx=None):
@@ -252,15 +256,19 @@ def normal_n(loc=0.0, scale=1.0, batch_shape=None, dtype=None, ctx=None):
         ctx = current_context()
     if batch_shape == ():
         batch_shape = None
+    else:
+        if isinstance(batch_shape, int):
+            batch_shape = (batch_shape,)
+        batch_shape = (-2,) + batch_shape
     if input_type == (True, True):
-        return _npi.normal_n(loc, scale, loc=None, scale=None, size=batch_shape,
-                             ctx=ctx, dtype=dtype)
+        return _npi.normal(loc, scale, loc=None, scale=None, size=batch_shape,
+                           ctx=ctx, dtype=dtype)
     elif input_type == (False, True):
-        return _npi.normal_n(scale, loc=loc, scale=None, size=batch_shape,
-                             ctx=ctx, dtype=dtype)
+        return _npi.normal(scale, loc=loc, scale=None, size=batch_shape,
+                           ctx=ctx, dtype=dtype)
     elif input_type == (True, False):
-        return _npi.normal_n(loc, loc=None, scale=scale, size=batch_shape,
-                             ctx=ctx, dtype=dtype)
+        return _npi.normal(loc, loc=None, scale=scale, size=batch_shape,
+                           ctx=ctx, dtype=dtype)
     else:
-        return _npi.normal_n(loc=loc, scale=scale, size=batch_shape,
-                             ctx=ctx, dtype=dtype)
+        return _npi.normal(loc=loc, scale=scale, size=batch_shape,
+                           ctx=ctx, dtype=dtype)
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 9a70f6e268e6..6a2c2456f05f 100755
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -2268,6 +2268,7 @@ def verify_generator(generator, buckets, probs, nsamples=1000000, nrepeat=5, suc
                                 str(buckets), str(probs)))
     return cs_ret_l
 
+
 def compare_ndarray_tuple(t1, t2, rtol=None, atol=None):
     """Compare ndarray tuple."""
     if t1 is None or t2 is None:
@@ -2280,11 +2281,14 @@ def compare_ndarray_tuple(t1, t2, rtol=None, atol=None):
         assert_almost_equal(t1, t2, rtol=rtol, atol=atol)
 
 
-def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='default',
-                      rtol=1e-4, atol=1e-5, compare_states=True, ntensors=1):
+def compare_optimizer(opt1, opt2, shapes, dtype, w_stype='default', g_stype='default',
+                      rtol=1e-4, atol=1e-5, compare_states=True):
     """Compare opt1 and opt2."""
-    if not isinstance(shape, list):
-        assert(ntensors == 1)
+
+    w1_list, w2_list = [], []
+    g1_list, g2_list = [], []
+    s1_list, s2_list = [], []
+    for i, shape in enumerate(shapes):
         if w_stype == 'default':
             w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
             w1 = w2.copyto(default_context())
@@ -2301,37 +2305,77 @@ def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='defa
             g1 = g2.copyto(default_context()).tostype('default')
         else:
             raise Exception("type not supported yet")
+        s1 = opt1.create_state_multi_precision(i, w1)
+        s2 = opt2.create_state_multi_precision(i, w2)
 
-        state1 = opt1.create_state_multi_precision(0, w1)
-        state2 = opt2.create_state_multi_precision(0, w2)
         if compare_states:
-            compare_ndarray_tuple(state1, state2)
+            compare_ndarray_tuple(s1, s2)
+
+        w1_list.append(w1)
+        w2_list.append(w2)
+        g1_list.append(g1)
+        g2_list.append(g2)
+        s1_list.append(s1)
+        s2_list.append(s2)
+
+    indices = list(range(len(shapes)))
+    opt1.update_multi_precision(indices, w1_list, g1_list, s1_list)
+    opt2.update_multi_precision(indices, w2_list, g2_list, s2_list)
+    if compare_states:
+        compare_ndarray_tuple(tuple(s1_list), tuple(s2_list), rtol=rtol, atol=atol)
+    compare_ndarray_tuple(tuple(w1_list), tuple(w2_list), rtol=rtol, atol=atol)
+
+
+def compare_optimizer_noise_seeded(opt1, opt2, shapes, dtype, noise_seed,
+                                   w_stype='default', g_stype='default',
+                                   rtol=1e-4, atol=1e-5, compare_states=True):
+    """Compare opt1 and opt2 with the added functionality that the seed for generating random noise
+    in the SGLD optimizer update is set so that the same noise is used in opt1 and opt2.
+
+    """
+    w1_list, w2_list = [], []
+    g1_list, g2_list = [], []
+    s1_list, s2_list = [], []
+    for i, shape in enumerate(shapes):
+        if w_stype == 'default':
+            w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
+            w1 = w2.copyto(default_context())
+        elif w_stype in ('row_sparse', 'csr'):
+            w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
+            w1 = w2.copyto(default_context()).tostype('default')
+        else:
+            raise Exception("type not supported yet")
+        if g_stype == 'default':
+            g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
+            g1 = g2.copyto(default_context())
+        elif g_stype in ('row_sparse', 'csr'):
+            g2 = rand_ndarray(shape, g_stype, dtype=dtype)
+            g1 = g2.copyto(default_context()).tostype('default')
+        else:
+            raise Exception("type not supported yet")
+        s1 = opt1.create_state_multi_precision(i, w1)
+        s2 = opt2.create_state_multi_precision(i, w2)
 
-        opt1.update_multi_precision(0, w1, g1, state1)
-        opt2.update_multi_precision(0, w2, g2, state2)
         if compare_states:
-            compare_ndarray_tuple(state1, state2, rtol=rtol, atol=atol)
-        assert_almost_equal(w1, w2, rtol=rtol, atol=atol)
-    else:
-        # test multi-tensor: Opt1 single-tensor reference, Opt2 multi-tensor
-        from copy import deepcopy
-        w1, g1 = [], []
-        for s in shape:
-            w1.append(mx.random.uniform(shape=s, ctx=default_context(), dtype=dtype))
-            g1.append(mx.random.uniform(shape=s, ctx=default_context(), dtype=dtype))
-        w1 = tuple(w1)
-        w2 = deepcopy(w1)
-        g1 = tuple(g1)
-        g2 = deepcopy(g1)
-        state2 = [opt2.create_state_multi_precision(0, w2[i]) for i in range(ntensors)]
-
-        opt2.update_multi_precision(list(range(ntensors)), w2, g2, state2)
-        for i in range(ntensors):
-            state1 = opt1.create_state_multi_precision(i, w1[i])
-            opt1.update_multi_precision(i, w1[i], g1[i], state1)
-            if compare_states:
-                compare_ndarray_tuple(state1, state2[i], rtol, atol)
-            compare_ndarray_tuple(w1[i], w2[i], rtol, atol)
+            compare_ndarray_tuple(s1, s2)
+
+        w1_list.append(w1)
+        w2_list.append(w2)
+        g1_list.append(g1)
+        g2_list.append(g2)
+        s1_list.append(s1)
+        s2_list.append(s2)
+
+    indices = list(range(len(shapes)))
+    # set seed for Gaussian noise replication
+    mx.random.seed(noise_seed)
+    opt1.update_multi_precision(indices, w1_list, g1_list, s1_list)
+    mx.random.seed(noise_seed)
+    opt2.update_multi_precision(indices, w2_list, g2_list, s2_list)
+    if compare_states:
+        compare_ndarray_tuple(tuple(s1_list), tuple(s2_list), rtol=rtol, atol=atol)
+    compare_ndarray_tuple(tuple(w1_list), tuple(w2_list), rtol=rtol, atol=atol)
+
 
 def same_symbol_structure(sym1, sym2):
     """Compare two symbols to check if they have the same computation graph structure.
diff --git a/scala-package/README.md b/scala-package/README.md
index b1a5aa413e52..9d0572448cb9 100644
--- a/scala-package/README.md
+++ b/scala-package/README.md
@@ -101,7 +101,7 @@ Also, add the dependency which corresponds to your platform to the ```dependenci
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
-  <version>[1.6.0-SNAPSHOT,)</version>
+  <version>[1.7.0-SNAPSHOT,)</version>
 </dependency>
 ```
 
@@ -113,7 +113,7 @@ Also, add the dependency which corresponds to your platform to the ```dependenci
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
-  <version>[1.6.0-SNAPSHOT,)</version>
+  <version>[1.7.0-SNAPSHOT,)</version>
 </dependency>
 ```
 
@@ -124,11 +124,11 @@ Also, add the dependency which corresponds to your platform to the ```dependenci
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
-  <version>[1.6.0-SNAPSHOT,)</version>
+  <version>[1.7.0-SNAPSHOT,)</version>
 </dependency>
 ```
 
-**Note:** ```<version>[1.6.0-SNAPSHOT,)</version>``` indicates that we will fetch packages with version 1.6.0 or higher. This will always ensure that the pom.xml is able to fetch the latest and greatest jar files from Maven Snapshot repository.
+**Note:** ```<version>[1.7.0-SNAPSHOT,)</version>``` indicates that we will fetch packages with version 1.7.0 or higher. This will always ensure that the pom.xml is able to fetch the latest and greatest jar files from Maven Snapshot repository.
 
 Build From Source
 -----------------
@@ -186,7 +186,7 @@ Adding the following configuration in `pom.xml`
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-INTERNAL</artifactId>
-  <version>1.6.0</version>
+  <version>1.7.0</version>
   <scope>system</scope>
   <systemPath>path_to_jar/mxnet-full_2.11-INTERNAL.jar</systemPath>
 </dependency>
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/optimizer/RMSProp.scala b/scala-package/core/src/main/scala/org/apache/mxnet/optimizer/RMSProp.scala
index 49fca6a1242e..71b20b8c356d 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/optimizer/RMSProp.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/optimizer/RMSProp.scala
@@ -26,15 +26,15 @@ import org.apache.mxnet.NDArrayConversions._
  * http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
  *
  * @param learningRate Float, Step size.
- * @param gamma1 Float, decay factor of moving average for gradient, gradient^^2.
- * @param gamma2 Float, momentum factor of moving average for gradient.
+ * @param rho Float, decay factor of moving average for gradient, gradient^^2.
+ * @param momentum Float, momentum factor of moving average for gradient.
  * @param rescaleGradient Float, rescaling factor of gradient.
  * @param wd Float, L2 regularization coefficient add to all the weights
  * @param clipGradient Float, clip gradient in range [-clip_gradient, clip_gradient]
  * @param lrScheduler The learning rate scheduler
  */
 class RMSProp(val learningRate: Float = 0.002f, rescaleGradient: Float = 1.0f,
-              gamma1: Float = 0.95f, gamma2: Float = 0.9f, wd: Float = 0.0f,
+              rho: Float = 0.95f, momentum: Float = 0.9f, wd: Float = 0.0f,
               lrScheduler: LRScheduler = null, clipGradient: Float = 0f) extends Optimizer {
 
   /**
@@ -57,18 +57,18 @@ class RMSProp(val learningRate: Float = 0.002f, rescaleGradient: Float = 1.0f,
       oldResdGrad.dispose()
     }
 
-    val nUpdated = ((1 - this.gamma1) * (resdGrad * resdGrad) + this.gamma1 * n)
+    val nUpdated = ((1 - this.rho) * (resdGrad * resdGrad) + this.rho * n)
       .disposeDepsExcept(resdGrad, n)
     n.set(nUpdated)
     nUpdated.dispose()
 
-    val gUpdated = ((1 - this.gamma1) * resdGrad + this.gamma1 * g)
+    val gUpdated = ((1 - this.rho) * resdGrad + this.rho * g)
       .disposeDepsExcept(resdGrad, g)
     g.set(gUpdated)
     gUpdated.dispose()
 
     val deltaUpdated =
-      (this.gamma2 * delta - lr * (resdGrad / NDArray.sqrt(n - g * g + 1e-4f) + wd * weight))
+      (this.momentum * delta - lr * (resdGrad / NDArray.sqrt(n - g * g + 1e-4f) + wd * weight))
       .disposeDepsExcept(delta, resdGrad, n, g, weight)
     delta.set(deltaUpdated)
     deltaUpdated.dispose()
diff --git a/scala-package/mxnet-demo/java-demo/README.md b/scala-package/mxnet-demo/java-demo/README.md
index 40a9a4e829fe..2f5849982b0f 100644
--- a/scala-package/mxnet-demo/java-demo/README.md
+++ b/scala-package/mxnet-demo/java-demo/README.md
@@ -120,5 +120,5 @@ sudo apt install libopencv-imgcodecs3.4
 
 Is there any other version available?
 
-You can find nightly release version from [here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.6.0-SNAPSHOT~~).
-Please keep the same version in the pom file or [other versions in here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~~~) to run this demo.
\ No newline at end of file
+You can find nightly release version from [here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.7.0-SNAPSHOT~~).
+Please keep the same version in the pom file or [other versions in here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~~~) to run this demo.
diff --git a/scala-package/mxnet-demo/java-demo/pom.xml b/scala-package/mxnet-demo/java-demo/pom.xml
index a8337b1f3fc6..f6936b64bdbe 100644
--- a/scala-package/mxnet-demo/java-demo/pom.xml
+++ b/scala-package/mxnet-demo/java-demo/pom.xml
@@ -27,7 +27,7 @@
     <properties>
         <maven.compiler.source>1.8</maven.compiler.source>
         <maven.compiler.target>1.8</maven.compiler.target>
-        <mxnet.version>[1.6.0-SNAPSHOT, )</mxnet.version>
+        <mxnet.version>[1.7.0-SNAPSHOT, )</mxnet.version>
         <mxnet.scalaprofile>2.11</mxnet.scalaprofile>
     </properties>
 
@@ -41,7 +41,7 @@
                 </repository>
             </repositories>
             <properties>
-                <mxnet.version>[1.6.0-SNAPSHOT, )</mxnet.version>
+                <mxnet.version>[1.7.0-SNAPSHOT, )</mxnet.version>
             </properties>
         </profile>
         <profile>
@@ -114,4 +114,4 @@
         </plugins>
     </build>
 
-</project>
\ No newline at end of file
+</project>
diff --git a/scala-package/mxnet-demo/scala-demo/pom.xml b/scala-package/mxnet-demo/scala-demo/pom.xml
index 8fab50e8cb9c..72dbc1b8d8e6 100644
--- a/scala-package/mxnet-demo/scala-demo/pom.xml
+++ b/scala-package/mxnet-demo/scala-demo/pom.xml
@@ -35,7 +35,7 @@
                 </repository>
             </repositories>
             <properties>
-                <mxnet.version>[1.6.0-SNAPSHOT, )</mxnet.version>
+                <mxnet.version>[1.7.0-SNAPSHOT, )</mxnet.version>
             </properties>
         </profile>
         <profile>
diff --git a/snapcraft.yaml b/snapcraft.yaml
index 5e03a0935430..6a6048642a29 100644
--- a/snapcraft.yaml
+++ b/snapcraft.yaml
@@ -16,7 +16,7 @@
 # under the License.
 
 name: mxnet
-version: '1.6.0'
+version: '1.7.0'
 summary: MXNet is a deep learning framework designed for efficiency and flexibility.
 description: |
   MXNet is a deep learning framework designed for both efficiency and 
diff --git a/src/operator/contrib/optimizer_op-inl.h b/src/operator/contrib/optimizer_op-inl.h
index fd556a4231cb..2276b9375012 100644
--- a/src/operator/contrib/optimizer_op-inl.h
+++ b/src/operator/contrib/optimizer_op-inl.h
@@ -130,7 +130,7 @@ template <typename xpu> struct GroupAdagradDnsRspKernel {
       // clang-format off
       const DType grad_rescaled = get_grad_rescaled(j);
       index_t data_j = get_data_j(j);
-      const DType div = lr * grad_rescaled / square_root::Map(state_data[grad_idx[i]] + eps);
+      const DType div = lr * grad_rescaled / (square_root::Map(state_data[grad_idx[i]]) + eps);
       out_data[data_j] = weight_data[data_j] - div;
       // clang-format on
     }
diff --git a/src/operator/contrib/optimizer_op.cc b/src/operator/contrib/optimizer_op.cc
index 83bbcdab833d..be6d30587368 100644
--- a/src/operator/contrib/optimizer_op.cc
+++ b/src/operator/contrib/optimizer_op.cc
@@ -61,7 +61,7 @@ Updates are applied by::
 
     grad = clip(grad * rescale_grad, clip_gradient)
     history += mean(square(grad), axis=1, keepdims=True)
-    div = grad / sqrt(history + float_stable_eps)
+    div = grad / (sqrt(history) + epsilon)
     weight -= div * lr
 
 Weights are updated lazily if the gradient is sparse.
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 2d2a0dea575d..9106ee222542 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -347,6 +347,12 @@ struct mixed_rpower {
 };
 #endif
 
+
+#pragma GCC diagnostic push
+#if __GNUC__ >= 7
+#pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#pragma GCC diagnostic ignored "-Wbool-compare"
+#endif
 MXNET_BINARY_MATH_OP_NC_WITH_BOOL(mul, a * b);
 
 MXNET_BINARY_MATH_OP_NC_WITH_BOOL(div, a / b);
@@ -354,6 +360,7 @@ MXNET_BINARY_MATH_OP_NC_WITH_BOOL(div, a / b);
 MXNET_BINARY_MATH_OP_NC_WITH_BOOL(plus, a + b);
 
 MXNET_BINARY_MATH_OP_NC_WITH_BOOL(minus, a - b);
+#pragma GCC diagnostic pop
 
 MXNET_UNARY_MATH_OP(negation, -a);
 
@@ -683,6 +690,10 @@ struct fix : public mxnet_op::tunable {
   }
 };
 
+#pragma GCC diagnostic push
+#if __GNUC__ >= 7
+#pragma GCC diagnostic ignored "-Wbool-compare"
+#endif
 /*! \brief used to determine whether a number is Not A Number*/
 struct isnan : public mxnet_op::tunable {
   template<typename DType>
@@ -722,6 +733,7 @@ struct isneginf : public mxnet_op::tunable {
     return IsInf(a) && a < 0;
   }
 };
+#pragma GCC diagnostic pop
 
 /*! \brief used for generate gradient of MAE loss*/
 MXNET_BINARY_MATH_OP_NC(minus_sign, a - b > DType(0) ? DType(1) : -DType(1));
@@ -1301,7 +1313,12 @@ struct nrm2 {
   /*! \brief finalize reduction result */
   template<typename DType>
   MSHADOW_XINLINE static void Finalize(volatile DType& sum_of_squares, volatile DType& scale) { // NOLINT(*)
+#pragma GCC diagnostic push
+#if __GNUC__ >= 7
+#pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#endif
     sum_of_squares = scale * math::sqrt(sum_of_squares);
+#pragma GCC diagnostic pop
   }
   /*!
    *\brief calculate gradient of redres with respect to redsrc,
@@ -1395,6 +1412,11 @@ struct nanprod_grad : public mxnet_op::tunable {
   }
 };
 
+#pragma GCC diagnostic push
+#if __GNUC__ >= 7
+#pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#pragma GCC diagnostic ignored "-Wbool-compare"
+#endif
 /*! \brief used for computing binary lowest common multiple */
 struct lcm : public mxnet_op::tunable {
   template<typename DType>
@@ -1436,6 +1458,7 @@ struct lcm : public mxnet_op::tunable {
     return DType(0.0f);
   }
 };
+#pragma GCC diagnostic pop
 
 }  // namespace mshadow_op
 }  // namespace op
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index aaeda76bd459..fa036237c97c 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -158,14 +158,6 @@ static inline bool SupportMKLDNN(int dtype, const mxnet::TShape &shape) {
          (ndim == 1 || ndim == 2 || ndim == 4);
 }
 
-static inline bool SupportMKLDNNRnn(const NDArray &input) {
-  if (input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 3
-      && dmlc::GetEnv("MXNET_USE_MKLDNN_RNN", 1)) {
-    return true;
-  }
-  return false;
-}
-
 static inline bool SupportMKLDNNQuantize(int dtype) {
   return dtype == mshadow::kFloat32 || dtype == mshadow::kInt8 ||
          dtype == mshadow::kUint8 || dtype == mshadow::kBfloat16;
diff --git a/src/operator/nn/mkldnn/mkldnn_rnn-inl.h b/src/operator/nn/mkldnn/mkldnn_rnn-inl.h
index a4104bf1a437..1d914876506f 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_rnn-inl.h
@@ -441,6 +441,18 @@ class MKLDNNRnnOp {
             const std::vector<NDArray> &outputs);
 };
 
+inline bool SupportMKLDNNRnn(const int input_dtype) {
+  if (input_dtype == mshadow::kFloat32 && dmlc::GetEnv("MXNET_USE_MKLDNN_RNN", 1)) {
+    return true;
+  }
+  return false;
+}
+
+inline bool SupportMKLDNNRnn(const RNNParam &param, const int input_dtype) {
+  if (param.projection_size.has_value()) return false;
+  return SupportMKLDNNRnn(input_dtype);
+}
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/mkldnn/mkldnn_rnn.cc b/src/operator/nn/mkldnn/mkldnn_rnn.cc
index 8af0e997483e..6180e3d8ceb2 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn.cc
+++ b/src/operator/nn/mkldnn/mkldnn_rnn.cc
@@ -339,7 +339,6 @@ FUNC(MKLDNN_ARG_DIFF_##NAME, ARGS.at(MKLDNN_ARG_##NAME).get_desc(), HANDLE)
 void MKLDNNRnnForward::SetNewDataMem(void* x, void* hx, void* cx,
                                      void* y, void* hy, void* cy,
                                      const int dtype) {
-  using dims = mkldnn::memory::dims;
   using desc = mkldnn::memory::desc;
   using format_tag = mkldnn::memory::format_tag;
   auto& cpu_engine = CpuEngine::Get()->get_engine();
@@ -632,7 +631,6 @@ void MKLDNNRnnOp::Init(const OpContext &ctx,
                        const std::vector<NDArray> &inputs,
                        const std::vector<OpReqType> &req,
                        const std::vector<NDArray> &outputs) {
-  using memory = mkldnn::memory;
   using format_tag = mkldnn::memory::format_tag;
 
   // In the `autograd.record()` context, RNNOp is required to run into
diff --git a/src/operator/numpy/linalg/np_norm-inl.h b/src/operator/numpy/linalg/np_norm-inl.h
index 643554f502f8..e244c65b1d08 100644
--- a/src/operator/numpy/linalg/np_norm-inl.h
+++ b/src/operator/numpy/linalg/np_norm-inl.h
@@ -71,6 +71,10 @@ struct nrmlp {
   /*! \brief do stable reduction into dst */
   template<typename AType, typename DType>
   MSHADOW_XINLINE void Reduce(volatile AType& sum_of_powers, volatile DType src, volatile DType& scale) { // NOLINT(*)
+#pragma GCC diagnostic push
+#if __GNUC__ >= 7
+#pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#endif
     if (src != 0) {
       DType src_abs = abs::Map(src);
       if (scale < src_abs) {
@@ -81,6 +85,7 @@ struct nrmlp {
         sum_of_powers = sum_of_powers + AType(lp_power(static_cast<double>(src_abs / scale), lp));
       }
     }
+#pragma GCC diagnostic pop
   }
 
   /*! \brief combine the results of two reducers */
@@ -111,9 +116,14 @@ struct nrmlp {
   /*! \brief finalize reduction result */
   template<typename DType>
   MSHADOW_XINLINE void Finalize(volatile DType& sum_of_powers, volatile DType& scale) { // NOLINT(*)
+#pragma GCC diagnostic push
+#if __GNUC__ >= 7
+#pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#endif
     if (lp != 0.0) {
       sum_of_powers = scale * DType(lp_power(static_cast<double>(sum_of_powers), 1.0 / lp));
     }
+#pragma GCC diagnostic pop
   }
 
   /*!
diff --git a/src/operator/numpy/np_cumsum-inl.h b/src/operator/numpy/np_cumsum-inl.h
index 375d83b2240f..65e658115dc4 100644
--- a/src/operator/numpy/np_cumsum-inl.h
+++ b/src/operator/numpy/np_cumsum-inl.h
@@ -60,17 +60,17 @@ struct CumsumParam : public dmlc::Parameter<CumsumParam> {
 
 struct cumsum_forward {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int i,
+  MSHADOW_XINLINE static void Map(index_t i,
                                   OType *out,
                                   const IType *in,
-                                  const int middle,
-                                  const int trailing) {
-    int left = i / trailing, right = i % trailing;
-    int offset = left * middle * trailing + right;
+                                  const index_t middle,
+                                  const index_t trailing) {
+    index_t left = i / trailing, right = i % trailing;
+    index_t offset = left * middle * trailing + right;
     const IType *lane_in = in + offset;
     OType *lane_out = out + offset;
     lane_out[0] = OType(lane_in[0]);
-    for (int j = 1; j < middle; ++j) {
+    for (index_t j = 1; j < middle; ++j) {
       lane_out[j * trailing] = lane_out[(j - 1) * trailing] + OType(lane_in[j * trailing]);
     }
   }
@@ -125,17 +125,17 @@ void CumsumForward(const nnvm::NodeAttrs& attrs,
 
 struct cumsum_backward {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int i,
+  MSHADOW_XINLINE static void Map(index_t i,
                                   IType *igrad,
                                   const OType *ograd,
-                                  const int middle,
-                                  const int trailing) {
-    int left = i / trailing, right = i % trailing;
-    int offset = left * middle * trailing + right;
+                                  const index_t middle,
+                                  const index_t trailing) {
+    index_t left = i / trailing, right = i % trailing;
+    index_t offset = left * middle * trailing + right;
     const OType *lane_ograd = ograd + offset;
     IType *lane_igrad = igrad + offset;
     lane_igrad[(middle - 1) * trailing] = IType(lane_ograd[(middle - 1) * trailing]);
-    for (int j = middle - 2; j >= 0; --j) {
+    for (index_t j = middle - 2; j >= 0; --j) {
       lane_igrad[j * trailing] = lane_igrad[(j + 1) * trailing] + IType(lane_ograd[j * trailing]);
     }
   }
diff --git a/src/operator/numpy/random/dist_common.h b/src/operator/numpy/random/dist_common.h
index e8358294eaf0..375b8d225ddf 100644
--- a/src/operator/numpy/random/dist_common.h
+++ b/src/operator/numpy/random/dist_common.h
@@ -143,33 +143,60 @@ template <typename DistParam>
 inline bool TwoparamsDistOpShape(const nnvm::NodeAttrs &attrs,
                                  std::vector<TShape> *in_attrs,
                                  std::vector<TShape> *out_attrs) {
+  // The inferShape function for sampling Ops has two modes: Concat/Broadcast,
+  // if size[0] == -2, the Concat schema will be selected:
+  // output_size = (size[1:],) + broadcast(param1.shape, param2.shape)
+  // otherwise output_size = broadcast(param1.shape, param2.shape, size)
   const DistParam &param = nnvm::get<DistParam>(attrs.parsed);
+  // Variable indicating the mode.
+  bool concat_mode = false;
+  // Variable storing the info from `size` parameter.
+  std::vector<dim_t> oshape_vec;
   if (param.size.has_value()) {
     // Size declared.
-    std::vector<dim_t> oshape_vec;
     const mxnet::Tuple<int> &size = param.size.value();
-    for (int i = 0; i < size.ndim(); ++i) {
+    int head = size[0];
+    if (head == -2) {
+      concat_mode = true;
+    } else {
+      oshape_vec.emplace_back(head);
+    }
+    for (int i = 1; i < size.ndim(); ++i) {
       oshape_vec.emplace_back(size[i]);
     }
+    // If under the broadcast mode, `size` is equivalent to the final output_size.
+    if (!concat_mode) {
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape(oshape_vec));
     for (size_t input_idx = 0; input_idx < in_attrs->size(); input_idx++) {
       CheckBroadcastable((*in_attrs)[input_idx], (*out_attrs)[0]);
+      }
     }
-  } else {
-    // Size undeclared.
+  }
+  // Under concat mode, or `size` is not declared.
+  if (concat_mode || (!param.size.has_value())) {
+    // broadcast(param1.shape, param2.shape).
+    mxnet::TShape param_broadcast_shape;
     if (in_attrs->size() == 2U) {
       // Both params from ndarray.
-      mxnet::TShape &low = (*in_attrs)[0];
-      mxnet::TShape &high = (*in_attrs)[1];
-      mxnet::TShape out(std::max(low.ndim(), high.ndim()), -1);
-      InferBroadcastShape(low, high, &out);
-      SHAPE_ASSIGN_CHECK(*out_attrs, 0, out);
+      mxnet::TShape &param1 = (*in_attrs)[0];
+      mxnet::TShape &param2 = (*in_attrs)[1];
+      mxnet::TShape out(std::max(param1.ndim(), param2.ndim()), -1);
+      InferBroadcastShape(param1, param2, &out);
+      param_broadcast_shape = out;
     } else if (in_attrs->size() == 1U) {
       // One param from ndarray.
-      SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0))
+      param_broadcast_shape = in_attrs->at(0);
     } else if (in_attrs->size() == 0) {
       // Two scalar case.
-      SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape(0, -1))
+      param_broadcast_shape = TShape(0, -1);
+    }
+    if (concat_mode) {
+      for (int i = 0; i < param_broadcast_shape.ndim(); ++i) {
+        oshape_vec.emplace_back(param_broadcast_shape[i]);
+      }
+      SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape(oshape_vec));
+    } else {
+      SHAPE_ASSIGN_CHECK(*out_attrs, 0, param_broadcast_shape);
     }
   }
   if (out_attrs->size() == 2U) {
diff --git a/src/operator/numpy/random/np_bernoulli_op.cc b/src/operator/numpy/random/np_bernoulli_op.cc
index d67ad1b8d7f6..1377d525015d 100644
--- a/src/operator/numpy/random/np_bernoulli_op.cc
+++ b/src/operator/numpy/random/np_bernoulli_op.cc
@@ -53,7 +53,7 @@ NNVM_REGISTER_OP(_npi_bernoulli)
     return (num_inputs == 0) ? std::vector<std::string>() : std::vector<std::string>{"input1"};
   })
 .set_attr_parser(ParamParser<NumpyBernoulliParam>)
-.set_attr<mxnet::FInferShape>("FInferShape", UnaryDistOpShape<NumpyBernoulliParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", TwoparamsDistOpShape<NumpyBernoulliParam>)
 .set_attr<nnvm::FInferType>("FInferType", NumpyBernoulliOpType)
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const nnvm::NodeAttrs& attrs) {
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 291d98013d8a..6f13a70d277a 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -231,23 +231,18 @@ struct MultiSGDKernel {
       if (i < static_cast<index_t>(param.sizes[index])) {
         MPDType w = has_mixed_precision ? param.weights32[index][i] :
                                           MPDType(param.weights[index][i]);
-        MPDType mom = has_momentum ? param.mom[index][i] : MPDType(0);
+        MPDType rescale_grad = param.rescale_grad * static_cast<MPDType>(param.grads[index][i]);
         if (param.clip_gradient >= 0.0f) {
-          mom = param.momentum*mom
-                - param.lrs[index]*param.wds[index]*w
-                - param.lrs[index]
-                *mshadow_op::clip::Map(param.rescale_grad *
-                                       static_cast<MPDType>(param.grads[index][i]),
-                                     param.clip_gradient);
-        } else {
-          mom = param.momentum*mom
-                - param.lrs[index]*param.wds[index]*w
-                - param.lrs[index]*param.rescale_grad*static_cast<MPDType>(param.grads[index][i]);
+          rescale_grad = mshadow_op::clip::Map(rescale_grad, param.clip_gradient);
         }
+        rescale_grad += param.wds[index] * w;
         if (has_momentum) {
-          param.mom[index][i] = mom;
+          param.mom[index][i] *= param.momentum;
+          param.mom[index][i] -= param.lrs[index] * rescale_grad;
+          w = w + param.mom[index][i];
+        } else {
+          w -= param.lrs[index] * rescale_grad;
         }
-        w = w + mom;
         if (has_mixed_precision) {
           param.weights32[index][i] = w;
         }
@@ -385,16 +380,12 @@ struct SGDKernel {
     const DType* grad_data, const DType param_clip_gradient,
     const DType param_lr, const DType param_wd, const DType param_rescale_grad,
     const OpReqType req) {
+    DType rescale_grad = param_rescale_grad * grad_data[i];
     if (param_clip_gradient >= 0.0f) {
-      KERNEL_ASSIGN(out_data[i], req,
-             (1.f-param_lr*param_wd)*weight_data[i]
-               - (param_lr)
-                 * mshadow_op::clip::Map(param_rescale_grad*grad_data[i], param_clip_gradient));
-    } else {
-      KERNEL_ASSIGN(out_data[i], req,
-             (1.f-param_lr*param_wd)*weight_data[i]
-               - (param_lr*param_rescale_grad)*grad_data[i]);
+      rescale_grad = mshadow_op::clip::Map(rescale_grad, param_clip_gradient);
     }
+    rescale_grad += param_wd * weight_data[i];
+    KERNEL_ASSIGN(out_data[i], req, weight_data[i] - (param_lr * rescale_grad));
   }
 };
 
@@ -439,13 +430,12 @@ struct SGDDnsRspKernel<req, gpu> {
     const dim_t col_id = i % row_length;
     const dim_t row_offset = grad_idx[row_id] * row_length;
     const dim_t data_i = row_offset + col_id;
+    DType grad_rescaled = rescale_grad * grad_val[i];
     if (clip_gradient >= 0.0f) {
-      KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
-                   (lr) * mshadow_op::clip::Map(rescale_grad * grad_val[i], clip_gradient));
-    } else {
-      KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
-                    (lr * rescale_grad) * grad_val[i]);
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight[data_i];
+    KERNEL_ASSIGN(out[data_i], req, weight[data_i] - (lr * grad_rescaled));
   }
 };
 
@@ -464,13 +454,12 @@ struct SGDDnsRspKernel<req, cpu> {
     for (index_t j = 0; j < row_length; j++) {
       index_t data_i = grad_idx[i] * row_length + j;
       index_t grad_i = i * row_length + j;
+      DType grad_rescaled = rescale_grad * grad_val[grad_i];
       if (clip_gradient >= 0.0f) {
-        KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
-                     (lr) * mshadow_op::clip::Map(rescale_grad * grad_val[grad_i], clip_gradient));
-      } else {
-        KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
-                      (lr * rescale_grad) * grad_val[grad_i]);
+        grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += wd * weight[data_i];
+      KERNEL_ASSIGN(out[data_i], req, weight[data_i] - (lr * grad_rescaled));
     }
   }
 };
@@ -505,7 +494,7 @@ inline void SGDUpdateDnsRspImpl(const SGDParam& param,
         // apply standard weight decay if not lazy update
         if (!param.lazy_update) {
           Kernel<op_with_req<mshadow_op::mul, req_type>, xpu>::Launch(s, weight.Size(),
-            weight_data, weight_data, static_cast<DType>(1 - param.lr * param.wd));
+          weight_data, weight_data, static_cast<DType>(1 - param.lr * param.wd));
           wd = 0;
         }
         if (!grad.storage_initialized()) return;
@@ -605,16 +594,13 @@ struct SGDMomKernel {
                                   const DType param_clip_gradient, const DType param_momentum,
                                   const DType param_lr, const DType param_wd,
                                   const DType param_rescale_grad, const OpReqType req) {
+    DType rescale_grad = param_rescale_grad * grad_data[i];
     if (param_clip_gradient >= 0.0f) {
-      mom_data[i] = param_momentum*mom_data[i]
-              - param_lr*param_wd*weight_data[i]
-              - param_lr
-              *mshadow_op::clip::Map(param_rescale_grad*grad_data[i], param_clip_gradient);
-    } else {
-      mom_data[i] = param_momentum*mom_data[i]
-                - param_lr*param_wd*weight_data[i]
-                - param_lr*param_rescale_grad*grad_data[i];
+      rescale_grad = mshadow_op::clip::Map(rescale_grad, param_clip_gradient);
     }
+    rescale_grad += param_wd * weight_data[i];
+    mom_data[i] *= param_momentum;
+    mom_data[i] -= param_lr * rescale_grad;
     KERNEL_ASSIGN(out_data[i], req, weight_data[i] + mom_data[i]);
   }
 };
@@ -659,20 +645,15 @@ struct MP_SGDKernel {
     const DType* grad_data, float* weight32, const float param_clip_gradient,
     const float param_lr, const float param_wd, const float param_rescale_grad,
     const OpReqType req) {
+    float w = weight32[i];
+    float rescale_grad = param_rescale_grad * static_cast<float>(grad_data[i]);
     if (param_clip_gradient >= 0.0f) {
-      float w = weight32[i];
-      w = (1.f - param_lr*param_wd)*w -
-          (param_lr) * mshadow_op::clip::Map(param_rescale_grad*static_cast<float>(grad_data[i]),
-                                             param_clip_gradient);
-      weight32[i] = w;
-      KERNEL_ASSIGN(out_data[i], req, (DType)w);
-    } else {
-      float w = weight32[i];
-      w = (1.f-param_lr*param_wd)*w
-               - (param_lr*param_rescale_grad)*static_cast<float>(grad_data[i]);
-      weight32[i] = w;
-      KERNEL_ASSIGN(out_data[i], req, (DType)w);
+      rescale_grad = mshadow_op::clip::Map(rescale_grad, param_clip_gradient);
     }
+    rescale_grad += param_wd * w;
+    w -= param_lr * rescale_grad;
+    weight32[i] = w;
+    KERNEL_ASSIGN(out_data[i], req, (DType)w);
   }
 };
 
@@ -705,17 +686,13 @@ struct MP_SGDMomKernel {
     const float param_wd, const float param_rescale_grad, const OpReqType req) {
     float w = weight32[i];
     float mom = mom_data[i];
+    float grad_rescaled = param_rescale_grad*static_cast<float>(grad_data[i]);
     if (param_clip_gradient >= 0.0f) {
-      mom = param_momentum*mom
-              - param_lr*param_wd*w
-              - param_lr
-              *mshadow_op::clip::Map(param_rescale_grad*static_cast<float>(grad_data[i]),
-                                     param_clip_gradient);
-    } else {
-      mom = param_momentum*mom
-                - param_lr*param_wd*w
-                - param_lr*param_rescale_grad*static_cast<float>(grad_data[i]);
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, param_clip_gradient);
     }
+    grad_rescaled += param_wd * w;
+    mom *= param_momentum;
+    mom -= param_lr * grad_rescaled;
     mom_data[i] = mom;
     w = w + mom;
     weight32[i] = w;
@@ -754,21 +731,16 @@ struct SGDMomDnsRspDnsKernel<req, cpu> {
     DType* mom_data, const DType* weight_data, const IType* grad_idx,
     const DType* grad_data, const DType clip_gradient, const DType momentum,
     const DType lr, const DType wd, const DType rescale_grad) {
-    const DType rate = lr * wd;
     for (index_t j = 0; j < row_length; j++) {
       index_t data_i = grad_idx[i] * row_length + j;
       index_t grad_i = i * row_length + j;
+      DType grad_rescaled = rescale_grad * grad_data[grad_i];
       if (clip_gradient >= 0.0f) {
-        mom_data[data_i] = momentum * mom_data[data_i]
-                - rate * weight_data[data_i]
-                - lr *
-                mshadow_op::clip::Map(rescale_grad * grad_data[grad_i],
-                                      clip_gradient);
-      } else {
-        mom_data[data_i] = momentum * mom_data[data_i]
-                  - rate * weight_data[data_i]
-                  - lr * rescale_grad * grad_data[grad_i];
+        grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += wd * weight_data[data_i];
+      mom_data[data_i] *= momentum;
+      mom_data[data_i] -= lr * grad_rescaled;
       KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]);
     }
   }
@@ -782,21 +754,16 @@ struct SGDMomDnsRspDnsKernel<req, gpu> {
     const DType* grad_data, const DType clip_gradient, const DType momentum,
     const DType lr, const DType wd, const DType rescale_grad) {
     using nnvm::dim_t;
-    const DType rate = lr * wd;
     const dim_t row_id = i / row_length;
     const dim_t col_id = i % row_length;
     const dim_t data_i = grad_idx[row_id] * row_length + col_id;
+    DType grad_rescaled = rescale_grad * grad_data[i];
     if (clip_gradient >= 0.0f) {
-      mom_data[data_i] = momentum * mom_data[data_i]
-              - rate * weight_data[data_i]
-              - lr *
-              mshadow_op::clip::Map(rescale_grad * grad_data[i],
-                                    clip_gradient);
-    } else {
-      mom_data[data_i] = momentum * mom_data[data_i]
-                - rate * weight_data[data_i]
-                - lr * rescale_grad * grad_data[i];
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight_data[data_i];
+    mom_data[data_i] *= momentum;
+    mom_data[data_i] -= lr * grad_rescaled;
     KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]);
   }
 };
@@ -1066,20 +1033,15 @@ struct NAGMomKernel {
     const DType param_clip_gradient, const DType param_momentum,
     const DType param_lr, const DType param_wd,
     const DType param_rescale_grad, const OpReqType req) {
+    DType grad_rescaled = param_rescale_grad * grad_data[i];
     if (param_clip_gradient >= 0.0f) {
-      mom_data[i] = param_momentum*mom_data[i];
-      KERNEL_ASSIGN(out_data[i], req, weight_data[i]-mom_data[i]+(param_momentum+1)
-              *(mom_data[i]-(param_lr*(mshadow_op::clip::Map(param_rescale_grad
-                              *grad_data[i], param_clip_gradient)+(param_wd*weight_data[i])))));
-      mom_data[i] = mom_data[i] - (param_lr*((mshadow_op::clip::Map(param_rescale_grad*grad_data[i],
-                          param_clip_gradient))+(param_wd*weight_data[i])));
-    } else {
-      mom_data[i] = param_momentum*mom_data[i];
-      KERNEL_ASSIGN(out_data[i], req, weight_data[i]-mom_data[i]+(param_momentum+1)
-              *(mom_data[i]-(param_lr*(param_rescale_grad*grad_data[i]+param_wd*weight_data[i]))));
-      mom_data[i] = mom_data[i] - param_lr*((param_rescale_grad*grad_data[i])
-              +(param_wd*weight_data[i]));
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, param_clip_gradient);
     }
+    grad_rescaled += param_wd * weight_data[i];
+    mom_data[i] *= param_momentum;
+    mom_data[i] -= param_lr * grad_rescaled;
+    KERNEL_ASSIGN(out_data[i], req, weight_data[i] + (param_momentum * mom_data[i])
+                   - (param_lr * grad_rescaled));
   }
 };
 
@@ -1116,25 +1078,16 @@ struct MP_NAGMomKernel {
     const float param_wd, const float param_rescale_grad,
     const OpReqType req) {
     float w = weight32[i];
+    float grad_rescaled = param_rescale_grad * static_cast<float>(grad_data[i]);
     if (param_clip_gradient >= 0.0f) {
-      mom_data[i] = param_momentum*mom_data[i];
-      w = w-mom_data[i]+(param_momentum+1)*(mom_data[i]-param_lr
-              *(mshadow_op::clip::Map(param_rescale_grad*static_cast<float>(grad_data[i]),
-                          param_clip_gradient)+(param_wd*w)));
-      mom_data[i] = mom_data[i] - param_lr
-          *((mshadow_op::clip::Map(param_rescale_grad*static_cast<float>(grad_data[i]),
-                          param_clip_gradient))+(param_wd*w));
-      weight32[i] = w;
-      KERNEL_ASSIGN(out_data[i], req, w);
-    } else {
-      mom_data[i] = param_momentum*mom_data[i];
-      w = w-mom_data[i]+(param_momentum+1)*(mom_data[i]-param_lr
-              *(param_rescale_grad*static_cast<float>(grad_data[i])+(param_wd*w)));
-      mom_data[i] = mom_data[i] - param_lr
-          *((param_rescale_grad*static_cast<float>(grad_data[i]))+(param_wd*w));
-      weight32[i] = w;
-      KERNEL_ASSIGN(out_data[i], req, w);
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, param_clip_gradient);
     }
+    grad_rescaled += param_wd * w;
+    mom_data[i] *= param_momentum;
+    mom_data[i] -= param_lr * grad_rescaled;
+    w += (param_momentum * mom_data[i]) - (param_lr * grad_rescaled);
+    weight32[i] = w;
+    KERNEL_ASSIGN(out_data[i], req, w);
   }
 };
 
@@ -1212,7 +1165,7 @@ struct FTMLKernel {
     const OpReqType req) {
     using namespace mshadow_op;
     const DType grad_i = clip_grad >= 0.0f
-        ? clip::Map(rescale_grad * grad[i] + wd * weight[i], clip_grad)
+        ? clip::Map(rescale_grad * grad[i], clip_grad) + wd * weight[i]
         : (rescale_grad * grad[i] + wd * weight[i]);
     v[i] = beta2 * v[i] + (1 - beta2) * square::Map(grad_i);
     const DType d_t = (1 - power::Map(beta1, t)) / lr *
@@ -1300,10 +1253,11 @@ struct AdamUpdateKernel {
     const DType epsilon, const OpReqType req) {
     using namespace mshadow_op;
 
-    DType grad_rescaled = grad_data[i] * rescale_grad + weight_data[i] * wd;
+    DType grad_rescaled = grad_data[i] * rescale_grad;
     if (clip_gradient >= 0.f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += weight_data[i] * wd;
 
     mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled;
     var_data[i] = beta2 * var_data[i] +
@@ -1363,17 +1317,13 @@ struct AdamDnsRspDnsKernel<req, cpu> {
       const dim_t data_i = row_offset + j;
       // index in grad
       const dim_t grad_i = i * row_length + j;
-      const DType grad_rescaled = grad_data[grad_i] * rescale_grad + weight_data[data_i] * wd;
+      DType grad_rescaled = grad_data[grad_i] * rescale_grad;
       if (clip_gradient >= 0.0f) {
-        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) *
-                            clip::Map(grad_rescaled, clip_gradient);
-        var_data[data_i] =  beta2 * var_data[data_i] + (1.f - beta2) * square::Map(
-                            clip::Map(grad_rescaled, clip_gradient));
-      } else {
-        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
-        var_data[data_i] = beta2 * var_data[data_i] +
-                           (1.f - beta2) * grad_rescaled * grad_rescaled;
+        grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += weight_data[data_i] * wd;
+      mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
+      var_data[data_i] = beta2 * var_data[data_i] +(1.f - beta2) * grad_rescaled * grad_rescaled;
       KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] - lr * mean_data[data_i] /
                     (square_root::Map(var_data[data_i]) + epsilon));
     }
@@ -1396,10 +1346,11 @@ struct AdamDnsRspDnsKernel<req, gpu> {
     // index in data/mean/var
     const dim_t data_i = row_offset + col_id;
     // index in grad
-    DType grad_rescaled = grad_data[i] * rescale_grad + weight_data[data_i] * wd;
+    DType grad_rescaled = grad_data[i] * rescale_grad;
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += weight_data[data_i] * wd;
     mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
     var_data[data_i] = beta2 * var_data[data_i] +
                        (1.f - beta2) * grad_rescaled * grad_rescaled;
@@ -1915,8 +1866,8 @@ inline void MPLambUpdatePhaseTwo(const nnvm::NodeAttrs& attrs,
 // by Alex Graves, 2013.
 struct RMSPropAlexParam : public dmlc::Parameter<RMSPropAlexParam> {
   float lr;
-  float gamma1;
-  float gamma2;
+  float rho;
+  float momentum;
   float epsilon;
   float wd;
   float rescale_grad;
@@ -1925,9 +1876,9 @@ struct RMSPropAlexParam : public dmlc::Parameter<RMSPropAlexParam> {
   DMLC_DECLARE_PARAMETER(RMSPropAlexParam) {
     DMLC_DECLARE_FIELD(lr)
     .describe("Learning rate");
-    DMLC_DECLARE_FIELD(gamma1).set_default(0.95f)
+    DMLC_DECLARE_FIELD(rho).set_default(0.95f)
     .describe("Decay rate.");
-    DMLC_DECLARE_FIELD(gamma2).set_default(0.9f)
+    DMLC_DECLARE_FIELD(momentum).set_default(0.9f)
     .describe("Decay rate.");
     DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f)
     .describe("A small constant for numerical stability.");
@@ -1957,25 +1908,26 @@ struct RMSPropAlexUpdateKernel {
     DType* state_n_data, DType* state_g_data, DType* delta_data,
     const DType* weight_data, const DType* grad_data,
     const DType clip_gradient, const DType rescale_grad,
-    const DType gamma1, const DType gamma2,
+    const DType rho, const DType momentum,
     const DType lr, const DType wd,
     const DType clip_weights, const DType epsilon,
     const OpReqType req) {
     using namespace mshadow_op;
 
-    DType grad_rescaled = rescale_grad * grad_data[i] + wd * weight_data[i];
+    DType grad_rescaled = rescale_grad * grad_data[i];
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight_data[i];
 
-    state_n_data[i] = (1.f - gamma1) * grad_rescaled * grad_rescaled +
-                      gamma1 * state_n_data[i];
-    state_g_data[i] = (1.f - gamma1) * grad_rescaled +
-                      gamma1 * state_g_data[i];
-    delta_data[i] = gamma2 * delta_data[i] -
+    state_n_data[i] = (1.f - rho) * square::Map(grad_rescaled) +
+                      rho * state_n_data[i];
+    state_g_data[i] = (1.f - rho) * grad_rescaled +
+                      rho * state_g_data[i];
+    delta_data[i] = momentum * delta_data[i] -
                     (lr * (grad_rescaled) /
                       (square_root::Map(state_n_data[i] -
-                                        state_g_data[i] * state_g_data[i] + epsilon)));
+                                        square::Map(state_g_data[i]) + epsilon)));
 
     if (clip_weights >= 0.0f) {
       const DType clipped_weight = clip::Map(weight_data[i] + delta_data[i], clip_weights);
@@ -1998,15 +1950,15 @@ inline void RMSPropAlexUpdate(const nnvm::NodeAttrs &attrs,
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     DType* weight_data = inputs[0].dptr<DType>();
     DType* grad_data = inputs[1].dptr<DType>();
-    DType* state_n_data = inputs[2].dptr<DType>();
-    DType* state_g_data = inputs[3].dptr<DType>();
+    DType* state_g_data = inputs[2].dptr<DType>();
+    DType* state_n_data = inputs[3].dptr<DType>();
     DType* delta_data = inputs[4].dptr<DType>();
     DType* out_data = outputs[0].dptr<DType>();
 
     Kernel<RMSPropAlexUpdateKernel, xpu>::Launch(s, inputs[0].shape_.Size(),
       out_data, state_n_data, state_g_data, delta_data, weight_data, grad_data,
       static_cast<DType>(param.clip_gradient), static_cast<DType>(param.rescale_grad),
-      static_cast<DType>(param.gamma1), static_cast<DType>(param.gamma2),
+      static_cast<DType>(param.rho), static_cast<DType>(param.momentum),
       static_cast<DType>(param.lr), static_cast<DType>(param.wd),
       static_cast<DType>(param.clip_weights), static_cast<DType>(param.epsilon), req[0]);
   });
@@ -2017,7 +1969,7 @@ inline void RMSPropAlexUpdate(const nnvm::NodeAttrs &attrs,
 // by Tieleman & Hinton, 2012
 struct RMSPropParam : public dmlc::Parameter<RMSPropParam> {
   float lr;
-  float gamma1;
+  float rho;
   float epsilon;
   float wd;
   float rescale_grad;
@@ -2026,7 +1978,7 @@ struct RMSPropParam : public dmlc::Parameter<RMSPropParam> {
   DMLC_DECLARE_PARAMETER(RMSPropParam) {
     DMLC_DECLARE_FIELD(lr)
     .describe("Learning rate");
-    DMLC_DECLARE_FIELD(gamma1).set_default(0.95f)
+    DMLC_DECLARE_FIELD(rho).set_default(0.95f)
     .describe("The decay rate of momentum estimates.");
     DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f)
     .describe("A small constant for numerical stability.");
@@ -2056,20 +2008,21 @@ struct RMSPropUpdateKernel {
     DType* out_data, DType* state_n_data,
     const DType* weight_data, const DType* grad_data,
     const DType clip_gradient, const DType rescale_grad,
-    const DType gamma1, const DType lr, const DType wd,
+    const DType rho, const DType lr, const DType wd,
     const DType clip_weights, const DType epsilon,
     const OpReqType req) {
     using namespace mshadow_op;
 
-    DType grad_rescaled = rescale_grad * grad_data[i] + wd * weight_data[i];
+    DType grad_rescaled = rescale_grad * grad_data[i];
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight_data[i];
 
-    state_n_data[i] = (1.f - gamma1) * (grad_rescaled * grad_rescaled) + gamma1 * state_n_data[i];
+    state_n_data[i] = (1.f - rho) * square::Map(grad_rescaled) + rho * state_n_data[i];
 
     DType weight = weight_data[i] -
-                   lr * (grad_rescaled / square_root::Map(state_n_data[i] + epsilon));
+                   lr * (grad_rescaled) / (square_root::Map(state_n_data[i]) + epsilon);
     if (clip_weights >= 0.0f) {
       weight = clip::Map(weight, clip_weights);
     }
@@ -2094,7 +2047,7 @@ inline void RMSPropUpdate(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
     Kernel<RMSPropUpdateKernel, xpu>::Launch(s, inputs[0].shape_.Size(),
       out_data, state_n_data, weight_data, grad_data,
       static_cast<DType>(param.clip_gradient), static_cast<DType>(param.rescale_grad),
-      static_cast<DType>(param.gamma1), static_cast<DType>(param.lr), static_cast<DType>(param.wd),
+      static_cast<DType>(param.rho), static_cast<DType>(param.lr), static_cast<DType>(param.wd),
       static_cast<DType>(param.clip_weights), static_cast<DType>(param.epsilon), req[0]);
   });
 }
@@ -2151,10 +2104,9 @@ struct FtrlUpdateKernel {
                       weight_data[i] / lr;
     n_data[i] += square::Map(grad_rescaled);
 
-    KERNEL_ASSIGN(out_data[i], req,
-                  (sign::Map(z_data[i]) * lamda1 - z_data[i]) /
-                  ((beta + square_root::Map(n_data[i])) / lr + wd) *
-                  gt::Map(abs::Map(z_data[i]), lamda1));
+    DType d = - sign::Map(z_data[i]) * maximum::Map(abs::Map(z_data[i]) - lamda1,
+                                                    static_cast<DType>(0));
+    KERNEL_ASSIGN(out_data[i], req, d / ((beta + square_root::Map(n_data[i])) / lr + wd));
   }
 };
 
@@ -2198,23 +2150,19 @@ struct FtrlDnsRspDnsKernel {
       const dim_t data_i = row_offset + j;
       // index in grad
       const dim_t grad_i = i * row_length + j;
-      const DType grad_rescaled = grad_data[grad_i] * rescale_grad;
+      DType grad_rescaled = grad_data[grad_i] * rescale_grad;
       if (clip_gradient >= 0.0f) {
-        z_data[data_i] += clip::Map(grad_rescaled, clip_gradient) -
-                          (square_root::Map(n_data[data_i] +
-                          square::Map(clip::Map(grad_rescaled, clip_gradient))) -
-                          square_root::Map(n_data[data_i])) * weight_data[data_i] / lr;
-        n_data[data_i] += square::Map(clip::Map(grad_rescaled, clip_gradient));
-      } else {
-        z_data[data_i] += grad_rescaled - (square_root::Map(n_data[data_i] +
-                          square::Map(grad_rescaled)) - square_root::Map(n_data[data_i])) *
-                          weight_data[data_i] / lr;
-        n_data[data_i] += square::Map(grad_rescaled);
+        grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
       }
+      z_data[data_i] += grad_rescaled - (square_root::Map(n_data[data_i] +
+                        square::Map(grad_rescaled)) - square_root::Map(n_data[data_i])) *
+                        weight_data[data_i] / lr;
+      n_data[data_i] += square::Map(grad_rescaled);
+
+      DType d = - sign::Map(z_data[data_i]) * maximum::Map(abs::Map(z_data[data_i]) - lamda1,
+                                                           static_cast<DType>(0));
       KERNEL_ASSIGN(out_data[data_i], req,
-                    (sign::Map(z_data[data_i]) * lamda1 - z_data[data_i]) /
-                    ((beta + square_root::Map(n_data[data_i])) / lr + wd) *
-                    gt::Map(abs::Map(z_data[data_i]), lamda1));
+                    d / ((beta + square_root::Map(n_data[data_i])) / lr + wd));
     }
   }
 };
@@ -2351,8 +2299,8 @@ struct SignSGDKernel {
 
     // param_clip_gradient has no effect for SignSGD
     KERNEL_ASSIGN(out_data[i], req,
-             (1.f-param_lr*param_wd)*weight_data[i]
-               - (param_lr)*((grad_data[i] > 0) - (grad_data[i] < 0)));
+             (1.f - param_lr * param_wd) * weight_data[i]
+               - (param_lr) * ((grad_data[i] > 0) - (grad_data[i] < 0)));
   }
 };
 
@@ -2418,18 +2366,15 @@ struct SignumKernel {
                                   const DType param_lr, const DType param_wd,
                                   const DType param_rescale_grad, const DType param_wd_lh,
                                   const OpReqType req) {
+    DType rescale_grad = param_rescale_grad * grad_data[i];
     if (param_clip_gradient >= 0.0f) {
-      mom_data[i] = param_momentum*mom_data[i]
-              - (1-param_momentum)*param_wd*weight_data[i]
-              - (1-param_momentum)
-              *mshadow_op::clip::Map(param_rescale_grad*grad_data[i], param_clip_gradient);
-    } else {
-      mom_data[i] = param_momentum*mom_data[i]
-                - (1-param_momentum)*param_wd*weight_data[i]
-                - (1-param_momentum)*param_rescale_grad*grad_data[i];
+      rescale_grad = mshadow_op::clip::Map(rescale_grad, param_clip_gradient);
     }
-    KERNEL_ASSIGN(out_data[i], req, (1.f-param_lr*param_wd_lh)*weight_data[i]
-      + (param_lr)*((mom_data[i] > 0) - (mom_data[i] < 0)));
+    rescale_grad += param_wd * weight_data[i];
+    mom_data[i] *= param_momentum;
+    mom_data[i] -= (1 - param_momentum) * rescale_grad;
+    KERNEL_ASSIGN(out_data[i], req, (1.f - param_lr * param_wd_lh) * weight_data[i]
+      + (param_lr) * ((mom_data[i] > 0) - (mom_data[i] < 0)));
   }
 };
 
@@ -2526,7 +2471,7 @@ struct AdagradDnsRspDnsKernel<cpu> {
       }
       const DType grad_squared = grad_rescaled * grad_rescaled;
       state_data[data_j] += grad_squared;
-      const DType div = grad_rescaled / square_root::Map(state_data[data_j] + epsilon);
+      const DType div = grad_rescaled / (square_root::Map(state_data[data_j]) + epsilon);
       // No need to use KERNEL_ASSIGN, as we already checked req is kWriteInplace
       out_data[data_j] = weight_data[data_j] - div * lr;
     }
@@ -2551,7 +2496,7 @@ struct AdagradDnsRspDnsKernel<gpu> {
     }
     const DType grad_squared = grad_rescaled * grad_rescaled;
     state_data[data_i] += grad_squared;
-    const DType div = grad_rescaled / square_root::Map(state_data[data_i] + epsilon);
+    const DType div = grad_rescaled / (square_root::Map(state_data[data_i]) + epsilon);
     // No need to use KERNEL_ASSIGN, as we already checked req is kWriteInplace
     out_data[data_i] = weight_data[data_i] - div * lr;
   }
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index 93e1267cc8c7..2ac3673e4a09 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -112,7 +112,6 @@ struct SGDMomStdDnsRspDnsKernel<req, cpu> {
     DType* mom_data, const DType* weight_data, const IType* grad_idx,
     const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
     const DType momentum, const DType lr, const DType wd, const DType rescale_grad) {
-    const DType rate = lr * wd;
     const bool non_zero = (i == 0) ? prefix_sum[0] > 0
                                    : prefix_sum[i] > prefix_sum[i-1];
 
@@ -122,17 +121,13 @@ struct SGDMomStdDnsRspDnsKernel<req, cpu> {
       const index_t data_i = row_i + j;
       const DType grad = non_zero ? grad_data[grad_i + j]
                                   : static_cast<DType>(0);
+      DType grad_rescaled = rescale_grad * grad;
       if (clip_gradient >= 0.0f) {
-        mom_data[data_i] = momentum * mom_data[data_i]
-                - rate * weight_data[data_i]
-                - lr *
-                mshadow_op::clip::Map(rescale_grad * grad,
-                                      clip_gradient);
-      } else {
-        mom_data[data_i] = momentum * mom_data[data_i]
-                  - rate * weight_data[data_i]
-                  - lr * rescale_grad * grad;
+        grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += wd * weight_data[data_i];
+      mom_data[data_i] *= momentum;
+      mom_data[data_i] -= lr * grad_rescaled;
       KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]);
     }
   }
@@ -208,20 +203,16 @@ struct AdamStdDnsRspDnsKernel<req, cpu> {
     const RType grad_i = (prefix_sum[i]-1) * row_length;
     for (index_t j = 0; j < row_length; j++) {
       const index_t data_i = row_i + j;
-      const DType grad_rescaled = non_zero ? static_cast<DType>(
-                                               grad_data[grad_i + j] * rescale_grad +
-                                               weight_data[data_i] * wd)
-                                           : static_cast<DType>(weight_data[data_i] * wd);
+      DType grad_rescaled = non_zero ? static_cast<DType>(
+                                         grad_data[grad_i + j] * rescale_grad)
+                                     : static_cast<DType>(0);
       if (clip_gradient >= 0.0f) {
-        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) *
-                            clip::Map(grad_rescaled, clip_gradient);
-        var_data[data_i] =  beta2 * var_data[data_i] + (1.f - beta2) * square::Map(
-                            clip::Map(grad_rescaled, clip_gradient));
-      } else {
-        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
-        var_data[data_i] = beta2 * var_data[data_i] +
-                           (1.f - beta2) * square::Map(grad_rescaled);
+        grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += weight_data[data_i] * wd;
+      mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
+      var_data[data_i] = beta2 * var_data[data_i] +
+                         (1.f - beta2) * square::Map(grad_rescaled);
       KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] - lr * mean_data[data_i] /
                     (square_root::Map(var_data[data_i]) + epsilon));
     }
@@ -780,7 +771,7 @@ gradient and :math:`E[g^2]_t` is the decaying average over past squared gradient
 The :math:`E[g^2]_t` is given by:
 
 .. math::
-  E[g^2]_t = \gamma * E[g^2]_{t-1} + (1-\gamma) * g_t^2
+  E[g^2]_t = \rho * E[g^2]_{t-1} + (1-\rho) * g_t^2
 
 The update step is
 
@@ -791,7 +782,7 @@ The RMSProp code follows the version in
 http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
 Tieleman & Hinton, 2012.
 
-Hinton suggests the momentum term :math:`\gamma` to be 0.9 and the learning rate
+Hinton suggests the momentum term :math:`\rho` to be 0.9 and the learning rate
 :math:`\eta` to be 0.001.
 
 )code" ADD_FILELINE)
@@ -819,19 +810,19 @@ Define :math:`E[g^2]_t` is the decaying average over past squared gradient and
 :math:`E[g]_t` is the decaying average over past gradient.
 
 .. math::
-  E[g^2]_t = \gamma_1 * E[g^2]_{t-1} + (1 - \gamma_1) * g_t^2\\
-  E[g]_t = \gamma_1 * E[g]_{t-1} + (1 - \gamma_1) * g_t\\
-  \Delta_t = \gamma_2 * \Delta_{t-1} - \frac{\eta}{\sqrt{E[g^2]_t - E[g]_t^2 + \epsilon}} g_t\\
+  E[g^2]_t = \rho * E[g^2]_{t-1} + (1 - \rho) * g_t^2\\
+  E[g]_t = \rho * E[g]_{t-1} + (1 - \rho) * g_t\\
+  momentum_t = \gamma * momentum_{t-1} - \frac{\eta}{\sqrt{E[g^2]_t - E[g]_t^2 + \epsilon}} g_t\\
 
 The update step is
 
 .. math::
-  \theta_{t+1} = \theta_t + \Delta_t
+  \theta_{t+1} = \theta_t + momentum_t
 
 The RMSPropAlex code follows the version in
 http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
 
-Graves suggests the momentum term :math:`\gamma_1` to be 0.95, :math:`\gamma_2`
+Graves suggests the momentum term :math:`\rho` to be 0.95, :math:`\gamma`
 to be 0.9 and the learning rate :math:`\eta` to be 0.0001.
 )code" ADD_FILELINE)
 .set_num_inputs(5)
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index fe724ffbe361..95f45be981ce 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -37,7 +37,6 @@ struct SGDMomStdDnsRspDnsKernel<req, gpu> {
     const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
     const DType momentum, const DType lr, const DType wd, const DType rescale_grad) {
     using nnvm::dim_t;
-    const DType rate = lr * wd;
     const dim_t row_id = i / row_length;
     const dim_t col_id = i % row_length;
     const dim_t nnr = prefix_sum[row_id];
@@ -46,14 +45,13 @@ struct SGDMomStdDnsRspDnsKernel<req, gpu> {
     const RType grad_i = (nnr - 1) * row_length + col_id;
     const DType grad = non_zero ? grad_data[grad_i]
                                 : static_cast<DType>(0);
+    DType grad_rescaled = rescale_grad * grad;
     if (clip_gradient >= 0.0f) {
-      mom_data[i] = momentum * mom_data[i]
-              - rate * weight_data[i]
-              - lr * mshadow_op::clip::Map(rescale_grad * grad, clip_gradient);
-    } else {
-      mom_data[i] = momentum * mom_data[i]
-                  - rate * weight_data[i] - lr * rescale_grad * grad;
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight_data[i];
+    mom_data[i] *= momentum;
+    mom_data[i] -= lr * grad_rescaled;
     KERNEL_ASSIGN(out_data[i], req, weight_data[i] + mom_data[i]);
   }
 };
@@ -139,12 +137,12 @@ struct AdamStdDnsRspDnsKernel<req, gpu> {
     const bool non_zero = (row_id == 0) ? prefix_sum[0] > 0
                           : prefix_sum[row_id] > prefix_sum[row_id - 1];
     const RType grad_offset = (prefix_sum[row_id] - 1) * row_length + col_id;
-    DType grad_rescaled = non_zero ? static_cast<DType>(grad_data[grad_offset] * rescale_grad
-                                                        + weight_data[i] * wd)
-                                   : static_cast<DType>(weight_data[i] * wd);
+    DType grad_rescaled = non_zero ? static_cast<DType>(grad_data[grad_offset] * rescale_grad)
+                                   : static_cast<DType>(0);
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += weight_data[i] * wd;
     mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled;
     var_data[i] = beta2 * var_data[i] +
                   (1.f - beta2) * square::Map(grad_rescaled);
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_act.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_act.cc
index bc69cb5e9bf7..86acac880cb2 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_act.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_act.cc
@@ -40,7 +40,7 @@ static void MKLDNNQuantizedActForward(const nnvm::NodeAttrs& attrs,
       << "_contrib_quantized_act op only supports uint8 and int8 as input "
          "type";
 
-  MKLDNNActivationForward(attrs, ctx, in_data[0], req[0], out_data[0]);
+  MKLDNNRun(MKLDNNActivationForward, attrs, ctx, in_data[0], req[0], out_data[0]);
   out_data[1].data().dptr<float>()[0] = in_data[1].data().dptr<float>()[0];
   out_data[2].data().dptr<float>()[0] = in_data[2].data().dptr<float>()[0];
 }
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 3d47f8c0d361..557c1117739a 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -185,6 +185,7 @@ inline int GetRnnBiasSize(int num_layer,
 inline size_t GetRNNWorkspaceSize(int seq_length,
                                   int batch_size,
                                   int hidden_size,
+                                  int projection_size,
                                   int direction,
                                   int mode) {
   size_t size = 0;
@@ -324,6 +325,7 @@ void RNNForwardInference(DType* ws,
                          const int batch_size,
                          const int input_size,
                          const int state_size,
+                         const int projection_size,
                          DType* x_ptr,
                          DType* hx_ptr,
                          DType* cx_ptr,
@@ -336,8 +338,8 @@ void RNNForwardInference(DType* ws,
   switch (mode) {
     case rnn_enum::kLstm:
       LstmForwardInference<DType>(ws, state_outputs, num_layers, direction, seq_length,
-                                  batch_size, input_size, state_size, x_ptr, hx_ptr, cx_ptr,
-                                  w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr);
+                                  batch_size, input_size, state_size, projection_size,
+                                  x_ptr, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr);
       break;
     case rnn_enum::kGru:
       GruForwardInference<DType>(ws, state_outputs, num_layers, direction, seq_length,
@@ -511,10 +513,7 @@ class RNNOp {
       this->temp_init_space_ = false;
       this->reserve_cpu_space_size_ = 0;
       this->temp_cpu_space_size_ = 0;
-      if (param_.projection_size.has_value()) {
-        LOG(FATAL) <<
-            "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1";
-      }
+
       if (param_.lstm_state_clip_min.has_value()
           || param_.lstm_state_clip_max.has_value()) {
         LOG(FATAL) << "LSTM state clipping is only supported for GPU with CuDNN later than 7.2.1";
@@ -843,9 +842,14 @@ class RNNOp {
 #endif  // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
 
     if (ctx_.dev_type == kCPU) {
+      int projection_size = 0;
+      if (param_.projection_size.has_value()) {
+        projection_size = param_.projection_size.value();
+      }
+
       // allocate temp space
       const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-          param_.state_size, direction, param_.mode);
+          param_.state_size, projection_size, direction, param_.mode);
       if (!temp_init_space_ || temp_cpu_space_size_ < work_cpu_space_size) {
         temp_cpu_space_size_ = work_cpu_space_size;
         temp_cpu_space_ = NDArray(TShape({static_cast<dim_t>(temp_cpu_space_size_)}), ctx_,
@@ -856,6 +860,9 @@ class RNNOp {
 
       if (ctx.is_train || ctx.need_grad) {
         // allocate reserve space
+        if (param_.projection_size.has_value()) {
+          LOG(FATAL) << "No training support for LSTM with projection on CPU currently.";
+        }
 
         const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
                                                      param_.seq_length_, param_.batch_size_,
@@ -896,6 +903,7 @@ class RNNOp {
                                    param_.batch_size_,
                                    param_.input_size_,
                                    param_.state_size,
+                                   projection_size,
                                    x.dptr_,
                                    hx.dptr_,
                                    cx_ptr,
@@ -1096,10 +1104,17 @@ class RNNOp {
 #endif  // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
 
     if (ctx_.dev_type == kCPU) {
+      int projection_size = 0;
+      if (param_.projection_size.has_value()) {
+        // TODO(zixuanweeei): Add training support for LSTM with projection on CPU.
+        // projection_size = param_.projection_size.value();
+        LOG(FATAL) << "No training support for LSTM with projection on CPU currently.";
+      }
+
       // allocate temp space
       const size_t work_cpu_space_size =
-          GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-                              param_.state_size, direction, param_.mode);
+          GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, param_.state_size,
+                              projection_size, direction, param_.mode);
       if (!temp_init_space_ || temp_cpu_space_size_ != work_cpu_space_size) {
         LOG(FATAL) << "Check temp init error";
       }
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index f468b60de744..ac5e17d49133 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -190,20 +190,19 @@ static std::vector<ResourceRequest> RNNResourceEx(const NodeAttrs& attrs, const
   return request;
 }
 
+#if MXNET_USE_MKLDNN == 1
 inline static bool RNNStorageType(const nnvm::NodeAttrs& attrs,
                                   const int dev_mask,
                                   DispatchMode* dispatch_mode,
                                   std::vector<int> *in_attrs,
                                   std::vector<int> *out_attrs) {
-  DispatchMode wanted_mode = DispatchMode::kFCompute;
-
-#if MXNET_USE_MKLDNN == 1
-  wanted_mode = DispatchMode::kFComputeEx;
-#endif  // MXNET_USE_MKLDNN == 1
-
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
+  const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
+  const bool support_mkldnn_rnn =
+      !param.projection_size.has_value() && dmlc::GetEnv("MXNET_USE_MKLDNN_RNN", 1);
+  return MKLDNNStorageType(attrs, dev_mask, support_mkldnn_rnn,
+                           dispatch_mode, in_attrs, out_attrs);
 }
+#endif  // MXNET_USE_MKLDNN == 1
 
 struct RNNGrad {
   const char *op_name;
@@ -246,9 +245,7 @@ static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs,
   }
 
 #if MXNET_USE_MKLDNN == 1
-  if ((in_types[0] == mshadow::kFloat32 || in_types[0] == mshadow::kFloat16)
-      && in_shapes[0].ndim() == 3 && ctx.dev_type == kCPU
-      && dmlc::GetEnv("MXNET_USE_MKLDNN_RNN", 1)) {
+  if (ctx.dev_type == kCPU && SupportMKLDNNRnn(param, in_types[rnn_enum::kData])) {
     const mxnet::TShape& data_shape = in_shapes[rnn_enum::kData];
     state = OpStatePtr::Create<MKLDNNRnnOp>(param, data_shape[0],
         data_shape[1], data_shape[2]);
@@ -274,7 +271,7 @@ static void RNNStatefulComputeExCPU(const OpStatePtr& state_ptr,
                                     const std::vector<NDArray>& inputs,
                                     const std::vector<OpReqType>& req,
                                     const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNRnn(inputs[0])) {
+  if (SupportMKLDNNRnn(inputs[rnn_enum::kData].dtype())) {
     MKLDNNRnnOp& op = state_ptr.get_state<MKLDNNRnnOp>();
     op.Forward(ctx, inputs, req, outputs);
   } else {
@@ -287,7 +284,7 @@ static void RNNStatefulGradComputeExCPU(const OpStatePtr& state_ptr,
                                         const std::vector<NDArray>& inputs,
                                         const std::vector<OpReqType>& req,
                                         const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNRnn(inputs[0])) {
+  if (SupportMKLDNNRnn(inputs[rnn_enum::kData].dtype())) {
     MKLDNNRnnOp& op = state_ptr.get_state<MKLDNNRnnOp>();
     op.Backward(ctx, inputs, req, outputs);
   } else {
@@ -338,6 +335,23 @@ Long Short-Term Memory - Hochreiter, 1997. http://www.bioinf.jku.at/publications
             h_t = o_t * \tanh(c_t)
             \end{array}
 
+With the projection size being set, LSTM could use the projection feature to reduce the parameters
+size and give some speedups without significant damage to the accuracy.
+
+Long Short-Term Memory Based Recurrent Neural Network Architectures for Large Vocabulary Speech
+Recognition - Sak et al. 2014. https://arxiv.org/abs/1402.1128
+
+.. math::
+  \begin{array}{ll}
+            i_t = \mathrm{sigmoid}(W_{ii} x_t + b_{ii} + W_{ri} r_{(t-1)} + b_{ri}) \\
+            f_t = \mathrm{sigmoid}(W_{if} x_t + b_{if} + W_{rf} r_{(t-1)} + b_{rf}) \\
+            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{rc} r_{(t-1)} + b_{rg}) \\
+            o_t = \mathrm{sigmoid}(W_{io} x_t + b_{o} + W_{ro} r_{(t-1)} + b_{ro}) \\
+            c_t = f_t * c_{(t-1)} + i_t * g_t \\
+            h_t = o_t * \tanh(c_t)
+            r_t = W_{hr} h_t
+            \end{array}
+
 **GRU**
 
 Gated Recurrent Unit - Cho et al. 2014. http://arxiv.org/abs/1406.1078
@@ -385,10 +399,10 @@ The definition of GRU here is slightly different from paper but compatible with
 })
 .set_attr<mxnet::FInferShape>("FInferShape", RNNShape)
 .set_attr<nnvm::FInferType>("FInferType", RNNType)
-.set_attr<FInferStorageType>("FInferStorageType", RNNStorageType)
 .set_attr<FCreateOpState>("FCreateOpState", CreateRNNState)
 .set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulCompute<cpu>)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<FInferStorageType>("FInferStorageType", RNNStorageType)
 .set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", RNNStatefulComputeExCPU)
 #endif
@@ -427,9 +441,9 @@ NNVM_REGISTER_OP(_backward_RNN)
 .set_attr_parser(ParamParser<RNNParam>)
 .set_attr<bool>("TIsLayerOpBackward", true)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FInferStorageType>("FInferStorageType", RNNStorageType)
 .set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulGradCompute<cpu>)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<FInferStorageType>("FInferStorageType", RNNStorageType)
 .set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", RNNStatefulGradComputeExCPU)
 #endif
diff --git a/src/operator/rnn_impl.h b/src/operator/rnn_impl.h
index 3aa643421857..008ba7d315c6 100644
--- a/src/operator/rnn_impl.h
+++ b/src/operator/rnn_impl.h
@@ -209,6 +209,7 @@ void LstmForwardInferenceSingleLayer(DType* ws,
                                      const int N,
                                      const int I,
                                      const int H,
+                                     const int P,
                                      const Tensor<cpu, 2, DType> &x,
                                      const Tensor<cpu, 2, DType> &hx,
                                      const Tensor<cpu, 2, DType> &cx,
@@ -219,7 +220,9 @@ void LstmForwardInferenceSingleLayer(DType* ws,
                                      DType* cy_ptr) {
   using namespace mshadow;
   const Tensor<cpu, 2, DType> wx(w_ptr, Shape2(H * 4, I));
-  const Tensor<cpu, 2, DType> wh(w_ptr + I * H * 4, Shape2(H * 4, H));
+  const Tensor<cpu, 2, DType> wh(w_ptr + I * H * 4, Shape2(H * 4, (P ? P : H)));
+  Tensor<cpu, 2, DType> whr(w_ptr, Shape2(1, 1));
+  if (P > 0) whr = Tensor<cpu, 2, DType>(wh.dptr_ + P * 4 * H, Shape2(P, H));
   const Tensor<cpu, 2, DType> bx(b_ptr, Shape2(4, H));
   const Tensor<cpu, 2, DType> bh(b_ptr + H * 4, Shape2(4, H));
   Tensor<cpu, 2, DType> yx_flat(ws, Shape2(T * N, H * 4));
@@ -228,7 +231,10 @@ void LstmForwardInferenceSingleLayer(DType* ws,
   const Tensor<cpu, 3, DType> yh(yh_flat.dptr_, Shape3(N, 4, H));
   Tensor<cpu, 2, DType> h(yh_flat.dptr_ + N * H * 4, Shape2(N, H));
   Tensor<cpu, 2, DType> c(h.dptr_ + N * H, Shape2(N, H));
+  Tensor<cpu, 2, DType> r(hy_ptr, Shape2(1, 1));
+  if (P > 0) r = Tensor<cpu, 2, DType>(hy_ptr, Shape2(N, P));
   const int offset = bid ? H : 0;
+  const int proj_offset = bid ? P : 0;
   const DType alpha = 1.0;
   const DType beta = 0.0;
   const int cell_size = N * H;
@@ -237,7 +243,11 @@ void LstmForwardInferenceSingleLayer(DType* ws,
   const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   for (int i = 0; i < T; ++i) {
     int t = bid ? T - 1 - i : i;
-    linalg_gemm(i ? h : hx, wh, yh_flat, alpha, beta, false, true);
+    if (P > 0) {
+      linalg_gemm(i ? r : hx, wh, yh_flat, alpha, beta, false, true);
+    } else {
+      linalg_gemm(i ? h : hx, wh, yh_flat, alpha, beta, false, true);
+    }
     #pragma omp parallel for num_threads(omp_threads)
     for (int jk = 0; jk < cell_size; ++jk) {
       int j = jk / H;
@@ -248,14 +258,21 @@ void LstmForwardInferenceSingleLayer(DType* ws,
       DType ot = sigmoid<DType>(yx[t][j][3][k] + yh[j][3][k] + bx[3][k] + bh[3][k]);
       DType ct = (i ? c[j][k] : cx[j][k]) * ft + it * gt;
       DType ht = ot * tanh(ct);
-      y[t][j][k + offset] = ht;
+      if (P == 0) y[t][j][k + offset] = ht;
       if (i == T - 1 && state_outputs) {
-        hy_ptr[jk] = ht;
+        if (P == 0) hy_ptr[jk] = ht;
         cy_ptr[jk] = ct;
       } else {
-        h[j][k] = ht;
         c[j][k] = ct;
       }
+      h[j][k] = ht;
+    }
+    if (P > 0) {
+      linalg_gemm(h, whr, r, alpha, beta, false, true);
+      #pragma omp parallel for num_threads(omp_threads)
+      for (int j = 0; j < N; ++j) {
+        std::memcpy(y[t][j].dptr_ + proj_offset, r[j].dptr_, P * sizeof(DType));
+      }
     }
   }
 }
@@ -269,6 +286,7 @@ void LstmForwardInference(DType* ws,
                           const int N,
                           const int I,
                           const int H,
+                          const int P,
                           DType* x_ptr,
                           DType* hx_ptr,
                           DType* cx_ptr,
@@ -278,25 +296,29 @@ void LstmForwardInference(DType* ws,
                           DType* hy_ptr,
                           DType* cy_ptr) {
   const int total_layers = D * L;
-  Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(total_layers, N, H));
+  Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(total_layers, N, P ? P : H));
   Tensor<cpu, 3, DType> cx(cx_ptr, Shape3(total_layers, N, H));
   const int b_size = 2 * H * 4;
   const int cell_size = N * H;
+  const int projection_size = (P ? P : H) * N;
   DType* y_tmp_ptr = ws + (T + 1) * cell_size * 4 + cell_size * 2;
   DType* y_cur_ptr = y_ptr;
   int idx = 0;  // state & cell state's idx;
   bool flag = L % 2 ? false : true;
   for (int i = 0; i < L; ++i) {
-    const int input_size = i ? H * D : I;
-    const int w_size = (input_size + H) * H * 4;
+    const int input_size = i ? (P ? P : H) * D : I;
+    int w_size = (input_size + (P ? P : H)) * H * 4;
+    if (P > 0) {
+      w_size += P * H;
+    }
     // If bidirectional, need space to save current layer output y.
     if (D == 2) {
       y_cur_ptr = flag ? y_tmp_ptr : y_ptr;
       flag = !flag;
     }
     Tensor<cpu, 2, DType> x(x_ptr, Shape2(T * N, input_size));
-    Tensor<cpu, 3, DType> y(y_cur_ptr, Shape3(T, N, H * D));
-    LstmForwardInferenceSingleLayer<DType>(ws, state_outputs, false, T, N, input_size, H,
+    Tensor<cpu, 3, DType> y(y_cur_ptr, Shape3(T, N, (P ? P : H) * D));
+    LstmForwardInferenceSingleLayer<DType>(ws, state_outputs, false, T, N, input_size, H, P,
                                            x, hx[idx], cx[idx], y, w_ptr, b_ptr, hy_ptr, cy_ptr);
     // If bidirectional, then calculate the reverse direction's forward result.
     if (D == 2) {
@@ -304,10 +326,10 @@ void LstmForwardInference(DType* ws,
       b_ptr += b_size;
       ++idx;
       if (state_outputs) {
-        hy_ptr += cell_size;
+        hy_ptr += projection_size;
         cy_ptr += cell_size;
       }
-      LstmForwardInferenceSingleLayer<DType>(ws, state_outputs, true, T, N, input_size, H,
+      LstmForwardInferenceSingleLayer<DType>(ws, state_outputs, true, T, N, input_size, H, P,
                                              x, hx[idx], cx[idx], y, w_ptr, b_ptr, hy_ptr, cy_ptr);
     }
     // Don't need to move pointer in the last layer.
@@ -317,7 +339,7 @@ void LstmForwardInference(DType* ws,
       x_ptr = y_cur_ptr;
       ++idx;
       if (state_outputs) {
-        hy_ptr += cell_size;
+        hy_ptr += projection_size;
         cy_ptr += cell_size;
       }
     }
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index f00caf32332d..9e63730ec001 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -289,7 +289,7 @@ static void TransposeComputeExCPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(outputs.size(), 1U);
 
   if (SupportMKLDNNTranspose(param, inputs[0]) && req[0] == kWriteTo) {
-    MKLDNNTransposeForward(attrs, ctx, inputs[0], req[0], outputs[0]);
+    MKLDNNRun(MKLDNNTransposeForward, attrs, ctx, inputs[0], req[0], outputs[0]);
     return;
   }
   FallBackCompute(Transpose<cpu>, attrs, ctx, inputs, req, outputs);
diff --git a/src/optimizer/sgd-inl.h b/src/optimizer/sgd-inl.h
index 12738f8e4053..00afe2ad079a 100644
--- a/src/optimizer/sgd-inl.h
+++ b/src/optimizer/sgd-inl.h
@@ -82,7 +82,7 @@ void sgd_mom_update(RunContext ctx, TBlob weight, const TBlob grad, TBlob mom,
   Tensor<xpu, 2> grad2d = grad.FlatTo2D<xpu, real_t>(s);
   if (param.clip_gradient > 0.0f) {
     mom2d = param.momentum*mom2d -
-            lr*(param.rescale_grad*F<sgd_clip>(grad2d, param.clip_gradient) + wd*weight2d);
+            lr*(F<sgd_clip>(param.rescale_grad * grad2d, param.clip_gradient) + wd*weight2d);
   } else {
     mom2d = param.momentum*mom2d - lr*(param.rescale_grad*grad2d + wd*weight2d);
   }
@@ -98,7 +98,7 @@ void sgd_update(RunContext ctx, TBlob weight, const TBlob grad,
   Tensor<xpu, 2> weight2d = weight.FlatTo2D<xpu, real_t>(s);
   Tensor<xpu, 2> grad2d = grad.FlatTo2D<xpu, real_t>(s);
   if (param.clip_gradient >= 0.0f) {
-    weight2d -= lr*(param.rescale_grad*F<sgd_clip>(grad2d, param.clip_gradient) +
+    weight2d -= lr*(F<sgd_clip>(param.rescale_grad * grad2d, param.clip_gradient) +
                 wd*weight2d);
   } else {
     weight2d -= lr*(param.rescale_grad*grad2d + wd*weight2d);
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index ee57f172c1c9..222c4525ae50 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -504,6 +504,16 @@ def check_ravel():
 
         assert out.shape[0] == LARGE_TENSOR_SHAPE
 
+    def check_cumsum():
+        a = nd.ones((LARGE_X, SMALL_Y))
+        axis = 1
+
+        res = nd.cumsum(a=a, axis=axis)
+
+        assert res.shape[0] == LARGE_X
+        assert res.shape[1] == SMALL_Y
+        assert res[0][SMALL_Y - 1] == 50.
+
     check_gluon_embedding()
     check_fully_connected()
     check_dense()
@@ -527,6 +537,7 @@ def check_ravel():
     check_embedding()
     check_spatial_transformer()
     check_ravel()
+    check_cumsum()
 
 
 def test_tensor():
diff --git a/tests/python/unittest/test_contrib_optimizer.py b/tests/python/unittest/test_contrib_optimizer.py
index 7cfd0217aa31..5f7c51f257b3 100644
--- a/tests/python/unittest/test_contrib_optimizer.py
+++ b/tests/python/unittest/test_contrib_optimizer.py
@@ -25,76 +25,39 @@
 from common import with_seed
 
 
-# * GroupAdaGrad
-class PyGroupAdaGrad(mx.optimizer.Optimizer):
-    """The python reference of Group AdaGrad optimizer.
-
-    Parameters
-    ----------
-    eps: float, optional
-        Small value to avoid division by 0.
-
-    """
-
-    def __init__(self, eps=1e-5, **kwargs):
-        super(PyGroupAdaGrad, self).__init__(**kwargs)
-        self.float_stable_eps = eps
-
-    def create_state(self, index, weight):
-        assert len(weight.shape) == 2
-        history = mx.nd.zeros(
-            (weight.shape[0], 1), weight.context, stype=weight.stype)
-        return history
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        assert wd == 0
-
-        history = state
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-        history[:] += mx.nd.mean(mx.nd.square(grad), axis=1, keepdims=True)
-        div = lr * grad / mx.nd.sqrt(history + self.float_stable_eps)
-        weight[:] -= div
-
-
 def test_group_adagrad():
     mx.random.seed(0)
-    opt1 = PyGroupAdaGrad
+    opt1 = mx.optimizer.contrib.GroupAdaGrad
     opt2 = mx.optimizer.contrib.GroupAdaGrad
-    shape = (3, 4)
-    eps_options = [{}, {'eps': 1e-8}]
+    shapes = [(3, 4), [5, 6]]
+    eps_options = [{}, {'epsilon': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float32]:
-        for options in itertools.product(eps_options, cg_options, rg_options):
+        for options in itertools.product(eps_options, cg_options, rg_options, agg_options):
             kwarg = dict(wd=0.0)
             for option in options:
                 kwarg.update(option)
             compare_optimizer(
-                opt1(**kwarg),
-                opt2(**kwarg),
-                shape,
-                dtype,
-                compare_states=False)
+                opt1(use_fused_step=False, **kwarg),
+                opt2(use_fused_step=True, **kwarg),
+                shapes,
+                dtype)
             compare_optimizer(
-                opt1(**kwarg),
-                opt2(**kwarg),
-                shape,
+                opt1(use_fused_step=False, **kwarg),
+                opt2(use_fused_step=True, **kwarg),
+                shapes,
                 dtype,
                 w_stype='row_sparse',
-                g_stype='row_sparse',
-                compare_states=False)
+                g_stype='row_sparse')
             compare_optimizer(
-                opt1(**kwarg),
-                opt2(**kwarg),
-                shape,
+                opt1(use_fused_step=False, **kwarg),
+                opt2(use_fused_step=True, **kwarg),
+                shapes,
                 dtype,
-                g_stype='row_sparse',
-                compare_states=False)
+                g_stype='row_sparse')
 
 
 @with_seed()
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index 0f27f53f83a8..c18f6fa5f4e7 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -51,6 +51,62 @@ def test_lstm():
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
 
+@with_seed()
+@assert_raises_cudnn_not_satisfied(min_version='7.2.1')
+def test_lstmp():
+    hidden_size, projection_size = 512, 256
+    rtol, atol = 1e-4, 1e-4
+    batch_size, seq_len = 5, 3
+    input_size = 128
+    lstm_input = mx.nd.uniform(shape=(seq_len, batch_size, input_size))
+
+    # ==== Unidirectional Layer ====
+    for num_layers in [1, 3]:
+        fused_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size,
+                                    num_layers=num_layers, layout='TNC', bidirectional=False)
+        fused_layer.collect_params().initialize()
+
+        params = fused_layer.collect_params()
+        stack_layer = mx.gluon.rnn.HybridSequentialRNNCell(prefix='lstm0_', params=params)
+        with stack_layer.name_scope():
+            for i in range(num_layers):
+                stack_layer.add(gluon.contrib.rnn.LSTMPCell(hidden_size,
+                                                            projection_size=projection_size,
+                                                            prefix='l%d_' % i))
+        stack_layer.initialize()
+
+        fused_output = fused_layer(lstm_input.copy())
+        stack_output = stack_layer.unroll(seq_len, lstm_input.copy(), layout='TNC',
+                                        merge_outputs=True)[0]
+
+        assert_almost_equal(fused_output.asnumpy(), stack_output.asnumpy(), rtol=rtol, atol=atol)
+
+    # ==== Bidirectional Layer ====
+    for num_layers in [1, 3]:
+        fused_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size,
+                                    num_layers=num_layers, layout='TNC', bidirectional=True)
+        fused_layer.collect_params().initialize()
+
+        params = fused_layer.collect_params()
+        stack_layer = mx.gluon.rnn.HybridSequentialRNNCell(prefix='lstm0_', params=params)
+        with stack_layer.name_scope():
+            for i in range(num_layers):
+                stack_layer.add(
+                    gluon.rnn.BidirectionalCell(gluon.contrib.rnn.LSTMPCell(hidden_size,
+                                                                            projection_size=projection_size,
+                                                                            prefix='l%d_' % i),
+                                                gluon.contrib.rnn.LSTMPCell(hidden_size,
+                                                                            projection_size=projection_size,
+                                                                            prefix='r%d_' % i)))
+        stack_layer.initialize()
+
+        fused_output = fused_layer(lstm_input.copy())
+        stack_output = stack_layer.unroll(seq_len, lstm_input.copy(), layout='TNC',
+                                            merge_outputs=True)[0]
+
+        assert_almost_equal(fused_output.asnumpy(), stack_output.asnumpy(), rtol=rtol, atol=atol)
+
+
 def test_lstm_forget_bias():
     forget_bias = 2.0
     stack = gluon.rnn.SequentialRNNCell()
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 18b26579f740..298d565dc237 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -2222,8 +2222,10 @@ def _add_workload_extract():
     OpArgMngr.add_workload('extract', condition, arr)
 
 
-def _add_workload_flatnonzero():
+def _add_workload_flatnonzero(array_pool):
     x = np.array([-2, -1,  0,  1,  2])
+    OpArgMngr.add_workload('flatnonzero', array_pool['4x1'])
+    OpArgMngr.add_workload('flatnonzero', array_pool['1x2'])
     OpArgMngr.add_workload('flatnonzero', x)
 
 
@@ -2911,7 +2913,7 @@ def _prepare_workloads():
     _add_workload_digitize()
     _add_workload_divmod()
     _add_workload_extract()
-    _add_workload_flatnonzero()
+    _add_workload_flatnonzero(array_pool)
     _add_workload_float_power()
     _add_workload_frexp()
     _add_workload_histogram2d()
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 583b1f791f75..268d58c7026a 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -3764,10 +3764,9 @@ def __init__(self, shape, op_name):
         def hybrid_forward(self, F, param1, param2):
             op = getattr(F.npx.random, self._op_name, None)
             assert op is not None
-            # return param1 + param2 + op(batch_shape=self._shape)
             return op(param1, param2, batch_shape=self._shape)
 
-    batch_shapes = [(10,), (2, 3), 6, (), None]
+    batch_shapes = [(10,), (2, 3), 6, ()]
     event_shapes = [(), (2,), (2,2)]
     dtypes = ['float16', 'float32', 'float64']
     op_names = ['uniform_n', 'normal_n']
@@ -6299,6 +6298,36 @@ def hybrid_forward(self, F, x):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
 
 
+@with_seed()
+@use_np
+def test_np_flatnonzero():
+    class TestFlatnonzero(HybridBlock):
+        def __init__(self):
+            super(TestFlatnonzero, self).__init__()
+
+        def hybrid_forward(self, F, a):
+            return F.np.flatnonzero(a)
+
+    shapes = [(1,), (4, 3), (4, 5), (2, 1), (6, 5, 6), (4, 2, 1, 2),
+              (5, 1, 3, 3), (3, 3, 1, 0),]
+    types = ['int32', 'int64', 'float32', 'float64']
+    hybridizes = [True, False]
+    for hybridize, oneType, shape in itertools.product(hybridizes, types, shapes):
+        rtol, atol = 1e-3, 1e-5
+        test_flatnonzero = TestFlatnonzero()
+        if hybridize:
+            test_flatnonzero.hybridize()
+        x = rand_ndarray(shape, dtype=oneType).as_np_ndarray()
+        np_out = _np.flatnonzero(x.asnumpy())
+        mx_out = test_flatnonzero(x)
+        assert mx_out.shape == np_out.shape
+        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+
+        mx_out = np.flatnonzero(x)
+        np_out = _np.flatnonzero(x.asnumpy())
+        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=rtol, atol=atol)
+
+
 @with_seed()
 @use_np
 def test_np_round():
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 6d7cf40f29f7..2a15e3407862 100755
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -72,145 +72,51 @@ def test_lr_wd_mult():
     args2 = {k: v.asnumpy() for k, v in args2.items()}
 
     assert mod._optimizer.lr_mult == {'fc1_bias': 1.0, 'fc1_weight': 0.0}
-    assert mod._optimizer.wd_mult == {'fc2_bias': 0.5, 'fc2_weight': 0.5, 'fc1_bias': 0.0}
+    assert mod._optimizer.wd_mult == {'fc2_bias': 0.5, 'fc2_weight': 0.5}
     assert mx.test_utils.almost_equal(args1['fc1_weight'], args2['fc1_weight'], 1e-10)
     assert not mx.test_utils.almost_equal(args1['fc1_bias'], args2['fc1_bias'], 1e-1)
     assert not mx.test_utils.almost_equal(args1['fc2_weight'], args2['fc2_weight'], 1e-1)
 
-# SGD
-
-class PySGD(mx.optimizer.Optimizer):
-    """python reference implemenation of sgd"""
-    def __init__(self, learning_rate=0.01, momentum=0.0, multi_precision=False, **kwargs):
-        super(PySGD, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.momentum = momentum
-        self.multi_precision = multi_precision
-
-    def create_state(self, index, weight):
-        """Create additional optimizer state: momentum
-
-        Parameters
-        ----------
-        weight : NDArray
-        The weight data
-
-        """
-        momentum = None
-        weight_master_copy = None
-        do_multi_precision = self.multi_precision and weight.dtype == np.float16
-        if do_multi_precision:
-            if self.momentum != 0.0:
-                momentum = mx.nd.zeros(weight.shape, weight.context, dtype=np.float32)
-            weight_master_copy = array(weight, ctx=weight.context, dtype=np.float32)
-            return (momentum, weight_master_copy)
-        else:
-            if self.momentum != 0.0:
-                momentum = mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
-            return momentum
-
-    def create_state_multi_precision(self, index, weight):
-        return self.create_state(index, weight)
-
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-        An unique integer key used to index the parameters
-
-        weight : NDArray
-        weight ndarray
-
-        grad : NDArray
-        grad ndarray
-
-        state : NDArray or other objects returned by init_state
-        The auxiliary state used in optimization.
-        """
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-        use_multi_precision = isinstance(state, list) or isinstance(state, tuple)
-
-        if not use_multi_precision:
-            if self.momentum == 0.0:
-                if self.clip_gradient is not None:
-                    weight[:] = ((1 - lr*wd)*weight -
-                        lr*mx.nd.clip(grad*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                else:
-                    weight[:] = (1 - lr*wd)*weight - lr*self.rescale_grad*grad
-            else:
-                mom = state
-                if self.clip_gradient is not None:
-                    mom[:] = (self.momentum*mom - lr*wd*weight -
-                        lr*mx.nd.clip(grad*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                    weight += mom
-                else:
-                    mom[:] = self.momentum*mom - lr*wd*weight - lr*self.rescale_grad*grad
-                    weight += mom
-        else:
-            grad32 = array(grad, ctx=grad.context, dtype=np.float32)
-            mom = state[0]
-            weight32 = state[1]
-            if self.momentum == 0.0:
-                if self.clip_gradient is not None:
-                    weight32[:] = ((1 - lr*wd)*weight32 -
-                        lr*mx.nd.clip(grad32*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                else:
-                    weight32[:] = (1 - lr*wd)*weight32 - lr*self.rescale_grad*grad32
-            else:
-                if self.clip_gradient is not None:
-                    mom[:] = (self.momentum*mom - lr*wd*weight32 -
-                        lr*mx.nd.clip(grad32*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                    weight32 += mom
-                else:
-                    mom[:] = self.momentum*mom - lr*wd*weight32 - lr*self.rescale_grad*grad32
-                    weight32 += mom
-            tmp = weight32.astype(weight.dtype)
-            tmp.copyto(weight)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        self.update(index, weight, grad, state)
 
 @with_seed()
 def test_sgd():
-    opt1 = PySGD
+    opt1 = mx.optimizer.SGD
     opt2 = mx.optimizer.SGD
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    for dtype in [np.float16, np.float32, np.float64]:
-        for mom_option in mom_options:
-            for cg_option in cg_options:
-                for rg_option in rg_options:
-                    for wd_option in wd_options:
-                        for mp_option in mp_options:
-                            kwarg = {}
-                            kwarg.update(mom_option)
-                            kwarg.update(cg_option)
-                            kwarg.update(rg_option)
-                            kwarg.update(wd_option)
-                            kwarg.update(mp_option)
-                            if (dtype == np.float16 and
-                                    ('multi_precision' not in kwarg or
-                                        not kwarg['multi_precision'])):
-                                continue
-                            if dtype == np.float16:
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=1e-3)
-                            else:
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
-                            # test operator fallback on cpu
-                            if dtype != np.float16:
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape[:2],
-                                                  dtype, w_stype='csr', g_stype='csr')
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(mom_options, cg_options, rg_options,
+                                        wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            if dtype == np.float16:
+                compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                                  opt2(use_fused_step=True, **kwarg),
+                                  shapes, dtype, rtol=1e-3, atol=1e-4)
+            else:
+                compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                                  opt2(use_fused_step=True, **kwarg),
+                                  shapes, dtype)
+            # test operator fallback on cpu
+            if dtype != np.float16:
+                compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                                  opt2(use_fused_step=True, **kwarg),
+                                  [shapes[0][:2], shapes[1]],
+                                  dtype, w_stype='csr', g_stype='csr')
+
 
 class PySparseSGD(mx.optimizer.Optimizer):
     """python reference implemenation of sgd"""
-    def __init__(self, learning_rate=0.01, momentum=0.0, **kwargs):
+    def __init__(self, learning_rate=0.1, momentum=0.0, **kwargs):
         super(PySparseSGD, self).__init__(learning_rate=learning_rate, **kwargs)
         self.momentum = momentum
 
@@ -228,478 +134,240 @@ def create_state(self, index, weight):
         else:
             return mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
 
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
 
         Parameters
         ----------
-        index : int
-        An unique integer key used to index the parameters
-
-        weight : NDArray
-        weight ndarray
-
-        grad : NDArray
-        grad ndarray
-
-        state : NDArray or other objects returned by init_state
-        The auxiliary state used in optimization.
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
         """
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-        num_rows = weight.shape[0]
-        if self.momentum == 0.0:
-            # Update on a per row basis, skip all-zero rows
-            for row in range(num_rows):
-                grad_row = grad[row].asnumpy()
-                all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
-                if all_zeros:
-                   continue
-                if self.clip_gradient is not None:
-                    weight[row] = ((1 - lr*wd)*weight[row] -
-                        lr*mx.nd.clip(grad[row]*self.rescale_grad,
-                                     -self.clip_gradient, self.clip_gradient))
-                else:
-                    weight[row] = (1 - lr*wd)*weight[row] - lr*self.rescale_grad*grad[row]
-        else:
-            mom = state
-            for row in range(num_rows):
-              grad_row = grad[row].asnumpy()
-              all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
-              if all_zeros:
-                  continue
-              if self.clip_gradient is not None:
-                  mom[row] = (self.momentum*mom[row] - lr*wd*weight[row] -
-                      lr*mx.nd.clip(grad[row]*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                  weight[row] += mom[row]
-              else:
-                  mom[row] = self.momentum*mom[row] - lr*wd*weight[row] - lr*self.rescale_grad*grad[row]
-                  weight[row] += mom[row]
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            self._update_count(index)
+            num_rows = weight.shape[0]
+            if self.momentum == 0.0:
+                # Update on a per row basis, skip all-zero rows
+                for row in range(num_rows):
+                    grad_row = grad[row].asnumpy()
+                    all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
+                    if all_zeros:
+                        continue
+                    grad[row] *= self.rescale_grad
+                    if self.clip_gradient is not None:
+                        grad[row] = mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient)
+                    grad[row] += wd * weight[row]
+                    weight[row] -= lr * grad[row]
+            else:
+                mom = state
+                for row in range(num_rows):
+                    grad_row = grad[row].asnumpy()
+                    all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
+                    if all_zeros:
+                        continue
+                    grad[row] *= self.rescale_grad
+                    if self.clip_gradient is not None:
+                        grad[row] = mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient)
+                    grad[row] += wd * weight[row]
+                    mom[row] *= self.momentum
+                    mom[row] -= lr * grad[row]
+                    weight[row] += mom[row]
+
 
 @with_seed()
 def test_sparse_sgd():
     opt1 = PySparseSGD
     opt2 = mx.optimizer.SGD
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float32]:
-        for mom_option in mom_options:
-            for cg_option in cg_options:
-                for rg_option in rg_options:
-                    for wd_option in wd_options:
-                        for mp_option in mp_options:
-                            kwarg = {}
-                            kwarg.update(mom_option)
-                            kwarg.update(cg_option)
-                            kwarg.update(rg_option)
-                            kwarg.update(wd_option)
-                            kwarg.update(mp_option)
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
-                                              w_stype='row_sparse', g_stype='row_sparse')
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
-                                              w_stype='default', g_stype='row_sparse')
+        for params in itertools.product(mom_options, cg_options, rg_options,
+                                        wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            compare_optimizer(opt1(**kwarg),
+                              opt2(use_fused_step=True, lazy_update=True, **kwarg), shapes, dtype,
+                              w_stype='row_sparse', g_stype='row_sparse')
+            compare_optimizer(opt1(**kwarg),
+                              opt2(use_fused_step=True, lazy_update=True, **kwarg), shapes, dtype,
+                              w_stype='default', g_stype='row_sparse')
 
 
 @with_seed()
 def test_std_sparse_sgd():
-    opt1 = PySGD
+    opt1 = mx.optimizer.SGD
     opt2 = mx.optimizer.SGD
-    shape = (3, 4, 5)
-    mom_options = [{'momentum': 0.0}, {'momentum': 0.9}]
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    for dtype in [np.float32]:
-        for mom_option in mom_options:
-            for cg_option in cg_options:
-                for rg_option in rg_options:
-                    for wd_option in wd_options:
-                        kwarg = {}
-                        kwarg.update(mom_option)
-                        kwarg.update(cg_option)
-                        kwarg.update(rg_option)
-                        kwarg.update(wd_option)
-                        compare_optimizer(opt1(**kwarg), opt2(lazy_update=False, **kwarg), shape, dtype,
-                                          w_stype='row_sparse', g_stype='row_sparse')
-                        compare_optimizer(opt1(**kwarg), opt2(lazy_update=False, **kwarg), shape, dtype,
-                                          w_stype='default', g_stype='row_sparse')
-
-
-class PyNAG(PySGD):
-    def __init__(self, **kwargs):
-        super(PyNAG, self).__init__(**kwargs)
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
 
-    def create_state(self, index, weight):
-        """Create additional optimizer state: momentum
-
-        Parameters
-        ----------
-        weight : NDArray
-        The weight data
-
-        """
-        momentum = None
-        weight_master_copy = None
-        do_multi_precision = self.multi_precision and weight.dtype == np.float16
-        if do_multi_precision:
-            if self.momentum != 0.0:
-                momentum = mx.nd.zeros(weight.shape, weight.context, dtype=np.float32)
-            weight_master_copy = array(weight, ctx=weight.context, dtype=np.float32)
-            return (momentum, weight_master_copy)
-        else:
-            if self.momentum != 0.0:
-                momentum = mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
-            return momentum
-
-    def create_state_multi_precision(self, index, weight):
-        return self.create_state(index, weight)
-
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-        An unique integer key used to index the parameters
-
-        weight : NDArray
-        weight ndarray
-
-        grad : NDArray
-        grad ndarray
+    for dtype in [np.float32]:
+        for params in itertools.product(mom_options, cg_options, rg_options,
+                                        wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes, dtype,
+                              w_stype='row_sparse', g_stype='row_sparse')
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes, dtype,
+                              w_stype='default', g_stype='row_sparse')
 
-        state : NDArray or other objects returned by init_state
-        The auxiliary state used in optimization.
-        """
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-        use_multi_precision = isinstance(state, list) or isinstance(state, tuple)
-        if not use_multi_precision:
-            grad = grad * self.rescale_grad
-            if self.clip_gradient is not None:
-                grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-            if self.momentum == 0.0:
-                weight[:] += -lr * (grad + wd * weight)
-            else:
-              mom = state
-              weight[:] += (self.momentum**2 * mom) - lr*(self.momentum + 1)*(grad + wd*weight)
-              mom[:] = (self.momentum*mom) - lr*(grad + wd*weight)
-        else:
-            grad32 = array(grad, ctx=grad.context, dtype=np.float32)
-            grad32 = grad32 * self.rescale_grad
-            if self.clip_gradient is not None:
-                grad32 = mx.nd.clip(grad32, -self.clip_gradient, self.clip_gradient)
-            mom = state[0]
-            weight32 = state[1]
-            if self.momentum == 0.0:
-                weight32[:] += -lr * (grad32 + wd * weight32)
-            else:
-                weight32[:] += (self.momentum**2 * mom) - lr*(self.momentum+1)*(grad32 + wd*weight32)
-                mom[:] = (self.momentum*mom) - lr*(grad32 + wd*weight32)
-            tmp = weight32.astype(weight.dtype)
-            tmp.copyto(weight)
 
 @with_seed()
 def test_nag():
-    opt1 = PyNAG
+    opt1 = mx.optimizer.NAG
     opt2 = mx.optimizer.NAG
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}]
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
 
-    for dtype in [np.float16, np.float32, np.float64]:
+    for dtype in [np.float16, np.float32]:
         for params in itertools.product(mom_options, cg_options, rg_options,
-                                        wd_options, mp_options):
+                                        wd_options, mp_options, agg_options):
             kwarg = {k: v for param in params for k, v in param.items()}
             if (dtype == np.float16 and ('multi_precision' not in kwarg or
-                                        not kwarg['multi_precision'])):
+                                         not kwarg['multi_precision'])):
                 continue
-            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=1e-3, atol=1e-4)
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg),
+                              shapes, dtype, rtol=1e-3, atol=1e-4)
 
 
-# LAMB optimizer
-class PyLAMB(mx.optimizer.Optimizer):
-    """
-	Python reference implementation of LAMB optimizer.
-    """
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6,
-                 lower_bound=None, upper_bound=None, bias_correction=True, **kwargs):
-        super(PyLAMB, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.lower_bound = lower_bound
-        self.upper_bound = upper_bound
-        self.bias_correction = bias_correction
-
-    def create_state(self, index, weight):
-        stype = weight.stype
-        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype),
-                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype))
-
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        t = self._index_update_count[index]
-        mean, var = state
-        grad *= self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-
-        mean[:] = self.beta1 * mean + (1. - self.beta1) * grad
-        var[:] = self.beta2 * var + (1. - self.beta2) * mx.nd.square(grad)
-
-        mean_hat = mean
-        var_hat = var
-        r1 = weight.norm()
-        if self.lower_bound:
-            r1 = mx.nd.maximum(r1, self.lower_bound)
-        if self.upper_bound:
-            r1 = mx.nd.minimum(r1, self.upper_bound)
-        if self.bias_correction:
-            mean_hat = mean / (1. - mx.nd.power(self.beta1, t))
-            var_hat = var / (1. - mx.nd.power(self.beta2, t))
-
-        g = mean_hat / (mx.nd.sqrt(var_hat) + self.epsilon) + wd * weight
-
-        r2 = g.norm()
-        # calculate lamb_trust_ratio
-        r = 1. if r1 == 0. or r2 == 0. else r1 / r2
-        lr *= r
-        # update weight
-        weight[:] -= lr * g
+@with_seed()
+def test_lars():
+    opt1 = mx.optimizer.LARS
+    opt2 = mx.optimizer.LARS
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    eta_options = [{}, {'eta': 0.002}, {'eta': 0.01}]
+    mom_options = [{'momentum': 0.0}, {'momentum': 0.9}]
+    cg_options = [{}, {'clip_gradient': 0.4}]
+    rg_options = [{}, {'rescale_grad': 0.14}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}]
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(eta_options, mom_options, cg_options, rg_options,
+                                        wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg),
+                              shapes, dtype, rtol=1e-3, atol=1e-3)
 
 
 @with_seed()
 def test_lamb():
-    opt1 = PyLAMB
+    opt1 = mx.optimizer.LAMB
     opt2 = mx.optimizer.LAMB
-    shape = (3, 4, 5)
-    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
-    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    bc_options = [{}, {'bias_correction': False}, {'bias_correction': True}]
-    lb_options = [{}, {'lower_bound': None}, {'lower_bound': 1e-3}]
-    ub_options = [{}, {'upper_bound': None}, {'upper_bound': 10}]
-    for params in itertools.product(cg_options, rg_options, wd_options, bc_options, lb_options, ub_options):
-        kwarg = {k: v for param in params for k, v in param.items()}
-        kwarg['multi_precision'] = False
-        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32)
-        kwarg['multi_precision'] = True
-        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float16, rtol=1e-3, atol=1e-3)
-
-@with_seed()
-def test_multilamb():
-    opt1 = PyLAMB
-    opt2 = mx.optimizer.LAMB
-
-    # shapes as Bert-large
-    dims_x = [1024, 4096, 1024, 1024]
-    dims_y = [1, 1, 1024, 4096]
-    dims_occurrences = [9, 1, 4, 2]
-    nlayers = 4 # 24
-    # extra_dims_x=[30522, 512, 30522]
-    # extra_dims_y=[1, 1024, 1024]
-    shapes=[]
-    for l in range(nlayers):
-        for i, (dx,dy) in enumerate(zip(dims_x, dims_y)):
-            for j in range(dims_occurrences[i]):
-                shapes.append((dx,dy))
-    # for dx,dy in zip(extra_dims_x, extra_dims_y):
-    #    shapes.append((dx,dy))
-
-    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
-    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    bias_options = [{'bias_correction': False}, {'bias_correction': True}]
-
-    for dtype in [np.float16, np.float32, np.float64]:
-        for cg_option in cg_options:
-            for rg_option in rg_options:
-                for wd_option in wd_options:
-                    for bias_option in bias_options:
-                        kwarg = {}
-                        kwarg.update(cg_option)
-                        kwarg.update(rg_option)
-                        kwarg.update(wd_option)
-                        kwarg.update(bias_option)
-                        if (dtype == np.float16):
-                            kwarg.update({'multi_precision': True})
-                        atol = 1e-3
-                        rtol = 1e-6
-                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype,
-                                          rtol=rtol, atol=atol, ntensors=len(shapes))
-
-#SGLD
-class PySGLD(mx.optimizer.Optimizer):
-    """python reference implementation of SGLD"""
-
-    def __init__(self, **kwargs):
-        super(PySGLD, self).__init__(**kwargs)
-
-    def create_state(self, index, weight):
-        return None
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, mx.nd.NDArray))
-        assert(isinstance(grad, mx.nd.NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-        weight[:] += - lr/2 * (grad + wd * weight) + mx.random.normal(0, math.sqrt(lr), shape=weight.shape,
-                                                            dtype=weight.dtype, ctx=weight.context)
-
+    
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}]
+    beta2_options = [{}, {'beta2': 0.8}]
+    cg_options = [{}, {'clip_gradient': 0.4}]
+    rg_options = [{}, {'rescale_grad': 0.14}]
+    wd_options = [{}, {'wd': 0.03}]
+    bc_options = [{'bias_correction': False}, {'bias_correction': True}]
+    lb_options = [{'lower_bound': None}, {'lower_bound': 1e-3}]
+    ub_options = [{'upper_bound': None}, {'upper_bound': 10}]
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options, rg_options,
+                                        wd_options, bc_options, lb_options, ub_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg),
+                              shapes, dtype, rtol=1e-3, atol=1e-3)
 
 
 @with_seed()
 def test_sgld():
-    opt1 = PySGLD
+    opt1 = mx.optimizer.SGLD
     opt2 = mx.optimizer.SGLD
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     ns_options = [1234, 42]
-
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-
-
-    def compare_optimizer_noise_seeded(opt1, opt2, shape, dtype, noise_seed,
-                                       w_stype='default', g_stype='default',
-                                       rtol=1e-4, atol=1e-5, compare_states=True):
-        """Compare opt1 and opt2 with the added functionality that the seed for generating random noise
-        in the SGLD optimizer update is set so that the same noise is used in opt1 and opt2.
-
-        """
-        if w_stype == 'default':
-            w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-            w1 = w2.copyto(default_context())
-        elif w_stype == 'row_sparse' or w_stype == 'csr':
-            w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
-            w1 = w2.copyto(default_context()).tostype('default')
-        else:
-            raise Exception("type not supported yet")
-        if g_stype == 'default':
-            g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-            g1 = g2.copyto(default_context())
-        elif g_stype == 'row_sparse' or g_stype == 'csr':
-            g2 = rand_ndarray(shape, g_stype, dtype=dtype)
-            g1 = g2.copyto(default_context()).tostype('default')
-        else:
-            raise Exception("type not supported yet")
-
-        state1 = opt1.create_state_multi_precision(0, w1)
-        state2 = opt2.create_state_multi_precision(0, w2)
-        if compare_states:
-            compare_ndarray_tuple(state1, state2)
-
-        # set seed for Gaussian noise replication
-        mx.random.seed(noise_seed)
-        opt1.update_multi_precision(0, w1, g1, state1)
-        mx.random.seed(noise_seed)
-        opt2.update_multi_precision(0, w2, g2, state2)
-        if compare_states:
-            compare_ndarray_tuple(state1, state2, rtol=rtol, atol=atol)
-        assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=rtol, atol=atol)
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
 
     for seed in ns_options:
-        for dtype in [np.float16, np.float32, np.float64]:
-            for params in itertools.product(cg_options, wd_options, mp_options):
+        for dtype in [np.float16, np.float32]:
+            for params in itertools.product(cg_options, wd_options, mp_options, agg_options):
                 kwarg = {k: v for param in params for k, v in param.items()}
-                if (dtype == np.float16 and ('multi_precision' not in kwarg or
-                    not kwarg['multi_precision'])):
+                if (dtype == np.float16 and ('multi_precision' not in kwarg
+                                             or not kwarg['multi_precision'])):
                     continue
                 atol = 1e-2 if dtype == np.float16 else 1e-3
                 rtol = 1e-4 if dtype == np.float16 else 1e-5
-                compare_optimizer_noise_seeded(opt1(**kwarg), opt2(**kwarg), shape, dtype, seed, atol=atol, rtol=rtol)
+                compare_optimizer_noise_seeded(opt1(**kwarg),
+                                               opt2(**kwarg),
+                                               shapes, dtype, seed, atol=atol, rtol=rtol)
 
 
-
-# FTML
-
-class PyFTML(mx.optimizer.Optimizer):
-    """python reference implemenation of FTML"""
-    def __init__(self, beta1=0.6, beta2=0.999, epsilon=1e-8, **kwargs):
-        super(PyFTML, self).__init__(**kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-
-    def create_state(self, index, weight):
-        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype), # d_0
-                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype), # v_0
-                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)) # z_0
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, mx.nd. NDArray))
-        assert(isinstance(grad, mx.nd.NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        t = self._index_update_count[index]
-
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-        # get previous states
-        prev_d, prev_v, prev_z = state
-        # compute states
-        v_t = self.beta2 * prev_v + (1 - self.beta2) * mx.nd.square(grad)
-        d_t = (1 - pow(self.beta1, t)) / lr * (mx.nd.sqrt(v_t / (1 - pow(self.beta2, t))) + self.epsilon)
-        sigma_t = d_t - self.beta1 * prev_d
-        z_t = self.beta1 * prev_z + (1 - self.beta1) * grad - sigma_t * weight
-        # update weight
-        weight[:] = - z_t / d_t
-        # update states
-        prev_d[:] = d_t
-        prev_v[:] = v_t
-        prev_z[:] = z_t
-
 @with_seed()
 def test_ftml():
-    opt1 = PyFTML
+    opt1 = mx.optimizer.FTML
     opt2 = mx.optimizer.FTML
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
     beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    for dtype in [np.float32]:
-        for beta1_option in beta1_options:
-            for beta2_option in beta2_options:
-                for cg_option in cg_options:
-                    for rg_option in rg_options:
-                        for wd_option in wd_options:
-                            kwarg = {}
-                            kwarg.update(beta1_option)
-                            kwarg.update(beta2_option)
-                            kwarg.update(cg_option)
-                            kwarg.update(rg_option)
-                            kwarg.update(wd_option)
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=1e-3, atol=1e-4)
-
-
-# ADAM
-class PyAdam(mx.optimizer.Optimizer):
-    """python reference implemenation of adam"""
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        rg_options, wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg),
+                              shapes, dtype, rtol=1e-3, atol=1e-4)
+
+
+# Sparse ADAM
+class PySparseAdam(mx.optimizer.Optimizer):
+    """python reference implemenation of sparse adam"""
     def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 lazy_update=True, **kwargs):
-        super(PyAdam, self).__init__(learning_rate=learning_rate, **kwargs)
+                 lazy_update=False, **kwargs):
+        super(PySparseAdam, self).__init__(learning_rate=learning_rate, **kwargs)
         self.beta1 = beta1
         self.beta2 = beta2
         self.epsilon = epsilon
@@ -717,391 +385,214 @@ def create_state(self, index, weight):
         return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
                 mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
 
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
 
         Parameters
         ----------
-        index : int
-        An unique integer key used to index the parameters
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
 
-        weight : NDArray
-        weight ndarray
+            mean, variance = state
+            num_rows = weight.shape[0]
 
-        grad : NDArray
-        grad ndarray
+            coef1 = 1. - self.beta1 ** t
+            coef2 = 1. - self.beta2 ** t
+            lr *= math.sqrt(coef2) / coef1
 
-        state : NDArray or other objects returned by init_state
-        The auxiliary state used in optimization.
-        """
-        lr = self._get_lr(index)
-        self._update_count(index)
-
-        t = self._index_update_count[index]
-        mean, variance = state
-
-        wd = self._get_wd(index)
-        num_rows = weight.shape[0]
-        coef1 = 1. - self.beta1**t
-        coef2 = 1. - self.beta2**t
-        lr *= math.sqrt(coef2)/coef1
-        for row in range(num_rows):
-            # check row slices of all zeros
-            all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
-            # skip zeros during lazy update
-            if all_zeros and self.lazy_update:
-                continue
-            grad[row] = grad[row] * self.rescale_grad + wd * weight[row]
-            # clip gradients
-            if self.clip_gradient is not None:
-                mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
-            # update mean
-            mean[row] *= self.beta1
-            mean[row] += grad[row] * (1. - self.beta1)
-            # update variance
-            variance[row] *= self.beta2
-            variance[row] += (1 - self.beta2) * mx.nd.square(grad[row], out=grad[row])
-            # update weight
-            weight[row] -= lr*mean[row]/(mx.nd.sqrt(variance[row]) + self.epsilon)
+            for row in range(num_rows):
+                # check row slices of all zeros
+                all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(),
+                                                       np.zeros_like(grad[row].asnumpy()))
+                # skip zeros during lazy update
+                if all_zeros and self.lazy_update:
+                    continue
+                grad[row] *= self.rescale_grad
+                # clip gradients
+                if self.clip_gradient is not None:
+                    mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+                grad[row] += wd * weight[row]
+                # update mean
+                mean[row] *= self.beta1
+                mean[row] += grad[row] * (1. - self.beta1)
+                # update variance
+                variance[row] *= self.beta2
+                variance[row] += (1 - self.beta2) * mx.nd.square(grad[row], out=grad[row])
+                # update weight
+                weight[row] -= lr * mean[row] / (mx.nd.sqrt(variance[row]) + self.epsilon)
 
 
 @with_seed()
 def test_adam():
-    opt1 = PyAdam
+    opt1 = mx.optimizer.Adam
     opt2 = mx.optimizer.Adam
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
+    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    for dtype in [np.float16, np.float32, np.float64]:
-        for cg_option in cg_options:
-            for rg_option in rg_options:
-                for wd_option in wd_options:
-                    for mp_option in mp_options:
-                        kwarg = {}
-                        kwarg.update(cg_option)
-                        kwarg.update(rg_option)
-                        kwarg.update(wd_option)
-                        kwarg.update(mp_option)
-                        if (dtype == np.float16 and
-                                ('multi_precision' not in kwarg or
-                                    not kwarg['multi_precision'])):
-                            continue
-                        # atol 2e-5 needed to pass with seed 1248389097
-                        compare_optimizer(opt1(lazy_update=False, **kwarg), opt2(**kwarg), shape, dtype,
-                                          rtol=1e-4, atol=2e-5)
-                        # atol 2e-5 needed to pass with seed 781809840
-                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape,
-                                          dtype, w_stype='row_sparse', g_stype='row_sparse',
-                                          rtol=1e-4, atol=2e-5)
-                        compare_optimizer(opt1(lazy_update=False, **kwarg), opt2(lazy_update=False, **kwarg), shape,
-                                          dtype, w_stype='row_sparse', g_stype='row_sparse',
-                                          rtol=1e-4, atol=2e-5)
-                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape,
-                                          dtype, w_stype='default', g_stype='row_sparse',
-                                          rtol=1e-4, atol=2e-5)
-                        compare_optimizer(opt1(lazy_update=False, **kwarg), opt2(lazy_update=False, **kwarg), shape,
-                                          dtype, w_stype='default', g_stype='row_sparse',
-                                          rtol=1e-4, atol=2e-5)
-
-
-# AdaMax
-class PyAdamax(mx.optimizer.Optimizer):
-    """The python reference of AdaMax optimizer.
-
-    This class implements the AdaMax optimizer, one variant of Adam based on the infinity norm,
-    available at http://arxiv.org/abs/1412.6980 Section 7.
-
-    The optimizer updates the weight by::
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        m = beta1 * m_t + (1 - beta1) * grad
-        u = maximum(beta2 * u, abs(grad))
-        weight -= lr / (1 - beta1**t) * m / u
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        Exponential decay rate for the first moment estimates.
-    beta2 : float, optional
-        Exponential decay rate for the second moment estimates.
-    """
-    def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, **kwargs):
-        super(PyAdamax, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-
-    def create_state(self, index, weight):
-        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
-                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        t = self._index_update_count[index]
-        lr /= (1. - self.beta1**t)
-
-        # preprocess grad
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        rg_options, wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            # atol 2e-5 needed to pass with seed 1248389097
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                              rtol=1e-4, atol=2e-5)
 
-        # update m_t and u_t
-        m_t, u_t = state
-        m_t[:] = self.beta1 * m_t + (1. - self.beta1) * grad
-        u_t[:] = mx.nd.maximum(self.beta2 * u_t, mx.nd.abs(grad))
 
-        # update weight
-        weight[:] -= lr * m_t / u_t
+@with_seed()
+def test_sparse_adam():
+    opt1 = PySparseAdam
+    opt2 = mx.optimizer.Adam
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}]
+    beta2_options = [{}, {'beta2': 0.8}]
+    cg_options = [{}, {'clip_gradient': 0.4}]
+    rg_options = [{}, {'rescale_grad': 0.14}]
+    wd_options = [{}, {'wd': 0.03}]
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        rg_options, wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            # atol 2e-5 needed to pass with seed 1248389097
+            compare_optimizer(opt1(lazy_update=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes, dtype,
+                              rtol=1e-4, atol=2e-5)
+            # atol 2e-5 needed to pass with seed 781809840
+            compare_optimizer(opt1(lazy_update=True, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=True, **kwarg), shapes,
+                              dtype, w_stype='row_sparse', g_stype='row_sparse',
+                              rtol=1e-4, atol=2e-5)
+            compare_optimizer(opt1(lazy_update=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes,
+                              dtype, w_stype='row_sparse', g_stype='row_sparse',
+                              rtol=1e-4, atol=2e-5)
+            compare_optimizer(opt1(lazy_update=True, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=True, **kwarg), shapes,
+                              dtype, w_stype='default', g_stype='row_sparse',
+                              rtol=1e-4, atol=2e-5)
+            compare_optimizer(opt1(lazy_update=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes,
+                              dtype, w_stype='default', g_stype='row_sparse',
+                              rtol=1e-4, atol=2e-5)
 
 
 @with_seed()
 def test_adamax():
-    opt1 = PyAdamax
+    opt1 = mx.optimizer.Adamax
     opt2 = mx.optimizer.Adamax
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
+    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    for dtype in [np.float16, np.float32, np.float64]:
-        for params in itertools.product(cg_options, rg_options, wd_options, mp_options):
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        rg_options, wd_options, mp_options, agg_options):
             kwarg = {k: v for param in params for k, v in param.items()}
             if (dtype == np.float16 and
-                    ('multi_precision' not in kwarg or
-                    not kwarg['multi_precision'])):
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
                 continue
-            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
-
-
-# Signum
-class PySignum(mx.optimizer.Optimizer):
-    """The python reference of Signum optimizer.
-
-    The optimizer updates the weight by:
-
-        rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
-        state = momentum * state + (1-momentum)*rescaled_grad
-        weight = (1 - lr * wd_lh) * weight - lr * sign(state)
-
-    See the original paper at: https://jeremybernste.in/projects/amazon/signum.pdf
-
-    For details of the update algorithm see
-    :class:`~mxnet.ndarray.signsgd_update` and :class:`~mxnet.ndarray.signum_update`.
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype)
 
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-       The momentum value.
-    wd_lh : float, optitional
-       The amount of decoupled weight decay regularization.
-    """
-    def __init__(self, learning_rate=0.01, momentum=0.9, wd_lh = 0.0, **kwargs):
-        super(PySignum, self).__init__(learning_rate = learning_rate, **kwargs)
-        self.momentum = momentum
-        self.wd_lh = wd_lh
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            momentum = mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
-        return momentum
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        if state is not None:
-            mom = state
-            if self.clip_gradient is not None:
-              mom[:] = (self.momentum*mom - (1-self.momentum)*(wd*weight +
-                  mx.nd.clip(grad*self.rescale_grad, -self.clip_gradient, self.clip_gradient)))
-            else:
-              mom[:] = self.momentum*mom - (1-self.momentum)*wd*weight - (1-self.momentum)*self.rescale_grad*grad
-            weight[:] = (1 - lr*self.wd_lh)*weight + lr*mx.nd.sign(mom)
-        else:
-            weight[:] = (1 - lr*(wd+self.wd_lh))*weight - lr*mx.nd.sign(grad)
 
 @with_seed()
 def test_signum():
-    opt1 = PySignum
+    opt1 = mx.optimizer.Signum
     opt2 = mx.optimizer.Signum
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     wd_lh_options = [{}, {'wd_lh': 0.015}, {'wd_lh': 0.0}]
     mom_options = [{}, {'momentum': 0.9}]
     lr_options = [{'learning_rate': 0.05},{'learning_rate': 0.01}]
-    for dtype in [np.float32, np.float64]:
-        for cg_option in cg_options:
-            for rg_option in rg_options:
-                for wd_option in wd_options:
-                    for mp_option in wd_lh_options:
-                        for lr_option in lr_options:
-                            for mom_option in mom_options:
-                                kwarg = {}
-                                kwarg.update(cg_option)
-                                kwarg.update(rg_option)
-                                kwarg.update(wd_option)
-                                kwarg.update(mp_option)
-                                kwarg.update(lr_option)
-                                kwarg.update(mom_option)
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
-
-
-# RMSProp
-class PyRMSProp(mx.optimizer.Optimizer):
-    """RMSProp optimizer of Tieleman & Hinton, 2012,
-
-    For centered=False, the code follows the version in
-    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
-    Tieleman & Hinton, 2012
-
-    For centered=True, the code follows the version in
-    http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
-
-    Parameters
-    ----------
-    learning_rate : float, optional
-        Step size.
-        Default value is set to 0.001.
-    gamma1: float, optional
-        decay factor of moving average for gradient, gradient^2.
-        Default value is set to 0.9.
-    gamma2: float, optional
-        "momentum" factor.
-        Default value if set to 0.9.
-        Only used if centered=True
-    epsilon : float, optional
-        Default value is set to 1e-8.
-    centered : boolean, optional
-        Use Graves or Tielemans & Hintons version of RMSProp
-    wd : float, optional
-        L2 regularization coefficient add to all the weights
-    rescale_grad : float, optional
-        rescaling factor of gradient.
-    clip_gradient : float, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-    clip_weights : float, optional
-        clip weights in range [-clip_weights, clip_weights]
-
-    """
-    def __init__(self, learning_rate=0.001, gamma1=0.9, gamma2=0.9,
-                 epsilon=1e-8, centered=False, clip_weights=None, **kwargs):
-        super(PyRMSProp, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.centered = centered
-        self.gamma1 = gamma1
-        self.gamma2 = gamma2
-        self.epsilon = epsilon
-        self.clip_weights = clip_weights
-
-    def create_state(self, index, weight):
-        """Create additional optimizer state.
-
-        For centered=False: n
-        For centered=True: n, g, delta
-
-        Parameters
-        ----------
-        weight : NDArray
-            The weight data
-        """
-        if self.centered:
-            return (mx.nd.zeros(weight.shape, weight.context),  # n
-                    mx.nd.zeros(weight.shape, weight.context),  # g
-                    mx.nd.zeros(weight.shape, weight.context))  # delta
-        else:
-            return (mx.nd.zeros(weight.shape, weight.context), )  # n
-
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-            An unique integer key used to index the parameters
-
-        weight : NDArray
-            weight ndarray
-
-        grad : NDArray
-            grad ndarray
-
-        state : NDArray or other objects returned by init_state
-            The auxiliary state used in optimization.
-        """
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-        grad = grad * self.rescale_grad + wd * weight
-
-        if not self.centered:
-            (n, ) = state
-            if self.clip_gradient is not None:
-                grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-            n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n
-            weight[:] -= lr * grad/(mx.nd.sqrt(n + self.epsilon))
-
-        else:
-            n, g, delta = state
-            if self.clip_gradient is not None:
-                grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-            n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n
-            g[:] = (1 - self.gamma1) * grad + self.gamma1 * g
-            delta[:] = (self.gamma2) * delta - lr * grad/(mx.nd.sqrt(n - g*g + self.epsilon))
-            weight[:] += delta
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(cg_options, rg_options, wd_options,
+                                        wd_lh_options, mom_options, lr_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            rtol, atol = (1e-3, 1e-4) if dtype is np.float16 else (1e-4, 1e-5)
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                                   rtol=rtol, atol=atol)
 
-        if self.clip_weights:
-             mx.ndarray.clip(weight, -self.clip_weights, self.clip_weights, out=weight)
 
 @with_seed()
 def test_rms():
-    opt1 = PyRMSProp
+    opt1 = mx.optimizer.RMSProp
     opt2 = mx.optimizer.RMSProp
-    shape = (3, 4, 5)
-    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    rho_options = [{}, {'rho': 0.5}]
+    cg_options = [{}, {'clip_gradient': 0.4}]
     cw_options = [{}, {'clip_weights': 0.01}]
-    center_options = [{}, {'centered': False}, {'centered': True}]
-    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    center_options = [{'centered': False}, {'centered': True}]
+    rg_options = [{}, {'rescale_grad': 0.14}]
+    wd_options = [{}, {'wd': 0.03}]
+    mom_options = [{'momentum': 0.0}, {'momentum': 0.9}]
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         # Reduce foating point compare tolerance to avoid flaky test failure.
         rtol, atol = (1e-1, 1e-1) if dtype is np.float16 else (1e-2, 1e-2)
 
-        for cw_option in cw_options:
-            for cg_option in cg_options:
-                for center_option in center_options:
-                    for rg_option in rg_options:
-                        for wd_option in wd_options:
-                            for mp_option in mp_options:
-                                kwarg = {}
-                                kwarg.update(cw_option)
-                                kwarg.update(cg_option)
-                                kwarg.update(center_option)
-                                kwarg.update(rg_option)
-                                kwarg.update(wd_option)
-                                kwarg.update(mp_option)
-                                if (dtype == np.float16 and
-                                        ('multi_precision' not in kwarg or
-                                            not kwarg['multi_precision'])):
-                                    continue
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=rtol, atol=atol)
-                                if (default_context() == mx.cpu()):
-                                    compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, g_stype='row_sparse', rtol=rtol, atol=atol)
-
-class PyFtrl(mx.optimizer.Optimizer):
-    """The Ftrl optimizer.
+        for params in itertools.product(rho_options, cg_options, cw_options,
+                                        center_options, rg_options, wd_options,
+                                        mom_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                              rtol=rtol, atol=atol)
+            if default_context() == mx.cpu():
+                compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                                  opt2(use_fused_step=True, **kwarg),
+                                  shapes, dtype, g_stype='row_sparse', rtol=rtol, atol=atol)
+
+
+class PySparseFtrl(mx.optimizer.Optimizer):
+    """python reference implemenation of sparse Ftrl optimizer.
 
     Referenced from *Ad Click Prediction: a View from the Trenches*, available at
     http://dl.acm.org/citation.cfm?id=2488200.
@@ -1119,224 +610,290 @@ class PyFtrl(mx.optimizer.Optimizer):
            \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^t}}
     """
 
-    def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, lazy_update=False, **kwargs):
-        super(PyFtrl, self).__init__(**kwargs)
+    def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, **kwargs):
+        super(PySparseFtrl, self).__init__(**kwargs)
         self.lamda1 = lamda1
         self.beta = beta
         self.lr = learning_rate
-        self.lazy_update = lazy_update
 
     def create_state(self, index, weight):
-        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # dn
+        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # z
                 mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # n
 
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        wd = self._get_wd(index)
-        lr = self._get_lr(index)
-        num_rows = weight.shape[0]
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
 
-        dn, n = state
-        for row in range(num_rows):
-            all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
-            if all_zeros and self.lazy_update:
-                continue
-            grad[row] = grad[row] * self.rescale_grad
-            if self.clip_gradient is not None:
-                mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            wd = self._get_wd(index)
+            lr = self._get_lr(index)
+            num_rows = weight.shape[0]
+
+            z, n = state
+            for row in range(num_rows):
+                all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
+                if all_zeros:
+                    continue
+                grad[row] *= self.rescale_grad
+                if self.clip_gradient is not None:
+                    mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+
+                # update z[row], n[row]
+                sigma = - mx.nd.sqrt(n[row])
+                n[row] += mx.nd.square(grad[row])
+                denom = mx.nd.sqrt(n[row])
+                sigma += denom
+                sigma /= lr
+                z[row] += grad[row] - sigma * weight[row]
 
-            #update dn, n
-            dn[row] += grad[row] - (mx.nd.sqrt(n[row] + grad[row] * grad[row]) - mx.nd.sqrt(n[row])) * weight[row] / lr
-            n[row] += grad[row] * grad[row]
+                # update weight
+                denom += self.beta
+                denom /= lr
+                denom += wd
+                d = mx.nd.sign(z[row]) * mx.nd.maximum(mx.nd.abs(z[row]) - self.lamda1, 0)
+                weight[row] = - d / denom
 
-            # update weight
-            weight[row] = (mx.nd.sign(dn[row]) * self.lamda1 - dn[row]) / \
-                          ((self.beta + mx.nd.sqrt(n[row])) / lr + wd) * (mx.nd.abs(dn[row]) > self.lamda1)
 
 @with_seed()
 def test_ftrl():
-    opt1 = PyFtrl
+    opt1 = mx.optimizer.Ftrl
     opt2 = mx.optimizer.Ftrl
-    shape = (3, 4, 5)
-    kwargs = [{},
-              {'clip_gradient': 0.5},
-              {'clip_gradient': 0.4, 'rescale_grad': 0.14},
-              {'rescale_grad': 0.8},
-              {'clip_gradient': 0.5, 'wd': 0.07},
-              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'wd': 0.03},
-              {'rescale_grad': 0.8, 'wd': 0.05},
-              {'rescale_grad': 0.8, 'wd': 0.05, 'lamda1': 0.01},
-              {'clip_gradient': 0.5, 'wd': 0.07, 'lamda1': 1.0}]
-    for kwarg in kwargs:
-        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32)
-        compare_optimizer(opt1(lazy_update=True, **kwarg), opt2(**kwarg), shape,
-                          np.float32, w_stype='row_sparse', g_stype='row_sparse')
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    lamda1_options = [{'lamda1': 0.}, {'lamda1': 0.1}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(lamda1_options, cg_options,
+                                        rg_options, wd_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                              rtol=1e-4, atol=1e-4)
+
+
+@with_seed()
+def test_sparse_ftrl():
+    opt1 = PySparseFtrl
+    opt2 = mx.optimizer.Ftrl
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    lamda1_options = [{'lamda1': 0.}, {'lamda1': 0.1}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(lamda1_options, cg_options,
+                                        rg_options, wd_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes,
+                              dtype, w_stype='row_sparse', g_stype='row_sparse',
+                              rtol=1e-4, atol=1e-4)
+
 
 @with_seed()
 def test_nadam():
+    opt1 = mx.optimizer.Nadam
+    opt2 = mx.optimizer.Nadam
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}]
+    beta2_options = [{}, {'beta2': 0.8}]
+    schedule_decay_options = [{}, {'schedule_decay': 0.008}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}]
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        schedule_decay_options, rg_options, wd_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype)
+
 
-    def get_net(num_hidden, flatten=True):
-        data = mx.symbol.Variable('data')
-        fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128, flatten=flatten)
-        act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu")
-        fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64, flatten=flatten)
-        act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
-        fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=num_hidden, flatten=flatten)
-        return fc3
-
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, 10))
-    label = mx.random.uniform(-1, 1, shape=(N, 1))
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=5, label_name='label', shuffle=True)
-    output = get_net(1)
-    l = mx.symbol.Variable('label')
-    Loss = gluon.loss.L1Loss()
-    loss = Loss(output, l)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=60, optimizer_params={'learning_rate': 0.001, 'wd': 0.0005},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
-            optimizer='nadam')
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.11
-
-# AdaGrad
-class PyAdaGrad(mx.optimizer.Optimizer):
-    """The python reference of AdaGrad optimizer.
+class PySparseAdaGrad(mx.optimizer.Optimizer):
+    """python reference implemenation of sparse Adagrad optimizer.
 
     This class implements the AdaGrad optimizer described in *Adaptive Subgradient
     Methods for Online Learning and Stochastic Optimization*, and available at
     http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
 
-    Updates are applied by::
-
-        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        history = history + square(rescaled_grad)
-        w = w - learning_rate * rescaled_grad / sqrt(history + epsilon)
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
     Parameters
     ----------
-    eps: float, optional
+    learning_rate : float, default 0.01
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    epsilon : float, default 1e-6
         Small value to avoid division by 0.
-
     """
-    def __init__(self, eps=1e-7, **kwargs):
-        super(PyAdaGrad, self).__init__(**kwargs)
-        self.float_stable_eps = eps
+
+    def __init__(self, learning_rate=0.01, epsilon=1e-6, **kwargs):
+        super(PySparseAdaGrad, self).__init__(learning_rate=learning_rate,
+                                              **kwargs)
+        self.epsilon = epsilon
 
     def create_state(self, index, weight):
-        return mx.nd.zeros(weight.shape, weight.context, stype=weight.stype)
+        return mx.nd.zeros(weight.shape, weight.context, stype=weight.stype)  # history
 
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            wd = self._get_wd(index)
+            lr = self._get_lr(index)
+            num_rows = weight.shape[0]
+
+            history = state
+            for row in range(num_rows):
+                all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
+                if all_zeros:
+                    continue
+                grad[row] *= self.rescale_grad
+                if self.clip_gradient is not None:
+                    mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+                grad[row] += wd * weight[row]
+
+                # update history[row]
+                history[row] += mx.nd.square(grad[row])
+                denom = mx.nd.sqrt(history[row])
+                denom += self.epsilon
+
+                # update weight
+                weight[row] -= lr * grad[row] / denom
 
-        history = state
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-        history[:] += mx.nd.square(grad)
-        div = grad / mx.nd.sqrt(history + self.float_stable_eps)
-        weight[:] += (div + weight * wd) * -lr
 
 @with_seed()
 def test_adagrad():
-    opt1 = PyAdaGrad
+    opt1 = mx.optimizer.AdaGrad
     opt2 = mx.optimizer.AdaGrad
-    shape = (3, 4, 5)
-    eps_options = [{}, {'eps': 1e-8}]
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    eps_options = [{}, {'epsilon': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.0}]
-    for dtype in [np.float32]:
-        for eps_option in eps_options:
-            for cg_option in cg_options:
-                for rg_option in rg_options:
-                    for wd_option in wd_options:
-                        kwarg = {}
-                        kwarg.update(eps_option)
-                        kwarg.update(cg_option)
-                        kwarg.update(rg_option)
-                        kwarg.update(wd_option)
-                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
-                        if wd_option.get('wd', 0.0) == 0.0:
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
-                                              w_stype='row_sparse', g_stype='row_sparse')
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
-                                              g_stype='row_sparse')
-
-# AdaDelta
-class PyAdaDelta(mx.optimizer.Optimizer):
-    """The python reference of AdaDelta optimizer.
-
-    This class implements AdaDelta, an optimizer described in  *ADADELTA: An adaptive
-    learning rate method*, available at https://arxiv.org/abs/1212.5701.
-
-    This optimizer updates each weight by::
-
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        acc_grad = rho * acc_grad + (1. - rho) * grad ** 2
-        cur_delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
-        acc_delta = rho * acc_delta + (1. - rho) * cur_delta ** 2
-        weight -= (cur_delta + wd * weight)
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    rho: float
-        Decay rate for both squared gradients and delta.
-    epsilon : float
-        Small value to avoid division by 0.
-    """
-    def __init__(self, rho=0.90, epsilon=1e-5, **kwargs):
-        super(PyAdaDelta, self).__init__(**kwargs)
-        self.rho = rho
-        self.epsilon = epsilon
-
-    def create_state(self, index, weight):
-        return (mx.nd.zeros(weight.shape, weight.context),
-                mx.nd.zeros(weight.shape, weight.context))
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        wd = self._get_wd(index)
-
-        grad *= self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(eps_options, cg_options,
+                                        rg_options, wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if dtype is np.float16:
+                kwarg.update({'multi_precision': True})
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype)
 
-        acc_grad, acc_delta = state
 
-        acc_grad[:] = self.rho * acc_grad + (1. - self.rho) * grad ** 2
-        current_delta = (mx.nd.sqrt(acc_delta + self.epsilon) /
-                         mx.nd.sqrt(acc_grad + self.epsilon)) * grad
-        acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta ** 2
+@with_seed()
+def test_sparse_adagrad():
+    opt1 = PySparseAdaGrad
+    opt2 = mx.optimizer.AdaGrad
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    eps_options = [{}, {'epsilon': 1e-8}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.0}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(eps_options, cg_options,
+                                        rg_options, wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if dtype is np.float16:
+                kwarg.update({'multi_precision': True})
+            if kwarg.get('wd', 0.0) == 0.0:
+                compare_optimizer(opt1(**kwarg), opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                                  w_stype='row_sparse', g_stype='row_sparse')
+                compare_optimizer(opt1(**kwarg), opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                                  g_stype='row_sparse')
 
-        # update weight
-        weight[:] -= current_delta + wd * weight
 
 @with_seed()
 def test_adadelta():
-    opt1 = PyAdaDelta
+    opt1 = mx.optimizer.AdaDelta
     opt2 = mx.optimizer.AdaDelta
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     rho_options = [{'rho': 0.9}]
     eps_options = [{}, {'epsilon': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.0}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(rho_options, eps_options, cg_options,
-                                        rg_options, wd_options):
+                                        rg_options, wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if dtype is np.float16:
+                kwarg.update({'multi_precision': True})
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype)
+
+
+@with_seed()
+def test_dcasgd():
+    opt1 = mx.optimizer.DCASGD
+    opt2 = mx.optimizer.DCASGD
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    lamda_options = [{}, {'lamda': 0.01}, {'lamda': 0.1}]
+    mom_options = [{}, {'momentum': 0.0}, {'momentum': 0.9}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(lamda_options, mom_options, cg_options,
+                                        rg_options, wd_options, agg_options):
             kwarg = {k: v for param in params for k, v in param.items()}
             if dtype is np.float16:
                 kwarg.update({'multi_precision': True})
-            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype)
 
 
 def test_factor_scheduler():
@@ -1353,6 +910,7 @@ def test_factor_scheduler():
     np.testing.assert_almost_equal(sched(201), base_lr * factor * factor)
     np.testing.assert_almost_equal(sched(1000), 1e-4)
 
+
 def test_multifactor_scheduler():
     base_lr = 0.1
     steps = [15, 25]
@@ -1368,6 +926,7 @@ def test_multifactor_scheduler():
     np.testing.assert_almost_equal(sched(26), base_lr * factor * factor)
     np.testing.assert_almost_equal(sched(100), base_lr * factor * factor)
 
+
 def test_poly_scheduler():
     base_lr = 3
     final_lr = 0
@@ -1382,6 +941,7 @@ def test_poly_scheduler():
     assert (poly_sched(500) < 1.6)
     np.testing.assert_almost_equal(poly_sched(steps), final_lr)
 
+
 def test_cosine_scheduler():
     # also tests case without warmup
     base_lr = 3
@@ -1392,6 +952,8 @@ def test_cosine_scheduler():
     np.testing.assert_almost_equal(cosine_sched(steps), final_lr)
     assert (cosine_sched(500) > 1.5)
 
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
+
diff --git a/tools/caffe_translator/src/main/resources/templates/opt_rmsprop.st b/tools/caffe_translator/src/main/resources/templates/opt_rmsprop.st
index 6baec42951d5..1ace86d36f61 100644
--- a/tools/caffe_translator/src/main/resources/templates/opt_rmsprop.st
+++ b/tools/caffe_translator/src/main/resources/templates/opt_rmsprop.st
@@ -18,7 +18,7 @@
 !>
 <opt_vars(solver)>
 <if(solver.rms_decay)>
-gamma1 = <solver.rms_decay>
+rho = <solver.rms_decay>
 <endif>
 <if(solver.delta)>
 epsilon = <solver.delta>
@@ -26,7 +26,7 @@ epsilon = <solver.delta>
 
 optimizer_params={'learning_rate':base_lr<\\>
 <if(solver.wd)>, 'wd':wd<endif><\\>
-<if(solver.rms_decay)>, 'gamma1':gamma1<endif><\\>
+<if(solver.rms_decay)>, 'rho':rho<endif><\\>
 <if(solver.delta)>, 'epsilon':epsilon<endif>}<\\>
 
 module.init_optimizer(optimizer='RMSProp', optimizer_params=optimizer_params)