diff --git a/CMakeLists.txt b/CMakeLists.txt
index 483108a68419..000bbbf17ea5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -372,13 +372,13 @@ endif()
 
 # ---[ LAPack
 if(USE_LAPACK)
+  message("USE_LAPACK is ON")
   add_definitions(-DMXNET_USE_LAPACK=1)
   if (NOT MSVC)
     list(APPEND mxnet_LINKER_LIBS lapack)
   endif()
 endif()
 
-message("USE LAPACK ${USE_LAPACK}")
 
 # ---[ jemalloc
 if(USE_JEMALLOC)
diff --git a/Jenkinsfile b/Jenkinsfile
index 6d21f496426e..9d7792066e37 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -26,6 +26,8 @@ mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdpart
 mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
+// mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
+mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
 mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 // timeout in minutes
@@ -233,6 +235,17 @@ try {
         }
       }
     },
+    'CPU: Openblas, debug': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/build-cpu-openblas') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            docker_run('ubuntu_cpu', 'build_ubuntu_cpu_cmake_debug', false)
+            pack_lib('cpu_debug', mx_cmake_lib_debug)
+          }
+        }
+      }
+    },
     'CPU: Clang 3.9': {
       node('mxnetlinux-cpu') {
         ws('workspace/build-cpu-clang39') {
@@ -378,28 +391,8 @@ try {
           ws('workspace/build-cpu') {
             withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
               init_git_win()
-              bat """mkdir build_vc14_cpu
-                call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
-                cd build_vc14_cpu
-                cmake -G \"Visual Studio 14 2015 Win64\" -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DUSE_MKL_IF_AVAILABLE=0 ${env.WORKSPACE}"""
-              bat 'C:\\mxnet\\build_vc14_cpu.bat'
-
-              bat '''rmdir /s/q pkg_vc14_cpu
-                mkdir pkg_vc14_cpu\\lib
-                mkdir pkg_vc14_cpu\\python
-                mkdir pkg_vc14_cpu\\include
-                mkdir pkg_vc14_cpu\\build
-                copy build_vc14_cpu\\Release\\libmxnet.lib pkg_vc14_cpu\\lib
-                copy build_vc14_cpu\\Release\\libmxnet.dll pkg_vc14_cpu\\build
-                xcopy python pkg_vc14_cpu\\python /E /I /Y
-                xcopy include pkg_vc14_cpu\\include /E /I /Y
-                xcopy 3rdparty\\dmlc-core\\include pkg_vc14_cpu\\include /E /I /Y
-                xcopy 3rdparty\\mshadow\\mshadow pkg_vc14_cpu\\include\\mshadow /E /I /Y
-                xcopy 3rdparty\\nnvm\\include pkg_vc14_cpu\\nnvm\\include /E /I /Y
-                del /Q *.7z
-                7z.exe a vc14_cpu.7z pkg_vc14_cpu\\
-                '''
-              stash includes: 'vc14_cpu.7z', name: 'vc14_cpu'
+              powershell 'python ci/build_windows.py -f WIN_CPU'
+              stash includes: 'windows_package.7z', name: 'windows_package_cpu'
             }
           }
         }
@@ -411,28 +404,9 @@ try {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/build-gpu') {
             withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
-            init_git_win()
-            bat """mkdir build_vc14_gpu
-              call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
-              cd build_vc14_gpu
-              cmake -G \"NMake Makefiles JOM\" -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NVRTC=1 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=Manual -DCUDA_ARCH_BIN=52 -DCUDA_ARCH_PTX=52 -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 /DNDEBUG" -DCMAKE_BUILD_TYPE=Release -DUSE_MKL_IF_AVAILABLE=0 ${env.WORKSPACE}"""
-            bat 'C:\\mxnet\\build_vc14_gpu.bat'
-            bat '''rmdir /s/q pkg_vc14_gpu
-              mkdir pkg_vc14_gpu\\lib
-              mkdir pkg_vc14_gpu\\python
-              mkdir pkg_vc14_gpu\\include
-              mkdir pkg_vc14_gpu\\build
-              copy build_vc14_gpu\\libmxnet.lib pkg_vc14_gpu\\lib
-              copy build_vc14_gpu\\libmxnet.dll pkg_vc14_gpu\\build
-              xcopy python pkg_vc14_gpu\\python /E /I /Y
-              xcopy include pkg_vc14_gpu\\include /E /I /Y
-              xcopy 3rdparty\\dmlc-core\\include pkg_vc14_gpu\\include /E /I /Y
-              xcopy 3rdparty\\mshadow\\mshadow pkg_vc14_gpu\\include\\mshadow /E /I /Y
-              xcopy 3rdparty\\nnvm\\include pkg_vc14_gpu\\nnvm\\include /E /I /Y
-              del /Q *.7z
-              7z.exe a vc14_gpu.7z pkg_vc14_gpu\\
-              '''
-            stash includes: 'vc14_gpu.7z', name: 'vc14_gpu'
+              init_git_win()
+              powershell 'python ci/build_windows.py -f WIN_GPU'
+              stash includes: 'windows_package.7z', name: 'windows_package_gpu'
             }
           }
         }
@@ -443,37 +417,9 @@ try {
         timeout(time: max_time, unit: 'MINUTES') {
           ws('workspace/build-gpu') {
             withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0','BUILD_NAME=vc14_gpu_mkldnn']) {
-            init_git_win()
-            bat """mkdir build_%BUILD_NAME%
-              call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
-              cd build_%BUILD_NAME%
-              copy ${env.WORKSPACE}\\3rdparty\\mkldnn\\config_template.vcxproj.user ${env.WORKSPACE}\\config_template.vcxproj.user /y
-              cmake -G \"NMake Makefiles JOM\" -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NVRTC=1 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=Manual -DCUDA_ARCH_BIN=52 -DCUDA_ARCH_PTX=52 -DUSE_MKLDNN=1 -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 /DNDEBUG" -DCMAKE_BUILD_TYPE=Release ${env.WORKSPACE}"""
-            bat '''
-                call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
-                cd build_%BUILD_NAME%
-                set /a cores=%NUMBER_OF_PROCESSORS% * 2
-                jom -j %cores%
-                '''
-            bat '''rmdir /s/q pkg_%BUILD_NAME%
-              mkdir pkg_%BUILD_NAME%\\lib
-              mkdir pkg_%BUILD_NAME%\\python
-              mkdir pkg_%BUILD_NAME%\\include
-              mkdir pkg_%BUILD_NAME%\\build
-              copy build_%BUILD_NAME%\\libmxnet.lib pkg_%BUILD_NAME%\\lib
-              copy build_%BUILD_NAME%\\libmxnet.dll pkg_%BUILD_NAME%\\build
-              copy build_%BUILD_NAME%\\3rdparty\\mkldnn\\src\\mkldnn.dll pkg_%BUILD_NAME%\\build
-              copy build_%BUILD_NAME%\\libiomp5md.dll pkg_%BUILD_NAME%\\build
-              copy build_%BUILD_NAME%\\mklml.dll pkg_%BUILD_NAME%\\build
-              xcopy python pkg_%BUILD_NAME%\\python /E /I /Y
-              xcopy include pkg_%BUILD_NAME%\\include /E /I /Y
-              xcopy 3rdparty\\dmlc-core\\include pkg_%BUILD_NAME%\\include /E /I /Y
-              xcopy 3rdparty\\mshadow\\mshadow pkg_%BUILD_NAME%\\include\\mshadow /E /I /Y
-              xcopy 3rdparty\\nnvm\\include pkg_%BUILD_NAME%\\nnvm\\include /E /I /Y
-              del /Q *.7z
-              7z.exe a %BUILD_NAME%.7z pkg_%BUILD_NAME%\\
-              '''
-            stash includes: 'vc14_gpu_mkldnn.7z', name: 'vc14_gpu_mkldnn'
+              init_git_win()
+              powershell 'python ci/build_windows.py -f WIN_GPU_MKLDNN'
+              stash includes: 'windows_package.7z', name: 'windows_package_gpu_mkldnn'
             }
           }
         }
@@ -574,6 +520,20 @@ try {
         }
       }
     },
+    'Python3: CPU debug': {
+      node('mxnetlinux-cpu') {
+        ws('workspace/ut-python3-cpu-debug') {
+          try {
+            init_git()
+            unpack_lib('cpu_debug', mx_cmake_lib_debug)
+            python3_ut('ubuntu_cpu')
+          } finally {
+            collect_test_results_unix('nosetests_unittest.xml', 'nosetests_python3_cpu_debug_unittest.xml')
+            collect_test_results_unix('nosetests_quantization.xml', 'nosetests_python3_cpu_debug_quantization.xml')
+          }
+        }
+      }
+    },
     'Python2: GPU': {
       node('mxnetlinux-gpu') {
         ws('workspace/ut-python2-gpu') {
@@ -843,16 +803,8 @@ try {
           ws('workspace/ut-python-cpu') {
             try {
               init_git_win()
-              unstash 'vc14_cpu'
-              bat '''rmdir /s/q pkg_vc14_cpu
-                7z x -y vc14_cpu.7z'''
-              bat """xcopy C:\\mxnet\\data data /E /I /Y
-                xcopy C:\\mxnet\\model model /E /I /Y
-                call activate py2
-                pip install mock
-                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
-                del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
-                C:\\mxnet\\test_cpu.bat"""
+              unstash 'windows_package_cpu'
+              powershell 'ci/windows/test_py2_cpu.ps1'
             } finally {
               collect_test_results_windows('nosetests_unittest.xml', 'nosetests_unittest_windows_python2_cpu.xml')
             }
@@ -866,15 +818,8 @@ try {
           ws('workspace/ut-python-cpu') {
             try {
               init_git_win()
-              unstash 'vc14_cpu'
-              bat '''rmdir /s/q pkg_vc14_cpu
-                7z x -y vc14_cpu.7z'''
-              bat """xcopy C:\\mxnet\\data data /E /I /Y
-                xcopy C:\\mxnet\\model model /E /I /Y
-                call activate py3
-                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
-                del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
-                C:\\mxnet\\test_cpu.bat"""
+              unstash 'windows_package_cpu'
+              powershell 'ci/windows/test_py3_cpu.ps1'
             } finally {
               collect_test_results_windows('nosetests_unittest.xml', 'nosetests_unittest_windows_python3_cpu.xml')
             }
@@ -888,19 +833,11 @@ try {
           ws('workspace/ut-python-gpu') {
             try {
               init_git_win()
-              unstash 'vc14_gpu'
-              bat '''rmdir /s/q pkg_vc14_gpu
-                7z x -y vc14_gpu.7z'''
-              bat """xcopy C:\\mxnet\\data data /E /I /Y
-                xcopy C:\\mxnet\\model model /E /I /Y
-                call activate py2
-                pip install mock
-                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
-                del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
-                C:\\mxnet\\test_gpu.bat"""
+              unstash 'windows_package_gpu'
+              powershell 'ci/windows/test_py2_gpu.ps1'
             } finally {
-              collect_test_results_windows('nosetests_gpu_forward.xml', 'nosetests_gpu_forward_windows_python2_gpu.xml')
-              collect_test_results_windows('nosetests_gpu_operator.xml', 'nosetests_gpu_operator_windows_python2_gpu.xml')
+              collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python2_gpu.xml')
+              collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python2_gpu.xml')
             }
           }
         }
@@ -912,18 +849,11 @@ try {
           ws('workspace/ut-python-gpu') {
             try {
               init_git_win()
-              unstash 'vc14_gpu'
-              bat '''rmdir /s/q pkg_vc14_gpu
-                7z x -y vc14_gpu.7z'''
-              bat """xcopy C:\\mxnet\\data data /E /I /Y
-                xcopy C:\\mxnet\\model model /E /I /Y
-                call activate py3
-                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
-                del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
-                C:\\mxnet\\test_gpu.bat"""
+              unstash 'windows_package_gpu'
+              powershell 'ci/windows/test_py3_gpu.ps1'
             } finally {
-              collect_test_results_windows('nosetests_gpu_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu.xml')
-              collect_test_results_windows('nosetests_gpu_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu.xml')
+              collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu.xml')
+              collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu.xml')
             }
           }
         }
@@ -935,18 +865,11 @@ try {
           ws('workspace/ut-python-gpu') {
             try {
               init_git_win()
-              unstash 'vc14_gpu_mkldnn'
-              bat '''rmdir /s/q pkg_vc14_gpu_mkldnn
-                7z x -y vc14_gpu_mkldnn.7z'''
-              bat """xcopy C:\\mxnet\\data data /E /I /Y
-                xcopy C:\\mxnet\\model model /E /I /Y
-                call activate py3
-                set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu_mkldnn\\python
-                del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu_mkldnn\\python\\*.pyc
-                C:\\mxnet\\test_gpu.bat"""
+              unstash 'windows_package_gpu_mkldnn'
+              powershell 'ci/windows/test_py3_gpu.ps1'
             } finally {
-              collect_test_results_windows('nosetests_gpu_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu_mkldnn.xml')
-              collect_test_results_windows('nosetests_gpu_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu_mkldnn.xml')
+              collect_test_results_windows('nosetests_forward.xml', 'nosetests_gpu_forward_windows_python3_gpu_mkldnn.xml')
+              collect_test_results_windows('nosetests_operator.xml', 'nosetests_gpu_operator_windows_python3_gpu_mkldnn.xml')
             }
           }
         }
diff --git a/R-package/R/mlp.R b/R-package/R/mlp.R
index ecc30999d1c2..aa510d103f53 100644
--- a/R-package/R/mlp.R
+++ b/R-package/R/mlp.R
@@ -8,7 +8,7 @@
 #' @param activation either a single string or a vector containing the names of the activation functions.
 #' @param out_activation a single string containing the name of the output activation function.
 #' @param ctx whether train on cpu (default) or gpu.
-#' @param eval_metric the evaluation metric/
+#' @param eval.metric the evaluation metric/
 #' @param ... other parameters passing to \code{mx.model.FeedForward.create}/
 #' 
 #' @examples
diff --git a/ci/build.py b/ci/build.py
index 09f2d4709bdd..a9d6a63537f2 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -39,6 +39,7 @@
 from itertools import chain
 from subprocess import call, check_call
 from typing import *
+from util import *
 
 CCACHE_MAXSIZE = '500G'
 
@@ -138,24 +139,9 @@ def _get_local_image_id(docker_binary, docker_tag):
     return image_id
 
 
-def get_mxnet_root() -> str:
-    curpath = os.path.abspath(os.path.dirname(__file__))
-
-    def is_mxnet_root(path: str) -> bool:
-        return os.path.exists(os.path.join(path, ".mxnet_root"))
-
-    while not is_mxnet_root(curpath):
-        parent = os.path.abspath(os.path.join(curpath, os.pardir))
-        if parent == curpath:
-            raise RuntimeError("Got to the root and couldn't find a parent folder with .mxnet_root")
-        curpath = parent
-    return curpath
-
-
 def buildir() -> str:
     return os.path.join(get_mxnet_root(), "build")
 
-
 def default_ccache_dir() -> str:
     # Share ccache across containers
     if 'CCACHE_DIR' in os.environ:
diff --git a/ci/build_windows.py b/ci/build_windows.py
new file mode 100755
index 000000000000..5eca58db7b74
--- /dev/null
+++ b/ci/build_windows.py
@@ -0,0 +1,253 @@
+﻿#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""User friendly / multi platform builder script"""
+
+import subprocess
+import logging
+import os
+import tempfile
+import sys
+from distutils import spawn
+import logging
+from subprocess import check_call
+import platform
+import argparse
+from util import *
+import json
+from enum import Enum
+import time
+import datetime
+import shutil
+import glob
+from distutils.dir_util import copy_tree
+
+KNOWN_VCVARS = [
+    # VS 2015
+      r'C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\x86_amd64\vcvarsx86_amd64.bat'
+    # VS 2017
+    , r'c:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsx86_amd64.bat'
+]
+
+class BuildFlavour(Enum):
+    WIN_CPU = 'WIN_CPU'
+    WIN_CPU_MKLDNN = 'WIN_CPU_MKLDNN'
+    WIN_GPU = 'WIN_GPU'
+    WIN_GPU_MKLDNN = 'WIN_GPU_MKLDNN'
+
+CMAKE_FLAGS = {
+    'WIN_CPU': '-DUSE_CUDA=0 \
+                -DUSE_CUDNN=0 \
+                -DUSE_NVRTC=0 \
+                -DUSE_OPENCV=1 \
+                -DUSE_OPENMP=1 \
+                -DUSE_PROFILER=1 \
+                -DUSE_BLAS=open \
+                -DUSE_LAPACK=1 \
+                -DUSE_DIST_KVSTORE=0 \
+                -DBUILD_CPP_EXAMPLES=1 \
+                -DUSE_MKL_IF_AVAILABLE=0'
+
+    ,'WIN_CPU_MKLDNN': '-DUSE_CUDA=0 \
+                        -DUSE_CUDNN=0 \
+                        -DUSE_NVRTC=0 \
+                        -DUSE_OPENCV=1 \
+                        -DUSE_OPENMP=1 \
+                        -DUSE_PROFILER=1 \
+                        -DUSE_BLAS=open \
+                        -DUSE_LAPACK=1 \
+                        -DUSE_DIST_KVSTORE=0 \
+                        -DUSE_MKL_IF_AVAILABLE=1'
+
+    ,'WIN_GPU': '-DUSE_CUDA=1 \
+                 -DUSE_CUDNN=1 \
+                 -DUSE_NVRTC=1 \
+                 -DUSE_OPENCV=1  \
+                 -DUSE_OPENMP=1 \
+                 -DUSE_PROFILER=1 \
+                 -DUSE_BLAS=open  \
+                 -DUSE_LAPACK=1  \
+                 -DUSE_DIST_KVSTORE=0 \
+                 -DCUDA_ARCH_NAME=Manual \
+                 -DCUDA_ARCH_BIN=52 \
+                 -DCUDA_ARCH_PTX=52 \
+                 -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 /DNDEBUG" \
+                 -DUSE_MKL_IF_AVAILABLE=0 \
+                 -DCMAKE_BUILD_TYPE=Release'
+
+    ,'WIN_GPU_MKLDNN': '-DUSE_CUDA=1 \
+                        -DUSE_CUDNN=1 \
+                        -DUSE_NVRTC=1 \
+                        -DUSE_OPENCV=1 \
+                        -DUSE_OPENMP=1 \
+                        -DUSE_PROFILER=1 \
+                        -DUSE_BLAS=open \
+                        -DUSE_LAPACK=1 \
+                        -DUSE_DIST_KVSTORE=0 \
+                        -DCUDA_ARCH_NAME=Manual \
+                        -DCUDA_ARCH_BIN=52 \
+                        -DCUDA_ARCH_PTX=52 \
+                        -DUSE_MKLDNN=1 \
+                        -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 \
+                        /DNDEBUG" \
+                        -DCMAKE_BUILD_TYPE=Release'
+
+}
+
+
+def get_vcvars_environment(architecture, vcvars):
+    """
+    Returns a dictionary containing the environment variables set up by vcvars
+    """
+    result = None
+    python = sys.executable
+
+    vcvars_list = [vcvars]
+    vcvars_list.extend(KNOWN_VCVARS)
+    for vcvars in vcvars_list:
+        if os.path.isfile(vcvars):
+            process = subprocess.Popen('("%s" %s>nul) && "%s" -c "import os; import json; print(json.dumps(dict(os.environ)))"' % (vcvars, architecture, python), stdout=subprocess.PIPE, shell=True)
+            stdout, stderr = process.communicate()
+            exitcode = process.wait()
+            if exitcode == 0:
+                logging.info("Using build environment from: %s", vcvars)
+                return(json.loads(stdout.strip()))
+            else:
+                raise RuntimeError('Failed cloning environment from vcvars file: %s stdout: %s stderr: %s', vcvars, stdout, stderr)
+    raise RuntimeError('Couldn\'t find vcvars batch file: %s', vcvars)
+
+
+def windows_build(args):
+    vcvars_env = get_vcvars_environment(args.arch, args.vcvars)
+    logging.debug("vcvars environment: %s", vcvars_env)
+    os.environ.update(vcvars_env)
+
+    path = args.output
+    os.makedirs(path, exist_ok=True)
+    mxnet_root = get_mxnet_root()
+    logging.info("Found mxnet root: {}".format(mxnet_root))
+    with remember_cwd():
+        os.chdir(path)
+        logging.info("Generating project with CMake")
+        check_call("cmake -G \"Visual Studio 14 2015 Win64\" {} {}".format(CMAKE_FLAGS[args.flavour], mxnet_root), shell=True)
+        logging.info("Building with visual studio")
+        t0 = int(time.time())
+        check_call(["msbuild", "mxnet.sln","/p:configuration=release;platform=x64", "/maxcpucount","/v:minimal"])
+        logging.info("Build flavour: %s complete in directory: \"%s\"", args.flavour, os.path.abspath(path))
+        logging.info("Build took %s" , datetime.timedelta(seconds=int(time.time()-t0)))
+    windows_package(args)
+
+def windows_package(args):
+    pkgfile = 'windows_package.7z'
+    pkgdir = os.path.abspath('windows_package')
+    logging.info("Packaging libraries and headers in package: %s", pkgfile)
+    j = os.path.join
+    pkgdir_lib = os.path.abspath(j(pkgdir, 'lib'))
+    with remember_cwd():
+        os.chdir(args.output)
+        logging.info("Looking for static libraries and dlls in: \"%s", os.getcwd())
+        libs = list(glob.iglob('**/*.lib', recursive=True))
+        dlls = list(glob.iglob('**/*.dll', recursive=True))
+        os.makedirs(pkgdir_lib, exist_ok=True)
+        for lib in libs:
+            logging.info("packing lib: %s", lib)
+            shutil.copy(lib, pkgdir_lib)
+        for dll in dlls:
+            logging.info("packing dll: %s", dll)
+            shutil.copy(dll, pkgdir_lib)
+        os.chdir(get_mxnet_root())
+        logging.info('packing python bindings')
+        copy_tree('python', j(pkgdir, 'python'))
+        logging.info('packing headers')
+        copy_tree('include', j(pkgdir, 'include'))
+        copy_tree(j('3rdparty','dmlc-core','include'), j(pkgdir, 'include'))
+        copy_tree(j('3rdparty','mshadow', 'mshadow'), j(pkgdir, 'include', 'mshadow'))
+        copy_tree(j('3rdparty','tvm','nnvm', 'include'), j(pkgdir,'include', 'nnvm', 'include'))
+        logging.info("Compressing package: %s", pkgfile)
+        check_call(['7z', 'a', pkgfile, pkgdir])
+
+
+def nix_build(args):
+    path = args.output
+    os.makedirs(path, exist_ok=True)
+    with remember_cwd():
+        os.chdir(path)
+        logging.info("Generating project with CMake")
+        check_call("cmake \
+            -DUSE_CUDA=OFF \
+            -DUSE_OPENCV=OFF \
+            -DUSE_OPENMP=OFF \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -GNinja ..", shell=True)
+        check_call("ninja", shell=True)
+
+def main():
+    logging.getLogger().setLevel(logging.INFO)
+    logging.basicConfig(format='%(asctime)-15s %(message)s')
+    logging.info("MXNet Windows build helper")
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-o", "--output",
+        help="output directory",
+        default='build',
+        type=str)
+
+    parser.add_argument("--vcvars",
+        help="vcvars batch file location, typically inside vs studio install dir",
+        default=r'c:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsx86_amd64.bat',
+        type=str)
+
+    parser.add_argument("--arch",
+        help="architecture",
+        default='x64',
+        type=str)
+
+    parser.add_argument("-f", "--flavour",
+        help="build flavour",
+        default='WIN_CPU',
+        choices=[x.name for x in BuildFlavour],
+        type=str)
+
+    args = parser.parse_args()
+    logging.info("Build flavour: %s", args.flavour)
+
+    system = platform.system()
+    if system == 'Windows':
+        logging.info("Detected Windows platform")
+        if 'OpenBLAS_HOME' not in os.environ:
+            os.environ["OpenBLAS_HOME"] = "C:\\mxnet\\openblas"
+        if 'OpenCV_DIR' not in os.environ:
+            os.environ["OpenCV_DIR"] = "C:\\mxnet\\opencv_vc14"
+        if 'CUDA_PATH' not in os.environ:
+            os.environ["CUDA_PATH"] = "C:\\CUDA\\v8.0"
+        windows_build(args)
+
+    elif system == 'Linux' or system == 'Darwin':
+        nix_build(args)
+
+    else:
+        logging.error("Don't know how to build for {} yet".format(platform.system()))
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
+
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 52a2650a1cc4..1c861beb916c 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -110,23 +110,6 @@ build_jetson() {
     popd
 }
 
-report_ccache_usage() {
-    set -ex
-    pushd .
-
-    # Show global ccache summary at the end of each run.
-    ccache -s
-    if [ -e $CCACHE_LOGFILE ]
-    then
-        # Display local ccache log, excluding some overly verbose output.
-        cat $CCACHE_LOGFILE | grep -v "Config:" | grep -v "stats.lock"
-    else
-        echo "No ccache log found."
-    fi
-
-    popd
-}
-
 #
 # ARM builds
 #
@@ -159,7 +142,6 @@ build_armv6() {
         -G Ninja /work/mxnet
 
     ninja -v
-    report_ccache_usage
     build_wheel
     popd
 }
@@ -191,7 +173,6 @@ build_armv7() {
         -G Ninja /work/mxnet
 
     ninja -v
-    report_ccache_usage
     build_wheel
     popd
 }
@@ -210,7 +191,6 @@ build_armv8() {
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
     ninja -v
-    report_ccache_usage
     build_wheel
 }
 
@@ -237,7 +217,6 @@ build_android_armv7() {
         -DUSE_MKL_IF_AVAILABLE=OFF\
         -G Ninja /work/mxnet
     ninja -v
-    report_ccache_usage
 }
 
 build_android_armv8() {
@@ -270,8 +249,6 @@ build_centos7_cpu() {
         USE_BLAS=openblas \
         USE_DIST_KVSTORE=1 \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_amzn_linux_cpu() {
@@ -289,7 +266,6 @@ build_amzn_linux_cpu() {
         -DUSE_DIST_KVSTORE=ON\
         -G Ninja /work/mxnet
     ninja -v
-    report_ccache_usage
 }
 
 
@@ -306,8 +282,6 @@ build_centos7_mkldnn() {
         USE_MKLDNN=1 \
         USE_BLAS=openblas \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_centos7_gpu() {
@@ -341,26 +315,38 @@ build_ubuntu_cpu_openblas() {
         USE_BLAS=openblas             \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
+}
+
+build_ubuntu_cpu_cmake_debug() {
+    set -ex
+    pushd .
+    cd /work/build
+    cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DUSE_CUDA=OFF \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_OPENMP=OFF \
+        -DUSE_OPENCV=ON \
+        -DCMAKE_BUILD_TYPE=Debug \
+        -G Ninja \
+        /work/mxnet
 
-    report_ccache_usage
+    ninja -v
+    popd
 }
 
 build_ubuntu_cpu_clang39() {
     set -ex
-
-    export CXX=clang++-3.9
+     export CXX=clang++-3.9
     export CC=clang-3.9
-
-    build_ccache_wrappers
-
-    make \
+     build_ccache_wrappers
+     make \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
         USE_OPENMP=0                  \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_cpu_clang50() {
@@ -377,8 +363,6 @@ build_ubuntu_cpu_clang50() {
         USE_OPENMP=1                  \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_cpu_clang39_mkldnn() {
@@ -395,8 +379,6 @@ build_ubuntu_cpu_clang39_mkldnn() {
         USE_MKLDNN=1                  \
         USE_OPENMP=0                  \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_cpu_clang50_mkldnn() {
@@ -413,8 +395,6 @@ build_ubuntu_cpu_clang50_mkldnn() {
         USE_MKLDNN=1                  \
         USE_OPENMP=1                  \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_cpu_mkldnn() {
@@ -428,8 +408,6 @@ build_ubuntu_cpu_mkldnn() {
         USE_BLAS=openblas             \
         USE_MKLDNN=1                  \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_gpu() {
@@ -450,8 +428,6 @@ build_ubuntu_gpu_mkldnn() {
         USE_CUDA_PATH=/usr/local/cuda \
         USE_CUDNN=1                   \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_gpu_mkldnn_nocudnn() {
@@ -467,8 +443,6 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
         USE_CUDA_PATH=/usr/local/cuda \
         USE_CUDNN=0                   \
         -j$(nproc)
-
-    report_ccache_usage
 }
 
 build_ubuntu_gpu_cuda91_cudnn7() {
@@ -515,7 +489,6 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
-    report_ccache_usage
     # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
     cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
     mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
@@ -537,7 +510,6 @@ build_ubuntu_gpu_cmake() {
         /work/mxnet
 
     ninja -v
-    report_ccache_usage
 }
 
 
@@ -759,6 +731,7 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
     ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --no-multiprecision
     ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=compressed_cpu
     ../../tools/launch.py -n 7 --launcher local python dist_sync_kvstore.py --type=compressed_cpu --no-multiprecision
+    ../../tools/launch.py -n 3 --launcher local python test_server_profiling.py
 }
 
 integrationtest_ubuntu_gpu_scala() {
diff --git a/ci/docker_cache.py b/ci/docker_cache.py
index 6637ec377165..7a6d1106d38d 100755
--- a/ci/docker_cache.py
+++ b/ci/docker_cache.py
@@ -31,7 +31,6 @@
 import subprocess
 import json
 import build as build_util
-from joblib import Parallel, delayed
 
 
 
@@ -43,6 +42,7 @@ def build_save_containers(platforms, registry, load_cache) -> int:
     :param load_cache: Load cache before building
     :return: 1 if error occurred, 0 otherwise
     """
+    from joblib import Parallel, delayed
     if len(platforms) == 0:
         return 0
 
diff --git a/ci/util.py b/ci/util.py
new file mode 100644
index 000000000000..22631f30435f
--- /dev/null
+++ b/ci/util.py
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import contextlib
+
+def get_mxnet_root() -> str:
+    curpath = os.path.abspath(os.path.dirname(__file__))
+
+    def is_mxnet_root(path: str) -> bool:
+        return os.path.exists(os.path.join(path, ".mxnet_root"))
+
+    while not is_mxnet_root(curpath):
+        parent = os.path.abspath(os.path.join(curpath, os.pardir))
+        if parent == curpath:
+            raise RuntimeError("Got to the root and couldn't find a parent folder with .mxnet_root")
+        curpath = parent
+    return curpath
+
+@contextlib.contextmanager
+def remember_cwd():
+    '''
+    Restore current directory when exiting context
+    '''
+    curdir = os.getcwd()
+    try: yield
+    finally: os.chdir(curdir)
+
+
diff --git a/ci/windows/test_py2_cpu.ps1 b/ci/windows/test_py2_cpu.ps1
new file mode 100644
index 000000000000..1623d2956103
--- /dev/null
+++ b/ci/windows/test_py2_cpu.ps1
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+7z x -y windows_package.7z
+$env:PYTHONPATH=join-path $pwd.Path windows_package\python
+$env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+c:\Anaconda3\envs\py2\Scripts\pip install -r tests\requirements.txt
+c:\Anaconda3\envs\py2\python.exe -m nose -v --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
+if (! $?) { Throw ("Error running unittest") }
+c:\Anaconda3\envs\py2\python.exe -m nose -v --with-xunit --xunit-file nosetests_train.xml tests\python\train
+if (! $?) { Throw ("Error running train tests") }
diff --git a/ci/windows/test_py2_gpu.ps1 b/ci/windows/test_py2_gpu.ps1
new file mode 100644
index 000000000000..13cd5366e0db
--- /dev/null
+++ b/ci/windows/test_py2_gpu.ps1
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+7z x -y windows_package.7z
+$env:PYTHONPATH=join-path $pwd.Path windows_package\python
+$env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+c:\Anaconda3\envs\py2\Scripts\pip install -r tests\requirements.txt
+c:\Anaconda3\envs\py2\python.exe -m nose -v --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
+if (! $?) { Throw ("Error running unittest") }
+c:\Anaconda3\envs\py2\python.exe -m nose -v --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py
+if (! $?) { Throw ("Error running tests") }
+c:\Anaconda3\envs\py2\python.exe -m nose -v --with-xunit --xunit-file nosetests_forward.xml tests\python\gpu\test_forward.py
+if (! $?) { Throw ("Error running tests") }
+c:\Anaconda3\envs\py2\python.exe -m nose -v tests\python\train
+if (! $?) { Throw ("Error running tests") }
diff --git a/ci/windows/test_py3_cpu.ps1 b/ci/windows/test_py3_cpu.ps1
new file mode 100644
index 000000000000..98d4e410e8f5
--- /dev/null
+++ b/ci/windows/test_py3_cpu.ps1
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+7z x -y windows_package.7z
+$env:PYTHONPATH=join-path $pwd.Path windows_package\python
+$env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+c:\Anaconda3\envs\py3\Scripts\pip install -r tests\requirements.txt
+c:\Anaconda3\envs\py3\python.exe -m nose -v --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
+if (! $?) { Throw ("Error running unittest") }
+c:\Anaconda3\envs\py3\python.exe -m nose -v --with-xunit --xunit-file nosetests_train.xml tests\python\train
+if (! $?) { Throw ("Error running train tests") }
diff --git a/ci/windows/test_py3_gpu.ps1 b/ci/windows/test_py3_gpu.ps1
new file mode 100644
index 000000000000..b94b4f389be8
--- /dev/null
+++ b/ci/windows/test_py3_gpu.ps1
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+7z x -y windows_package.7z
+$env:PYTHONPATH=join-path $pwd.Path windows_package\python
+$env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+c:\Anaconda3\envs\py3\Scripts\pip install -r tests\requirements.txt
+c:\Anaconda3\envs\py3\python.exe -m nose -v --with-xunit --xunit-file nosetests_unittest.xml tests\python\unittest
+if (! $?) { Throw ("Error running unittest") }
+c:\Anaconda3\envs\py3\python.exe -m nose -v --with-xunit --xunit-file nosetests_operator.xml tests\python\gpu\test_operator_gpu.py
+if (! $?) { Throw ("Error running tests") }
+c:\Anaconda3\envs\py3\python.exe -m nose -v --with-xunit --xunit-file nosetests_forward.xml tests\python\gpu\test_forward.py
+if (! $?) { Throw ("Error running tests") }
+c:\Anaconda3\envs\py3\python.exe -m nose -v --with-xunit --xunit-file nosetests_train.xml tests\python\train
+if (! $?) { Throw ("Error running tests") }
diff --git a/contrib/clojure-package/examples/scripts/get_cifar_data.sh b/contrib/clojure-package/examples/scripts/get_cifar_data.sh
index 372c7bb5781e..12b3770c2700 100755
--- a/contrib/clojure-package/examples/scripts/get_cifar_data.sh
+++ b/contrib/clojure-package/examples/scripts/get_cifar_data.sh
@@ -20,8 +20,8 @@
 
 set -evx
 
-if [ ! -z "$MXNET_DATA_DIR" ]; then
-  data_path="$MXNET_DATA_DIR"
+if [ ! -z "$MXNET_HOME" ]; then
+  data_path="$MXNET_HOME"
 else
   data_path="./data"
 fi
diff --git a/contrib/clojure-package/examples/scripts/get_mnist_data.sh b/contrib/clojure-package/examples/scripts/get_mnist_data.sh
index 6f32b85f480b..703ece207a1f 100755
--- a/contrib/clojure-package/examples/scripts/get_mnist_data.sh
+++ b/contrib/clojure-package/examples/scripts/get_mnist_data.sh
@@ -20,8 +20,8 @@
 
 set -evx
 
-if [ ! -z "$MXNET_DATA_DIR" ]; then
-  data_path="$MXNET_DATA_DIR"
+if [ ! -z "$MXNET_HOME" ]; then
+  data_path="$MXNET_HOME"
 else
   data_path="./data"
 fi
diff --git a/contrib/clojure-package/scripts/get_cifar_data.sh b/contrib/clojure-package/scripts/get_cifar_data.sh
index 372c7bb5781e..12b3770c2700 100755
--- a/contrib/clojure-package/scripts/get_cifar_data.sh
+++ b/contrib/clojure-package/scripts/get_cifar_data.sh
@@ -20,8 +20,8 @@
 
 set -evx
 
-if [ ! -z "$MXNET_DATA_DIR" ]; then
-  data_path="$MXNET_DATA_DIR"
+if [ ! -z "$MXNET_HOME" ]; then
+  data_path="$MXNET_HOME"
 else
   data_path="./data"
 fi
diff --git a/contrib/clojure-package/scripts/get_mnist_data.sh b/contrib/clojure-package/scripts/get_mnist_data.sh
index 6f32b85f480b..703ece207a1f 100755
--- a/contrib/clojure-package/scripts/get_mnist_data.sh
+++ b/contrib/clojure-package/scripts/get_mnist_data.sh
@@ -20,8 +20,8 @@
 
 set -evx
 
-if [ ! -z "$MXNET_DATA_DIR" ]; then
-  data_path="$MXNET_DATA_DIR"
+if [ ! -z "$MXNET_HOME" ]; then
+  data_path="$MXNET_HOME"
 else
   data_path="./data"
 fi
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj
index 22ab761547e2..ab6d345fe91d 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj
@@ -309,7 +309,6 @@
 
 (defn load-checkpoint
   "Create a model from previously saved checkpoint.
-   - mod module
    - opts map of
      -  prefix Path prefix of saved model files. You should have prefix-symbol.json,
                  prefix-xxxx.params, and optionally prefix-xxxx.states,
@@ -341,7 +340,7 @@
     (util/->option (when workload-list (util/vec->indexed-seq workload-list)))
     (util/->option (when fixed-param-names (util/vec->set fixed-param-names)))))
   ([prefix epoch]
-   (load-checkpoint mod {:prefix prefix :epoch epoch})))
+   (load-checkpoint {:prefix prefix :epoch epoch})))
 
 (defn load-optimizer-states [mod fname]
   (.mod load fname))
@@ -670,4 +669,3 @@
 
   (fit-params {:allow-missing true})
   (fit-params {}))
-
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj
index f3d4e75e8c97..0f71b5a850cc 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj
@@ -101,13 +101,20 @@
         (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1 :momentum 0.9})})
         (m/update)
         (m/save-checkpoint {:prefix "test" :epoch 0 :save-opt-states true}))
-
     (let [mod2 (m/load-checkpoint {:prefix "test" :epoch 0 :load-optimizer-states true})]
       (-> mod2
           (m/bind {:data-shapes [{:name "data" :shape [10 10] :layout "NT"}]})
           (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1 :momentum 0.9})}))
-      (is (= (-> mod m/symbol sym/to-json)  (-> mod2 m/symbol sym/to-json)))
-      (is (= (-> mod m/params first) (-> mod2 m/params first))))))
+      (is (= (-> mod m/symbol sym/to-json) (-> mod2 m/symbol sym/to-json)))
+      (is (= (-> mod m/params first) (-> mod2 m/params first))))
+    ;; arity 2 version of above. `load-optimizer-states` is `false` here by default,
+    ;; but optimizers states aren't checked here so it's not relevant to the test outcome.
+    (let [mod3 (m/load-checkpoint "test" 0)]
+      (-> mod3
+          (m/bind {:data-shapes [{:name "data" :shape [10 10] :layout "NT"}]})
+          (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1 :momentum 0.9})}))
+      (is (= (-> mod m/symbol sym/to-json) (-> mod3 m/symbol sym/to-json)))
+      (is (= (-> mod m/params first) (-> mod3 m/params first))))))
 
 (deftest test-module-save-load-multi-device
   (let [s (sym/variable "data")
@@ -321,4 +328,3 @@
 (comment
 
   (m/data-shapes x))
-
diff --git a/docs/api/python/contrib/onnx.md b/docs/api/python/contrib/onnx.md
index 3fe2048001fc..d7c34ec1e01f 100644
--- a/docs/api/python/contrib/onnx.md
+++ b/docs/api/python/contrib/onnx.md
@@ -13,7 +13,7 @@ With ONNX format support for MXNet, developers can build and train models with a
 ```
 
 ### Installation Instructions
-- To use this module developers need to **install ONNX**, which requires the protobuf compiler to be installed separately. Please follow the [instructions to install ONNX and its dependencies](https://github.com/onnx/onnx#installation). **MXNet currently supports ONNX v1.1.1**. Once installed, you can go through the tutorials on how to use this module.
+- To use this module developers need to **install ONNX**, which requires the protobuf compiler to be installed separately. Please follow the [instructions to install ONNX and its dependencies](https://github.com/onnx/onnx#installation). **MXNet currently supports ONNX v1.2.1**. Once installed, you can go through the tutorials on how to use this module.
 
 
 This document describes all the ONNX-MXNet APIs.
@@ -24,6 +24,7 @@ This document describes all the ONNX-MXNet APIs.
 
     mxnet.contrib.onnx.import_model
     mxnet.contrib.onnx.get_model_metadata
+    mxnet.contrib.onnx.import_to_gluon
     mxnet.contrib.onnx.export_model
 ```
 
@@ -49,10 +50,10 @@ This document describes all the ONNX-MXNet APIs.
 
 ```eval_rst
 
-.. automodule:: mxnet.contrib.onnx
-    :members: import_model
-    :members: get_model_metadata
-    :members: export_model
+.. automodule:: mxnet.contrib.onnx.import_model
+.. automodule:: mxnet.contrib.onnx.get_model_metadata
+.. automodule:: mxnet.contrib.onnx.import_to_gluon
+.. automodule:: mxnet.contrib.onnx.export_model
 
 ```
 
diff --git a/docs/community/ecosystem.md b/docs/community/ecosystem.md
index 5ca6d7a0b479..54f8c8993ea9 100644
--- a/docs/community/ecosystem.md
+++ b/docs/community/ecosystem.md
@@ -57,7 +57,7 @@ Community contributions to MXNet have added many new valuable features and funct
 
 ## Model Serving
 
-* [MXNet Model Server (MMS)](https://github.com/apache/incubator-mxnet/tree/master/example/model-server/mms.md) - simple yet scalable solution for model inference.
+* [MXNet Model Server (MMS)](https://github.com/awslabs/mxnet-model-server) - simple yet scalable solution for model inference.
 
 
 ## Model Zoos
diff --git a/docs/community/mxnet_channels.md b/docs/community/mxnet_channels.md
index ef3963f7dabc..18dc1bc55ec8 100644
--- a/docs/community/mxnet_channels.md
+++ b/docs/community/mxnet_channels.md
@@ -2,7 +2,7 @@
 
 Converse with the MXNet community via the following channels:
 
-- [Forum](https://discuss.mxnet.io/): [discuss.mxnet.io](discuss.mxnet.io) <i class="fas fa-external-link-alt"></i>
+- [Forum](https://discuss.mxnet.io/): [discuss.mxnet.io](https://discuss.mxnet.io/) <i class="fas fa-external-link-alt"></i>
 - [MXNet Apache developer mailing list](https://lists.apache.org/list.html?dev@mxnet.apache.org) (dev@mxnet.apache.org): To subscribe, send an email to <a href="mailto:user-subscribe@mxnet.apache.org">dev-subscribe@mxnet.apache.org</a> <i class="far fa-envelope"></i>
 - [MXNet Apache user mailing list](https://lists.apache.org/list.html?user@mxnet.apache.org) (user@mxnet.apache.org): To subscribe, send an email to <a href="mailto:dev-subscribe@mxnet.apache.org">user-subscribe@mxnet.apache.org</a> <i class="far fa-envelope"></i>
 - [MXNet Slack channel](https://apache-mxnet.slack.com): To request an invitation to the channel please subscribe to the mailing list above and then email: <a href="mailto:dev@mxnet.apache.org">dev@mxnet.apache.org</a> <i class="far fa-envelope"></i>
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index 881bc14fdc89..6e9a3594168f 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -152,6 +152,10 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
   - Values: String ```(default='https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'```
   - The repository url to be used for Gluon datasets and pre-trained models.
 
+* MXNET_HOME
+  - Data directory in the filesystem for storage, for example when downloading gluon models.
+  - Default in *nix is .mxnet APPDATA/mxnet in windows.
+
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
diff --git a/docs/install/index.md b/docs/install/index.md
index d4704df2ee7b..57c50eb9bb06 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -1784,7 +1784,7 @@ Next, we install the ```graphviz``` library that we use for visualizing network
 <div class="cpu">
 
 Install the latest version (3.5.1+) of R from [CRAN](https://cran.r-project.org/bin/windows/).
-You can [build MXNet-R from source](windows_setup.html#install-the-mxnet-package-for-r), or you can use a pre-built binary:
+You can [build MXNet-R from source](windows_setup.html#install-mxnet-package-for-r), or you can use a pre-built binary:
 
 ```r
 cran <- getOption("repos")
@@ -1797,14 +1797,15 @@ install.packages("mxnet")
 
 <div class="gpu">
 
-You can [build MXNet-R from source](windows_setup.html#install-the-mxnet-package-for-r), or you can use a pre-built binary:
+You can [build MXNet-R from source](windows_setup.html#install-mxnet-package-for-r), or you can use a pre-built binary:
 
 ```r
-cran <- getOption("repos")
-cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU"
-options(repos = cran)
-install.packages("mxnet")
+  cran <- getOption("repos")
+  cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU/cu92"
+  options(repos = cran)
+  install.packages("mxnet")
 ```
+Change cu92 to cu80, cu90 or cu91 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
 
 </div> <!-- END of GPU -->
 </div> <!-- END - Windows R -->
diff --git a/docs/install/windows_setup.md b/docs/install/windows_setup.md
index 9d03474b5949..40ddeb8182d8 100755
--- a/docs/install/windows_setup.md
+++ b/docs/install/windows_setup.md
@@ -218,11 +218,11 @@ For GPU package:
 
 ```r
   cran <- getOption("repos")
-  cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU/cuX"
+  cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU/cu92"
   options(repos = cran)
   install.packages("mxnet")
 ```
-Change X to 80,90,91 or 92 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
+Change cu92 to cu80, cu90 or cu91 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
 #### Building MXNet from Source Code(GPU)
 After you have installed above software, continue with the following steps to build MXNet-R: 
 1. Clone the MXNet github repo.
diff --git a/docs/settings.ini b/docs/settings.ini
index 8459486c684c..b438a071f708 100644
--- a/docs/settings.ini
+++ b/docs/settings.ini
@@ -4,7 +4,7 @@ build_mxnet = 0
 [document_sets_default]
 clojure_docs = 1
 doxygen_docs = 1
-r_docs = 1
+r_docs = 0
 scala_docs = 1
 
 [document_sets_1.2.0]
diff --git a/docs/tutorials/scala/index.md b/docs/tutorials/scala/index.md
index cd9b2e219fcc..f14337f90f08 100644
--- a/docs/tutorials/scala/index.md
+++ b/docs/tutorials/scala/index.md
@@ -6,8 +6,8 @@ Using MXNet-Scala is easiest with Maven. You have a couple of options for settin
 
 **Note:** Windows is not yet supported.
 
-* [MXNet-Scala Setup Guide Using Maven](../install/scala_setup.html)
-* [Setup Scala with MXNet and Create a MXNet-Scala Project with IntelliJ](mxnet_scala_on_intellij.html)
+* [MXNet-Scala Setup Guide Using Maven](../../install/scala_setup.md)
+* [Setup Scala with MXNet and Create a MXNet-Scala Project with IntelliJ](mxnet_scala_on_intellij.md)
 
 ## Tutorials
 
diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py
index 67cda78172b6..b3b13053addf 100755
--- a/example/image-classification/common/fit.py
+++ b/example/image-classification/common/fit.py
@@ -135,6 +135,12 @@ def add_fit_args(parser):
                        help='the epochs to ramp-up lr to scaled large-batch value')
     train.add_argument('--warmup-strategy', type=str, default='linear',
                        help='the ramping-up strategy for large batch sgd')
+    train.add_argument('--profile-worker-suffix', type=str, default='',
+                       help='profile workers actions into this file. During distributed training\
+                             filename saved will be rank1_ followed by this suffix')
+    train.add_argument('--profile-server-suffix', type=str, default='',
+                       help='profile server actions into a file with name like rank1_ followed by this suffix \
+                             during distributed training')
     return train
 
 
@@ -150,6 +156,17 @@ def fit(args, network, data_loader, **kwargs):
     if args.gc_type != 'none':
         kv.set_gradient_compression({'type': args.gc_type,
                                      'threshold': args.gc_threshold})
+    if args.profile_server_suffix:
+        mx.profiler.set_config(filename=args.profile_server_suffix, profile_all=True, profile_process='server')
+        mx.profiler.set_state(state='run', profile_process='server')
+
+    if args.profile_worker_suffix:
+        if kv.num_workers > 1:
+            filename = 'rank' + str(kv.rank) + '_' + args.profile_worker_suffix
+        else:
+            filename = args.profile_worker_suffix
+        mx.profiler.set_config(filename=filename, profile_all=True, profile_process='worker')
+        mx.profiler.set_state(state='run', profile_process='worker')
 
     # logging
     head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
@@ -180,7 +197,6 @@ def fit(args, network, data_loader, **kwargs):
                 logging.info('Batch [%d]\tSpeed: %.2f samples/sec', i,
                              args.disp_batches * args.batch_size / (time.time() - tic))
                 tic = time.time()
-
         return
 
     # load model
@@ -314,3 +330,8 @@ def fit(args, network, data_loader, **kwargs):
               epoch_end_callback=checkpoint,
               allow_missing=True,
               monitor=monitor)
+
+    if args.profile_server_suffix:
+        mx.profiler.set_state(state='run', profile_process='server')
+    if args.profile_worker_suffix:
+        mx.profiler.set_state(state='run', profile_process='worker')
\ No newline at end of file
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 75147cfd706d..6bbe9dfe8f0a 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -230,7 +230,19 @@ MXNET_DLL int MXRandomSeedContext(int seed, int dev_type, int dev_id);
 MXNET_DLL int MXNotifyShutdown();
 
 /*!
- * \brief Set up configuration of profiler
+ * \brief Set up configuration of profiler for the process passed as profile_process in keys
+ * \param num_params Number of parameters
+ * \param keys array of parameter keys
+ * \param vals array of parameter values
+ * \param kvstoreHandle handle to kvstore
+ * \return 0 when success, -1 when failure happens.
+ */
+MXNET_DLL int MXSetProcessProfilerConfig(int num_params, const char* const* keys,
+                                         const char* const* vals,
+                                         KVStoreHandle kvstoreHandle);
+
+/*!
+ * \brief Set up configuration of profiler for worker/current process
  * \param num_params Number of parameters
  * \param keys array of parameter keys
  * \param vals array of parameter values
@@ -239,7 +251,21 @@ MXNET_DLL int MXNotifyShutdown();
 MXNET_DLL int MXSetProfilerConfig(int num_params, const char* const* keys, const char* const* vals);
 
 /*!
- * \brief Set up state of profiler
+ * \brief Set up state of profiler for either worker or server process
+ * \param state indicate the working state of profiler,
+ *  profiler not running when state == 0,
+ *  profiler running when state == 1
+ * \param profile_process an int,
+ * when 0 command is for worker/current process,
+ * when 1 command is for server process
+ * \param kvstoreHandle handle to kvstore, needed for server process profiling
+ * \return 0 when success, -1 when failure happens.
+ */
+MXNET_DLL int MXSetProcessProfilerState(int state, int profile_process,
+                                        KVStoreHandle kvStoreHandle);
+
+/*!
+ * \brief Set up state of profiler for current process
  * \param state indicate the working state of profiler,
  *  profiler not running when state == 0,
  *  profiler running when state == 1
@@ -250,11 +276,22 @@ MXNET_DLL int MXSetProfilerState(int state);
 /*!
  * \brief Save profile and stop profiler
  * \param finished true if stat output should stop after this point
+ * \param profile_process an int,
+ * when 0 command is for worker/current process,
+ * when 1 command is for server process
+ * \param kvstoreHandle handle to kvstore
  * \return 0 when success, -1 when failure happens.
  */
-MXNET_DLL int MXDumpProfile(int finished);
+MXNET_DLL int MXDumpProcessProfile(int finished, int profile_process, KVStoreHandle kvStoreHandle);
 
 
+/*!
+ * \brief Save profile and stop profiler for worker/current process
+ * \param finished true if stat output should stop after this point
+ * \return 0 when success, -1 when failure happens.
+ */
+MXNET_DLL int MXDumpProfile(int finished);
+
 /*!
  * \brief Print aggregate stats to the a string
  * \param out_str Will receive a pointer to the output string
@@ -267,6 +304,16 @@ MXNET_DLL int MXAggregateProfileStatsPrint(const char **out_str, int reset);
 /*!
  * \brief Pause profiler tuning collection
  * \param paused If nonzero, profiling pauses. Otherwise, profiling resumes/continues
+ * \param profile_process integer which denotes whether to process worker or server process
+ * \param kvstoreHandle handle to kvstore
+ * \return 0 when success, -1 when failure happens.
+ * \note pausing and resuming is global and not recursive
+ */
+MXNET_DLL int MXProcessProfilePause(int paused, int profile_process, KVStoreHandle kvStoreHandle);
+
+/*!
+ * \brief Pause profiler tuning collection for worker/current process
+ * \param paused If nonzero, profiling pauses. Otherwise, profiling resumes/continues
  * \return 0 when success, -1 when failure happens.
  * \note pausing and resuming is global and not recursive
  */
@@ -2145,8 +2192,7 @@ typedef void (MXKVStoreServerController)(int head,
                                          void *controller_handle);
 
 /**
- * \return Run as server (or scheduler)
- *
+ * \brief Run as server (or scheduler)
  * \param handle handle to the KVStore
  * \param controller the user-defined server controller
  * \param controller_handle helper handle for implementing controller
@@ -2157,8 +2203,7 @@ MXNET_DLL int MXKVStoreRunServer(KVStoreHandle handle,
                                  void *controller_handle);
 
 /**
- * \return Send a command to all server nodes
- *
+ * \brief Send a command to all server nodes
  * \param handle handle to the KVStore
  * \param cmd_id the head of the command
  * \param cmd_body the body of the command
diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h
index e10bd213aa26..a73d96356132 100644
--- a/include/mxnet/kvstore.h
+++ b/include/mxnet/kvstore.h
@@ -38,6 +38,18 @@
 #endif  // MXNET_USE_DIST_KVSTORE
 
 namespace mxnet {
+
+/*!
+ * \brief enum to denote types of commands kvstore sends to server regarding profiler
+ * kSetConfig sets profiler configs. Similar to mx.profiler.set_config()
+ * kState allows changing state of profiler to stop or run
+ * kPause allows pausing and resuming of profiler
+ * kDump asks profiler to dump output
+ */
+enum class KVStoreServerProfilerCommand {
+  kSetConfig, kState, kPause, kDump
+};
+
 /*!
  * \brief distributed key-value store
  *
@@ -364,6 +376,20 @@ class KVStore {
    */
   virtual void SendCommandToServers(int cmd_id, const std::string& cmd_body) { }
 
+  /**
+   * \brief Sends server profiler commands to all server nodes
+   * Only the worker with rank=0 sends the command which will be received by all servers
+   * \param type ProfilerCommand type
+   * \param params parameters for that command in the form of a string
+   */
+  virtual void SetServerProfilerCommand(const KVStoreServerProfilerCommand type,
+                                        const std::string& params) {
+    LOG(INFO) << "Unable to pass server the profiler command. If you are using "
+              << "distributed kvstore, you need to compile with USE_DIST_KVSTORE=1."
+              << "If you are training on single machine, then there is no server process"
+              << "to profile. Please profile the worker process instead.";
+  }
+
   /**
    * \brief the prototype of a server controller
    */
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 4df794bdfe37..3d8ee0191757 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -22,11 +22,11 @@
 
 import atexit
 import ctypes
-import inspect
 import os
 import sys
 import warnings
-
+import inspect
+import platform
 import numpy as np
 
 from . import libinfo
@@ -59,6 +59,26 @@
     py_str = lambda x: x
 
 
+def data_dir_default():
+    """
+
+    :return: default data directory depending on the platform and environment variables
+    """
+    system = platform.system()
+    if system == 'Windows':
+        return os.path.join(os.environ.get('APPDATA'), 'mxnet')
+    else:
+        return os.path.join(os.path.expanduser("~"), '.mxnet')
+
+
+def data_dir():
+    """
+
+    :return: data directory in the filesystem for storage, for example when downloading models
+    """
+    return os.getenv('MXNET_HOME', data_dir_default())
+
+
 class _NullType(object):
     """Placeholder for arguments"""
     def __repr__(self):
diff --git a/python/mxnet/contrib/text/embedding.py b/python/mxnet/contrib/text/embedding.py
index 6598718e6b01..38defb4b90bc 100644
--- a/python/mxnet/contrib/text/embedding.py
+++ b/python/mxnet/contrib/text/embedding.py
@@ -34,6 +34,7 @@
 from . import vocab
 from ... import ndarray as nd
 from ... import registry
+from ... import base
 
 
 def register(embedding_cls):
@@ -496,7 +497,7 @@ class GloVe(_TokenEmbedding):
     ----------
     pretrained_file_name : str, default 'glove.840B.300d.txt'
         The name of the pre-trained token embedding file.
-    embedding_root : str, default os.path.join('~', '.mxnet', 'embeddings')
+    embedding_root : str, default $MXNET_HOME/embeddings
         The root directory for storing embedding-related files.
     init_unknown_vec : callback
         The callback used to initialize the embedding vector for the unknown token.
@@ -541,7 +542,7 @@ def _get_download_file_name(cls, pretrained_file_name):
         return archive
 
     def __init__(self, pretrained_file_name='glove.840B.300d.txt',
-                 embedding_root=os.path.join('~', '.mxnet', 'embeddings'),
+                 embedding_root=os.path.join(base.data_dir(), 'embeddings'),
                  init_unknown_vec=nd.zeros, vocabulary=None, **kwargs):
         GloVe._check_pretrained_file_names(pretrained_file_name)
 
@@ -600,7 +601,7 @@ class FastText(_TokenEmbedding):
     ----------
     pretrained_file_name : str, default 'wiki.en.vec'
         The name of the pre-trained token embedding file.
-    embedding_root : str, default os.path.join('~', '.mxnet', 'embeddings')
+    embedding_root : str, default $MXNET_HOME/embeddings
         The root directory for storing embedding-related files.
     init_unknown_vec : callback
         The callback used to initialize the embedding vector for the unknown token.
@@ -642,7 +643,7 @@ def _get_download_file_name(cls, pretrained_file_name):
         return '.'.join(pretrained_file_name.split('.')[:-1])+'.zip'
 
     def __init__(self, pretrained_file_name='wiki.simple.vec',
-                 embedding_root=os.path.join('~', '.mxnet', 'embeddings'),
+                 embedding_root=os.path.join(base.data_dir(), 'embeddings'),
                  init_unknown_vec=nd.zeros, vocabulary=None, **kwargs):
         FastText._check_pretrained_file_names(pretrained_file_name)
 
diff --git a/python/mxnet/gluon/contrib/data/text.py b/python/mxnet/gluon/contrib/data/text.py
index 98fe6b657f2b..9e78e3c2e23c 100644
--- a/python/mxnet/gluon/contrib/data/text.py
+++ b/python/mxnet/gluon/contrib/data/text.py
@@ -30,8 +30,7 @@
 from ...data import dataset
 from ...utils import download, check_sha1, _get_repo_file_url
 from ....contrib import text
-from .... import nd
-
+from .... import nd, base
 
 class _LanguageModelDataset(dataset._DownloadedDataset): # pylint: disable=abstract-method
     def __init__(self, root, namespace, vocabulary):
@@ -116,7 +115,7 @@ class WikiText2(_WikiText):
 
     Parameters
     ----------
-    root : str, default '~/.mxnet/datasets/wikitext-2'
+    root : str, default $MXNET_HOME/datasets/wikitext-2
         Path to temp folder for storing data.
     segment : str, default 'train'
         Dataset segment. Options are 'train', 'validation', 'test'.
@@ -127,7 +126,7 @@ class WikiText2(_WikiText):
         The sequence length of each sample, regardless of the sentence boundary.
 
     """
-    def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'wikitext-2'),
+    def __init__(self, root=os.path.join(base.data_dir(), 'datasets', 'wikitext-2'),
                  segment='train', vocab=None, seq_len=35):
         self._archive_file = ('wikitext-2-v1.zip', '3c914d17d80b1459be871a5039ac23e752a53cbe')
         self._data_file = {'train': ('wiki.train.tokens',
@@ -154,7 +153,7 @@ class WikiText103(_WikiText):
 
     Parameters
     ----------
-    root : str, default '~/.mxnet/datasets/wikitext-103'
+    root : str, default $MXNET_HOME/datasets/wikitext-103
         Path to temp folder for storing data.
     segment : str, default 'train'
         Dataset segment. Options are 'train', 'validation', 'test'.
@@ -164,7 +163,7 @@ class WikiText103(_WikiText):
     seq_len : int, default 35
         The sequence length of each sample, regardless of the sentence boundary.
     """
-    def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'wikitext-103'),
+    def __init__(self, root=os.path.join(base.data_dir(), 'datasets', 'wikitext-103'),
                  segment='train', vocab=None, seq_len=35):
         self._archive_file = ('wikitext-103-v1.zip', '0aec09a7537b58d4bb65362fee27650eeaba625a')
         self._data_file = {'train': ('wiki.train.tokens',
diff --git a/python/mxnet/gluon/data/vision/datasets.py b/python/mxnet/gluon/data/vision/datasets.py
index 74a5aebf17bb..2c98000389ad 100644
--- a/python/mxnet/gluon/data/vision/datasets.py
+++ b/python/mxnet/gluon/data/vision/datasets.py
@@ -30,7 +30,7 @@
 
 from .. import dataset
 from ...utils import download, check_sha1, _get_repo_file_url
-from .... import nd, image, recordio
+from .... import nd, image, recordio, base
 
 
 class MNIST(dataset._DownloadedDataset):
@@ -40,7 +40,7 @@ class MNIST(dataset._DownloadedDataset):
 
     Parameters
     ----------
-    root : str, default '~/.mxnet/datasets/mnist'
+    root : str, default $MXNET_HOME/datasets/mnist
         Path to temp folder for storing data.
     train : bool, default True
         Whether to load the training or testing set.
@@ -51,7 +51,7 @@ class MNIST(dataset._DownloadedDataset):
         transform=lambda data, label: (data.astype(np.float32)/255, label)
 
     """
-    def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'mnist'),
+    def __init__(self, root=os.path.join(base.data_dir(), 'datasets', 'mnist'),
                  train=True, transform=None):
         self._train = train
         self._train_data = ('train-images-idx3-ubyte.gz',
@@ -101,7 +101,7 @@ class FashionMNIST(MNIST):
 
     Parameters
     ----------
-    root : str, default '~/.mxnet/datasets/fashion-mnist'
+    root : str, default $MXNET_HOME/datasets/fashion-mnist'
         Path to temp folder for storing data.
     train : bool, default True
         Whether to load the training or testing set.
@@ -112,7 +112,7 @@ class FashionMNIST(MNIST):
         transform=lambda data, label: (data.astype(np.float32)/255, label)
 
     """
-    def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'fashion-mnist'),
+    def __init__(self, root=os.path.join(base.data_dir(), 'datasets', 'fashion-mnist'),
                  train=True, transform=None):
         self._train = train
         self._train_data = ('train-images-idx3-ubyte.gz',
@@ -134,7 +134,7 @@ class CIFAR10(dataset._DownloadedDataset):
 
     Parameters
     ----------
-    root : str, default '~/.mxnet/datasets/cifar10'
+    root : str, default $MXNET_HOME/datasets/cifar10
         Path to temp folder for storing data.
     train : bool, default True
         Whether to load the training or testing set.
@@ -145,7 +145,7 @@ class CIFAR10(dataset._DownloadedDataset):
         transform=lambda data, label: (data.astype(np.float32)/255, label)
 
     """
-    def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'cifar10'),
+    def __init__(self, root=os.path.join(base.data_dir(), 'datasets', 'cifar10'),
                  train=True, transform=None):
         self._train = train
         self._archive_file = ('cifar-10-binary.tar.gz', 'fab780a1e191a7eda0f345501ccd62d20f7ed891')
@@ -197,7 +197,7 @@ class CIFAR100(CIFAR10):
 
     Parameters
     ----------
-    root : str, default '~/.mxnet/datasets/cifar100'
+    root : str, default $MXNET_HOME/datasets/cifar100
         Path to temp folder for storing data.
     fine_label : bool, default False
         Whether to load the fine-grained (100 classes) or coarse-grained (20 super-classes) labels.
@@ -210,7 +210,7 @@ class CIFAR100(CIFAR10):
         transform=lambda data, label: (data.astype(np.float32)/255, label)
 
     """
-    def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'cifar100'),
+    def __init__(self, root=os.path.join(base.data_dir(), 'datasets', 'cifar100'),
                  fine_label=False, train=True, transform=None):
         self._train = train
         self._archive_file = ('cifar-100-binary.tar.gz', 'a0bb982c76b83111308126cc779a992fa506b90b')
diff --git a/python/mxnet/gluon/model_zoo/model_store.py b/python/mxnet/gluon/model_zoo/model_store.py
index 7eead68f0dbf..11ac47bae905 100644
--- a/python/mxnet/gluon/model_zoo/model_store.py
+++ b/python/mxnet/gluon/model_zoo/model_store.py
@@ -21,8 +21,10 @@
 __all__ = ['get_model_file', 'purge']
 import os
 import zipfile
+import logging
 
 from ..utils import download, check_sha1
+from ... import base, util
 
 _model_sha1 = {name: checksum for checksum, name in [
     ('44335d1f0046b328243b32a26a4fbd62d9057b45', 'alexnet'),
@@ -68,7 +70,7 @@ def short_hash(name):
         raise ValueError('Pretrained model for {name} is not available.'.format(name=name))
     return _model_sha1[name][:8]
 
-def get_model_file(name, root=os.path.join('~', '.mxnet', 'models')):
+def get_model_file(name, root=os.path.join(base.data_dir(), 'models')):
     r"""Return location for the pretrained on local file system.
 
     This function will download from online model zoo when model cannot be found or has mismatch.
@@ -78,7 +80,7 @@ def get_model_file(name, root=os.path.join('~', '.mxnet', 'models')):
     ----------
     name : str
         Name of the model.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
 
     Returns
@@ -95,12 +97,11 @@ def get_model_file(name, root=os.path.join('~', '.mxnet', 'models')):
         if check_sha1(file_path, sha1_hash):
             return file_path
         else:
-            print('Mismatch in the content of model file detected. Downloading again.')
+            logging.warning('Mismatch in the content of model file detected. Downloading again.')
     else:
-        print('Model file is not found. Downloading.')
+        logging.info('Model file not found. Downloading to %s.', file_path)
 
-    if not os.path.exists(root):
-        os.makedirs(root)
+    util.makedirs(root)
 
     zip_file_path = os.path.join(root, file_name+'.zip')
     repo_url = os.environ.get('MXNET_GLUON_REPO', apache_repo_url)
@@ -118,12 +119,12 @@ def get_model_file(name, root=os.path.join('~', '.mxnet', 'models')):
     else:
         raise ValueError('Downloaded file has different hash. Please try again.')
 
-def purge(root=os.path.join('~', '.mxnet', 'models')):
+def purge(root=os.path.join(base.data_dir(), 'models')):
     r"""Purge all pretrained model files in local file store.
 
     Parameters
     ----------
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     root = os.path.expanduser(root)
diff --git a/python/mxnet/gluon/model_zoo/vision/__init__.py b/python/mxnet/gluon/model_zoo/vision/__init__.py
index a6e5dc137d48..7d33ce409b21 100644
--- a/python/mxnet/gluon/model_zoo/vision/__init__.py
+++ b/python/mxnet/gluon/model_zoo/vision/__init__.py
@@ -101,7 +101,7 @@ def get_model(name, **kwargs):
         Number of classes for the output layer.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
 
     Returns
diff --git a/python/mxnet/gluon/model_zoo/vision/alexnet.py b/python/mxnet/gluon/model_zoo/vision/alexnet.py
index fdb006258c2a..daf4617cd12e 100644
--- a/python/mxnet/gluon/model_zoo/vision/alexnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/alexnet.py
@@ -25,6 +25,7 @@
 from ....context import cpu
 from ...block import HybridBlock
 from ... import nn
+from .... import base
 
 # Net
 class AlexNet(HybridBlock):
@@ -68,7 +69,7 @@ def hybrid_forward(self, F, x):
 
 # Constructor
 def alexnet(pretrained=False, ctx=cpu(),
-            root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+            root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""AlexNet model from the `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
 
     Parameters
@@ -77,7 +78,7 @@ def alexnet(pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     net = AlexNet(**kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/densenet.py b/python/mxnet/gluon/model_zoo/vision/densenet.py
index b03f5ce8d52a..83febd3658c4 100644
--- a/python/mxnet/gluon/model_zoo/vision/densenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/densenet.py
@@ -26,6 +26,7 @@
 from ...block import HybridBlock
 from ... import nn
 from ...contrib.nn import HybridConcurrent, Identity
+from .... import base
 
 # Helpers
 def _make_dense_block(num_layers, bn_size, growth_rate, dropout, stage_index):
@@ -122,7 +123,7 @@ def hybrid_forward(self, F, x):
 
 # Constructor
 def get_densenet(num_layers, pretrained=False, ctx=cpu(),
-                 root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+                 root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""Densenet-BC model from the
     `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_ paper.
 
@@ -134,7 +135,7 @@ def get_densenet(num_layers, pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     num_init_features, growth_rate, block_config = densenet_spec[num_layers]
@@ -154,7 +155,7 @@ def densenet121(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_densenet(121, **kwargs)
@@ -169,7 +170,7 @@ def densenet161(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_densenet(161, **kwargs)
@@ -184,7 +185,7 @@ def densenet169(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_densenet(169, **kwargs)
@@ -199,7 +200,7 @@ def densenet201(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_densenet(201, **kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/inception.py b/python/mxnet/gluon/model_zoo/vision/inception.py
index 7c54691f1b59..6bdc526a6a13 100644
--- a/python/mxnet/gluon/model_zoo/vision/inception.py
+++ b/python/mxnet/gluon/model_zoo/vision/inception.py
@@ -26,6 +26,7 @@
 from ...block import HybridBlock
 from ... import nn
 from ...contrib.nn import HybridConcurrent
+from .... import base
 
 # Helpers
 def _make_basic_conv(**kwargs):
@@ -199,7 +200,7 @@ def hybrid_forward(self, F, x):
 
 # Constructor
 def inception_v3(pretrained=False, ctx=cpu(),
-                 root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+                 root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""Inception v3 model from
     `"Rethinking the Inception Architecture for Computer Vision"
     <http://arxiv.org/abs/1512.00567>`_ paper.
@@ -210,7 +211,7 @@ def inception_v3(pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     net = Inception3(**kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/mobilenet.py b/python/mxnet/gluon/model_zoo/vision/mobilenet.py
index 1a2c9b946190..1a84e05af208 100644
--- a/python/mxnet/gluon/model_zoo/vision/mobilenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/mobilenet.py
@@ -30,6 +30,7 @@
 from ... import nn
 from ....context import cpu
 from ...block import HybridBlock
+from .... import base
 
 
 # Helpers
@@ -188,7 +189,7 @@ def hybrid_forward(self, F, x):
 
 # Constructor
 def get_mobilenet(multiplier, pretrained=False, ctx=cpu(),
-                  root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+                  root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""MobileNet model from the
     `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
     <https://arxiv.org/abs/1704.04861>`_ paper.
@@ -203,7 +204,7 @@ def get_mobilenet(multiplier, pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     net = MobileNet(multiplier, **kwargs)
@@ -219,7 +220,7 @@ def get_mobilenet(multiplier, pretrained=False, ctx=cpu(),
 
 
 def get_mobilenet_v2(multiplier, pretrained=False, ctx=cpu(),
-                     root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+                     root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""MobileNetV2 model from the
     `"Inverted Residuals and Linear Bottlenecks:
       Mobile Networks for Classification, Detection and Segmentation"
@@ -235,7 +236,7 @@ def get_mobilenet_v2(multiplier, pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     net = MobileNetV2(multiplier, **kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/resnet.py b/python/mxnet/gluon/model_zoo/vision/resnet.py
index da279b89583e..48390decb11b 100644
--- a/python/mxnet/gluon/model_zoo/vision/resnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/resnet.py
@@ -32,6 +32,7 @@
 from ....context import cpu
 from ...block import HybridBlock
 from ... import nn
+from .... import base
 
 # Helpers
 def _conv3x3(channels, stride, in_channels):
@@ -356,7 +357,7 @@ def hybrid_forward(self, F, x):
 
 # Constructor
 def get_resnet(version, num_layers, pretrained=False, ctx=cpu(),
-               root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+               root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""ResNet V1 model from `"Deep Residual Learning for Image Recognition"
     <http://arxiv.org/abs/1512.03385>`_ paper.
     ResNet V2 model from `"Identity Mappings in Deep Residual Networks"
@@ -372,7 +373,7 @@ def get_resnet(version, num_layers, pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     assert num_layers in resnet_spec, \
@@ -400,7 +401,7 @@ def resnet18_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 18, **kwargs)
@@ -415,7 +416,7 @@ def resnet34_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 34, **kwargs)
@@ -430,7 +431,7 @@ def resnet50_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 50, **kwargs)
@@ -445,7 +446,7 @@ def resnet101_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 101, **kwargs)
@@ -460,7 +461,7 @@ def resnet152_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 152, **kwargs)
@@ -475,7 +476,7 @@ def resnet18_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(2, 18, **kwargs)
@@ -490,7 +491,7 @@ def resnet34_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(2, 34, **kwargs)
@@ -505,7 +506,7 @@ def resnet50_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(2, 50, **kwargs)
@@ -520,7 +521,7 @@ def resnet101_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(2, 101, **kwargs)
@@ -535,7 +536,7 @@ def resnet152_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(2, 152, **kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/squeezenet.py b/python/mxnet/gluon/model_zoo/vision/squeezenet.py
index aaff4c36dfa0..b97d1274a6f0 100644
--- a/python/mxnet/gluon/model_zoo/vision/squeezenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/squeezenet.py
@@ -26,6 +26,7 @@
 from ...block import HybridBlock
 from ... import nn
 from ...contrib.nn import HybridConcurrent
+from .... import base
 
 # Helpers
 def _make_fire(squeeze_channels, expand1x1_channels, expand3x3_channels):
@@ -110,7 +111,7 @@ def hybrid_forward(self, F, x):
 
 # Constructor
 def get_squeezenet(version, pretrained=False, ctx=cpu(),
-                   root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+                   root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""SqueezeNet model from the `"SqueezeNet: AlexNet-level accuracy with 50x fewer parameters
     and <0.5MB model size" <https://arxiv.org/abs/1602.07360>`_ paper.
     SqueezeNet 1.1 model from the `official SqueezeNet repo
@@ -126,7 +127,7 @@ def get_squeezenet(version, pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     net = SqueezeNet(version, **kwargs)
@@ -145,7 +146,7 @@ def squeezenet1_0(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_squeezenet('1.0', **kwargs)
@@ -162,7 +163,7 @@ def squeezenet1_1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_squeezenet('1.1', **kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/vgg.py b/python/mxnet/gluon/model_zoo/vision/vgg.py
index a3b1685b4130..9a740e633182 100644
--- a/python/mxnet/gluon/model_zoo/vision/vgg.py
+++ b/python/mxnet/gluon/model_zoo/vision/vgg.py
@@ -30,6 +30,7 @@
 from ....initializer import Xavier
 from ...block import HybridBlock
 from ... import nn
+from .... import base
 
 
 class VGG(HybridBlock):
@@ -94,7 +95,7 @@ def hybrid_forward(self, F, x):
 
 # Constructors
 def get_vgg(num_layers, pretrained=False, ctx=cpu(),
-            root=os.path.join('~', '.mxnet', 'models'), **kwargs):
+            root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""VGG model from the `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
     <https://arxiv.org/abs/1409.1556>`_ paper.
 
@@ -106,7 +107,7 @@ def get_vgg(num_layers, pretrained=False, ctx=cpu(),
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     layers, filters = vgg_spec[num_layers]
@@ -128,7 +129,7 @@ def vgg11(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_vgg(11, **kwargs)
@@ -143,7 +144,7 @@ def vgg13(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_vgg(13, **kwargs)
@@ -158,7 +159,7 @@ def vgg16(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_vgg(16, **kwargs)
@@ -173,7 +174,7 @@ def vgg19(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_vgg(19, **kwargs)
@@ -189,7 +190,7 @@ def vgg11_bn(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
@@ -206,7 +207,7 @@ def vgg13_bn(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
@@ -223,7 +224,7 @@ def vgg16_bn(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
@@ -240,7 +241,7 @@ def vgg19_bn(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
-    root : str, default '~/.mxnet/models'
+    root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
diff --git a/python/mxnet/gluon/rnn/rnn_layer.py b/python/mxnet/gluon/rnn/rnn_layer.py
index 418c497ce832..4a7a0be2bc30 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -23,12 +23,11 @@
 from __future__ import print_function
 __all__ = ['RNN', 'LSTM', 'GRU']
 
-from ... import ndarray
-from .. import Block
+from ... import ndarray, symbol
+from .. import HybridBlock, tensor_types
 from . import rnn_cell
 
-
-class _RNNLayer(Block):
+class _RNNLayer(HybridBlock):
     """Implementation of recurrent layers."""
     def __init__(self, hidden_size, num_layers, layout,
                  dropout, bidirectional, input_size,
@@ -52,33 +51,28 @@ def __init__(self, hidden_size, num_layers, layout,
 
         self._gates = {'rnn_relu': 1, 'rnn_tanh': 1, 'lstm': 4, 'gru': 3}[mode]
 
-        self.i2h_weight = []
-        self.h2h_weight = []
-        self.i2h_bias = []
-        self.h2h_bias = []
-
         ng, ni, nh = self._gates, input_size, hidden_size
         for i in range(num_layers):
-            for j in (['l', 'r'] if self._dir == 2 else ['l']):
-                self.i2h_weight.append(
-                    self.params.get('%s%d_i2h_weight'%(j, i), shape=(ng*nh, ni),
-                                    init=i2h_weight_initializer,
-                                    allow_deferred_init=True))
-                self.h2h_weight.append(
-                    self.params.get('%s%d_h2h_weight'%(j, i), shape=(ng*nh, nh),
-                                    init=h2h_weight_initializer,
-                                    allow_deferred_init=True))
-                self.i2h_bias.append(
-                    self.params.get('%s%d_i2h_bias'%(j, i), shape=(ng*nh,),
-                                    init=i2h_bias_initializer,
-                                    allow_deferred_init=True))
-                self.h2h_bias.append(
-                    self.params.get('%s%d_h2h_bias'%(j, i), shape=(ng*nh,),
-                                    init=h2h_bias_initializer,
-                                    allow_deferred_init=True))
+            for j in ['l', 'r'][:self._dir]:
+                self._register_param('{}{}_i2h_weight'.format(j, i),
+                                     shape=(ng*nh, ni),
+                                     init=i2h_weight_initializer)
+                self._register_param('{}{}_h2h_weight'.format(j, i),
+                                     shape=(ng*nh, nh),
+                                     init=h2h_weight_initializer)
+                self._register_param('{}{}_i2h_bias'.format(j, i),
+                                     shape=(ng*nh,),
+                                     init=i2h_bias_initializer)
+                self._register_param('{}{}_h2h_bias'.format(j, i),
+                                     shape=(ng*nh,),
+                                     init=h2h_bias_initializer)
             ni = nh * self._dir
 
-        self._unfused = self._unfuse()
+    def _register_param(self, name, shape, init):
+        p = self.params.get(name, shape=shape, init=init,
+                            allow_deferred_init=True)
+        setattr(self, name, p)
+        return p
 
     def __repr__(self):
         s = '{name}({mapping}, {_layout}'
@@ -89,12 +83,23 @@ def __repr__(self):
         if self._dir == 2:
             s += ', bidirectional'
         s += ')'
-        shape = self.i2h_weight[0].shape
+        shape = self.l0_i2h_weight.shape
         mapping = '{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0] // self._gates)
         return s.format(name=self.__class__.__name__,
                         mapping=mapping,
                         **self.__dict__)
 
+    def _collect_params_with_prefix(self, prefix=''):
+        if prefix:
+            prefix += '.'
+        def convert_key(key): # for compatibility with old parameter format
+            key = key.split('_')
+            return '_unfused.{}.{}_cell.{}'.format(key[0][1:], key[0][0], '_'.join(key[1:]))
+        ret = {prefix + convert_key(key) : val for key, val in self._reg_params.items()}
+        for name, child in self._children.items():
+            ret.update(child._collect_params_with_prefix(prefix + name))
+        return ret
+
     def state_info(self, batch_size=0):
         raise NotImplementedError
 
@@ -111,7 +116,7 @@ def _unfuse(self):
                     'gru': lambda **kwargs: rnn_cell.GRUCell(self._hidden_size,
                                                              **kwargs)}[self._mode]
 
-        stack = rnn_cell.SequentialRNNCell(prefix=self.prefix, params=self.params)
+        stack = rnn_cell.HybridSequentialRNNCell(prefix=self.prefix, params=self.params)
         with stack.name_scope():
             ni = self._input_size
             for i in range(self._num_layers):
@@ -169,55 +174,42 @@ def begin_state(self, batch_size=0, func=ndarray.zeros, **kwargs):
             states.append(func(name='%sh0_%d'%(self.prefix, i), **info))
         return states
 
-    def forward(self, inputs, states=None):
-        batch_size = inputs.shape[self._layout.find('N')]
+    def hybrid_forward(self, F, inputs, states=None, **kwargs):
+        if F is ndarray:
+            batch_size = inputs.shape[self._layout.find('N')]
         skip_states = states is None
         if skip_states:
-            states = self.begin_state(batch_size, ctx=inputs.context)
-        if isinstance(states, ndarray.NDArray):
+            if F is ndarray:
+                states = self.begin_state(batch_size, ctx=inputs.context)
+            else:
+                states = self.begin_state(0, func=symbol.zeros)
+        if isinstance(states, tensor_types):
             states = [states]
-        for state, info in zip(states, self.state_info(batch_size)):
-            if state.shape != info['shape']:
-                raise ValueError(
-                    "Invalid recurrent state shape. Expecting %s, got %s."%(
-                        str(info['shape']), str(state.shape)))
-        if self._input_size == 0:
-            for i in range(self._dir):
-                self.i2h_weight[i].shape = (self._gates*self._hidden_size, inputs.shape[2])
-                self.i2h_weight[i]._finish_deferred_init()
-        out = self._forward_kernel(inputs, states)
+        if F is ndarray:
+            for state, info in zip(states, self.state_info(batch_size)):
+                if state.shape != info['shape']:
+                    raise ValueError(
+                        "Invalid recurrent state shape. Expecting %s, got %s."%(
+                            str(info['shape']), str(state.shape)))
+        out = self._forward_kernel(F, inputs, states, **kwargs)
 
         # out is (output, state)
         return out[0] if skip_states else out
 
-    def _forward(self, inputs, states):
-        """forward using gluon cell"""
-        ns = len(states)
-        axis = self._layout.find('T')
-        states = sum(zip(*((j for j in i) for i in states)), ())
-        outputs, states = self._unfused.unroll(
-            inputs.shape[axis], inputs, states,
-            layout=self._layout, merge_outputs=True)
-        new_states = []
-        for i in range(ns):
-            state = ndarray.concat(*(j.reshape((1,)+j.shape) for j in states[i::ns]), dim=0)
-            new_states.append(state)
-
-        return outputs, new_states
-
-    def _forward_kernel(self, inputs, states):
+    def _forward_kernel(self, F, inputs, states, **kwargs):
         """ forward using CUDNN or CPU kenrel"""
         if self._layout == 'NTC':
-            inputs = ndarray.swapaxes(inputs, dim1=0, dim2=1)
-        ctx = inputs.context
-        params = sum(zip(self.i2h_weight, self.h2h_weight), ())
-        params += sum(zip(self.i2h_bias, self.h2h_bias), ())
-        params = (i.data(ctx).reshape((-1,)) for i in params)
-        params = ndarray.concat(*params, dim=0)
-
-        rnn = ndarray.RNN(inputs, params, *states, state_size=self._hidden_size,
-                          num_layers=self._num_layers, bidirectional=self._dir == 2,
-                          p=self._dropout, state_outputs=True, mode=self._mode)
+            inputs = F.swapaxes(inputs, dim1=0, dim2=1)
+        params = (kwargs['{}{}_{}_{}'.format(d, l, g, t)].reshape(-1)
+                  for t in ['weight', 'bias']
+                  for l in range(self._num_layers)
+                  for d in ['l', 'r'][:self._dir]
+                  for g in ['i2h', 'h2h'])
+        params = F._internal._rnn_param_concat(*params, dim=0)
+
+        rnn = F.RNN(inputs, params, *states, state_size=self._hidden_size,
+                    num_layers=self._num_layers, bidirectional=self._dir == 2,
+                    p=self._dropout, state_outputs=True, mode=self._mode)
 
         if self._mode == 'lstm':
             outputs, states = rnn[0], [rnn[1], rnn[2]]
@@ -225,7 +217,7 @@ def _forward_kernel(self, inputs, states):
             outputs, states = rnn[0], [rnn[1]]
 
         if self._layout == 'NTC':
-            outputs = ndarray.swapaxes(outputs, dim1=0, dim2=1)
+            outputs = F.swapaxes(outputs, dim1=0, dim2=1)
 
         return outputs, states
 
diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py
index 609733659753..a54817501391 100644
--- a/python/mxnet/kvstore.py
+++ b/python/mxnet/kvstore.py
@@ -28,6 +28,7 @@
 from .base import check_call, string_types, mx_uint, py_str
 from .base import NDArrayHandle, KVStoreHandle
 from . import optimizer as opt
+from .profiler import set_kvstore_handle
 
 def _ctype_key_value(keys, vals):
     """
@@ -88,7 +89,8 @@ def _get_kvstore_server_command_type(command):
                      'kSetMultiPrecision': 1,
                      'kStopServer': 2,
                      'kSyncMode': 3,
-                     'kSetGradientCompression': 4}
+                     'kSetGradientCompression': 4,
+                     'kSetProfilerParams': 5}
     assert (command in command_types), "Unknown command type to send to server"
     return command_types[command]
 
@@ -670,4 +672,6 @@ def create(name='local'):
     handle = KVStoreHandle()
     check_call(_LIB.MXKVStoreCreate(c_str(name),
                                     ctypes.byref(handle)))
-    return KVStore(handle)
+    kv = KVStore(handle)
+    set_kvstore_handle(kv.handle)
+    return kv
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index f758af5f982c..ab7dadb17a54 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -24,7 +24,7 @@
 import warnings
 import numpy
 from .base import py_str
-from .ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs)
+from .ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply)
 from .ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
                       mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update,
                       signsgd_update, signum_update)
@@ -449,7 +449,7 @@ class SGD(Optimizer):
     **lazy updates** are applied by::
 
         for row in grad.indices:
-            rescaled_grad[row] = lr * rescale_grad * clip(grad[row], clip_gradient) + wd * weight[row]
+            rescaled_grad[row] = lr * (rescale_grad * clip(grad[row], clip_gradient) + wd * weight[row])
             state[row] = momentum[row] * state[row] + rescaled_grad[row]
             weight[row] = weight[row] - state[row]
 
@@ -462,7 +462,7 @@ class SGD(Optimizer):
 
     Otherwise, **standard updates** are applied by::
 
-        rescaled_grad = lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
+        rescaled_grad = lr * (rescale_grad * clip(grad, clip_gradient) + wd * weight)
         state = momentum * state + rescaled_grad
         weight = weight - state
 
@@ -616,6 +616,14 @@ class FTML(Optimizer):
     *FTML - Follow the Moving Leader in Deep Learning*,
     available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf.
 
+    Denote time step by t. The optimizer updates the weight by::
+
+        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        v = beta2 * v + (1 - beta2) * square(rescaled_grad)
+        d_t = (1 - power(beta1, t)) / lr * square_root(v / (1 - power(beta2, t))) + epsilon)
+        z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) * weight
+        weight = - z / d_t
+
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.
 
@@ -1080,6 +1088,13 @@ class AdaGrad(Optimizer):
     Methods for Online Learning and Stochastic Optimization*, and available at
     http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
 
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient)
+        history += square(grad)
+        div = grad / sqrt(history + float_stable_eps)
+        weight += (div + weight * wd) * -lr
+
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.
 
@@ -1207,6 +1222,14 @@ class AdaDelta(Optimizer):
     This class implements AdaDelta, an optimizer described in  *ADADELTA: An adaptive
     learning rate method*, available at https://arxiv.org/abs/1212.5701.
 
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        acc_grad = rho * acc_grad + (1. - rho) * grad * grad
+        delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
+        acc_delta = rho * acc_delta + (1. - rho) * delta * delta
+        weight -= (delta + wd * weight)
+
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.
 
@@ -1333,6 +1356,13 @@ class Adamax(Optimizer):
     It is a variant of Adam based on the infinity norm
     available at http://arxiv.org/abs/1412.6980 Section 7.
 
+    The optimizer updates the weight by::
+
+        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        m = beta1 * m_t + (1 - beta1) * grad
+        u = maximum(beta2 * u, abs(grad))
+        weight -= lr / (1 - beta1**t) * m / u
+
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.
 
diff --git a/python/mxnet/profiler.py b/python/mxnet/profiler.py
index 0e7a31c687ef..0b5e85b1eb54 100644
--- a/python/mxnet/profiler.py
+++ b/python/mxnet/profiler.py
@@ -22,8 +22,13 @@
 from __future__ import absolute_import
 import ctypes
 import warnings
-from .base import _LIB, check_call, c_str, ProfileHandle, c_str_array, py_str
+from .base import _LIB, check_call, c_str, ProfileHandle, c_str_array, py_str, KVStoreHandle
 
+profiler_kvstore_handle = KVStoreHandle()
+
+def set_kvstore_handle(handle):
+    global profiler_kvstore_handle
+    profiler_kvstore_handle = handle
 
 def set_config(**kwargs):
     """Set up the configure of profiler (only accepts keyword arguments).
@@ -49,12 +54,17 @@ def set_config(**kwargs):
     aggregate_stats : boolean,
         whether to maintain aggregate stats in memory for console
         dump.  Has some negative performance impact.
+    profile_process : string
+        whether to profile kvstore `server` or `worker`.
+        server can only be profiled when kvstore is of type dist.
+        if this is not passed, defaults to `worker`
     """
     kk = kwargs.keys()
     vv = kwargs.values()
-    check_call(_LIB.MXSetProfilerConfig(len(kwargs),
-                                        c_str_array([key for key in kk]),
-                                        c_str_array([str(val) for val in vv])))
+    check_call(_LIB.MXSetProcessProfilerConfig(len(kwargs),
+                                               c_str_array([key for key in kk]),
+                                               c_str_array([str(val) for val in vv]),
+                                               profiler_kvstore_handle))
 
 
 def profiler_set_config(mode='symbolic', filename='profile.json'):
@@ -73,10 +83,10 @@ def profiler_set_config(mode='symbolic', filename='profile.json'):
     keys = c_str_array([key for key in ["profile_" + mode, "filename"]])
     values = c_str_array([str(val) for val in [True, filename]])
     assert len(keys) == len(values)
-    check_call(_LIB.MXSetProfilerConfig(len(keys), keys, values))
+    check_call(_LIB.MXSetProcessProfilerConfig(len(keys), keys, values, profiler_kvstore_handle))
 
 
-def set_state(state='stop'):
+def set_state(state='stop', profile_process='worker'):
     """Set up the profiler state to 'run' or 'stop'.
 
     Parameters
@@ -84,9 +94,16 @@ def set_state(state='stop'):
     state : string, optional
         Indicates whether to run the profiler, can
         be 'stop' or 'run'. Default is `stop`.
+    profile_process : string
+        whether to profile kvstore `server` or `worker`.
+        server can only be profiled when kvstore is of type dist.
+        if this is not passed, defaults to `worker`
     """
     state2int = {'stop': 0, 'run': 1}
-    check_call(_LIB.MXSetProfilerState(ctypes.c_int(state2int[state])))
+    profile_process2int = {'worker': 0, 'server': 1}
+    check_call(_LIB.MXSetProcessProfilerState(ctypes.c_int(state2int[state]),
+                                              profile_process2int[profile_process],
+                                              profiler_kvstore_handle))
 
 
 def profiler_set_state(state='stop'):
@@ -102,7 +119,7 @@ def profiler_set_state(state='stop'):
                   'Please use profiler.set_state() instead')
     set_state(state)
 
-def dump(finished=True):
+def dump(finished=True, profile_process='worker'):
     """Dump profile and stop profiler. Use this to save profile
     in advance in case your program cannot exit normally.
 
@@ -111,9 +128,16 @@ def dump(finished=True):
     finished : boolean
         Indicates whether to stop statistic output (dumping) after this dump.
         Default is True
+    profile_process : string
+        whether to profile kvstore `server` or `worker`.
+        server can only be profiled when kvstore is of type dist.
+        if this is not passed, defaults to `worker`
     """
-    fin = 1 if finished is True else False
-    check_call(_LIB.MXDumpProfile(fin))
+    fin = 1 if finished is True else 0
+    profile_process2int = {'worker': 0, 'server': 1}
+    check_call(_LIB.MXDumpProcessProfile(fin,
+                                         profile_process2int[profile_process],
+                                         profiler_kvstore_handle))
 
 
 def dump_profile():
@@ -138,14 +162,37 @@ def dumps(reset=False):
     return py_str(debug_str.value)
 
 
-def pause():
-    """Pause profiling."""
-    check_call(_LIB.MXProfilePause(int(1)))
+def pause(profile_process='worker'):
+    """Pause profiling.
+
+    Parameters
+    ----------
+    profile_process : string
+        whether to profile kvstore `server` or `worker`.
+        server can only be profiled when kvstore is of type dist.
+        if this is not passed, defaults to `worker`
+    """
+    profile_process2int = {'worker': 0, 'server': 1}
+    check_call(_LIB.MXProcessProfilePause(int(1),
+                                          profile_process2int[profile_process],
+                                          profiler_kvstore_handle))
+
 
+def resume(profile_process='worker'):
+    """
+    Resume paused profiling.
 
-def resume():
-    """Resume paused profiling."""
-    check_call(_LIB.MXProfilePause(int(0)))
+    Parameters
+    ----------
+    profile_process : string
+        whether to profile kvstore `server` or `worker`.
+        server can only be profiled when kvstore is of type dist.
+        if this is not passed, defaults to `worker`
+    """
+    profile_process2int = {'worker': 0, 'server': 1}
+    check_call(_LIB.MXProcessProfilePause(int(0),
+                                          profile_process2int[profile_process],
+                                          profiler_kvstore_handle))
 
 
 class Domain(object):
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index 884288364b3d..1d42cf7c18f8 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -486,12 +486,12 @@ def _union_inputs(*graphs):
         input_id_to_loc = {}    # Dict[int, int], given id(sym), input_id_to_loc maps it
                                 # to a `loc`, where inputs[loc] = sym
         for graph in graphs:
-            # input_syms: all inputs to the `graph`
-            name_to_input_syms = {sym.name: sym for sym in _get_graph_inputs(graph)}
             # some loop_vars are inputs to `graph`, some are not
             name_to_loop_vars = {sym.name: sym for sym in loop_vars}
             # other inputs to `graph` created by cut_graph
             name_to_cut_g_syms = {sym.list_outputs()[0]: sym for sym in _cut_subgraph(graph)}
+            # input_syms: all inputs to the `graph`
+            name_to_input_syms = {sym.name: sym for sym in _get_graph_inputs(graph)}
             # also we collect the mapping from var's name to var's loc in loop_vars
             name_to_var_locs = {sym.name: i for i, sym in enumerate(loop_vars)}
             # collect arguments for each subgraph
@@ -644,12 +644,12 @@ def _union_inputs(*graphs):
         input_id_to_loc = {}    # Dict[int, int], given id(sym), input_id_to_loc maps it
                                 # to a `loc`, where inputs[loc] = sym
         for graph in graphs:
-            # input_syms: all inputs to the `graph`
-            name_to_input_syms = {sym.name: sym for sym in _get_graph_inputs(graph)}
             # some input_vars are inputs to `graph`, some are not
             name_to_input_vars = {sym.name: sym for sym in inputs}
             # other inputs to `graph` created by cut_graph
             name_to_cut_g_syms = {sym.list_outputs()[0]: sym for sym in _cut_subgraph(graph)}
+            # input_syms: all inputs to the `graph`
+            name_to_input_syms = {sym.name: sym for sym in _get_graph_inputs(graph)}
             # collect arguments for each subgraph
             input_locs = []                         # results from the second step
             for name in graph.list_inputs():
@@ -696,5 +696,4 @@ def _union_inputs(*graphs):
         else_input_locs=else_input_locs,
         num_outputs=then_num_outputs
     )
-    result = _to_symbol_tuple(result, "result")
-    return list(result)
+    return [result[i] for i in range(then_num_outputs)]
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
new file mode 100644
index 000000000000..57bc2bf76389
--- /dev/null
+++ b/python/mxnet/util.py
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""general utility functions"""
+
+import os
+import sys
+
+
+def makedirs(d):
+    """Create directories recursively if they don't exist. os.makedirs(exist_ok=True) is not
+    available in Python2"""
+    if sys.version_info[0] < 3:
+        from distutils.dir_util import mkpath
+        mkpath(d)
+    else:
+        os.makedirs(d, exist_ok=True)
diff --git a/scala-package/core/scripts/get_cifar_data.sh b/scala-package/core/scripts/get_cifar_data.sh
index 9ec1c39a4f99..b061c1895e4a 100755
--- a/scala-package/core/scripts/get_cifar_data.sh
+++ b/scala-package/core/scripts/get_cifar_data.sh
@@ -20,8 +20,8 @@
 
 set -e
 
-if [ ! -z "$MXNET_DATA_DIR" ]; then
-  data_path="$MXNET_DATA_DIR"
+if [ ! -z "$MXNET_HOME" ]; then
+  data_path="$MXNET_HOME"
 else
   data_path="./data"
 fi
diff --git a/scala-package/core/scripts/get_mnist_data.sh b/scala-package/core/scripts/get_mnist_data.sh
index 97e151bf8333..ded206fbb134 100755
--- a/scala-package/core/scripts/get_mnist_data.sh
+++ b/scala-package/core/scripts/get_mnist_data.sh
@@ -20,8 +20,8 @@
 
 set -e
 
-if [ ! -z "$MXNET_DATA_DIR" ]; then
-  data_path="$MXNET_DATA_DIR"
+if [ ! -z "$MXNET_HOME" ]; then
+  data_path="$MXNET_HOME"
 else
   data_path="./data"
 fi
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/TestUtil.scala b/scala-package/core/src/test/scala/org/apache/mxnet/TestUtil.scala
index 1187757a0331..4fc8ec9826c1 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/TestUtil.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/TestUtil.scala
@@ -24,7 +24,7 @@ class TestUtil {
     * @return Data direcotry path ()may be relative)
     */
   def getDataDirectory: String = {
-    var dataDir = System.getenv("MXNET_DATA_DIR")
+    var dataDir = System.getenv("MXNET_HOME")
     if(dataDir == null) {
       dataDir = "data"
     } else {
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/GanMnist.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/GanMnist.scala
index 6186989b74f6..70846eebfb8e 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/GanMnist.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/GanMnist.scala
@@ -181,7 +181,7 @@ object GanMnist {
     try {
       parser.parseArgument(args.toList.asJava)
 
-      val dataPath = if (anst.mnistDataPath == null) System.getenv("MXNET_DATA_DIR")
+      val dataPath = if (anst.mnistDataPath == null) System.getenv("MXNET_HOME")
       else anst.mnistDataPath
 
       assert(dataPath != null)
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainMnist.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainMnist.scala
index b0ecc7d29ccf..bd0ce45ffe5f 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainMnist.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainMnist.scala
@@ -112,7 +112,7 @@ object TrainMnist {
     try {
       parser.parseArgument(args.toList.asJava)
 
-      val dataPath = if (inst.dataDir == null) System.getenv("MXNET_DATA_DIR")
+      val dataPath = if (inst.dataDir == null) System.getenv("MXNET_HOME")
         else inst.dataDir
 
       val (dataShape, net) =
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala
index e886b908ba26..3bbd780d39b9 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExample.scala
@@ -119,13 +119,13 @@ object ImageClassifierExample {
       parser.parseArgument(args.toList.asJava)
 
 
-      val modelPathPrefix = if (inst.modelPathPrefix == null) System.getenv("MXNET_DATA_DIR")
+      val modelPathPrefix = if (inst.modelPathPrefix == null) System.getenv("MXNET_HOME")
       else inst.modelPathPrefix
 
-      val inputImagePath = if (inst.inputImagePath == null) System.getenv("MXNET_DATA_DIR")
+      val inputImagePath = if (inst.inputImagePath == null) System.getenv("MXNET_HOME")
       else inst.inputImagePath
 
-      val inputImageDir = if (inst.inputImageDir == null) System.getenv("MXNET_DATA_DIR")
+      val inputImageDir = if (inst.inputImageDir == null) System.getenv("MXNET_HOME")
       else inst.inputImageDir
 
       val singleOutput = runInferenceOnSingleImage(modelPathPrefix, inputImagePath, context)
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala
index dab977019097..b86f6751e45b 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala
@@ -44,21 +44,24 @@ import scala.collection.mutable.{ArrayBuffer, ListBuffer}
   * This will run as a part of "make scalatest"
   */
 class MultiTaskSuite extends FunSuite {
-
   test("Multitask Test") {
     val logger = LoggerFactory.getLogger(classOf[MultiTaskSuite])
-    logger.info("Multitask Test...")
+    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
+      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
+      logger.info("Multitask Test...")
 
-    val batchSize = 100
-    val numEpoch = 10
-    val ctx = Context.cpu()
+      val batchSize = 100
+      val numEpoch = 3
+      val ctx = Context.gpu()
 
-    val modelPath = ExampleMultiTask.getTrainingData
-    val (executor, evalMetric) = ExampleMultiTask.train(batchSize, numEpoch, ctx, modelPath)
-    evalMetric.get.foreach { case (name, value) =>
-      assert(value >= 0.95f)
+      val modelPath = ExampleMultiTask.getTrainingData
+      val (executor, evalMetric) = ExampleMultiTask.train(batchSize, numEpoch, ctx, modelPath)
+      evalMetric.get.foreach { case (name, value) =>
+        assert(value >= 0.95f)
+      }
+      executor.dispose()
+    } else {
+      logger.info("GPU test only, skipped...")
     }
-    executor.dispose()
   }
-
 }
diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
index 3f66fe68e041..e1c63104f9ad 100644
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ b/scala-package/native/osx-x86_64-cpu/pom.xml
@@ -73,6 +73,8 @@
             <linkerMiddleOption>-Wl,-exported_symbol,_Java_*</linkerMiddleOption>
             <linkerMiddleOption>-Wl,-x</linkerMiddleOption>
             <linkerMiddleOption>${lddeps}</linkerMiddleOption>
+            <linkerMiddleOption>-force_load ${project.basedir}/../../../lib/libmxnet.a</linkerMiddleOption>
+            <linkerMiddleOption>-force_load ${project.basedir}/../../../3rdparty/tvm/nnvm/lib/libnnvm.a</linkerMiddleOption>
           </linkerMiddleOptions>
           <linkerEndOptions>
             <linkerEndOption>${ldflags}</linkerEndOption>
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 118af6793156..ed513c0d7785 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -443,6 +443,8 @@ MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
   API_BEGIN();
   NDArray *arr = static_cast<NDArray*>(handle);
   nnvm::Tuple<dim_t> shape(dims, dims+ndim);
+  CHECK_GT(arr->shape().Size(), 0) << "Source ndarray's shape is undefined. Input shape: "
+    << arr->shape();
   TShape new_shape = mxnet::op::InferReshapeShape(shape, arr->shape(), reverse);
   *ptr = arr->ReshapeWithRecord(new_shape);
   *out = ptr;
diff --git a/src/c_api/c_api_profile.cc b/src/c_api/c_api_profile.cc
index c5841775794d..9c03b339e3ca 100644
--- a/src/c_api/c_api_profile.cc
+++ b/src/c_api/c_api_profile.cc
@@ -29,6 +29,7 @@
 #include <dmlc/base.h>
 #include <dmlc/logging.h>
 #include <dmlc/thread_group.h>
+#include <mxnet/kvstore.h>
 #include <stack>
 #include "./c_api_common.h"
 #include "../profiler/profiler.h"
@@ -197,6 +198,10 @@ struct PythonProfileObjects {
 };
 static PythonProfileObjects python_profile_objects;
 
+enum class ProfileProcess {
+  kWorker, kServer
+};
+
 struct ProfileConfigParam : public dmlc::Parameter<ProfileConfigParam> {
   bool profile_all;
   bool profile_symbolic;
@@ -207,6 +212,7 @@ struct ProfileConfigParam : public dmlc::Parameter<ProfileConfigParam> {
   bool continuous_dump;
   float dump_period;
   bool aggregate_stats;
+  int profile_process;
   DMLC_DECLARE_PARAMETER(ProfileConfigParam) {
     DMLC_DECLARE_FIELD(profile_all).set_default(false)
       .describe("Profile all.");
@@ -228,6 +234,13 @@ struct ProfileConfigParam : public dmlc::Parameter<ProfileConfigParam> {
     DMLC_DECLARE_FIELD(aggregate_stats).set_default(false)
       .describe("Maintain aggregate stats, required for MXDumpAggregateStats.  Note that "
       "this can have anegative performance impact.");
+    DMLC_DECLARE_FIELD(profile_process)
+      .add_enum("worker", static_cast<int>(ProfileProcess::kWorker))
+      .add_enum("server", static_cast<int>(ProfileProcess::kServer))
+      .set_default(static_cast<int>(ProfileProcess::kWorker))
+      .describe("Specifies which process to profile: "
+                "worker: this is default. for single node training it should always be worker."
+                "server: for distributed training, this profiles server process");
   }
 };
 
@@ -248,7 +261,8 @@ struct ProfileMarkerScopeParam : public dmlc::Parameter<ProfileMarkerScopeParam>
 
 DMLC_REGISTER_PARAMETER(ProfileMarkerScopeParam);
 
-int MXSetProfilerConfig(int num_params, const char* const* keys, const char* const* vals) {
+int MXSetProcessProfilerConfig(int num_params, const char* const* keys, const char* const* vals,
+                               KVStoreHandle kvstoreHandle) {
     mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
     std::vector<std::pair<std::string, std::string>> kwargs;
@@ -260,19 +274,37 @@ int MXSetProfilerConfig(int num_params, const char* const* keys, const char* con
     }
     ProfileConfigParam param;
     param.Init(kwargs);
-    int mode = 0;
-    if (param.profile_api || param.profile_all)        { mode |= profiler::Profiler::kAPI; }
-    if (param.profile_symbolic || param.profile_all)   { mode |= profiler::Profiler::kSymbolic; }
-    if (param.profile_imperative || param.profile_all) { mode |= profiler::Profiler::kImperative; }
-    if (param.profile_memory || param.profile_all)     { mode |= profiler::Profiler::kMemory; }
-    profiler::Profiler::Get()->SetConfig(profiler::Profiler::ProfilerMode(mode),
-                                         std::string(param.filename),
-                                         param.continuous_dump,
-                                         param.dump_period,
-                                         param.aggregate_stats);
+    if (static_cast<ProfileProcess>(param.profile_process) == ProfileProcess::kServer) {
+      std::ostringstream os;
+      for (int i = 0; i < num_params; ++i) {
+        // this will be sent to the server now, those configs shouldn't have profile server again
+        if (strcmp(keys[i], "profile_process") == 0) continue;
+        os << keys[i] << ":" << vals[i];
+        if (i != num_params - 1) os << ",";
+      }
+      CHECK(kvstoreHandle) << "KVStoreHandle passed to profiler is null";
+      static_cast<KVStore*>(kvstoreHandle)->SetServerProfilerCommand(
+      mxnet::KVStoreServerProfilerCommand::kSetConfig, os.str());
+    } else {
+      int mode = 0;
+      if (param.profile_api || param.profile_all)        { mode |= profiler::Profiler::kAPI; }
+      if (param.profile_symbolic || param.profile_all)   { mode |= profiler::Profiler::kSymbolic; }
+      if (param.profile_imperative ||
+          param.profile_all) { mode |= profiler::Profiler::kImperative; }
+      if (param.profile_memory || param.profile_all)     { mode |= profiler::Profiler::kMemory; }
+      profiler::Profiler::Get()->SetConfig(profiler::Profiler::ProfilerMode(mode),
+                                           std::string(param.filename),
+                                           param.continuous_dump,
+                                           param.dump_period,
+                                           param.aggregate_stats);
+    }
   API_END();
 }
 
+int MXSetProfilerConfig(int num_params, const char* const* keys, const char* const* vals) {
+  return MXSetProcessProfilerConfig(num_params, keys, vals, nullptr);
+}
+
 int MXAggregateProfileStatsPrint(const char **out_str, int reset) {
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
@@ -293,19 +325,40 @@ int MXAggregateProfileStatsPrint(const char **out_str, int reset) {
 }
 
 int MXDumpProfile(int finished) {
+  return MXDumpProcessProfile(finished, static_cast<int>(ProfileProcess::kWorker), nullptr);
+}
+
+int MXDumpProcessProfile(int finished, int profile_process, KVStoreHandle kvStoreHandle) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
+  if (static_cast<ProfileProcess>(profile_process) == ProfileProcess::kServer) {
+    CHECK(kvStoreHandle) << "Kvstore Handle passed to profiler is null";
+    static_cast<KVStore*>(kvStoreHandle)->SetServerProfilerCommand(
+      mxnet::KVStoreServerProfilerCommand::kDump,
+      std::to_string(finished));
+  } else {
     profiler::Profiler *profiler = profiler::Profiler::Get();
     CHECK(profiler->IsEnableOutput())
       << "Profiler hasn't been run. Config and start profiler first";
     profiler->DumpProfile(finished != 0);
+  }
   API_END()
 }
 
 int MXSetProfilerState(int state) {
+  return MXSetProcessProfilerState(state, static_cast<int>(ProfileProcess::kWorker), nullptr);
+}
+
+int MXSetProcessProfilerState(int state, int profile_process, KVStoreHandle kvStoreHandle) {
   mxnet::IgnoreProfileCallScope ignore;
   // state, kNotRunning: 0, kRunning: 1
   API_BEGIN();
+  if (static_cast<ProfileProcess>(profile_process) == ProfileProcess::kServer) {
+    CHECK(kvStoreHandle) << "Kvstore Handle passed to profiler is null";
+    static_cast<KVStore*>(kvStoreHandle)->SetServerProfilerCommand(
+      mxnet::KVStoreServerProfilerCommand::kState,
+      std::to_string(state));
+  } else {
     switch (state) {
       case profiler::Profiler::kNotRunning:
         profiler::vtune::vtune_pause();
@@ -315,6 +368,7 @@ int MXSetProfilerState(int state) {
         break;
     }
     profiler::Profiler::Get()->SetState(profiler::Profiler::ProfilerState(state));
+  }
   API_END();
 }
 
@@ -450,8 +504,18 @@ int MXProfileDurationStop(ProfileHandle duration_handle) {
 }
 
 int MXProfilePause(int paused) {
+  return MXProcessProfilePause(paused, static_cast<int>(ProfileProcess::kWorker), nullptr);
+}
+
+int MXProcessProfilePause(int paused, int profile_process, KVStoreHandle kvStoreHandle) {
   mxnet::IgnoreProfileCallScope ignore;
   API_BEGIN();
+  if (static_cast<ProfileProcess>(profile_process) == ProfileProcess::kServer) {
+    CHECK(kvStoreHandle) << "Kvstore Handle passed to profiler is null";
+    static_cast<KVStore*>(kvStoreHandle)->SetServerProfilerCommand(
+      mxnet::KVStoreServerProfilerCommand::kPause,
+      std::to_string(paused));
+  } else {
     if (paused) {
       profiler::vtune::vtune_pause();
       profiler::Profiler::Get()->set_paused(true);
@@ -459,6 +523,7 @@ int MXProfilePause(int paused) {
       profiler::Profiler::Get()->set_paused(false);
       profiler::vtune::vtune_resume();
     }
+  }
   API_END();
 }
 
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 7386de4d12e3..33c6f574a044 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1282,7 +1282,7 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
   for (size_t i = 0; i < pool_info.size(); i++) {
     sorted_pool_index.push_back(i);
   }
-  auto pool_comparator = [&pool_info](int lhs, int rhs){
+  auto pool_comparator = [&pool_info](size_t lhs, size_t rhs){
     return pool_info[lhs].bytes > pool_info[rhs].bytes;
   };
   std::sort(sorted_pool_index.begin(), sorted_pool_index.end(), pool_comparator);
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index d4da99ea9e85..1e7f8e0de1b3 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -821,12 +821,11 @@ OpStatePtr CachedOp::DynamicForward(
 
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
 
-  if (recording && !inlining_) Imperative::Get()->set_is_recording(false);
-
+  // If we are already recording, we don't need RunGraph to record all
+  // computation again.
   RunGraph(false, idx, arrays, 0, idx.num_nodes(), std::move(array_reqs),
-           std::move(ref_count), &states, dispatch_modes);
-
-  Imperative::Get()->set_is_recording(recording);
+           std::move(ref_count), &states, dispatch_modes,
+           !recording || inlining_);
 
   return op_state;
 }
@@ -947,7 +946,8 @@ void CachedOp::DynamicBackward(
   const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
 
   RunGraph(retain_graph, idx, arrays, num_forward_nodes, idx.num_nodes(),
-           std::move(array_reqs), std::move(ref_count), &states, dispatch_modes);
+           std::move(array_reqs), std::move(ref_count), &states, dispatch_modes,
+           Imperative::Get()->is_recording());
 
   if (retain_graph) {
     buff.resize(num_forward_entries);
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index e1654259a2fb..0c5ff8417754 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -495,7 +495,8 @@ std::vector<NDArray*> Imperative::Backward(
   int prev_bulk_size = Engine::Get()->set_bulk_size(backward_bulk_size_);
 
   RunGraph(retain_graph, idx, arrays, num_forward_nodes, idx.num_nodes(),
-           std::move(array_reqs), std::move(ref_count), &states, dispatch_modes);
+           std::move(array_reqs), std::move(ref_count), &states, dispatch_modes,
+           is_recording());
 
   Engine::Get()->set_bulk_size(prev_bulk_size);
   set_is_recording(prev_recording);
diff --git a/src/imperative/imperative_utils.cc b/src/imperative/imperative_utils.cc
index 464aefc220de..c84a3b9be502 100644
--- a/src/imperative/imperative_utils.cc
+++ b/src/imperative/imperative_utils.cc
@@ -30,7 +30,8 @@ void RunGraph(
     std::vector<OpReqType>&& array_reqs,
     std::vector<uint32_t>&& ref_count,
     std::vector<OpStatePtr> *p_states,
-    const DispatchModeVector &dispatch_modes) {
+    const DispatchModeVector &dispatch_modes,
+    bool recording) {
   using namespace nnvm;
   using namespace imperative;
   static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
@@ -40,7 +41,6 @@ void RunGraph(
   const auto imp = Imperative::Get();
 
   std::vector<OpStatePtr>& states = *p_states;
-  bool recording = imp->is_recording();
 
   std::vector<NDArray*> ndinputs, ndoutputs;
   ShapeVector arg_shapes;
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 6daf96e60d0b..9c86843ca7af 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -994,7 +994,8 @@ void RunGraph(const bool retain_graph,
               std::vector<OpReqType>&& array_reqs,
               std::vector<uint32_t>&& ref_count,
               std::vector<OpStatePtr> *p_states,
-              const DispatchModeVector &dispatch_modes);
+              const DispatchModeVector &dispatch_modes,
+              bool recording);
 
 }  // namespace imperative
 }  // namespace mxnet
diff --git a/src/initialize.cc b/src/initialize.cc
index 1fd92628e9b2..342b0ee0141b 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -26,6 +26,9 @@
 #include <dmlc/logging.h>
 #include <mxnet/engine.h>
 #include "./engine/openmp.h"
+#if MXNET_USE_OPENCV
+#include <opencv2/opencv.hpp>
+#endif  // MXNET_USE_OPENCV
 
 namespace mxnet {
 #if MXNET_USE_SIGNAL_HANDLER && DMLC_LOG_STACK_TRACE
@@ -57,6 +60,9 @@ class LibraryInitializer {
         // Make children single threaded since they are typically workers
         dmlc::SetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
         dmlc::SetEnv("OMP_NUM_THREADS", 1);
+#if MXNET_USE_OPENCV
+        cv::setNumThreads(0);  // disable opencv threading
+#endif  // MXNET_USE_OPENCV
         engine::OpenMP::Get()->set_enabled(false);
         Engine::Get()->Start();
       });
diff --git a/src/kvstore/gradient_compression.cc b/src/kvstore/gradient_compression.cc
index e94a0570d1f4..e4a06fa9a1f2 100644
--- a/src/kvstore/gradient_compression.cc
+++ b/src/kvstore/gradient_compression.cc
@@ -23,31 +23,14 @@
  * \author Rahul Huilgol
  */
 
-#include <sstream>
 #include <vector>
+#include "kvstore_local.h"
 #include "gradient_compression.h"
 #include "gradient_compression-inl.h"
 
 namespace mxnet {
 namespace kvstore {
 
-/*!
- * \brief Splits a string into smaller strings using char as delimiter
- * Example: "a,b,c,,d" is split into ["a","b","c","","d"]
- * \param s string to split
- * \param delim char to split string around
- * \param result container for tokens extracted after splitting
- */
-template<typename Out>
-void split(const std::string &s, const char delim, Out result) {
-  std::stringstream ss;
-  ss.str(s);
-  std::string item;
-  while (std::getline(ss, item, delim)) {
-    *(result++) = item;
-  }
-}
-
 DMLC_REGISTER_PARAMETER(GradientCompressionParam);
 
 GradientCompression::GradientCompression() {
@@ -90,7 +73,7 @@ std::string GradientCompression::EncodeParams() {
 
 void GradientCompression::DecodeParams(const std::string &s) {
   std::vector<std::string> elems;
-  split(s, ',', std::back_inserter(elems));
+  mxnet::kvstore::split(s, ',', std::back_inserter(elems));
   type_ = static_cast<CompressionType>(stoi(elems[0]));
   if (elems.size() > 1) {
     if (!elems[1].empty()) {
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index 7e2f5cb5faa9..23fbf67474ee 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -93,6 +93,15 @@ class KVStoreDist : public KVStoreLocal {
     }
   }
 
+  void SetServerProfilerCommand(const KVStoreServerProfilerCommand type,
+                                const std::string& params) override {
+    if (get_rank() == 0) {
+      SendCommandToServers(static_cast<int>(CommandType::kSetProfilerParams),
+                           params + std::to_string(static_cast<int>(type)));
+    }
+  }
+
+
   void Barrier() override {
     ps::Postoffice::Get()->Barrier(ps_worker_->get_customer()->customer_id(), ps::kWorkerGroup);
   }
diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h
index 451fb78a6229..372b58dbbf3d 100644
--- a/src/kvstore/kvstore_dist_server.h
+++ b/src/kvstore/kvstore_dist_server.h
@@ -24,6 +24,9 @@
  */
 #ifndef MXNET_KVSTORE_KVSTORE_DIST_SERVER_H_
 #define MXNET_KVSTORE_KVSTORE_DIST_SERVER_H_
+#include <mxnet/c_api.h>
+#include <mxnet/kvstore.h>
+#include <ps/ps.h>
 #include <queue>
 #include <string>
 #include <mutex>
@@ -32,8 +35,7 @@
 #include <functional>
 #include <future>
 #include <vector>
-#include "ps/ps.h"
-#include "mxnet/kvstore.h"
+#include "../profiler/profiler.h"
 #include "../operator/tensor/elemwise_binary_op-inl.h"
 #include "../operator/tensor/init_op.h"
 
@@ -42,7 +44,8 @@ namespace kvstore {
 
 // maintain same order in frontend.
 enum class CommandType {
-  kController, kSetMultiPrecision, kStopServer, kSyncMode, kSetGradientCompression,
+  kController, kSetMultiPrecision, kStopServer, kSyncMode,
+  kSetGradientCompression, kSetProfilerParams
 };
 
 enum class RequestType {
@@ -164,6 +167,7 @@ class KVStoreDistServer {
   }
 
   ~KVStoreDistServer() {
+    profiler::Profiler::Get()->SetState(profiler::Profiler::ProfilerState(0));
     delete ps_server_;
   }
 
@@ -194,27 +198,37 @@ class KVStoreDistServer {
 
   void CommandHandle(const ps::SimpleData& recved, ps::SimpleApp* app) {
     CommandType recved_type = static_cast<CommandType>(recved.head);
-    if (recved_type == CommandType::kStopServer) {
-      exec_.Stop();
-    } else if (recved_type == CommandType::kSyncMode) {
-      sync_mode_ = true;
-    } else if (recved_type == CommandType::kSetGradientCompression) {
-      gradient_compression_->DecodeParams(recved.body);
-    } else if (recved_type == CommandType::kSetMultiPrecision) {
-      // uses value 1 for message id from frontend
-      if (!multi_precision_) {
-        multi_precision_ = true;
-        CreateMultiPrecisionCopies();
-      }
-    } else if (recved_type == CommandType::kController) {
-      // value of 0
-      // let the main thread to execute ctrl, which is necessary for python
-      exec_.Exec([this, recved]() {
-          CHECK(controller_);
-          controller_(recved.head, recved.body);
-        });
-    } else {
-      LOG(FATAL) << "Unknown command type received " << recved.head;
+    switch (recved_type) {
+      case CommandType::kStopServer:
+        exec_.Stop();
+        break;
+      case CommandType::kSyncMode:
+        sync_mode_ = true;
+        break;
+      case CommandType::kSetGradientCompression:
+        gradient_compression_->DecodeParams(recved.body);
+        break;
+      case CommandType::kSetProfilerParams:
+        // last char is the type of profiler command
+        ProcessServerProfilerCommands(static_cast<KVStoreServerProfilerCommand>
+                                                  (recved.body.back() - '0'),
+                                      recved.body);
+        break;
+      case CommandType::kSetMultiPrecision:
+        // uses value 1 for message id from frontend
+        if (!multi_precision_) {
+          multi_precision_ = true;
+          CreateMultiPrecisionCopies();
+        }
+        break;
+      case CommandType::kController:
+        // this uses value 0 for message id from frontend
+        // let the main thread to execute ctrl, which is necessary for python
+        exec_.Exec([this, recved]() {
+            CHECK(controller_);
+            controller_(recved.head, recved.body);
+          });
+        break;
     }
     app->Response(recved);
   }
@@ -225,11 +239,11 @@ class KVStoreDistServer {
    * some keys are initialized before optimizer is set.
    */
   void CreateMultiPrecisionCopies() {
-    for (auto const& stored_entry : store_) {
+    for (auto const &stored_entry : store_) {
       const int key = stored_entry.first;
-      const NDArray& stored = stored_entry.second;
+      const NDArray &stored = stored_entry.second;
       if (stored.dtype() != mshadow::kFloat32) {
-        auto& stored_realt = store_realt_[key];
+        auto &stored_realt = store_realt_[key];
         if (stored.storage_type() == kRowSparseStorage) {
           stored_realt = NDArray(kRowSparseStorage, stored.shape(), stored.ctx(),
                                  true, mshadow::kFloat32);
@@ -237,7 +251,7 @@ class KVStoreDistServer {
           stored_realt = NDArray(stored.shape(), stored.ctx(), false, mshadow::kFloat32);
         }
 
-        auto& update = update_buf_[key];
+        auto &update = update_buf_[key];
         if (!update.merged.is_none()) {
           if (update.merged.storage_type() == kRowSparseStorage) {
             update.merged = NDArray(kRowSparseStorage, update.merged.shape(), update.merged.ctx(),
@@ -254,11 +268,60 @@ class KVStoreDistServer {
         CopyFromTo(stored, stored_realt);
       }
     }
-    for (auto const& stored_realt_entry : store_realt_) {
+    for (auto const &stored_realt_entry : store_realt_) {
       stored_realt_entry.second.WaitToRead();
     }
   }
 
+  void ProcessServerProfilerCommands(KVStoreServerProfilerCommand type, const std::string& body) {
+    switch (type) {
+      case KVStoreServerProfilerCommand::kSetConfig:
+        SetProfilerConfig(body.substr(0, body.size() - 1));
+        break;
+      case KVStoreServerProfilerCommand::kState:
+        MXSetProfilerState(static_cast<int>(body.front() - '0'));
+        break;
+      case KVStoreServerProfilerCommand::kPause:
+        MXProfilePause(static_cast<int>(body.front() - '0'));
+        break;
+      case KVStoreServerProfilerCommand::kDump:
+        MXDumpProfile(static_cast<int>(body.front() - '0'));
+        break;
+    }
+  }
+
+  void SetProfilerConfig(std::string params_str) {
+    std::vector<std::string> elems;
+    mxnet::kvstore::split(params_str, ',', std::back_inserter(elems));
+    std::vector<const char*> ckeys;
+    std::vector<const char*> cvals;
+    ckeys.reserve(elems.size());
+    cvals.reserve(elems.size());
+
+    for (size_t i=0; i < elems.size(); i++) {
+      std::vector<std::string> parts;
+      mxnet::kvstore::split(elems[i], ':', std::back_inserter(parts));
+      CHECK_EQ(parts.size(), 2) << "Improper profiler config passed from worker";
+      CHECK(!parts[0].empty()) << "ProfilerConfig parameter is empty";
+      CHECK(!parts[1].empty()) << "ProfilerConfig value is empty for parameter "<< parts[0];
+      if (parts[0] == "filename") {
+        parts[1] = "rank" + std::to_string(ps::MyRank()) + "_" + parts[1];
+      }
+      char* ckey = new char[parts[0].length() + 1];
+      std::snprintf(ckey, parts[0].length() + 1, "%s", parts[0].c_str());
+      ckeys.push_back(ckey);
+
+      char* cval = new char[parts[1].length() + 1];
+      std::snprintf(cval, parts[1].length() + 1, "%s", parts[1].c_str());
+      cvals.push_back(cval);
+    }
+    MXSetProfilerConfig(elems.size(), &ckeys[0], &cvals[0]);
+    for (size_t i=0; i < ckeys.size(); i++) {
+      delete[] ckeys[i];
+      delete[] cvals[i];
+    }
+  }
+
   void DataHandleEx(const ps::KVMeta& req_meta,
                     const ps::KVPairs<char>& req_data,
                     ps::KVServer<char>* server) {
diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index 324bc2c9558a..4e004a3a3008 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -40,6 +40,22 @@
 
 namespace mxnet {
 namespace kvstore {
+/*!
+ * \brief Splits a string into smaller strings using char as delimiter
+ * Example: "a,b,c,,d" is split into ["a","b","c","","d"]
+ * \param s string to split
+ * \param delim char to split string around
+ * \param result container for tokens extracted after splitting
+ */
+template<typename Out>
+void split(const std::string &s, const char delim, Out result) {
+  std::stringstream ss;
+  ss.str(s);
+  std::string item;
+  while (std::getline(ss, item, delim)) {
+    *(result++) = item;
+  }
+}
 
 enum KeyType {
   kUndefinedKey = -1,
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
index 266ccb1b1a14..7c7f403d6985 100644
--- a/src/operator/nn/concat.cc
+++ b/src/operator/nn/concat.cc
@@ -74,6 +74,65 @@ static bool ConcatShape(const nnvm::NodeAttrs& attrs,
   return dshape.Size() != 0;
 }
 
+// Concat for RNN param deals with the reverse shape inference from output
+// for the special case of concatenating RNN parameters.
+// The first (and sometimes the second) input may be unknown on the target axis.
+// If the two inputs are unknown, they always have the same shape.
+static bool RNNParamConcatShape(const nnvm::NodeAttrs& attrs,
+                                std::vector<TShape> *in_shape,
+                                std::vector<TShape> *out_shape) {
+  using namespace mshadow;
+  const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
+  TShape dshape;
+  index_t size = 0;
+  int num_zero = 0;
+  int axis = -1;
+  for (int i = 0; i < param_.num_args; ++i) {
+    TShape tmp = (*in_shape)[i];
+    if (tmp.ndim()) {
+      axis = CheckAxis(param_.dim, tmp.ndim());
+      num_zero += tmp[axis] == 0;
+      size += tmp[axis];
+      tmp[axis] = 0;
+      shape_assign(&dshape, tmp);
+    }
+  }
+
+  TShape tmp = (*out_shape)[0];
+  if (tmp.ndim()) {
+    axis = CheckAxis(param_.dim, tmp.ndim());
+    tmp[axis] = 0;
+    shape_assign(&dshape, tmp);
+  }
+
+  if (dshape.ndim() == 0) return false;
+
+  for (int i = 0; i < param_.num_args; ++i) {
+    CHECK(shape_assign(&(*in_shape)[i], dshape))
+        << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i];
+  }
+
+  if (!num_zero) dshape[axis] = size;
+  CHECK(shape_assign(&(*out_shape)[0], dshape))
+      << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
+  if ((*out_shape)[0][axis] != 0 && num_zero) {
+    int residual = (*out_shape)[0][axis] - size;
+    CHECK_GE(residual, 0)
+        << "Input size already exceeds output size. Residual: " << residual;
+    CHECK(num_zero <= 2 && num_zero >= 0)
+        << "Expecting 1 or 2 inputs that need shape inference. Got: " << num_zero;
+    bool need_infer = !(*out_shape)[0].Size();
+    for (int i = 0; i < num_zero; i++) {
+      (*in_shape)[i*2][axis] = residual / num_zero;
+      need_infer = need_infer || !(*in_shape)[i].Size();
+    }
+    return !need_infer;
+  }
+
+  return dshape.Size() != 0;
+}
+
 static bool ConcatType(const nnvm::NodeAttrs& attrs,
                        std::vector<int> *in_type,
                        std::vector<int> *out_type) {
@@ -228,6 +287,34 @@ struct ConcatGrad {
 
 DMLC_REGISTER_PARAMETER(ConcatParam);
 
+#define CONCAT_FORWARD_ATTRS \
+.set_num_inputs([](const NodeAttrs& attrs) { \
+  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed); \
+  return params.num_args; \
+}) \
+.set_num_outputs(1) \
+.set_attr_parser(ParamParser<ConcatParam>) \
+.set_attr<nnvm::FListInputNames>("FListInputNames", \
+    [](const NodeAttrs& attrs) { \
+  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed); \
+  std::vector<std::string> ret; \
+  for (int i = 0; i < params.num_args; ++i) { \
+    ret.push_back(std::string("arg") + std::to_string(i)); \
+  } \
+  return ret; \
+}) \
+.set_attr<nnvm::FListOutputNames>("FListOutputNames", \
+    [](const NodeAttrs& attrs) { \
+    return std::vector<std::string>{"output"}; \
+}) \
+.set_attr<nnvm::FInferType>("FInferType", ConcatType) \
+.set_attr<FInferStorageType>("FInferStorageType", ConcatForwardInferStorageType) \
+.set_attr<FCompute>("FCompute<cpu>", ConcatCompute<cpu>) \
+.set_attr<FComputeEx>("FComputeEx<cpu>", ConcatComputeExCPU) \
+.set_attr<nnvm::FGradient>("FGradient", ConcatGrad{"_backward_Concat"}) \
+.set_attr<std::string>("key_var_num_args", "num_args")
+
+
 NNVM_REGISTER_OP(Concat)
 MXNET_ADD_SPARSE_OP_ALIAS(concat)
 .add_alias("concat")
@@ -268,37 +355,13 @@ Example::
                          [ 5.,  5.,  8.,  8.]]
 
 )code" ADD_FILELINE)
-.set_num_inputs([](const NodeAttrs& attrs) {
-  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
-  return params.num_args;
-})
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<ConcatParam>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-    [](const NodeAttrs& attrs) {
-  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
-  std::vector<std::string> ret;
-  for (int i = 0; i < params.num_args; ++i) {
-    ret.push_back(std::string("arg") + std::to_string(i));
-  }
-  return ret;
-})
-.set_attr<nnvm::FListOutputNames>("FListOutputNames",
-    [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"output"};
-})
 #if MXNET_USE_MKLDNN == 1
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 })
 #endif
+CONCAT_FORWARD_ATTRS
 .set_attr<nnvm::FInferShape>("FInferShape", ConcatShape)
-.set_attr<nnvm::FInferType>("FInferType", ConcatType)
-.set_attr<FInferStorageType>("FInferStorageType", ConcatForwardInferStorageType)
-.set_attr<FCompute>("FCompute<cpu>", ConcatCompute<cpu>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", ConcatComputeExCPU)
-.set_attr<nnvm::FGradient>("FGradient", ConcatGrad{"_backward_Concat"})
-.set_attr<std::string>("key_var_num_args", "num_args")
 .add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
 .add_arguments(ConcatParam::__FIELDS__());
 
@@ -320,5 +383,19 @@ NNVM_REGISTER_OP(_backward_Concat)
 #endif
 .set_attr<FCompute>("FCompute<cpu>", ConcatGradCompute<cpu>);
 
+// _rnn_param_concat is a custom concat op with specialized infer_shape,
+// which handles the case where the first one or two inputs may have
+// unknown shape that can be inferred from output shape.
+NNVM_REGISTER_OP(_rnn_param_concat)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+CONCAT_FORWARD_ATTRS
+.set_attr<nnvm::FInferShape>("FInferShape", RNNParamConcatShape)
+.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
+.add_arguments(ConcatParam::__FIELDS__());
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/concat.cu b/src/operator/nn/concat.cu
index 4f6b8fc9ebef..2872d527898e 100644
--- a/src/operator/nn/concat.cu
+++ b/src/operator/nn/concat.cu
@@ -50,6 +50,10 @@ NNVM_REGISTER_OP(Concat)
 .set_attr<FCompute>("FCompute<gpu>", ConcatCompute<gpu>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", ConcatComputeExGPU);
 
+NNVM_REGISTER_OP(_rnn_param_concat)
+.set_attr<FCompute>("FCompute<gpu>", ConcatCompute<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", ConcatComputeExGPU);
+
 NNVM_REGISTER_OP(_backward_Concat)
 .set_attr<FCompute>("FCompute<gpu>", ConcatGradCompute<gpu>);
 
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 1e670a9047f0..73ef4f0f42a7 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -45,12 +45,12 @@ Operator *RNNProp::CreateOperatorEx(Context ctx,
 DMLC_REGISTER_PARAMETER(RNNParam);
 
 MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
-.describe(R"code(Applies recurrent layers to input data. Currently, vanilla RNN, LSTM and GRU are 
+.describe(R"code(Applies recurrent layers to input data. Currently, vanilla RNN, LSTM and GRU are
 implemented, with both multi-layer and bidirectional support.
 
 **Vanilla RNN**
 
-Applies a single-gate recurrent layer to input X. Two kinds of activation function are supported: 
+Applies a single-gate recurrent layer to input X. Two kinds of activation function are supported:
 ReLU and Tanh.
 
 With ReLU activation function:
@@ -63,7 +63,7 @@ With Tanh activtion function:
 .. math::
     h_t = \tanh(W_{ih} * x_t + b_{ih}  +  W_{hh} * h_{(t-1)} + b_{hh})
 
-Reference paper: Finding structure in time - Elman, 1988. 
+Reference paper: Finding structure in time - Elman, 1988.
 https://crl.ucsd.edu/~elman/Papers/fsit.pdf
 
 **LSTM**
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh
index be3d1f9223f4..33bf72798fd6 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -268,7 +268,11 @@ __global__ void reduce_kernel_M1(const int N, const bool addto,
   for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) {
     Shape<ndim> coord = unravel(idx, sshape);
     int j = ravel(coord, bshape);
-    assign(&small[idx], addto, OP::Map(big[j]));
+    DType val, residual;
+    Reducer::SetInitValue(val, residual);
+    Reducer::Reduce(val, OP::Map(big[j]), residual);
+    Reducer::Finalize(val, residual);
+    assign(&small[idx], addto, val);
   }
 }
 
@@ -287,7 +291,10 @@ __global__ void reduce_kernel_M1(const int N, const bool addto,
     int idx_big = ravel(coord, big_shape);
     int idx_lhs = ravel(coord, lhs_shape);
     int idx_rhs = ravel(coord, rhs_shape);
-    DType val = OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs]));
+    DType val, residual;
+    Reducer::SetInitValue(val, residual);
+    Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs])), residual);
+    Reducer::Finalize(val, residual);
     assign(&small[idx], addto, val);
   }
 }
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 0f96e2cc2f72..ef59145bb4a9 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -28,6 +28,27 @@
 namespace mxnet {
 namespace op {
 
+/*
+ * \brief returns true if all indices are between [min, max]
+ * \param data_ptr the indices to check
+ * \param data_size the number of indices to examine
+ * \param min the expected min value for indices
+ * \param max the expected max value for indices
+ */
+template<typename DType>
+bool CheckIndexOutOfBound(const DType* data_ptr, size_t data_size,
+                          const DType min, const DType max) {
+  bool is_valid = true;
+  for (size_t i = 0; i < data_size; i++) {
+    if (data_ptr[i] > max || data_ptr[i] < min) {
+      is_valid = false;
+      break;
+    }
+  }
+  return is_valid;
+}
+
+
 template<>
 void SparseEmbeddingOpForwardRspImpl<cpu>(const OpContext& ctx,
                                           const TBlob& data,
@@ -48,18 +69,16 @@ void SparseEmbeddingOpForwardRspImpl<cpu>(const OpContext& ctx,
     return;
   }
   // check out-of-bound indices
-  bool is_valid = true;
   MSHADOW_TYPE_SWITCH(data.type_flag_, DType, {
     DType min = 0;
     DType max = static_cast<DType>(weight.shape()[0] - 1);
     // check with single thread is faster since data is small
     DType* data_ptr = data.dptr<DType>();
     size_t data_size = data.shape_.Size();
-    for (size_t i = 0; i < data_size; i++) {
-      if (data_ptr[i] > max || data_ptr[i] < min) is_valid = false;
-    }
+    bool is_valid = CheckIndexOutOfBound(data_ptr, data_size,
+                                         min, max);
+    CHECK(is_valid) << "SparseEmbedding input contains data out of bound";
   })
-  CHECK(is_valid) << "SparseEmbedding input contains data out of bound";
   // the weight is actually dense
   if (weight.aux_shape(kIdx)[0] == weight.shape()[0]) {
     EmbeddingOpForwardDnsImpl<cpu>(s, data, weight.data(), req, output);
@@ -101,6 +120,15 @@ inline void SparseEmbeddingOpBackwardRspImpl<cpu>(const bool deterministic,
   MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
     MSHADOW_SGL_DBL_TYPE_SWITCH(ograd.type_flag_, DType, {
       MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
+        // check out of bound indices
+        {
+          IType min = 0;
+          IType max = static_cast<IType>(output.shape()[0] - 1);
+          // check with single thread is faster since data is small
+          IType* data_ptr = data.dptr<IType>();
+          bool is_valid = CheckIndexOutOfBound(data_ptr, data.shape_.Size(), min, max);
+          CHECK(is_valid) << "Embedding input contains data out of bound";
+        }
         // mark row flags
         Fill<false>(s, TBlob(row_flg, Shape1(num_rows), cpu::kDevMask), kWriteTo, 0);
         Kernel<MarkRowFlgKernel, cpu>::Launch(s, data_size, row_flg, data.dptr<IType>());
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 39fd81ef2001..bdc7f6e843c0 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -36,7 +36,7 @@ namespace op {
 
 struct is_valid_check {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, int32_t* out, const DType* data,
+  MSHADOW_XINLINE static void Map(int i, char* out, const DType* data,
                                   const DType min, const DType max) {
     if (data[i] < min || data[i] > max) *out = 1;
   }
@@ -116,6 +116,27 @@ struct AddTakeGradRspDeterministicKernel {
   }
 };
 
+/*
+ * \brief returns true if all indices are between [min, max]
+ * \param s the stream
+ * \param data_ptr the indices on the stream
+ * \param data_size the number of indices to examine
+ * \param min the expected min value for indices
+ * \param max the expected max value for indices
+ * \param is_valid_ptr the temparary workspace
+ */
+template<typename DType>
+bool CheckIndexOutOfBound(mshadow::Stream<gpu> *s, const DType* data_ptr, size_t data_size,
+                          const DType min, const DType max, char* is_valid_ptr) {
+  using namespace mxnet_op;
+  int32_t is_valid = 0;
+  Kernel<set_zero, gpu>::Launch(s, 1, is_valid_ptr);
+  Kernel<is_valid_check, gpu>::Launch(s, data_size, is_valid_ptr, data_ptr, min, max);
+  CUDA_CALL(cudaMemcpy(&is_valid, is_valid_ptr, sizeof(char),
+            cudaMemcpyDeviceToHost));
+  return is_valid == 0;
+}
+
 template<>
 void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
                                           const TBlob& data,
@@ -136,21 +157,17 @@ void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
     return;
   }
   // check out-of-bound indices
-  int32_t is_valid = 0;
   MSHADOW_TYPE_SWITCH(data.type_flag_, DType, {
     DType min = 0;
     DType max = static_cast<DType>(weight.shape()[0] - 1);
     DType* data_ptr = data.dptr<DType>();
     size_t data_size = data.shape_.Size();
     Tensor<gpu, 1, char> workspace = ctx.requested[0]
-        .get_space_typed<gpu, 1, char>(Shape1(sizeof(int32_t)), s);
-    int32_t* is_valid_ptr = reinterpret_cast<int32_t*>(workspace.dptr_);
-    Kernel<set_zero, gpu>::Launch(s, 1, is_valid_ptr);
-    Kernel<is_valid_check, gpu>::Launch(s, data_size, is_valid_ptr, data_ptr, min, max);
-    CUDA_CALL(cudaMemcpy(&is_valid, is_valid_ptr, sizeof(int32_t),
-              cudaMemcpyDeviceToHost));
+        .get_space_typed<gpu, 1, char>(Shape1(1), s);
+    char* is_valid_ptr = reinterpret_cast<char*>(workspace.dptr_);
+    bool is_valid = CheckIndexOutOfBound(s, data_ptr, data_size, min, max, is_valid_ptr);
+    CHECK(is_valid) << "SparseEmbedding input contains data out of bound";
   })
-  CHECK_EQ(is_valid, 0) << "SparseEmbedding input contains data out of bound";
   // the weight is actually dense
   if (weight.aux_shape(kIdx)[0] == weight.shape()[0]) {
     EmbeddingOpForwardDnsImpl<gpu>(s, data, weight.data(), req, output);
@@ -207,6 +224,17 @@ void SparseEmbeddingDeterministicKernelLaunch(const OpContext& ctx,
                                           sorted_data_storage_bytes);
   temp_storage = workspace.dptr_ + total_storage_bytes - temp_workspace_bytes;
 
+  // check out-of-bound indices
+  {
+    IType min = 0;
+    IType max = static_cast<IType>(output.shape()[0] - 1);
+    IType* data_ptr = data.dptr<IType>();
+    size_t data_size = data.shape_.Size();
+    bool is_valid = CheckIndexOutOfBound(s, data_ptr, data_size, min, max,
+                                         reinterpret_cast<char*>(temp_storage));
+    CHECK(is_valid) << "Embedding input contains data out of bound";
+  }
+
   // make a copy of the data, to be sorted
   TBlob sorted_data_blob(sorted_data, Shape1(data_size), gpu::kDevMask);
   auto sorted_data_tensor = sorted_data_blob.FlatTo1D<gpu, dim_t>(s);
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index eec920555ed1..78e1fa1d9c6a 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -122,7 +122,7 @@ inline TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
       CHECK(d1 != -1 || d2 != -1) << "Split dims cannot both be -1.";
       if (d1 == -1) d1 = d0 / d2;
       if (d2 == -1) d2 = d0 / d1;
-      CHECK_EQ(d1 * d2, static_cast<IType>(d0)) <<
+      CHECK(d1 * d2 == static_cast<IType>(d0) || static_cast<IType>(d0) == IType(0)) <<
         "Split dims " << d1 << ", " << d2 << " do not divide original dim " << d0;
       tmp.push_back(d1);
       tmp.push_back(d2);
@@ -151,13 +151,36 @@ inline TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
   return oshape;
 }
 
+inline bool ReverseReshapeInferShape(TShape *in, const TShape& out) {
+  if (in->Size() && out.Size()) {
+    return true;
+  } else if (!out.Size()) {
+    return false;
+  } else {
+    int zero_axis = -1;
+    int non_zero_prod = 1;
+    for (index_t i = 0; i < in->ndim(); i++) {
+      if ((*in)[i] == 0) {
+        if (zero_axis != -1)
+          return false;  // more than 1 zero found.
+        else
+          zero_axis = i;
+      } else {
+        non_zero_prod *= (*in)[i];
+      }
+    }
+    (*in)[zero_axis] = out.Size() / non_zero_prod;
+    return true;
+  }
+}
+
 inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
-                             std::vector<TShape> *in_attrs,
-                             std::vector<TShape> *out_attrs) {
+                         std::vector<TShape> *in_attrs,
+                         std::vector<TShape> *out_attrs) {
   const ReshapeParam& param_ = nnvm::get<ReshapeParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
   CHECK_EQ(out_attrs->size(), 1U);
-  const TShape &dshape = (*in_attrs)[0];
+  TShape &dshape = (*in_attrs)[0];
   if (dshape.ndim() == 0) return false;
   TShape oshape;
   if (param_.shape.ndim() != 0) {
@@ -182,14 +205,15 @@ inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
       oshape[inf_idx] = dshape.Size() / oshape.Size();
     }
   } else {
-    return (*out_attrs)[0].ndim();
+    return (*out_attrs)[0].ndim() && ReverseReshapeInferShape(&(*in_attrs)[0], (*out_attrs)[0]);
   }
+  ReverseReshapeInferShape(&dshape, oshape);
   CHECK_EQ(oshape.Size(), dshape.Size())
     << "Target shape size is different to source. "
     << "Target: " << oshape
     << "\nSource: " << dshape;
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  return true;
+  return ReverseReshapeInferShape(&(*in_attrs)[0], (*out_attrs)[0]);
 }
 
 inline bool FlattenShape(const nnvm::NodeAttrs& attrs,
diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py
index 4c61cc4e3267..8950a9270839 100644
--- a/tests/nightly/model_backwards_compatibility_check/common.py
+++ b/tests/nightly/model_backwards_compatibility_check/common.py
@@ -41,6 +41,8 @@
 backslash = '/'
 s3 = boto3.resource('s3')
 ctx = mx.cpu(0)
+atol_default = 1e-5
+rtol_default = 1e-5
 
 
 def get_model_path(model_name):
diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
index ae368e3a0fc6..5d63e7e9bca3 100644
--- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
+++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
@@ -44,7 +44,7 @@ def test_module_checkpoint_api():
         old_inference_results = load_inference_results(model_name)
         inference_results = loaded_model.predict(data_iter)
         # Check whether they are equal or not ?
-        assert_almost_equal(inference_results.asnumpy(), old_inference_results.asnumpy())
+        assert_almost_equal(inference_results.asnumpy(), old_inference_results.asnumpy(), rtol=rtol_default, atol=atol_default)
         clean_model_files(model_files, model_name)
         logging.info('=================================')
 
@@ -69,7 +69,7 @@ def test_lenet_gluon_load_params_api():
         loaded_model.load_params(model_name + '-params')
         output = loaded_model(test_data)
         old_inference_results = mx.nd.load(model_name + '-inference')['inference']
-        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy())
+        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy(), rtol=rtol_default, atol=atol_default)
         clean_model_files(model_files, model_name)
         logging.info('=================================')
     logging.info('Assertion passed for model : %s' % model_name)
@@ -92,7 +92,7 @@ def test_lenet_gluon_hybrid_imports_api():
         loaded_model = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-0000.params')
         output = loaded_model(test_data)
         old_inference_results = mx.nd.load(model_name + '-inference')['inference']
-        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy())
+        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy(), rtol=rtol_default, atol=atol_default)
         clean_model_files(model_files, model_name)
         logging.info('=================================')
     logging.info('Assertion passed for model : %s' % model_name)
@@ -124,7 +124,7 @@ def test_lstm_gluon_load_parameters_api():
         loaded_model.load_parameters(model_name + '-params')
         output = loaded_model(test_data)
         old_inference_results = mx.nd.load(model_name + '-inference')['inference']
-        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy())
+        assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy(), rtol=rtol_default, atol=atol_default)
         clean_model_files(model_files, model_name)
         logging.info('=================================')
     logging.info('Assertion passed for model : %s' % model_name)
diff --git a/tests/nightly/test_server_profiling.py b/tests/nightly/test_server_profiling.py
new file mode 100644
index 000000000000..7d157a3e4189
--- /dev/null
+++ b/tests/nightly/test_server_profiling.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import mxnet as mx
+import json
+
+key = '99'
+shape = (1200, 1200)        # bigger than MXNET_KVSTORE_BIGARRAY_BOUND
+kv = mx.kv.create('dist_sync')
+
+def init_kv():
+    # init kv dns keys
+    kv.init(key, mx.nd.ones(shape))
+    kv.set_optimizer(mx.optimizer.create('sgd'))
+    return kv, kv.rank, kv.num_workers
+
+def test_sync_push_pull():
+    kv, my_rank, nworker = init_kv()
+    def check_default_keys(kv, my_rank):
+        nrepeat = 10
+        # checks pull after push in loop, because behavior during
+        # consecutive pushes doesn't offer any guarantees
+        for i in range(nrepeat):
+            kv.push(key, mx.nd.ones(shape, dtype='float32') * (my_rank+1))
+            val = mx.nd.zeros(shape, dtype='float32')
+            kv.pull(key, out=val)
+            mx.nd.waitall()
+    check_default_keys(kv, my_rank)
+
+if __name__ == "__main__":
+    server_filename_suffix = 'test_profile_server.json'
+    worker_filename_suffix = 'test_profile_worker.json'
+    mx.profiler.set_config(filename=server_filename_suffix, profile_all=True, profile_process='server')
+    mx.profiler.set_config(filename='rank' + str(kv.rank) + '_' + worker_filename_suffix, profile_all=True, profile_process='worker')
+    mx.profiler.set_state(state='run', profile_process='server')
+    mx.profiler.set_state(state='run', profile_process='worker')
+    test_sync_push_pull()
+    mx.profiler.set_state(state='stop', profile_process='server')
+    mx.profiler.set_state(state='stop', profile_process='worker')
+
+    import glob, os
+
+    # will only work when launcher mode is local, as used for integration test
+    if kv.rank == 0:
+        for rank in range(kv.num_workers):
+            for suffix in [worker_filename_suffix, server_filename_suffix]:
+                # throws value error if file is not proper json
+                filename = 'rank' + str(rank) + '_' + suffix
+                print(glob.glob('*'), os.getcwd())
+                with open(filename, 'r') as f:
+                    j = json.load(f)
+
+
+
diff --git a/tests/python/gpu/test_forward.py b/tests/python/gpu/test_forward.py
index 126ccabaa7b5..02b0256024d3 100644
--- a/tests/python/gpu/test_forward.py
+++ b/tests/python/gpu/test_forward.py
@@ -24,11 +24,13 @@
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 from common import setup_module, with_seed, teardown
 from mxnet.gluon import utils
+import tarfile
 
 def _get_model():
     if not os.path.exists('model/Inception-7-symbol.json'):
-        download('http://data.mxnet.io/models/imagenet/inception-v3.tar.gz', dirname='model')
-        os.system("cd model; tar -xf inception-v3.tar.gz --strip-components 1")
+        download('http://data.mxnet.io/models/imagenet/inception-v3.tar.gz')
+        with tarfile.open(name="inception-v3.tar.gz", mode="r:gz") as tf:
+            tf.extractall()
 
 def _dump_images(shape):
     import skimage.io
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
new file mode 100644
index 000000000000..42d65dab5fdc
--- /dev/null
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -0,0 +1,203 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import print_function
+import sys
+import os
+import time
+import multiprocessing as mp
+import unittest
+import mxnet as mx
+import numpy as np
+import unittest
+from nose.tools import assert_raises
+from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal
+from mxnet.base import MXNetError
+from mxnet import autograd
+from numpy.testing import assert_allclose
+
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(curr_path, '../unittest'))
+from common import setup_module, with_seed, teardown, assert_raises_cudnn_disabled
+from test_gluon import *
+from test_loss import *
+from test_gluon_rnn import *
+
+set_default_context(mx.gpu(0))
+
+def check_rnn_layer(layer):
+    layer.collect_params().initialize(ctx=[mx.cpu(0), mx.gpu(0)])
+    with mx.gpu(0):
+        x = mx.nd.ones((10, 16, 30))
+        states = layer.begin_state(16)
+        go, gs = layer(x, states)
+
+    with mx.cpu(0):
+        x = mx.nd.ones((10, 16, 30))
+        states = layer.begin_state(16)
+        co, cs = layer(x, states)
+
+    # atol of 1e-6 required, as exposed by seed 2124685726
+    assert_almost_equal(go.asnumpy(), co.asnumpy(), rtol=1e-2, atol=1e-6)
+    for g, c in zip(gs, cs):
+        assert_almost_equal(g.asnumpy(), c.asnumpy(), rtol=1e-2, atol=1e-6)
+
+
+def check_rnn_layer_w_rand_inputs(layer):
+    layer.collect_params().initialize(ctx=[mx.cpu(0), mx.gpu(0)])
+    x = mx.nd.uniform(shape=(10, 16, 30))
+    with mx.gpu(0):
+        x = x.copyto(mx.gpu(0))
+        states = layer.begin_state(16)
+        go, gs = layer(x, states)
+
+    with mx.cpu(0):
+        x = x.copyto(mx.cpu(0))
+        states = layer.begin_state(16)
+        co, cs = layer(x, states)
+
+    assert_almost_equal(go.asnumpy(), co.asnumpy(), rtol=1e-2, atol=1e-6)
+    for g, c in zip(gs, cs):
+        assert_almost_equal(g.asnumpy(), c.asnumpy(), rtol=1e-2, atol=1e-6)
+
+
+@with_seed()
+@assert_raises_cudnn_disabled()
+def test_rnn_layer():
+    check_rnn_layer(gluon.rnn.RNN(100, num_layers=3))
+    check_rnn_layer(gluon.rnn.RNN(100, activation='tanh', num_layers=3))
+    check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3))
+    check_rnn_layer(gluon.rnn.GRU(100, num_layers=3))
+
+    check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True))
+    check_rnn_layer_w_rand_inputs(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True))
+
+
+@with_seed()
+def test_gluon_ctc_consistency():
+    loss = mx.gluon.loss.CTCLoss()
+    data = mx.nd.arange(0, 4, repeat=40, ctx=mx.gpu(0)).reshape((2,20,4)).flip(axis=0)
+    cpu_label = mx.nd.array([[2,1,-1,-1],[3,2,2,-1]], ctx=mx.cpu(0))
+    gpu_label = mx.nd.array([[2,1,-1,-1],[3,2,2,-1]], ctx=mx.gpu(0))
+
+    cpu_data = data.copy().as_in_context(mx.cpu(0))
+    cpu_data.attach_grad()
+    with mx.autograd.record():
+        l_cpu = loss(cpu_data, cpu_label)
+        l_cpu.backward()
+
+    gpu_data = data.copyto(mx.gpu(0))
+    gpu_data.attach_grad()
+    with mx.autograd.record():
+        l_gpu = loss(gpu_data, gpu_label)
+        l_gpu.backward()
+
+    assert_almost_equal(cpu_data.grad.asnumpy(), gpu_data.grad.asnumpy(), atol=1e-3, rtol=1e-3)
+
+
+@with_seed()
+def test_global_norm_clip_multi_device():
+    x1 = mx.nd.ones((3,3), ctx=mx.gpu(0))
+    x2 = mx.nd.ones((4,4), ctx=mx.cpu(0))
+    norm = gluon.utils.clip_global_norm([x1, x2], 1.0)
+    assert norm == 5.0
+    assert_almost_equal(x1.asnumpy(), np.ones((3,3))/5)
+    assert_almost_equal(x2.asnumpy(), np.ones((4,4))/5)
+
+
+def _check_batchnorm_result(input, num_devices=1, cuda=False):
+    from mxnet.gluon.utils import split_and_load
+    def _find_bn(module):
+        if isinstance(module, (mx.gluon.nn.BatchNorm, mx.gluon.contrib.nn.SyncBatchNorm)):
+            return module
+        elif isinstance(module.module, (mx.gluon.nn.BatchNorm, mx.gluon.contrib.nn.SyncBatchNorm)):
+            return module.module
+
+        raise RuntimeError('BN not found')
+
+    def _syncParameters(bn1, bn2, ctx):
+        ctx = input.context
+        bn2.gamma.set_data(bn1.gamma.data(ctx))
+        bn2.beta.set_data(bn1.beta.data(ctx))
+        bn2.running_mean.set_data(bn1.running_mean.data(ctx))
+        bn2.running_var.set_data(bn1.running_var.data(ctx))
+
+    input1 = input.copy()
+    input2 = input.copy()
+
+    if cuda:
+        input1 = input.as_in_context(mx.gpu(0))
+        ctx_list = [mx.gpu(i) for i in range(num_devices)]
+    else:
+        ctx_list = [mx.cpu(0) for _ in range(num_devices)]
+
+    nch = input.shape[1]
+    bn1 = mx.gluon.nn.BatchNorm(in_channels=nch)
+    bn2 = mx.gluon.contrib.nn.SyncBatchNorm(in_channels=nch, num_devices=num_devices)
+
+    bn1.initialize(ctx=ctx_list[0])
+    bn2.initialize(ctx=ctx_list)
+
+    # using the same values for gamma and beta
+    #_syncParameters(_find_bn(bn1), _find_bn(bn2), ctx_list[0])
+
+    input1.attach_grad()
+    inputs2 = split_and_load(input2, ctx_list, batch_axis=0)
+    for xi in inputs2:
+        xi.attach_grad()
+
+    with mx.autograd.record():
+        output1 = bn1(input1)
+        output2  = [bn2(xi) for xi in inputs2]
+        loss1 = (output1 ** 2).sum()
+        loss2 = [(output ** 2).sum() for output in output2]
+        mx.autograd.backward(loss1)
+        mx.autograd.backward(loss2)
+
+    output2 = mx.nd.concat(*[output.as_in_context(input.context) for output in output2], dim=0)
+    # assert forwarding
+    assert_almost_equal(input1.asnumpy(), input2.asnumpy(), atol=1e-3, rtol=1e-3)
+    assert_almost_equal(output1.asnumpy(), output2.asnumpy(), atol=1e-3, rtol=1e-3)
+    assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]).asnumpy(),
+                        _find_bn(bn2).running_mean.data(ctx_list[0]).asnumpy(),
+                        atol=1e-3, rtol=1e-3)
+    assert_almost_equal(_find_bn(bn1).running_var.data(ctx_list[0]).asnumpy(),
+                        _find_bn(bn2).running_var.data(ctx_list[0]).asnumpy(),
+                        atol=1e-3, rtol=1e-3)
+    input2grad = mx.nd.concat(*[output.grad.as_in_context(input.context) for output in inputs2], dim=0)
+    assert_almost_equal(input1.grad.asnumpy(), input2grad.asnumpy(), atol=1e-3, rtol=1e-3)
+
+
+def test_sync_batchnorm():
+    def get_num_devices():
+        for i in range(100):
+            try:
+                mx.nd.zeros((1,), ctx=mx.gpu(i))
+            except:
+                return i
+    # no need to use SyncBN with 1 gpu
+    if get_num_devices() < 2:
+        return
+    ndev = 2
+    # check with unsync version
+    for i in range(10):
+        _check_batchnorm_result(mx.nd.random.uniform(shape=(4, 1, 4, 4)),
+                                num_devices=ndev, cuda=True)
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index a3e663a68274..3d799aa5319b 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -36,11 +36,8 @@
 from test_operator import *
 from test_optimizer import *
 from test_random import *
-from test_gluon import *
-from test_loss import *
 from test_exc_handling import *
 #from test_rnn import *
-from test_gluon_rnn import *
 from test_sparse_ndarray import *
 from test_sparse_operator import *
 from test_ndarray import *
@@ -1660,17 +1657,6 @@ def check_rnn_layer_w_rand_inputs(layer):
     for g, c in zip(gs, cs):
         assert_almost_equal(g.asnumpy(), c.asnumpy(), rtol=1e-2, atol=1e-6)
 
-@with_seed()
-@assert_raises_cudnn_disabled()
-def test_rnn_layer():
-    check_rnn_layer(gluon.rnn.RNN(100, num_layers=3))
-    check_rnn_layer(gluon.rnn.RNN(100, activation='tanh', num_layers=3))
-    check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3))
-    check_rnn_layer(gluon.rnn.GRU(100, num_layers=3))
-
-    check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True))
-    check_rnn_layer_w_rand_inputs(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True))
-
 @with_seed()
 def test_sequence_reverse():
     check_sequence_reverse(mx.gpu(0))
@@ -1688,28 +1674,6 @@ def test_autograd_save_memory():
     x.backward()
 
 
-@with_seed()
-def test_gluon_ctc_consistency():
-    loss = mx.gluon.loss.CTCLoss()
-    data = mx.nd.arange(0, 4, repeat=40, ctx=mx.gpu(0)).reshape((2,20,4)).flip(axis=0)
-    cpu_label = mx.nd.array([[2,1,-1,-1],[3,2,2,-1]], ctx=mx.cpu(0))
-    gpu_label = mx.nd.array([[2,1,-1,-1],[3,2,2,-1]], ctx=mx.gpu(0))
-
-    cpu_data = data.copy().as_in_context(mx.cpu(0))
-    cpu_data.attach_grad()
-    with mx.autograd.record():
-        l_cpu = loss(cpu_data, cpu_label)
-        l_cpu.backward()
-
-    gpu_data = data.copyto(mx.gpu(0))
-    gpu_data.attach_grad()
-    with mx.autograd.record():
-        l_gpu = loss(gpu_data, gpu_label)
-        l_gpu.backward()
-
-    assert_almost_equal(cpu_data.grad.asnumpy(), gpu_data.grad.asnumpy(), atol=1e-3, rtol=1e-3)
-
-
 @with_seed()
 def test_cuda_rtc():
     source = r'''
@@ -1740,16 +1704,6 @@ def test_cuda_rtc():
     assert (y.asnumpy() == 12).all()
 
 
-@with_seed()
-def test_global_norm_clip_multi_device():
-    x1 = mx.nd.ones((3,3), ctx=mx.gpu(0))
-    x2 = mx.nd.ones((4,4), ctx=mx.cpu(0))
-    norm = gluon.utils.clip_global_norm([x1, x2], 1.0)
-    assert norm == 5.0
-    assert_almost_equal(x1.asnumpy(), np.ones((3,3))/5)
-    assert_almost_equal(x2.asnumpy(), np.ones((4,4))/5)
-
-
 @with_seed()
 def test_cross_device_autograd():
     x = mx.nd.random.uniform(shape=(10,))
@@ -1968,84 +1922,6 @@ def test_context_num_gpus():
     # Test that num_gpus reports at least one GPU, as the test is run on a GPU host.
     assert mx.context.num_gpus() > 0
 
-def _check_batchnorm_result(input, num_devices=1, cuda=False):
-    from mxnet.gluon.utils import split_and_load
-    def _find_bn(module):
-        if isinstance(module, (mx.gluon.nn.BatchNorm, mx.gluon.contrib.nn.SyncBatchNorm)):
-            return module
-        elif isinstance(module.module, (mx.gluon.nn.BatchNorm, mx.gluon.contrib.nn.SyncBatchNorm)):
-            return module.module
-
-        raise RuntimeError('BN not found')
-
-    def _syncParameters(bn1, bn2, ctx):
-        ctx = input.context
-        bn2.gamma.set_data(bn1.gamma.data(ctx))
-        bn2.beta.set_data(bn1.beta.data(ctx))
-        bn2.running_mean.set_data(bn1.running_mean.data(ctx))
-        bn2.running_var.set_data(bn1.running_var.data(ctx))
-
-    input1 = input.copy()
-    input2 = input.copy()
-
-    if cuda:
-        input1 = input.as_in_context(mx.gpu(0))
-        ctx_list = [mx.gpu(i) for i in range(num_devices)]
-    else:
-        ctx_list = [mx.cpu(0) for _ in range(num_devices)]
-
-    nch = input.shape[1]
-    bn1 = mx.gluon.nn.BatchNorm(in_channels=nch)
-    bn2 = mx.gluon.contrib.nn.SyncBatchNorm(in_channels=nch, num_devices=num_devices)
-
-    bn1.initialize(ctx=ctx_list[0])
-    bn2.initialize(ctx=ctx_list)
-
-    # using the same values for gamma and beta
-    #_syncParameters(_find_bn(bn1), _find_bn(bn2), ctx_list[0])
-
-    input1.attach_grad()
-    inputs2 = split_and_load(input2, ctx_list, batch_axis=0)
-    for xi in inputs2:
-        xi.attach_grad()
-
-    with mx.autograd.record():
-        output1 = bn1(input1)
-        output2  = [bn2(xi) for xi in inputs2]
-        loss1 = (output1 ** 2).sum()
-        loss2 = [(output ** 2).sum() for output in output2]
-        mx.autograd.backward(loss1)
-        mx.autograd.backward(loss2)
-
-    output2 = mx.nd.concat(*[output.as_in_context(input.context) for output in output2], dim=0)
-    # assert forwarding
-    assert_almost_equal(input1.asnumpy(), input2.asnumpy(), atol=1e-3, rtol=1e-3)
-    assert_almost_equal(output1.asnumpy(), output2.asnumpy(), atol=1e-3, rtol=1e-3)
-    assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]).asnumpy(),
-                        _find_bn(bn2).running_mean.data(ctx_list[0]).asnumpy(),
-                        atol=1e-3, rtol=1e-3)
-    assert_almost_equal(_find_bn(bn1).running_var.data(ctx_list[0]).asnumpy(),
-                        _find_bn(bn2).running_var.data(ctx_list[0]).asnumpy(),
-                        atol=1e-3, rtol=1e-3)
-    input2grad = mx.nd.concat(*[output.grad.as_in_context(input.context) for output in inputs2], dim=0)
-    assert_almost_equal(input1.grad.asnumpy(), input2grad.asnumpy(), atol=1e-3, rtol=1e-3)
-
-def test_sync_batchnorm():
-    def get_num_devices():
-        for i in range(100):
-            try:
-                mx.nd.zeros((1,), ctx=mx.gpu(i))
-            except:
-                return i
-    # no need to use SyncBN with 1 gpu
-    if get_num_devices() < 2:
-        return
-    ndev = 2
-    # check with unsync version
-    for i in range(10):
-        _check_batchnorm_result(mx.nd.random.uniform(shape=(4, 1, 4, 4)),
-                                num_devices=ndev, cuda=True)
-
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 359bbee569f8..08303c816af1 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -77,6 +77,7 @@ def test_dequantize_int8_to_float32():
 
 
 @with_seed()
+@unittest.skip('Flaky test, tracked in: https://github.com/apache/incubator-mxnet/issues/11747')
 def test_requantize_int32_to_int8():
     def quantized_int32_to_float(qdata, min_range, max_range):
         assert qdata.dtype == 'int32'
diff --git a/tests/python/train/test_sparse_fm.py b/tests/python/train/test_sparse_fm.py
new file mode 100644
index 000000000000..99a22f54cbbd
--- /dev/null
+++ b/tests/python/train/test_sparse_fm.py
@@ -0,0 +1,138 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import mxnet.ndarray as nd
+from mxnet.test_utils import *
+import numpy as np
+
+def test_factorization_machine_module(verbose=False):
+    """ Test factorization machine model with sparse operators """
+    def check_factorization_machine_module(optimizer=None, num_epochs=None):
+        print("check_factorization_machine_module( {} )".format(optimizer))
+
+        def fm(factor_size, feature_dim, init):
+            x = mx.symbol.Variable("data", stype='csr')
+            v = mx.symbol.Variable("v", shape=(feature_dim, factor_size),
+                                   init=init, stype='row_sparse')
+
+            w1_weight = mx.symbol.var('w1_weight', shape=(feature_dim, 1),
+                                      init=init, stype='row_sparse')
+            w1_bias = mx.symbol.var('w1_bias', shape=(1))
+            w1 = mx.symbol.broadcast_add(mx.symbol.dot(x, w1_weight), w1_bias)
+
+            v_s = mx.symbol._internal._square_sum(data=v, axis=1, keepdims=True)
+            x_s = mx.symbol.square(data=x)
+            bd_sum = mx.sym.dot(x_s, v_s)
+
+            w2 = mx.symbol.dot(x, v)
+            w2_squared = 0.5 * mx.symbol.square(data=w2)
+
+            w_all = mx.symbol.Concat(w1, w2_squared, dim=1)
+            sum1 = mx.symbol.sum(data=w_all, axis=1, keepdims=True)
+            sum2 = 0.5 * mx.symbol.negative(bd_sum)
+            model = mx.sym.elemwise_add(sum1, sum2)
+
+            y = mx.symbol.Variable("label")
+            model = mx.symbol.LinearRegressionOutput(data=model, label=y)
+            return model
+
+        # model
+        init = mx.initializer.Normal(sigma=0.01)
+        factor_size = 4
+        feature_dim = 10000
+        model = fm(factor_size, feature_dim, init)
+
+        # data iter
+        num_batches = 5
+        batch_size = 64
+        num_samples = batch_size * num_batches
+        # generate some random csr data
+        csr_nd = rand_ndarray((num_samples, feature_dim), 'csr', 0.1)
+        label = mx.nd.ones((num_samples,1))
+        # the alternative is to use LibSVMIter
+        train_iter = mx.io.NDArrayIter(data=csr_nd,
+                                       label={'label':label},
+                                       batch_size=batch_size,
+                                       last_batch_handle='discard')
+        # create module
+        mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label'])
+        # allocate memory by given the input data and lable shapes
+        mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
+        # initialize parameters by uniform random numbers
+        mod.init_params(initializer=init)
+        if optimizer == 'sgd':
+            # use Sparse SGD with learning rate 0.1 to train
+            sgd = mx.optimizer.SGD(momentum=0.1, clip_gradient=5.0, learning_rate=0.01,
+                                   rescale_grad=1.0/batch_size)
+            mod.init_optimizer(optimizer=sgd)
+            if num_epochs is None:
+                num_epochs = 10
+            expected_accuracy = 0.02
+        elif optimizer == 'adam':
+            # use Sparse Adam to train
+            adam = mx.optimizer.Adam(clip_gradient=5.0, learning_rate=0.0005,
+                                     rescale_grad=1.0/batch_size)
+            mod.init_optimizer(optimizer=adam)
+            if num_epochs is None:
+                num_epochs = 10
+            expected_accuracy = 0.05
+        elif optimizer == 'adagrad':
+            # use Sparse AdaGrad with learning rate 0.1 to train
+            adagrad = mx.optimizer.AdaGrad(clip_gradient=5.0, learning_rate=0.01,
+                                           rescale_grad=1.0/batch_size)
+            mod.init_optimizer(optimizer=adagrad)
+            if num_epochs is None:
+                num_epochs = 20
+            expected_accuracy = 0.09
+        else:
+            raise AssertionError("Unsupported optimizer type '" + optimizer + "' specified")
+        # use accuracy as the metric
+        metric = mx.metric.create('MSE')
+        # train 'num_epochs' epoch
+        for epoch in range(num_epochs):
+            train_iter.reset()
+            metric.reset()
+            for batch in train_iter:
+                mod.forward(batch, is_train=True)       # compute predictions
+                mod.update_metric(metric, batch.label)  # accumulate prediction accuracy
+                mod.backward()                          # compute gradients
+                mod.update()                            # update parameters
+            print('Epoch %d, Training %s' % (epoch, metric.get()))
+        if num_epochs > 1:
+            assert(metric.get()[1] < expected_accuracy)
+
+    if verbose is True:
+        print("============ SGD ==========================")
+        start = time.clock()
+    check_factorization_machine_module('sgd')
+    if verbose is True:
+        print("Duration: {}".format(time.clock() - start))
+        print("============ ADAM ==========================")
+        start = time.clock()
+    check_factorization_machine_module('adam')
+    if verbose is True:
+        print("Duration: {}".format(time.clock() - start))
+        print("============ ADAGRAD ==========================")
+        start = time.clock()
+    check_factorization_machine_module('adagrad')
+    if verbose is True:
+        print("Duration: {}".format(time.clock() - start))
+
+# run as a script
+if __name__ == "__main__":
+    test_factorization_machine_module()	
diff --git a/tests/python/unittest/test_base.py b/tests/python/unittest/test_base.py
new file mode 100644
index 000000000000..3189729e1d10
--- /dev/null
+++ b/tests/python/unittest/test_base.py
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet.base import data_dir
+from nose.tools import *
+import os
+import unittest
+import logging
+import os.path as op
+import platform
+
+class MXNetDataDirTest(unittest.TestCase):
+    def setUp(self):
+        self.mxnet_data_dir = os.environ.get('MXNET_HOME')
+        if 'MXNET_HOME' in os.environ:
+            del os.environ['MXNET_HOME']
+
+    def tearDown(self):
+        if self.mxnet_data_dir:
+            os.environ['MXNET_HOME'] = self.mxnet_data_dir
+        else:
+            if 'MXNET_HOME' in os.environ:
+                del os.environ['MXNET_HOME']
+
+    def test_data_dir(self,):
+        prev_data_dir = data_dir()
+        system = platform.system()
+        if system != 'Windows':
+            self.assertEqual(data_dir(), op.join(op.expanduser('~'), '.mxnet'))
+        os.environ['MXNET_HOME'] = '/tmp/mxnet_data'
+        self.assertEqual(data_dir(), '/tmp/mxnet_data')
+        del os.environ['MXNET_HOME']
+        self.assertEqual(data_dir(), prev_data_dir)
+
+
diff --git a/tests/python/unittest/test_contrib_control_flow.py b/tests/python/unittest/test_contrib_control_flow.py
index 67ed78ee0308..f1188b53d814 100644
--- a/tests/python/unittest/test_contrib_control_flow.py
+++ b/tests/python/unittest/test_contrib_control_flow.py
@@ -1159,6 +1159,7 @@ def check_contrib_rnn(cell_type, num_states):
 
     configs = [
             {},
+            {'inline_limit': 0},
             {'static_alloc': True},
             {'static_alloc': True, 'static_shape': True} ]
     for config in configs:
diff --git a/tests/python/unittest/test_executor.py b/tests/python/unittest/test_executor.py
index 630cad87496d..3117f6646481 100644
--- a/tests/python/unittest/test_executor.py
+++ b/tests/python/unittest/test_executor.py
@@ -18,13 +18,7 @@
 import numpy as np
 import mxnet as mx
 from common import setup_module, with_seed, teardown
-
-
-def reldiff(a, b):
-    diff = np.sum(np.abs(a - b))
-    norm = np.sum(np.abs(a))
-    reldiff = diff  / norm
-    return reldiff
+from mxnet.test_utils import assert_almost_equal
 
 
 def check_bind_with_uniform(uf, gf, dim, sf=None, lshape=None, rshape=None):
@@ -64,9 +58,9 @@ def check_bind_with_uniform(uf, gf, dim, sf=None, lshape=None, rshape=None):
     out1 = uf(lhs_arr.asnumpy(), rhs_arr.asnumpy())
     out3 = exec3.outputs[0].asnumpy()
     out4 = exec4.outputs[0].asnumpy()
-    assert reldiff(out1, out2) < 1e-6
-    assert reldiff(out1, out3) < 1e-6
-    assert reldiff(out1, out4) < 1e-6
+    assert_almost_equal(out1, out2, rtol=1e-5, atol=1e-5)
+    assert_almost_equal(out1, out3, rtol=1e-5, atol=1e-5)
+    assert_almost_equal(out1, out4, rtol=1e-5, atol=1e-5)
     # test gradient
     out_grad = mx.nd.array(np.ones(out2.shape))
     lhs_grad2, rhs_grad2 = gf(out_grad.asnumpy(),
@@ -74,8 +68,8 @@ def check_bind_with_uniform(uf, gf, dim, sf=None, lshape=None, rshape=None):
                               rhs_arr.asnumpy())
     executor.backward([out_grad])
 
-    assert reldiff(lhs_grad.asnumpy(), lhs_grad2) < 1e-6
-    assert reldiff(rhs_grad.asnumpy(), rhs_grad2) < 1e-6
+    assert_almost_equal(lhs_grad.asnumpy(), lhs_grad2, rtol=1e-5, atol=1e-5)
+    assert_almost_equal(rhs_grad.asnumpy(), rhs_grad2, rtol=1e-5, atol=1e-5)
 
 
 @with_seed(0)
@@ -118,12 +112,14 @@ def check_bind(disable_bulk_exec):
     check_bind(False)
 
 
-@with_seed(0)
+# @roywei: Removing fixed seed as flakiness in this test is fixed
+# tracked at https://github.com/apache/incubator-mxnet/issues/11686
+@with_seed()
 def test_dot():
     nrepeat = 10
     maxdim = 4
     for repeat in range(nrepeat):
-        s =tuple(np.random.randint(1, 500, size=3))
+        s =tuple(np.random.randint(1, 200, size=3))
         check_bind_with_uniform(lambda x, y: np.dot(x, y),
                                 lambda g, x, y: (np.dot(g, y.T), np.dot(x.T, g)),
                                 2,
@@ -131,7 +127,7 @@ def test_dot():
                                 rshape=(s[1], s[2]),
                                 sf = mx.symbol.dot)
     for repeat in range(nrepeat):
-        s =tuple(np.random.randint(1, 500, size=1))
+        s =tuple(np.random.randint(1, 200, size=1))
         check_bind_with_uniform(lambda x, y: np.dot(x, y),
                                 lambda g, x, y: (g * y, g * x),
                                 2,
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index a9a2904e1e13..4e8241ffc1ea 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -16,7 +16,7 @@
 # under the License.
 
 import mxnet as mx
-from mxnet import gluon
+from mxnet import gluon, nd
 import numpy as np
 import copy
 from numpy.testing import assert_allclose
@@ -25,7 +25,6 @@
 from common import assert_raises_cudnn_disabled
 
 
-@assert_raises_cudnn_disabled()
 def test_rnn():
     cell = gluon.rnn.RNNCell(100, prefix='rnn_')
     inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
@@ -51,7 +50,6 @@ def test_lstm():
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
 
-@assert_raises_cudnn_disabled()
 def test_lstm_forget_bias():
     forget_bias = 2.0
     stack = gluon.rnn.SequentialRNNCell()
@@ -77,19 +75,23 @@ def test_lstm_forget_bias():
 def test_lstm_cpu_inference():
     # should behave the same as lstm cell
     EXPECTED_LSTM_OUTPUT = np.array([[[0.72045636, 0.72045636, 0.95215213, 0.95215213],
-                                  [0.72045636, 0.72045636, 0.95215213, 0.95215213]],
-                                 [[0.95215213, 0.95215213, 0.72045636, 0.72045636],
-                                  [0.95215213, 0.95215213, 0.72045636, 0.72045636]]])
+                                      [0.72045636, 0.72045636, 0.95215213, 0.95215213]],
+                                     [[0.95215213, 0.95215213, 0.72045636, 0.72045636],
+                                      [0.95215213, 0.95215213, 0.72045636, 0.72045636]]])
     x = mx.nd.ones(shape=(2, 2, 2))
     model = mx.gluon.rnn.LSTM(2, num_layers=6, bidirectional=True)
+    model_cell = model._unfuse()
     model.initialize(mx.init.One())
+
     y = model(x).asnumpy()
+    y_cell = model_cell.unroll(2, x, layout='TNC', merge_outputs=True)[0].asnumpy()
 
+    mx.test_utils.assert_almost_equal(y_cell, EXPECTED_LSTM_OUTPUT,
+                                      rtol=1e-3, atol=1e-5)
     mx.test_utils.assert_almost_equal(y, EXPECTED_LSTM_OUTPUT,
                                       rtol=1e-3, atol=1e-5)
 
 
-@assert_raises_cudnn_disabled()
 def test_gru():
     cell = gluon.rnn.GRUCell(100, prefix='rnn_')
     inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
@@ -241,6 +243,46 @@ def test_bidirectional():
     assert outs == [(10, 200), (10, 200), (10, 200)]
 
 
+@assert_raises_cudnn_disabled()
+def test_layer_bidirectional():
+    class RefBiLSTM(gluon.Block):
+        def __init__(self, size, **kwargs):
+            super(RefBiLSTM, self).__init__(**kwargs)
+            with self.name_scope():
+                self._lstm_fwd = gluon.rnn.LSTM(size, bidirectional=False, prefix='l0')
+                self._lstm_bwd = gluon.rnn.LSTM(size, bidirectional=False, prefix='r0')
+
+        def forward(self, inpt):
+            fwd = self._lstm_fwd(inpt)
+            bwd_inpt = nd.flip(inpt, 0)
+            bwd = self._lstm_bwd(bwd_inpt)
+            bwd = nd.flip(bwd, 0)
+            return nd.concat(fwd, bwd, dim=2)
+
+    size = 7
+    in_size = 5
+    weights = {}
+    for d in ['l', 'r']:
+        weights['lstm_{}0_i2h_weight'.format(d)] = mx.random.uniform(shape=(size*4, in_size))
+        weights['lstm_{}0_h2h_weight'.format(d)] = mx.random.uniform(shape=(size*4, size))
+        weights['lstm_{}0_i2h_bias'.format(d)] = mx.random.uniform(shape=(size*4,))
+        weights['lstm_{}0_h2h_bias'.format(d)] = mx.random.uniform(shape=(size*4,))
+
+    net = gluon.rnn.LSTM(size, bidirectional=True, prefix='lstm_')
+    ref_net = RefBiLSTM(size, prefix='lstm_')
+    net.initialize()
+    ref_net.initialize()
+    net_params = net.collect_params()
+    ref_net_params = ref_net.collect_params()
+    for k in weights:
+        net_params[k].set_data(weights[k])
+        ref_net_params[k.replace('l0', 'l0l0').replace('r0', 'r0l0')].set_data(weights[k])
+
+    data = mx.random.uniform(shape=(3, 10, in_size))
+    assert_allclose(net(data).asnumpy(), ref_net(data).asnumpy())
+
+
+
 def test_zoneout():
     cell = gluon.rnn.ZoneoutCell(gluon.rnn.RNNCell(100, prefix='rnn_'), zoneout_outputs=0.5,
                               zoneout_states=0.5)
@@ -341,9 +383,12 @@ def check_rnn_layer_forward(layer, inputs, states=None, run_only=False):
     layer.collect_params().initialize()
     inputs.attach_grad()
     with mx.autograd.record():
-        out = layer(inputs, states)
+        if states is None:
+            out = layer(inputs)
+        else:
+            out = layer(inputs, states)
         if states is not None:
-            assert isinstance(out, tuple) and len(out) == 2
+            assert isinstance(out, (list, tuple)) and len(out) == 2
             out = out[0]
         else:
             assert isinstance(out, mx.nd.NDArray)
@@ -355,15 +400,19 @@ def check_rnn_layer_forward(layer, inputs, states=None, run_only=False):
     layer.hybridize()
 
     with mx.autograd.record():
-        out = layer(inputs, states)
         if states is not None:
-            assert isinstance(out, tuple) and len(out) == 2
+            out = layer(inputs, states)
+            assert isinstance(out, (list, tuple)) and len(out) == 2
             out = out[0]
         else:
+            out = layer(inputs)
             assert isinstance(out, mx.nd.NDArray)
         out.backward()
 
-    layer(inputs, states) # test is_training = false
+    if states is not None:
+        layer(inputs, states) # test is_training = false
+    else:
+        layer(inputs)
 
     if not run_only:
         mx.test_utils.assert_almost_equal(np_out, out.asnumpy(), rtol=1e-3, atol=1e-5)
@@ -393,15 +442,26 @@ def test_rnn_layers():
     check_rnn_layer_forward(gluon.rnn.GRU(10, 2, bidirectional=True, dropout=0.5),
                             mx.nd.ones((8, 3, 20)), mx.nd.ones((4, 3, 10)), run_only=True)
 
-    net = gluon.nn.Sequential()
-    net.add(gluon.rnn.LSTM(10, 2, bidirectional=True))
+    net = gluon.nn.HybridSequential()
+    net.add(gluon.rnn.LSTM(10, bidirectional=True))
     net.add(gluon.nn.BatchNorm(axis=2))
     net.add(gluon.nn.Flatten())
     net.add(gluon.nn.Dense(3, activation='relu'))
+    net.hybridize()
     net.collect_params().initialize()
     with mx.autograd.record():
         net(mx.nd.ones((2, 3, 10))).backward()
 
+    net2 = gluon.nn.HybridSequential()
+    net2.add(gluon.rnn.LSTM(10, bidirectional=True))
+    net2.add(gluon.nn.BatchNorm(axis=2))
+    net2.add(gluon.nn.Flatten())
+    net2.add(gluon.nn.Dense(3, activation='relu'))
+    net2.hybridize()
+    net2.collect_params().initialize()
+    with mx.autograd.record():
+        net2(mx.nd.ones((2, 3, 10))).backward()
+
 
 def test_rnn_unroll_variant_length():
     # Test for imperative usage
@@ -487,10 +547,9 @@ def test_cell_fill_shape():
 @assert_raises_cudnn_disabled()
 def test_layer_fill_shape():
     layer = gluon.rnn.LSTM(10)
-    layer.hybridize()
     check_rnn_layer_forward(layer, mx.nd.ones((3, 2, 7)))
     print(layer)
-    assert layer.i2h_weight[0].shape[1] == 7, layer.i2h_weight[0].shape[1]
+    assert layer.l0_i2h_weight.shape[1] == 7, layer.l0_i2h_weight.shape[1]
 
 
 if __name__ == '__main__':
diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py
index 8d5b86341a88..1da6244a4906 100644
--- a/tests/python/unittest/test_loss.py
+++ b/tests/python/unittest/test_loss.py
@@ -64,7 +64,8 @@ def get_net(num_hidden, flatten=True):
     fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=num_hidden, flatten=flatten)
     return fc3
 
-@with_seed(1234)
+# tracked at: https://github.com/apache/incubator-mxnet/issues/11692
+@with_seed()
 def test_ce_loss():
     nclass = 10
     N = 20
@@ -78,11 +79,12 @@ def test_ce_loss():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            eval_metric=mx.metric.Loss(), optimizer='adam')
+            eval_metric=mx.metric.Loss(), optimizer='adam',
+            initializer=mx.init.Xavier(magnitude=2))
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
 
-
-@with_seed(1234)
+# tracked at: https://github.com/apache/incubator-mxnet/issues/11691
+@with_seed()
 def test_bce_loss():
     N = 20
     data = mx.random.uniform(-1, 1, shape=(N, 20))
@@ -105,7 +107,7 @@ def test_bce_loss():
     prob_npy = 1.0 / (1.0 + np.exp(-data.asnumpy()))
     label_npy = label.asnumpy()
     npy_bce_loss = - label_npy * np.log(prob_npy) - (1 - label_npy) * np.log(1 - prob_npy)
-    assert_almost_equal(mx_bce_loss, npy_bce_loss)
+    assert_almost_equal(mx_bce_loss, npy_bce_loss, rtol=1e-4, atol=1e-5)
 
 @with_seed()
 def test_bce_equal_ce2():
@@ -144,7 +146,7 @@ def test_kl_loss():
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
 
 
-@with_seed(1234)
+@with_seed()
 def test_l2_loss():
     N = 20
     data = mx.random.uniform(-1, 1, shape=(N, 10))
@@ -162,7 +164,7 @@ def test_l2_loss():
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
 
 
-@with_seed(1234)
+@with_seed()
 def test_l1_loss():
     N = 20
     data = mx.random.uniform(-1, 1, shape=(N, 10))
@@ -207,7 +209,7 @@ def test_ctc_loss():
     mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741]))
 
 
-@with_seed(1234)
+@with_seed()
 def test_ctc_loss_train():
     N = 20
     data = mx.random.uniform(-1, 1, shape=(N, 20, 10))
@@ -225,7 +227,7 @@ def test_ctc_loss_train():
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 10
 
 
-@with_seed(1234)
+@with_seed()
 def test_sample_weight_loss():
     nclass = 10
     N = 20
@@ -290,7 +292,7 @@ def test_huber_loss():
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
 
 
-@with_seed(1234)
+@with_seed()
 def test_hinge_loss():
     N = 20
     data = mx.random.uniform(-1, 1, shape=(N, 10))
@@ -305,10 +307,10 @@ def test_hinge_loss():
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
             initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
             optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
+    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.06
 
 
-@with_seed(1234)
+@with_seed()
 def test_squared_hinge_loss():
     N = 20
     data = mx.random.uniform(-1, 1, shape=(N, 10))
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index 802988b43297..a21527a5a4ad 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -317,8 +317,9 @@ def create_bucketing_module(key):
     assert total_bytes_after == total_bytes_before
 
 
-
-@with_seed(11)
+# roywei: Getting rid of fixed seed as flakiness could not be reproduced,
+# tracked at: https://github.com/apache/incubator-mxnet/issues/11705
+@with_seed()
 def test_module_set_params():
     # data iter
     data = mx.nd.array([[0.05, .10]]);
@@ -381,7 +382,7 @@ def test_module_set_params():
                  aux_params={}, allow_missing=True, allow_extra=False)
 
 
-@with_seed(11)
+@with_seed()
 def test_monitor():
     # data iter
     data = mx.nd.array([[0.05, .10]]);
@@ -557,11 +558,12 @@ def check_shared_exec_group(sparse_embedding):
     for opt in sparse_embedding_opt:
         check_shared_exec_group(opt)
 
-@with_seed(11)
-def test_factorization_machine_module(verbose=False):
+@with_seed()
+def test_factorization_machine_module():
     """ Test factorization machine model with sparse operators """
-    def check_factorization_machine_module(optimizer=None, num_epochs=None):
-        print("check_factorization_machine_module( {} )".format(optimizer))
+    # this unit test is to test the flow, training accuracy is tested in another test
+    def check_factorization_machine_module(num_epochs=None):
+        print("check_factorization_machine_module")
 
         def fm(factor_size, feature_dim, init):
             x = mx.symbol.Variable("data", stype='csr')
@@ -613,33 +615,16 @@ def fm(factor_size, feature_dim, init):
         mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
         # initialize parameters by uniform random numbers
         mod.init_params(initializer=init)
-        if optimizer == 'sgd':
-            # use Sparse SGD with learning rate 0.1 to train
-            sgd = mx.optimizer.SGD(momentum=0.1, clip_gradient=5.0, learning_rate=0.01,
-                                   rescale_grad=1.0/batch_size)
-            mod.init_optimizer(optimizer=sgd)
-            if num_epochs is None:
-                num_epochs = 10
-            expected_accuracy = 0.02
-        elif optimizer == 'adam':
-            # use Sparse Adam to train
-            adam = mx.optimizer.Adam(clip_gradient=5.0, learning_rate=0.0005,
-                                     rescale_grad=1.0/batch_size)
-            mod.init_optimizer(optimizer=adam)
-            if num_epochs is None:
-                num_epochs = 10
-            expected_accuracy = 0.05
-        elif optimizer == 'adagrad':
-            # use Sparse AdaGrad with learning rate 0.1 to train
-            adagrad = mx.optimizer.AdaGrad(clip_gradient=5.0, learning_rate=0.01,
-                                           rescale_grad=1.0/batch_size)
-            mod.init_optimizer(optimizer=adagrad)
-            if num_epochs is None:
-                num_epochs = 20
-            expected_accuracy = 0.09
-        else:
-            raise AssertionError("Unsupported optimizer type '" + optimizer + "' specified")
-        # use accuracy as the metric
+
+        # use Sparse SGD with learning rate 0.1 to train
+        sgd = mx.optimizer.SGD(momentum=0.1, clip_gradient=5.0, learning_rate=0.01,
+                               rescale_grad=1.0/batch_size)
+        mod.init_optimizer(optimizer=sgd)
+        if num_epochs is None:
+            num_epochs = 50
+        expected_accuracy = 0.02
+
+	# use accuracy as the metric
         metric = mx.metric.create('MSE')
         # train 'num_epochs' epoch
         for epoch in range(num_epochs):
@@ -654,23 +639,7 @@ def fm(factor_size, feature_dim, init):
         if num_epochs > 1:
             assert(metric.get()[1] < expected_accuracy)
 
-    if verbose is True:
-        print("============ SGD ==========================")
-        start = time.clock()
-    check_factorization_machine_module('sgd')
-    if verbose is True:
-        print("Duration: {}".format(time.clock() - start))
-        print("============ ADAM ==========================")
-        start = time.clock()
-    check_factorization_machine_module('adam')
-    if verbose is True:
-        print("Duration: {}".format(time.clock() - start))
-        print("============ ADAGRAD ==========================")
-        start = time.clock()
-    check_factorization_machine_module('adagrad')
-    if verbose is True:
-        print("Duration: {}".format(time.clock() - start))
-
+    check_factorization_machine_module()
 
 @with_seed()
 def test_module_initializer():
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index e55fa1af90e8..931f805906f0 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -862,7 +862,7 @@ def test_iter():
     for i in range(x.size):
         assert same(y[i].asnumpy(), x[i].asnumpy())
 
-@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/8049")
+@with_seed()
 def test_cached():
     sym = mx.sym.Convolution(kernel=(3, 3), num_filter=10) + 2
     op = mx.nd.CachedOp(sym)
@@ -1308,25 +1308,31 @@ def test_norm(ctx=default_context()):
 
     def l1norm(input_data, axis=0, keepdims=False):
         return np.sum(abs(input_data), axis=axis, keepdims=keepdims)
-    def l2norm(input_data, axis=0, keepdims=False): 
+    def l2norm(input_data, axis=0, keepdims=False):
         return sp_norm(input_data, axis=axis, keepdims=keepdims)
 
     in_data_dim = random_sample([4,5,6], 1)[0]
-    in_data_shape = rand_shape_nd(in_data_dim)
-    np_arr = np.random.uniform(-1, 1, in_data_shape).astype(np.float32)
-    mx_arr = mx.nd.array(np_arr, ctx=ctx)
-    for ord in [1,2]:
-        for keep_dims in [True, False]:
-            for i in range(4):
-                npy_out = l1norm(np_arr, i, keep_dims) if ord==1 else l2norm(np_arr, i, keep_dims)
-                mx_out = mx.nd.norm(mx_arr, ord=ord, axis=i, keepdims=keep_dims)
-                assert npy_out.shape == mx_out.shape
-                mx.test_utils.assert_almost_equal(npy_out, mx_out.asnumpy())
-                if (i < 3):
-                    npy_out = l1norm(np_arr, (i, i+1), keep_dims) if ord==1 else l2norm(np_arr, (i, i+1), keep_dims)
-                    mx_out = mx.nd.norm(mx_arr, ord=ord, axis=(i, i+1), keepdims=keep_dims)
+    for force_reduce_dim1 in [True, False]:
+        in_data_shape = rand_shape_nd(in_data_dim)
+        if force_reduce_dim1:
+            in_data_shape = in_data_shape[:3] + (1, ) + in_data_shape[4:]
+        np_arr = np.random.uniform(-1, 1, in_data_shape).astype(np.float32)
+        mx_arr = mx.nd.array(np_arr, ctx=ctx)
+        for ord in [1, 2]:
+            for keep_dims in [True, False]:
+                for i in range(4):
+                    npy_out = l1norm(np_arr, i, keep_dims) if ord == 1 else l2norm(
+                        np_arr, i, keep_dims)
+                    mx_out = mx.nd.norm(mx_arr, ord=ord, axis=i, keepdims=keep_dims)
                     assert npy_out.shape == mx_out.shape
                     mx.test_utils.assert_almost_equal(npy_out, mx_out.asnumpy())
+                    if (i < 3):
+                        npy_out = l1norm(np_arr, (i, i + 1), keep_dims) if ord == 1 else l2norm(
+                            np_arr, (i, i + 1), keep_dims)
+                        mx_out = mx.nd.norm(mx_arr, ord=ord, axis=(i, i + 1), keepdims=keep_dims)
+                        assert npy_out.shape == mx_out.shape
+                        mx.test_utils.assert_almost_equal(npy_out, mx_out.asnumpy())
+
 
 @with_seed()
 def test_ndarray_cpu_shared_ctx():
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 99d635e3565f..90e85d123d59 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1943,11 +1943,11 @@ def test_bxor(a, b):
     test_bmul(a, b)
     test_bdiv(a, b)
     '''
-    Flaky Test Disabled due to master build failure: 
-    http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/incubator-mxnet/detail/master/1248/pipeline 
+    Flaky Test Disabled due to master build failure:
+    http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/incubator-mxnet/detail/master/1248/pipeline
     Github Issue: https://github.com/apache/incubator-mxnet/issues/11838
-    
-    test_bmod(a, b) 
+
+    test_bmod(a, b)
     '''
     test_bmod_int(a, b)
     test_bpow(a, b)
@@ -2065,6 +2065,23 @@ def test_reshape_new(src_shape, shape_args, reverse, dst_shape):
         assert np.square(exe.grad_dict['data'].asnumpy() - grad_npy.reshape(src_shape)).mean() < 1E-7, \
             'Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s'\
             %(str(src_shape), str(shape_args), str(reverse), str(dst_shape))
+
+        for i in range(len(src_shape)):
+            holdout_src_shape = list(src_shape)
+            holdout_src_shape[i] = 0
+            holdout_src_shape = tuple(holdout_src_shape)
+            net = mx.sym.Variable('data')
+            net = mx.sym.elemwise_add(net.reshape(shape_args, reverse=reverse), mx.sym.ones(shape=dst_shape))
+            input_shape, output_shape, __ = net.infer_shape(data=holdout_src_shape)
+            assert output_shape[0] == dst_shape, \
+                'Holdout Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s, ' \
+                'Output Shape = %s' %(str(holdout_src_shape), str(shape_args), str(reverse),
+                                      str(dst_shape), str(output_shape[0]))
+            assert input_shape[0] == src_shape, \
+                'Holdout Src Shape = %s, Shape Arguments = %s, Reverse = %s, Dst Shape = %s, ' \
+                'Output Shape = %s' %(str(holdout_src_shape), str(shape_args), str(reverse),
+                                      str(dst_shape), str(output_shape[0]))
+
     # Test new api (Using shape)
     test_cases = [
         [(2, 3, 5, 5),  (0, -1),          False, (2, 75)],
@@ -4946,8 +4963,9 @@ def _make_lower_triangle_symm(a, ndims, m, dtype=np.float32):
     lt_mask = mx.sym.reshape(lt_mask, shape=shp)
     return mx.sym.broadcast_mul(a, lt_mask)
 
-# Seed set because the test is not robust enough to operate on random data
-@with_seed(42)
+# @ankkhedia: Getting rid of fixed seed as flakiness could not be reproduced
+# tracked at https://github.com/apache/incubator-mxnet/issues/11718
+@with_seed()
 def test_laop():
     dtype = np.float64
     rtol_fw = 1e-7
@@ -5448,8 +5466,9 @@ def test_laop_3():
             check_grad(test_syevd_l_4, [a_batch])
 
 
-# Seed set because the test is not robust enough to operate on random data
-@with_seed(1896893923)
+# @piyushghai - Removing the fixed seed for this test.
+# Issue for flakiness is tracked at - https://github.com/apache/incubator-mxnet/issues/11721
+@with_seed()
 def test_laop_4():
     # Currently disabled on GPU as syevd needs cuda8
     # and MxNet builds use cuda 7.5
@@ -6615,7 +6634,7 @@ def test_diag():
     w = np.random.randint(2,9)
     a_np = np.random.random((h, w)).astype(np.float32)
     a = mx.nd.array(a_np).astype('float32')
-    
+
     # k == 0
     r = mx.nd.diag(a)
     assert_almost_equal(r.asnumpy(), np.diag(a_np))
@@ -6658,7 +6677,7 @@ def test_diag():
     d = np.random.randint(2,9)
     a_np = np.random.random((d))
     a = mx.nd.array(a_np)
-    
+
     # k is random
     k = np.random.randint(-d,d)
     r = mx.nd.diag(a, k=k)
@@ -6725,7 +6744,7 @@ def test_invalid_block_size():
         invalid_shape_inp = (n , c, h, w)
         data = rand_ndarray(invalid_shape_inp, 'default')
         assertRaises(MXNetError, mx.nd.depth_to_space, data, block)
-        
+
     test_invalid_depth_dim()
     test_invalid_space_dim()
     test_invalid_block_size()
@@ -6771,12 +6790,12 @@ def test_invalid_block_size():
         invalid_shape_inp = (n, c, h, w)
         data = rand_ndarray(invalid_shape_inp, 'default')
         assertRaises(MXNetError, mx.nd.space_to_depth, data, block)
-    
+
     def test_invalid_depth_dim():
         invalid_shape_inp = (n, 0, h, w)
         data = rand_ndarray(invalid_shape_inp, 'default')
         assertRaises(MXNetError, mx.nd.space_to_depth, data, block)
-    
+
     test_invalid_space_dim()
     test_invalid_block_size()
     test_invalid_depth_dim()
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index d90dfcf856f9..43e9608934e3 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -447,6 +447,7 @@ def test_uniform_generator():
             verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
 
 @with_seed()
+@unittest.skip('Flaky test, tracked in: https://github.com/apache/incubator-mxnet/issues/9856')
 def test_gamma_generator():
     ctx = mx.context.current_context()
     for dtype in ['float16', 'float32', 'float64']:
diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py
index 508f52301b42..7d3d58010b6e 100644
--- a/tests/python/unittest/test_sparse_ndarray.py
+++ b/tests/python/unittest/test_sparse_ndarray.py
@@ -514,24 +514,22 @@ def test_sparse_nd_astype_copy():
         assert (id(x) == id(y))
 
 
-@with_seed(0)
+@with_seed()
 def test_sparse_nd_pickle():
-    repeat = 1
     dim0 = 40
     dim1 = 40
     stypes = ['row_sparse', 'csr']
     densities = [0, 0.5]
     stype_dict = {'row_sparse': RowSparseNDArray, 'csr': CSRNDArray}
-    for _ in range(repeat):
-        shape = rand_shape_2d(dim0, dim1)
-        for stype in stypes:
-            for density in densities:
-                a, _ = rand_sparse_ndarray(shape, stype, density)
-                assert isinstance(a, stype_dict[stype])
-                data = pkl.dumps(a)
-                b = pkl.loads(data)
-                assert isinstance(b, stype_dict[stype])
-                assert same(a.asnumpy(), b.asnumpy())
+    shape = rand_shape_2d(dim0, dim1)
+    for stype in stypes:
+        for density in densities:
+            a, _ = rand_sparse_ndarray(shape, stype, density)
+            assert isinstance(a, stype_dict[stype])
+            data = pkl.dumps(a)
+            b = pkl.loads(data)
+            assert isinstance(b, stype_dict[stype])
+            assert same(a.asnumpy(), b.asnumpy())
 
 
 # @kalyc: Getting rid of fixed seed as flakiness could not be reproduced
diff --git a/tests/requirements.txt b/tests/requirements.txt
new file mode 100644
index 000000000000..0eca73fbb02a
--- /dev/null
+++ b/tests/requirements.txt
@@ -0,0 +1,3 @@
+# Requirements for tests, those are installed before running on the virtualenv
+mock
+nose
diff --git a/tools/license_header.py b/tools/license_header.py
index 0ee4049338b1..7aef33b71213 100755
--- a/tools/license_header.py
+++ b/tools/license_header.py
@@ -82,7 +82,7 @@
 _LANGS = {'.cc':'*', '.h':'*', '.cu':'*', '.cuh':'*', '.py':'#',
           '.pm':'#', '.scala':'*', '.cc':'*', '.sh':'#', '.cmake':'#',
           '.java':'*', '.sh':'#', '.cpp':'*', '.hpp':'*', '.c':'*',
-          '.bat':'rem', '.pl':'#', '.m':'%', '.R':'#', '.mk':'#', '.cfg':'#', '.t':'#'}
+          '.bat':'rem', '.pl':'#', '.m':'%', '.R':'#', '.mk':'#', '.cfg':'#', '.t':'#', '.ps1': '#'}
 
 # Previous license header, which will be removed
 _OLD_LICENSE = re.compile('.*Copyright.*by Contributors')