diff --git a/benchmark/python/ffi/benchmark_ffi.py b/benchmark/python/ffi/benchmark_ffi.py
index 96d8e1d6658f..ee3fccfaa185 100644
--- a/benchmark/python/ffi/benchmark_ffi.py
+++ b/benchmark/python/ffi/benchmark_ffi.py
@@ -51,6 +51,9 @@ def generate_workloads():
 def prepare_workloads():
     pool = generate_workloads()
     OpArgMngr.add_workload("zeros", (2, 2))
+    OpArgMngr.add_workload("einsum", "ii", pool['2x2'], optimize=False)
+    OpArgMngr.add_workload("unique", pool['1'], return_index=True, return_inverse=True, return_counts=True, axis=-1)
+    OpArgMngr.add_workload("dstack", (pool['2x1'], pool['2x1'], pool['2x1'], pool['2x1']))
     OpArgMngr.add_workload("polyval", dnp.arange(10), pool['2x2'])
     OpArgMngr.add_workload("ediff1d", pool['2x2'], pool['2x2'], pool['2x2'])
     OpArgMngr.add_workload("nan_to_num", pool['2x2'])
diff --git a/python/mxnet/kvstore/kvstore.py b/python/mxnet/kvstore/kvstore.py
index eec6aa5453f0..59e5a4dad041 100644
--- a/python/mxnet/kvstore/kvstore.py
+++ b/python/mxnet/kvstore/kvstore.py
@@ -498,8 +498,9 @@ def set_gradient_compression(self, compression_params):
         """ Specifies type of low-bit quantization for gradient compression \
          and additional arguments depending on the type of compression being used.
 
-        The 1bit compression works as follows: values which is above the threshold in the
-        gradient will be set to +1, whereas values below threshold will be set to -1.
+        The 1bit compression takes a float `threshold` and works as follows:
+        values which is above the threshold in the gradient will be set to +1, whereas
+        values below threshold will be set to -1.
 
         2bit Gradient Compression takes a positive float `threshold`.
         The technique works by thresholding values such that positive values in the
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index ff0e48d47664..a1b6ff8b5bac 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -951,11 +951,8 @@ def unique(ar, return_index=False, return_inverse=False, return_counts=False, ax
     >>> u[indices]
     array([1., 2., 6., 4., 2., 3., 2.])
     """
-    ret = _npi.unique(ar, return_index, return_inverse, return_counts, axis)
-    if isinstance(ret, list):
-        return tuple(ret)
-    else:
-        return ret
+    ret = list(_api_internal.unique(ar, return_index, return_inverse, return_counts, axis))
+    return ret[0] if len(ret) == 1 else tuple(ret)
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -4374,7 +4371,7 @@ def dstack(arrays):
            [[2, 3]],
            [[3, 4]]])
     """
-    return _npi.dstack(*arrays)
+    return _api_internal.dstack(*arrays)
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -6772,7 +6769,7 @@ def einsum(*operands, **kwargs):
 
     subscripts = operands[0]
     operands = operands[1:]
-    return _npi.einsum(*operands, subscripts=subscripts, out=out, optimize=int(optimize_arg))
+    return _api_internal.einsum(*operands, subscripts, out, int(optimize_arg))
 
 
 @set_module('mxnet.ndarray.numpy')
diff --git a/src/api/operator/numpy/np_einsum_op.cc b/src/api/operator/numpy/np_einsum_op.cc
new file mode 100644
index 000000000000..a5b8339a619e
--- /dev/null
+++ b/src/api/operator/numpy/np_einsum_op.cc
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_einsum_op.cc
+ * \brief Implementation of the API of functions in src/operator/numpy/np_einsum_op.cc
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include <vector>
+#include "../utils.h"
+#include "../../../operator/numpy/np_einsum_op-inl.h"
+
+namespace mxnet {
+
+MXNET_REGISTER_API("_npi.einsum")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_einsum");
+  nnvm::NodeAttrs attrs;
+  op::NumpyEinsumParam param;
+  int args_size = args.size();
+  // param.num_args
+  param.num_args = args_size - 3;
+  // param.subscripts
+  param.subscripts = args[args_size - 3].operator std::string();
+  // param.optimize
+  param.optimize = args[args_size - 1].operator int();
+
+  attrs.parsed = std::move(param);
+  attrs.op = op;
+  SetAttrDict<op::NumpyEinsumParam>(&attrs);
+
+  // inputs
+  int num_inputs = param.num_args;
+  std::vector<NDArray*> inputs_vec(num_inputs, nullptr);
+  for (int i = 0; i < num_inputs; ++i) {
+    inputs_vec[i] = args[i].operator mxnet::NDArray*();
+  }
+  NDArray** inputs = inputs_vec.data();
+
+  // outputs
+  NDArray* out = args[args_size - 2].operator mxnet::NDArray*();
+  NDArray** outputs = out == nullptr ? nullptr : &out;
+  int num_outputs = out != nullptr;
+
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, outputs);
+  if (out) {
+    *ret = PythonArg(args_size - 2);
+  } else {
+    *ret = reinterpret_cast<mxnet::NDArray*>(ndoutputs[0]);
+  }
+});
+
+}  // namespace mxnet
diff --git a/src/api/operator/numpy/np_matrix_op.cc b/src/api/operator/numpy/np_matrix_op.cc
index ae8421ac4010..fdf8e9a081fa 100644
--- a/src/api/operator/numpy/np_matrix_op.cc
+++ b/src/api/operator/numpy/np_matrix_op.cc
@@ -24,6 +24,7 @@
 #include <mxnet/api_registry.h>
 #include <mxnet/runtime/packed_func.h>
 #include "../utils.h"
+#include "../../../operator/nn/concat-inl.h"
 #include "../../../operator/tensor/matrix_op-inl.h"
 #include "../../../operator/numpy/np_matrix_op-inl.h"
 
@@ -49,6 +50,31 @@ MXNET_REGISTER_API("_npi.expand_dims")
   *ret = ndoutputs[0];
 });
 
+MXNET_REGISTER_API("_npi.dstack")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_dstack");
+  nnvm::NodeAttrs attrs;
+  op::ConcatParam param;
+  int args_size = args.size();
+  // param.num_args
+  param.num_args = args_size;
+  attrs.parsed = param;
+  attrs.op = op;
+  SetAttrDict<op::ConcatParam>(&attrs);
+  // inputs
+  int num_inputs = args_size;
+  std::vector<NDArray*> inputs_vec(args_size, nullptr);
+  for (int i = 0; i < args_size; ++i) {
+    inputs_vec[i] = args[i].operator mxnet::NDArray*();
+  }
+  NDArray** inputs = inputs_vec.data();
+  // outputs
+  int num_outputs = 0;
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, nullptr);
+  *ret = ndoutputs[0];
+});
+
 MXNET_REGISTER_API("_npi.split")
 .set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
   using namespace runtime;
diff --git a/src/api/operator/numpy/np_unique_op.cc b/src/api/operator/numpy/np_unique_op.cc
new file mode 100644
index 000000000000..288260f5dfb2
--- /dev/null
+++ b/src/api/operator/numpy/np_unique_op.cc
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file np_unique_op.cc
+ * \brief Implementation of the API of functions in src/operator/numpy/np_unique_op.cc
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include <vector>
+#include "../utils.h"
+#include "../../../operator/numpy/np_unique_op.h"
+
+namespace mxnet {
+
+MXNET_REGISTER_API("_npi.unique")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  const nnvm::Op* op = Op::Get("_npi_unique");
+  nnvm::NodeAttrs attrs;
+  op::NumpyUniqueParam param;
+  // param
+  param.return_index = args[1].operator bool();
+  param.return_inverse = args[2].operator bool();
+  param.return_counts = args[3].operator bool();
+  if (args[4].type_code() == kNull) {
+    param.axis = dmlc::nullopt;
+  } else {
+    param.axis = args[4].operator int();
+  }
+  attrs.parsed = std::move(param);
+  attrs.op = op;
+  SetAttrDict<op::NumpyUniqueParam>(&attrs);
+  // inputs
+  int num_inputs = 1;
+  NDArray* inputs[] = {args[0].operator mxnet::NDArray*()};
+  // outputs
+  int num_outputs = 0;
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, nullptr);
+  std::vector<NDArrayHandle> ndarray_handles;
+  ndarray_handles.reserve(num_outputs);
+  for (int i = 0; i < num_outputs; ++i) {
+    ndarray_handles.emplace_back(ndoutputs[i]);
+  }
+  *ret = ADT(0, ndarray_handles.begin(), ndarray_handles.end());
+});
+
+}  // namespace mxnet
diff --git a/src/kvstore/gradient_compression-inl.h b/src/kvstore/gradient_compression-inl.h
index d882d6739b47..7d70dff59617 100644
--- a/src/kvstore/gradient_compression-inl.h
+++ b/src/kvstore/gradient_compression-inl.h
@@ -60,12 +60,12 @@ struct quantize_1bit {
     char *block_ptr = reinterpret_cast < char * > (compr_block);
     // masks used to quantize data
     const uint8_t bits[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
-    for(int i = start; i < end; ++i) {
+    for (int i = start; i < end; ++i) {
       // adds offset to reach appropriate byte
       char *curr_byte = block_ptr + ((i - start) >> 3);
       // adds gradient to existing residual to get updated grad
       residual[i] += grad[i];
-      if(residual[i] > threshold){
+      if (residual[i] > threshold) {
         // set data to 1
         *curr_byte |= bits[(i & 7)];
         // reduce residual by 1
diff --git a/src/kvstore/gradient_compression.cc b/src/kvstore/gradient_compression.cc
index 915a3dff907d..86a183dd6688 100644
--- a/src/kvstore/gradient_compression.cc
+++ b/src/kvstore/gradient_compression.cc
@@ -153,12 +153,12 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
       } else {
         LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
       }
-#else
-    LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
-#endif
     } else {
       LOG(FATAL) << "unknown device mask";
     }
+#else
+    LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
   }
 }
 
@@ -207,12 +207,12 @@ void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray
       } else {
         LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
       }
-#else
-    LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
-#endif
     } else {
       LOG(FATAL) << "unknown device mask";
     }
+#else
+    LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
   }
 }
 }  // namespace kvstore
diff --git a/src/operator/nn/concat-inl.h b/src/operator/nn/concat-inl.h
index ffedba46c1ac..b5505d12ca45 100644
--- a/src/operator/nn/concat-inl.h
+++ b/src/operator/nn/concat-inl.h
@@ -55,6 +55,13 @@ struct ConcatParam : public dmlc::Parameter<ConcatParam> {
     DMLC_DECLARE_FIELD(dim).set_default(1)
     .describe("the dimension to be concated.");
   }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream num_args_s, dim_s;
+    num_args_s << num_args;
+    dim_s << dim;
+    (*dict)["num_args"] = num_args_s.str();
+    (*dict)["dim"] = dim_s.str();
+  }
 };  // struct ConcatParam
 
 template<typename xpu, typename DType>
diff --git a/src/operator/numpy/np_einsum_op-inl.h b/src/operator/numpy/np_einsum_op-inl.h
index b89e576bba23..ca80c7bc20be 100644
--- a/src/operator/numpy/np_einsum_op-inl.h
+++ b/src/operator/numpy/np_einsum_op-inl.h
@@ -384,6 +384,15 @@ struct NumpyEinsumParam: public dmlc::Parameter<NumpyEinsumParam> {
     DMLC_DECLARE_FIELD(optimize)
       .set_default(0);
   }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream num_args_s, optimize_s, subscripts_s;
+    num_args_s << num_args;
+    optimize_s << optimize;
+    subscripts_s << subscripts;
+    (*dict)["num_args"] = num_args_s.str();
+    (*dict)["optimize"] = optimize_s.str();
+    (*dict)["subscripts"] = subscripts_s.str();
+  }
 };
 
 class EinsumOp {
diff --git a/src/operator/numpy/np_unique_op.h b/src/operator/numpy/np_unique_op.h
index bc2b6c34c19f..0a121cd69481 100644
--- a/src/operator/numpy/np_unique_op.h
+++ b/src/operator/numpy/np_unique_op.h
@@ -80,6 +80,17 @@ struct NumpyUniqueParam : public dmlc::Parameter<NumpyUniqueParam> {
     .set_default(dmlc::optional<int>())
     .describe("An integer that represents the axis to operator on.");
   }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream return_index_s, return_inverse_s, return_counts_s, axis_s;
+    return_index_s << return_index;
+    return_inverse_s << return_inverse;
+    return_counts_s << return_counts;
+    axis_s << axis;
+    (*dict)["return_index"] = return_index_s.str();
+    (*dict)["return_inverse"] = return_inverse_s.str();
+    (*dict)["return_counts"] = return_counts_s.str();
+    (*dict)["axis"] = axis_s.str();
+  }
 };
 
 }  // namespace op
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index d3964af0bab7..3a8325cfb6e2 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -6542,6 +6542,9 @@ def hybrid_forward(self, F, a):
     configs = [
         ((), True, True, True, None),
         ((1, ), True, True, True, -1),
+        ((5, ), False, False, False, 0),
+        ((5, ), True, False, False, 0),
+        ((5, ), True, True, False, 0),
         ((5, ), True, True, True, 0),
         ((5, ), True, True, True, None),
         ((5, 4), True, True, True, None),
@@ -6562,15 +6565,24 @@ def hybrid_forward(self, F, a):
                 x = np.array(x, dtype=dtype)
                 np_out = _np.unique(x.asnumpy(), *config[1:])
                 mx_out = test_unique(x)
-                assert mx_out[0].shape == np_out[0].shape
-                for i in range(4):
-                    assert_almost_equal(mx_out[i].asnumpy(), np_out[i], rtol=1e-3, atol=1e-5)
+                if (len(mx_out)) == 1:
+                    assert mx_out.shape == np_out.shape
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+                else:
+                    for i in range(len(mx_out)):
+                        assert mx_out[i].shape == np_out[i].shape
+                        assert_almost_equal(mx_out[i].asnumpy(), np_out[i], rtol=1e-3, atol=1e-5)
 
                 # Test imperative once again
                 mx_out = np.unique(x, *config[1:])
                 np_out = _np.unique(x.asnumpy(), *config[1:])
-                for i in range(4):
-                    assert_almost_equal(mx_out[i].asnumpy(), np_out[i], rtol=1e-3, atol=1e-5)
+                if (len(mx_out)) == 1:
+                    assert mx_out.shape == np_out.shape
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+                else:
+                    for i in range(len(mx_out)):
+                        assert mx_out[i].shape == np_out[i].shape
+                        assert_almost_equal(mx_out[i].asnumpy(), np_out[i], rtol=1e-3, atol=1e-5)
 
 
 @with_seed()