diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index 15709e5bc284..4d9f4f987d77 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -56,11 +56,11 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
     .describe("Turn off cudnn pooling and use MXNet pooling operator. ");
 
-    DMLC_DECLARE_FIELD(kernel)
+    DMLC_DECLARE_FIELD(kernel).set_default(TShape())  // add default value here
     .enforce_nonzero()
     .describe("Pooling kernel size: (y, x) or (d, y, x)");
 
-    DMLC_DECLARE_FIELD(pool_type)
+    DMLC_DECLARE_FIELD(pool_type).set_default(pool_enum::kMaxPooling)  // add default pooling method
     .add_enum("max", pool_enum::kMaxPooling)
     .add_enum("avg", pool_enum::kAvgPooling)
     .add_enum("sum", pool_enum::kSumPooling)
@@ -132,19 +132,23 @@ class PoolingOp {
     using namespace mshadow;
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const TShape& ishape = in_data.shape_;
+    TShape kernel = param_.kernel;
     TShape padding = param_.pad;
+    TShape stride = param_.stride;
     if (param_.global_pool) {
-      for (index_t i = 0; i < padding.ndim(); i++) {
+      kernel = TShape(ishape.data() + 2,
+               ishape.data() + ishape.ndim());
+      padding = TShape(ishape.ndim() - 2);
+      for (index_t i = 0; i < ishape.ndim() - 2; i++) {
         padding[i] = 0;
       }
+      stride = TShape(ishape.ndim() - 2);
     }
 
     pool(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
-         param_.global_pool?
-           TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim())
-           : param_.kernel,
+         kernel,
          padding,
-         param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
+         stride,
          param_.pool_type, req, out_data.dptr<DType>());
   }
 
@@ -154,20 +158,24 @@ class PoolingOp {
     using namespace mshadow;
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const TShape& ishape = in_data.shape_;
+    TShape kernel = param_.kernel;
     TShape padding = param_.pad;
+    TShape stride = param_.stride;
     if (param_.global_pool) {
-      for (index_t i = 0; i < padding.ndim(); i++) {
+      kernel = TShape(ishape.data() + 2,
+               ishape.data() + ishape.ndim());
+      padding = TShape(ishape.ndim() - 2);
+      for (index_t i = 0; i < ishape.ndim() - 2; i++) {
         padding[i] = 0;
       }
+      stride = TShape(ishape.ndim() - 2);
     }
 
     unpool(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
            in_grad.shape_, out_grad.shape_,
-           param_.global_pool?
-             TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim())
-             : param_.kernel,
+           kernel,
            padding,
-           param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
+           stride,
            param_.pool_type, req, in_grad.dptr<DType>());
   }
 
@@ -178,6 +186,11 @@ class PoolingOp {
 template<typename xpu, typename DType>
 PoolingOp<xpu, DType> &GetPoolingOp(const PoolingParam &param) {
   static thread_local PoolingOp<xpu, DType> op;
+  // check if filter size assigned correctly
+  if (param.global_pool == false) {
+    CHECK_GT(param.kernel.ndim(), 0U)
+        << "You need to set the kernel size if global pooling is not used";
+  }
   op.Init(param);
   return op;
 }
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index f719e0753e08..7ee655fbab9c 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -46,15 +46,14 @@ static void PoolingParamParser(nnvm::NodeAttrs *attrs) {
     if (param.stride.ndim() == 0) param.stride = Shape2(1, 1);
     if (param.pad.ndim() == 0) param.pad = Shape2(0, 0);
   } else {
-    CHECK_EQ(param.kernel.ndim(), 3U) << param.kernel.ndim()
-                                       << "D pooling not supported";
+      // ignore kernel size only if global_pool not assigned false
+      if (param.global_pool == false) {
+        CHECK_EQ(param.kernel.ndim(), 3U) << param.kernel.ndim()
+            << "D pooling not supported";
+      }
     if (param.stride.ndim() == 0) param.stride = Shape3(1, 1, 1);
     if (param.pad.ndim() == 0) param.pad = Shape3(0, 0, 0);
   }
-  CHECK_EQ(param.stride.ndim(), param.kernel.ndim())
-      << "stride and kernel should have the same length";
-  CHECK_EQ(param.pad.ndim(), param.kernel.ndim())
-      << "pad and kernel should have the same length";
   attrs->parsed = std::move(param);
 }
 
@@ -98,28 +97,37 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
       << "Pooling: Input data should be  3D in (batch, channel, x)"
       << " Or 4D in (batch, channel, y, x) "
       << " Or 5D in (batch, channel, d, y, x)";
+  CHECK_LE(dshape.ndim(), 5U)
+      << "Pooling: Input data should be  3D in (batch, channel, x)"
+      << " Or 4D in (batch, channel, y, x) "
+      << " Or 5D in (batch, channel, d, y, x)";
   TShape oshape = dshape;
   if (dshape.ndim() == 0) return false;
-  if (param.kernel.ndim() == 1) {
+  if (param.global_pool) {
+      for (size_t i{2}; i < dshape.ndim(); i++)
+          oshape[i] = 1;
+      out_shape->clear();
+      out_shape->push_back(oshape);  // save output shape
+#if MXNET_USE_MKLDNN == 1
+      if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
+        out_shape->push_back(oshape);   // for workspace
+#endif
+  } else if (param.kernel.ndim() == 1) {
     CHECK_EQ(dshape.ndim(), 3U)
         << "Pooling: Input data should be 3D in (batch, channel, x)";
-    if (param.global_pool) {
-      oshape[2] = 1;
+    CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0])
+        << "kernel size (" << param.kernel[0] << ") exceeds input ("
+        << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0])
+        << ")";
+    if (param.pooling_convention == pool_enum::kValid) {
+      oshape[2] = 1 +
+                  (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
+                      param.stride[0];
     } else {
-      CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0])
-          << "kernel size (" << param.kernel[0] << ") exceeds input ("
-          << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0])
-          << ")";
-      if (param.pooling_convention == pool_enum::kValid) {
-        oshape[2] = 1 +
-                    (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
-                        param.stride[0];
-      } else {
-        oshape[2] = 1 + static_cast<int>(ceil(
-                            static_cast<float>(dshape[2] + 2 * param.pad[0] -
-                                               param.kernel[0]) /
-                            param.stride[0]));
-      }
+      oshape[2] = 1 + static_cast<int>(ceil(
+                          static_cast<float>(dshape[2] + 2 * param.pad[0] -
+                                             param.kernel[0]) /
+                          param.stride[0]));
     }
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
@@ -130,35 +138,30 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
   } else if (param.kernel.ndim() == 2) {
     CHECK_EQ(dshape.ndim(), 4U)
         << "Pooling: Input data should be 4D in (batch, channel, y, x)";
-    if (param.global_pool) {
-      oshape[2] = 1;
-      oshape[3] = 1;
+    CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0])
+        << "kernel size (" << param.kernel[0] << ") exceeds input ("
+        << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0])
+        << ")";
+    CHECK(param.kernel[1] <= dshape[3] + 2 * param.pad[1])
+        << "kernel size (" << param.kernel[1] << ") exceeds input ("
+        << dshape[3] << " padded to " << (dshape[3] + 2 * param.pad[1])
+        << ")";
+    if (param.pooling_convention == pool_enum::kValid) {
+      oshape[2] = 1 +
+                  (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
+                      param.stride[0];
+      oshape[3] = 1 +
+                  (dshape[3] + 2 * param.pad[1] - param.kernel[1]) /
+                      param.stride[1];
     } else {
-      CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0])
-          << "kernel size (" << param.kernel[0] << ") exceeds input ("
-          << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0])
-          << ")";
-      CHECK(param.kernel[1] <= dshape[3] + 2 * param.pad[1])
-          << "kernel size (" << param.kernel[1] << ") exceeds input ("
-          << dshape[3] << " padded to " << (dshape[3] + 2 * param.pad[1])
-          << ")";
-      if (param.pooling_convention == pool_enum::kValid) {
-        oshape[2] = 1 +
-                    (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
-                        param.stride[0];
-        oshape[3] = 1 +
-                    (dshape[3] + 2 * param.pad[1] - param.kernel[1]) /
-                        param.stride[1];
-      } else {
-        oshape[2] = 1 + static_cast<int>(ceil(
-                            static_cast<float>(dshape[2] + 2 * param.pad[0] -
-                                               param.kernel[0]) /
-                            param.stride[0]));
-        oshape[3] = 1 + static_cast<int>(ceil(
-                            static_cast<float>(dshape[3] + 2 * param.pad[1] -
-                                               param.kernel[1]) /
-                            param.stride[1]));
-      }
+      oshape[2] = 1 + static_cast<int>(ceil(
+                          static_cast<float>(dshape[2] + 2 * param.pad[0] -
+                                             param.kernel[0]) /
+                          param.stride[0]));
+      oshape[3] = 1 + static_cast<int>(ceil(
+                          static_cast<float>(dshape[3] + 2 * param.pad[1] -
+                                             param.kernel[1]) /
+                          param.stride[1]));
     }
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
@@ -175,35 +178,29 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
         << "kernel size exceeds input";
     CHECK_LE(param.kernel[2], dshape[4] + 2 * param.pad[2])
         << "kernel size exceeds input";
-    if (param.global_pool) {
-      oshape[2] = 1;
-      oshape[3] = 1;
-      oshape[4] = 1;
+    if (param.pooling_convention == pool_enum::kValid) {
+      oshape[2] = 1 +
+                  (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
+                      param.stride[0];
+      oshape[3] = 1 +
+                  (dshape[3] + 2 * param.pad[1] - param.kernel[1]) /
+                      param.stride[1];
+      oshape[4] = 1 +
+                  (dshape[4] + 2 * param.pad[2] - param.kernel[2]) /
+                      param.stride[2];
     } else {
-      if (param.pooling_convention == pool_enum::kValid) {
-        oshape[2] = 1 +
-                    (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
-                        param.stride[0];
-        oshape[3] = 1 +
-                    (dshape[3] + 2 * param.pad[1] - param.kernel[1]) /
-                        param.stride[1];
-        oshape[4] = 1 +
-                    (dshape[4] + 2 * param.pad[2] - param.kernel[2]) /
-                        param.stride[2];
-      } else {
-        oshape[2] = 1 + static_cast<int>(ceil(
-                            static_cast<float>(dshape[2] + 2 * param.pad[0] -
-                                               param.kernel[0]) /
-                            param.stride[0]));
-        oshape[3] = 1 + static_cast<int>(ceil(
-                            static_cast<float>(dshape[3] + 2 * param.pad[1] -
-                                               param.kernel[1]) /
-                            param.stride[1]));
-        oshape[4] = 1 + static_cast<int>(ceil(
-                            static_cast<float>(dshape[4] + 2 * param.pad[2] -
-                                               param.kernel[2]) /
-                            param.stride[2]));
-      }
+      oshape[2] = 1 + static_cast<int>(ceil(
+                          static_cast<float>(dshape[2] + 2 * param.pad[0] -
+                                             param.kernel[0]) /
+                          param.stride[0]));
+      oshape[3] = 1 + static_cast<int>(ceil(
+                          static_cast<float>(dshape[3] + 2 * param.pad[1] -
+                                             param.kernel[1]) /
+                          param.stride[1]));
+      oshape[4] = 1 + static_cast<int>(ceil(
+                          static_cast<float>(dshape[4] + 2 * param.pad[2] -
+                                             param.kernel[2]) /
+                          param.stride[2]));
     }
 
     out_shape->clear();
diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h
index 4934dbeb4544..0a663265cbe7 100644
--- a/src/operator/pooling_v1-inl.h
+++ b/src/operator/pooling_v1-inl.h
@@ -55,19 +55,19 @@ struct PoolingV1Param : public dmlc::Parameter<PoolingV1Param> {
   int pooling_convention;
   bool global_pool;
   DMLC_DECLARE_PARAMETER(PoolingV1Param) {
-    DMLC_DECLARE_FIELD(global_pool).set_default(false)
-    .describe("Ignore kernel size, do global pooling based on current input feature map. ");
-
-    DMLC_DECLARE_FIELD(kernel)
+    DMLC_DECLARE_FIELD(kernel).set_default(TShape())
     .enforce_nonzero()
     .describe("pooling kernel size: (y, x) or (d, y, x)");
 
-    DMLC_DECLARE_FIELD(pool_type)
+    DMLC_DECLARE_FIELD(pool_type).set_default(pool_v1_enum::kMaxPooling)
     .add_enum("max", pool_v1_enum::kMaxPooling)
     .add_enum("avg", pool_v1_enum::kAvgPooling)
     .add_enum("sum", pool_v1_enum::kSumPooling)
     .describe("Pooling type to be applied.");
 
+    DMLC_DECLARE_FIELD(global_pool).set_default(false)
+    .describe("Ignore kernel size, do global pooling based on current input feature map. ");
+
     DMLC_DECLARE_FIELD(pooling_convention).set_default(pool_v1_enum::kValid)
     .add_enum("full", pool_v1_enum::kFull)
     .add_enum("valid", pool_v1_enum::kValid)
@@ -105,8 +105,10 @@ class PoolingV1Op : public Operator {
 
     // reset padding size for global pooling
     TShape padding = param_.pad;
+    // TShape kernel = param_.kernel;
     if (param_.global_pool) {
       padding[0] = padding[1] = 0;
+      // kernel[0] = kernel[1] = 0;
     }
 
     Tensor<xpu, 4, DType> data = in_data[pool_v1_enum::kData].get<xpu, 4, DType>(s);
@@ -215,18 +217,20 @@ class PoolingV1Prop : public OperatorProperty {
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     using namespace mshadow;
     param_.Init(kwargs);
-    if (param_.kernel.ndim() == 2) {
-      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
-    } else {
-      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported";
-      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+    if (!param_.global_pool) {
+      if (param_.kernel.ndim() == 2) {
+        if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+        if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+      } else {
+        CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported";
+        if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+        if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+      }
+      CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim())
+        << "stride and kernel should have the same length";
+      CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim())
+        << "pad and kernel should have the same length";
     }
-    CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim())
-      << "stride and kernel should have the same length";
-    CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim())
-      << "pad and kernel should have the same length";
   }
 
   std::map<std::string, std::string> GetParams() const override {
@@ -240,34 +244,43 @@ class PoolingV1Prop : public OperatorProperty {
     const TShape &dshape = (*in_shape)[0];
     CHECK_GE(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
                                << "Or 5D in (batch, channel, d, y, x)";
+    CHECK_LE(dshape.ndim(), 5U) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
+                               << "Or 5D in (batch, channel, d, y, x)";
     TShape oshape = dshape;
     if (dshape.ndim() ==  0) return false;
-    if (param_.kernel.ndim() == 2) {
-      CHECK_EQ(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
-      if (param_.global_pool) {
+    if (param_.global_pool) {
+      if (dshape.ndim() == 4) {
         oshape[2] = 1;
         oshape[3] = 1;
       } else {
-        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
-            << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
-            << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
-        CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1])
-            << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3]
-            << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")";
-        if (param_.pooling_convention == pool_v1_enum::kValid) {
+        oshape[2] = 1;
+        oshape[3] = 1;
+        oshape[4] = 1;
+      }
+      out_shape->clear();
+      out_shape->push_back(oshape);
+    } else if (param_.kernel.ndim() == 2) {
+      CHECK_EQ(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
+      CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
+          << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
+          << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
+      CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1])
+          << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3]
+          << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")";
+      if (param_.pooling_convention == pool_v1_enum::kValid) {
           oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
                               param_.stride[0];
           oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
                               param_.stride[1];
-        } else {
+      } else {
           oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
                               dshape[2] + 2 * param_.pad[0] -
                               param_.kernel[0]) / param_.stride[0]));
           oshape[3] = 1 + static_cast<int>(ceil(static_cast<float>(
                               dshape[3] + 2 * param_.pad[1] -
                               param_.kernel[1]) / param_.stride[1]));
-        }
       }
+
       out_shape->clear();
       out_shape->push_back(oshape);
     } else if (param_.kernel.ndim() == 3) {
@@ -275,19 +288,14 @@ class PoolingV1Prop : public OperatorProperty {
       CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input";
       CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input";
       CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input";
-      if (param_.global_pool) {
-        oshape[2] = 1;
-        oshape[3] = 1;
-        oshape[4] = 1;
-      } else {
-        if (param_.pooling_convention == pool_v1_enum::kValid) {
+      if (param_.pooling_convention == pool_v1_enum::kValid) {
           oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
                               param_.stride[0];
           oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
                               param_.stride[1];
           oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) /
                               param_.stride[2];
-        } else {
+      } else {
           oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
                               dshape[2] + 2 * param_.pad[0] -
                               param_.kernel[0]) / param_.stride[0]));
@@ -297,7 +305,6 @@ class PoolingV1Prop : public OperatorProperty {
           oshape[4] = 1 + static_cast<int>(ceil(static_cast<float>(
                               dshape[4] + 2 * param_.pad[2] -
                               param_.kernel[2]) / param_.stride[2]));
-        }
       }
 
       out_shape->clear();
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index cb422e2263af..bc0249129d3e 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -904,81 +904,126 @@ def test_1d_pooling(pool_type):
         kernel = (4,)
         pad = (2,)
         stride = (2,)
-    
+
         ctx_list = []
         sym_list = []
-    
+
         pooling_convention = 'valid'
-    
+
         ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, name='pool'))
-    
+
         ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, name='pool'))
-    
+
+        ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+        sym_list.append(mx.sym.Pooling(pool_type=pool_type,
+                                       pooling_convention=pooling_convention, global_pool=True, name='pool'))
+
         ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, cudnn_off=False, name='pool'))
-    
+
         ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, cudnn_off=False, name='pool'))
-    
+
+        ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+        sym_list.append(mx.sym.Pooling(pool_type=pool_type,
+                                       pooling_convention=pooling_convention, global_pool=True, cudnn_off=False, name='pool'))
+
         ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, cudnn_off=True, name='pool'))
-    
+
         ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, cudnn_off=True, name='pool'))
-    
+
+        ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+        sym_list.append(mx.sym.Pooling(pool_type=pool_type,
+                                       pooling_convention=pooling_convention, global_pool=True, cudnn_off=True, name='pool'))
+
         check_consistency(sym_list, ctx_list)
-    
+
     def test_2d_pooling(pool_type):
         data = (2, 3, 20, 20)
         kernel = (4, 4)
         pad = (2, 2)
         stride = (2, 2)
-    
+
         ctx_list = []
         sym_list = []
-    
+
         pooling_convention = 'valid'
-    
+
         ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, name='pool'))
-    
+
         ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, name='pool'))
-    
+
+        ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+        sym_list.append(mx.sym.Pooling_v1(pool_type=pool_type,
+                                       pooling_convention=pooling_convention, global_pool=True, name='pool'))
+
         ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, name='pool'))
-    
+
         ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, name='pool'))
-    
+        ## no kernel indicated cpu
+        ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+        sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                       pooling_convention=pooling_convention, global_pool=True, name='pool'))
+
+        ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+        sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type,
+                                       pooling_convention=pooling_convention, global_pool=True, name='pool'))
+        ###
+        ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+        sym_list.append(mx.sym.Pooling(pool_type=pool_type,
+                                       pooling_convention=pooling_convention, global_pool=True, name='pool'))
+
         ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, cudnn_off=False, name='pool'))
-    
+
         ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, cudnn_off=False, name='pool'))
-    
+        ## no kernel indicated gpu
+        ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+        sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                       pooling_convention=pooling_convention, global_pool=True, cudnn_off=False, name='pool'))
+
+        ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+        sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type,
+                                       pooling_convention=pooling_convention, global_pool=True, cudnn_off=False, name='pool'))
+        ###
+
+        ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+        sym_list.append(mx.sym.Pooling(pool_type=pool_type,
+                                       pooling_convention=pooling_convention, global_pool=True, cudnn_off=False, name='pool'))
+
         ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, cudnn_off=True, name='pool'))
-    
+
         ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, cudnn_off=True, name='pool'))
-    
+
+        ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+        sym_list.append(mx.sym.Pooling(pool_type=pool_type,
+                                       pooling_convention=pooling_convention, global_pool=True, cudnn_off=True, name='pool'))
+
         check_consistency(sym_list, ctx_list)
 
     test_1d_pooling('max')