[GitHub] ptrendx closed pull request #13362: Add NHWC layout support to Pooling (cuDNN only)

GitBox Sun, 30 Dec 2018 17:33:38 -0800

ptrendx closed pull request #13362: Add NHWC layout support to Pooling (cuDNN 
only)
URL: https://github.com/apache/incubator-mxnet/pull/13362


This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/mxnet/gluon/nn/conv_layers.py 
b/python/mxnet/gluon/nn/conv_layers.py
index 5f20d20c02a..427ceb3bbf9 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -673,7 +673,7 @@ def __init__(self, channels, kernel_size, strides=(1, 1, 
1), padding=(0, 0, 0),
 class _Pooling(HybridBlock):
     """Abstract class for different pooling layers."""
     def __init__(self, pool_size, strides, padding, ceil_mode, global_pool,
-                 pool_type, count_include_pad=None, **kwargs):
+                 pool_type, layout, count_include_pad=None, **kwargs):
         super(_Pooling, self).__init__(**kwargs)
         if strides is None:
             strides = pool_size
@@ -684,6 +684,7 @@ def __init__(self, pool_size, strides, padding, ceil_mode, 
global_pool,
         self._kwargs = {
             'kernel': pool_size, 'stride': strides, 'pad': padding,
             'global_pool': global_pool, 'pool_type': pool_type,
+            'layout': layout,
             'pooling_convention': 'full' if ceil_mode else 'valid'}
         if count_include_pad is not None:
             self._kwargs['count_include_pad'] = count_include_pad
@@ -695,7 +696,8 @@ def hybrid_forward(self, F, x):
         return F.Pooling(x, name='fwd', **self._kwargs)
 
     def __repr__(self):
-        s = '{name}(size={kernel}, stride={stride}, padding={pad}, 
ceil_mode={ceil_mode})'
+        s = '{name}(size={kernel}, stride={stride}, padding={pad}, 
ceil_mode={ceil_mode}'
+        s += ', global_pool={global_pool}, pool_type={pool_type}, 
layout={layout})'
         return s.format(name=self.__class__.__name__,
                         ceil_mode=self._kwargs['pooling_convention'] == 'full',
                         **self._kwargs)
@@ -716,7 +718,8 @@ class MaxPool1D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Only supports 'NCW' layout for 
now.
+        Dimension ordering of data and weight. Only supports 'NCW' and 'NWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. Pooling is applied on the W dimension.
     ceil_mode : bool, default False
@@ -738,12 +741,13 @@ class MaxPool1D(_Pooling):
     """
     def __init__(self, pool_size=2, strides=None, padding=0, layout='NCW',
                  ceil_mode=False, **kwargs):
-        assert layout == 'NCW', "Only supports 'NCW' layout for now"
+        assert layout in ('NCW', 'NWC'),\
+            "Only NCW and NWC layouts are valid for 1D Pooling"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)
         assert len(pool_size) == 1, "pool_size must be a number or a list of 1 
ints"
         super(MaxPool1D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'max', **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'max', layout, 
**kwargs)
 
 
 class MaxPool2D(_Pooling):
@@ -761,7 +765,8 @@ class MaxPool2D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Only supports 'NCHW' layout for 
now.
+        Dimension ordering of data and weight. Only supports 'NCHW' and 'NHWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
     ceil_mode : bool, default False
@@ -786,12 +791,13 @@ class MaxPool2D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2), strides=None, padding=0, 
layout='NCHW',
                  ceil_mode=False, **kwargs):
-        assert layout == 'NCHW', "Only supports 'NCHW' layout for now"
+        assert layout in ('NCHW', 'NHWC'),\
+            "Only NCHW and NHWC layouts are valid for 2D Pooling"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*2
         assert len(pool_size) == 2, "pool_size must be a number or a list of 2 
ints"
         super(MaxPool2D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'max', **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'max', layout, 
**kwargs)
 
 
 class MaxPool3D(_Pooling):
@@ -809,7 +815,8 @@ class MaxPool3D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Only supports 'NCDHW' layout 
for now.
+        Dimension ordering of data and weight. Only supports 'NCDHW' and 
'NDHWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
         depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
         dimension.
@@ -836,12 +843,13 @@ class MaxPool3D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCDHW', **kwargs):
-        assert layout == 'NCDHW', "Only supports 'NCDHW' layout for now"
+        assert layout in ('NCDHW', 'NDHWC'),\
+            "Only NCDHW and NDHWC layouts are valid for 3D Pooling"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*3
         assert len(pool_size) == 3, "pool_size must be a number or a list of 3 
ints"
         super(MaxPool3D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'max', **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'max', layout, 
**kwargs)
 
 
 class AvgPool1D(_Pooling):
@@ -858,7 +866,8 @@ class AvgPool1D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Only supports 'NCW' layout for 
now.
+        Dimension ordering of data and weight. Only supports 'NCW' or 'NWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. padding is applied on 'W' dimension.
     ceil_mode : bool, default False
@@ -882,12 +891,14 @@ class AvgPool1D(_Pooling):
     """
     def __init__(self, pool_size=2, strides=None, padding=0, layout='NCW',
                  ceil_mode=False, count_include_pad=True, **kwargs):
-        assert layout == 'NCW', "Only supports 'NCW' layout for now"
+        assert layout in ('NCW', 'NWC'),\
+            "Only NCW and NWC layouts are valid for 1D Pooling"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)
         assert len(pool_size) == 1, "pool_size must be a number or a list of 1 
ints"
         super(AvgPool1D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'avg', 
count_include_pad, **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'avg', layout, 
count_include_pad,
+            **kwargs)
 
 
 class AvgPool2D(_Pooling):
@@ -904,7 +915,8 @@ class AvgPool2D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Only supports 'NCHW' layout for 
now.
+        Dimension ordering of data and weight. Only supports 'NCHW' or 'NHWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
     ceil_mode : bool, default False
@@ -931,12 +943,14 @@ class AvgPool2D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCHW', count_include_pad=True, 
**kwargs):
-        assert layout == 'NCHW', "Only supports 'NCHW' layout for now"
+        assert layout in ('NCHW', 'NHWC'),\
+            "Only NCHW and NHWC layouts are valid for 2D Pooling"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*2
         assert len(pool_size) == 2, "pool_size must be a number or a list of 2 
ints"
         super(AvgPool2D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'avg', 
count_include_pad, **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'avg', layout, 
count_include_pad,
+            **kwargs)
 
 
 class AvgPool3D(_Pooling):
@@ -982,12 +996,14 @@ class AvgPool3D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCDHW', count_include_pad=True, 
**kwargs):
-        assert layout == 'NCDHW', "Only supports 'NCDHW' layout for now"
+        assert layout in ('NCDHW', 'NDHWC'),\
+            "Only NCDHW and NDHWC layouts are valid for 3D Pooling"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*3
         assert len(pool_size) == 3, "pool_size must be a number or a list of 3 
ints"
         super(AvgPool3D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'avg', 
count_include_pad, **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'avg', layout, 
count_include_pad,
+            **kwargs)
 
 
 class GlobalMaxPool1D(_Pooling):
@@ -997,7 +1013,8 @@ class GlobalMaxPool1D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Only supports 'NCW' layout for 
now.
+        Dimension ordering of data and weight. Only supports 'NCW' or 'NWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. Pooling is applied on the W dimension.
 
@@ -1011,9 +1028,10 @@ class GlobalMaxPool1D(_Pooling):
           when `layout` is `NCW`.
     """
     def __init__(self, layout='NCW', **kwargs):
-        assert layout == 'NCW', "Only supports 'NCW' layout for now"
+        assert layout in ('NCW', 'NWC'),\
+            "Only NCW and NWC layouts are valid for 1D Pooling"
         super(GlobalMaxPool1D, self).__init__(
-            (1,), None, 0, True, True, 'max', **kwargs)
+            (1,), None, 0, True, True, 'max', layout, **kwargs)
 
 
 class GlobalMaxPool2D(_Pooling):
@@ -1023,7 +1041,8 @@ class GlobalMaxPool2D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Only supports 'NCHW' layout for 
now.
+        Dimension ordering of data and weight. Only supports 'NCHW' or 'NHWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
 
@@ -1038,9 +1057,10 @@ class GlobalMaxPool2D(_Pooling):
           `(batch_size, channels, 1, 1)` when `layout` is `NCHW`.
     """
     def __init__(self, layout='NCHW', **kwargs):
-        assert layout == 'NCHW', "Only supports 'NCHW' layout for now"
+        assert layout in ('NCHW', 'NHWC'),\
+            "Only NCHW and NHWC layouts are valid for 2D Pooling"
         super(GlobalMaxPool2D, self).__init__(
-            (1, 1), None, 0, True, True, 'max', **kwargs)
+            (1, 1), None, 0, True, True, 'max', layout, **kwargs)
 
 
 class GlobalMaxPool3D(_Pooling):
@@ -1050,7 +1070,8 @@ class GlobalMaxPool3D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Only supports 'NCDHW' layout 
for now.
+        Dimension ordering of data and weight. Only supports 'NCDHW' or 'NDHWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
         depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
         dimension.
@@ -1066,9 +1087,10 @@ class GlobalMaxPool3D(_Pooling):
           `(batch_size, channels, 1, 1, 1)` when `layout` is `NCDHW`.
     """
     def __init__(self, layout='NCDHW', **kwargs):
-        assert layout == 'NCDHW', "Only supports 'NCDHW' layout for now"
+        assert layout in ('NCDHW', 'NDHWC'),\
+            "Only NCDHW and NDHWC layouts are valid for 3D Pooling"
         super(GlobalMaxPool3D, self).__init__(
-            (1, 1, 1), None, 0, True, True, 'max', **kwargs)
+            (1, 1, 1), None, 0, True, True, 'max', layout, **kwargs)
 
 
 class GlobalAvgPool1D(_Pooling):
@@ -1077,7 +1099,8 @@ class GlobalAvgPool1D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Only supports 'NCW' layout for 
now.
+        Dimension ordering of data and weight. Only supports 'NCW' or 'NWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. padding is applied on 'W' dimension.
 
@@ -1090,9 +1113,10 @@ class GlobalAvgPool1D(_Pooling):
         - **out**: 3D output tensor with shape `(batch_size, channels, 1)`.
     """
     def __init__(self, layout='NCW', **kwargs):
-        assert layout == 'NCW', "Only supports 'NCW' layout for now"
+        assert layout in ('NCW', 'NWC'),\
+            "Only NCW and NWC layouts are valid for 1D Pooling"
         super(GlobalAvgPool1D, self).__init__(
-            (1,), None, 0, True, True, 'avg', **kwargs)
+            (1,), None, 0, True, True, 'avg', layout, **kwargs)
 
 
 class GlobalAvgPool2D(_Pooling):
@@ -1101,7 +1125,8 @@ class GlobalAvgPool2D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Only supports 'NCHW' layout for 
now.
+        Dimension ordering of data and weight. Only supports 'NCHW' or 'NHWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively.
 
@@ -1116,9 +1141,10 @@ class GlobalAvgPool2D(_Pooling):
           `(batch_size, channels, 1, 1)` when `layout` is `NCHW`.
     """
     def __init__(self, layout='NCHW', **kwargs):
-        assert layout == 'NCHW', "Only supports 'NCHW' layout for now"
+        assert layout in ('NCHW', 'NHWC'),\
+            "Only NCHW and NHWC layouts are valid for 2D Pooling"
         super(GlobalAvgPool2D, self).__init__(
-            (1, 1), None, 0, True, True, 'avg', **kwargs)
+            (1, 1), None, 0, True, True, 'avg', layout, **kwargs)
 
 
 class GlobalAvgPool3D(_Pooling):
@@ -1143,9 +1169,10 @@ class GlobalAvgPool3D(_Pooling):
           `(batch_size, channels, 1, 1, 1)` when `layout` is `NCDHW`.
     """
     def __init__(self, layout='NCDHW', **kwargs):
-        assert layout == 'NCDHW', "Only supports 'NCDHW' layout for now"
+        assert layout in ('NCDHW', 'NDHWC'),\
+            "Only NCDHW and NDHWC layouts are valid for 3D Pooling"
         super(GlobalAvgPool3D, self).__init__(
-            (1, 1, 1), None, 0, True, True, 'avg', **kwargs)
+            (1, 1, 1), None, 0, True, True, 'avg', layout, **kwargs)
 
 
 class ReflectionPad2D(HybridBlock):
diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h 
b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index 89fa73ef547..dcd9546c807 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -63,7 +63,7 @@ class CuDNNPoolingOp {
         }
         break;
       default:
-        LOG(FATAL) << "Not implmented";
+        LOG(FATAL) << "Not implemented";
     }
   }
 
@@ -73,7 +73,8 @@ class CuDNNPoolingOp {
     CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_));
   }
 
-  void Forward(const OpContext &ctx, const TBlob &in_data,
+  // Return boolean saying whether pooling configuration is supported.
+  bool Forward(const OpContext &ctx, const TBlob &in_data,
       const OpReqType &req, const TBlob &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
@@ -81,7 +82,8 @@ class CuDNNPoolingOp {
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
-    this->Init(s, in_data, out_data);
+    if (!this->Init(s, in_data, out_data))
+      return false;
     if (param_.kernel.ndim() == 2) {
       // 2d pool
       Tensor<gpu, 4, DType> data = in_data.get<gpu, 4, DType>(s);
@@ -113,9 +115,11 @@ class CuDNNPoolingOp {
     } else {
       LOG(FATAL) << "Only support 2D or 3D pooling";
     }
+    return true;
   }
 
-  void Backward(const OpContext &ctx, const TBlob &out_grad,
+  // Return boolean saying whether pooling configuration is supported
+  bool Backward(const OpContext &ctx, const TBlob &out_grad,
       const TBlob &in_data, const TBlob &out_data,
       const OpReqType &req, const TBlob &in_grad) {
     using namespace mshadow;
@@ -125,7 +129,8 @@ class CuDNNPoolingOp {
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
-    this->Init(s, in_data, out_data);
+    if (!this->Init(s, in_data, out_data))
+      return false;
     if (param_.kernel.ndim() == 2) {
       // 2d pool
       Tensor<gpu, 4, DType> m_out_grad = out_grad.get<gpu, 4, DType>(s);
@@ -165,55 +170,80 @@ class CuDNNPoolingOp {
     } else {
       LOG(FATAL) << "Only support 2D or 3D pooling";
     }
+    return true;
   }
 
  private:
-  inline void Init(mshadow::Stream<gpu> *s, const TBlob &in_data,
+  // Return boolean saying whether pooling configuration is supported
+  inline bool Init(mshadow::Stream<gpu> *s, const TBlob &in_data,
       const TBlob &out_data) {
     using namespace mshadow;
+    bool is_supported = true;
     #if CUDNN_MAJOR >= 5
     nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
     #endif
     if (param_.kernel.ndim() == 2) {
       // 2d conv
+      CHECK(param_.layout.value() == mshadow::kNCHW ||
+            param_.layout.value() == mshadow::kNHWC) << "Need 2D layout";
+      cudnnTensorFormat_t cudnn_layout =
+          (param_.layout.value() == mshadow::kNCHW) ? CUDNN_TENSOR_NCHW
+                                                    : CUDNN_TENSOR_NHWC;
       Tensor<gpu, 4, DType> data = in_data.get<gpu, 4, DType>(s);
       Tensor<gpu, 4, DType> out = out_data.get<gpu, 4, DType>(s);
-      mshadow::Shape<4> dshape = data.shape_;
+      // Perform shape calculations in a standard (NCHW) layout space
+      mshadow::Shape<4> dshape_nchw = (param_.layout.value() == 
mshadow::kNHWC) ?
+                                      ConvertLayout(data.shape_, 
mshadow::kNHWC, mshadow::kNCHW) :
+                                      data.shape_;
+      mshadow::Shape<4> oshape_nchw = (param_.layout.value() == 
mshadow::kNHWC) ?
+                                      ConvertLayout(out.shape_, 
mshadow::kNHWC, mshadow::kNCHW) :
+                                      out.shape_;
       CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_,
-                                            CUDNN_TENSOR_NCHW,
+                                            cudnn_layout,
                                             dtype_,
-                                            data.shape_[0],
-                                            data.shape_[1],
-                                            data.shape_[2],
-                                            data.shape_[3]));
+                                            dshape_nchw[0],
+                                            dshape_nchw[1],
+                                            dshape_nchw[2],
+                                            dshape_nchw[3]));
       CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_,
-                                            CUDNN_TENSOR_NCHW,
+                                            cudnn_layout,
                                             dtype_,
-                                            out.shape_[0],
-                                            out.shape_[1],
-                                            out.shape_[2],
-                                            out.shape_[3]));
+                                            oshape_nchw[0],
+                                            oshape_nchw[1],
+                                            oshape_nchw[2],
+                                            oshape_nchw[3]));
+      int window_height = param_.global_pool ? dshape_nchw[2] : 
param_.kernel[0];
+      int window_width = param_.global_pool ? dshape_nchw[3] : 
param_.kernel[1];
+      // CuDNN v7.1.4 backprop kernel doesn't support window sizes 9 and above.
+      // For reference see Fixed Issues section in
+      // 
https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_721.html#rel_721
+      #if CUDNN_VERSION == 7104
+      is_supported = window_height <= 8 && window_width <= 8;
+      #endif
       #if CUDNN_MAJOR >= 5
       CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
                                              mode_,
                                              nan_prop_,
-                                             param_.global_pool ? dshape[2] : 
param_.kernel[0],
-                                             param_.global_pool ? dshape[3] : 
param_.kernel[1],
+                                             window_height,
+                                             window_width,
                                              param_.global_pool ? 0 : 
param_.pad[0],
                                              param_.global_pool ? 0 : 
param_.pad[1],
                                              param_.global_pool ? 1 : 
param_.stride[0],
-                                             param_.global_pool ? 1 
:param_.stride[1]));
+                                             param_.global_pool ? 1 : 
param_.stride[1]));
       #else
       CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
                                              mode_,
-                                             param_.global_pool ? dshape[2] : 
param_.kernel[0],
-                                             param_.global_pool ? dshape[3] : 
param_.kernel[1],
+                                             window_height,
+                                             window_width,
                                              param_.global_pool ? 0 : 
param_.pad[0],
-                                             param_.global_ppol ? 0 : 
param_.pad[1],
+                                             param_.global_pool ? 0 : 
param_.pad[1],
                                              param_.global_pool ? 1 : 
param_.stride[0],
                                              param_.global_pool ? 1 : 
param_.stride[1]));
       #endif
     } else {
+      CHECK(param_.layout.value() == mshadow::kNCDHW ||
+            param_.layout.value() == mshadow::kNDHWC) << "Need 3D layout";
+      CHECK(param_.layout.value() == mshadow::kNCDHW) << "Only the NCDHW 
layout is supported.";
       Tensor<gpu, 5, DType> data = in_data.get<gpu, 5, DType>(s);
       Tensor<gpu, 5, DType> out = out_data.get<gpu, 5, DType>(s);
       std::vector<int> ishape = {static_cast<int>(data.shape_[0]),
@@ -275,6 +305,7 @@ class CuDNNPoolingOp {
       LOG(FATAL) << "3D pooling only support CUDNN v5 and above";
       #endif
     }
+    return is_supported;
   }
 
   cudnnDataType_t dtype_;
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index 71d85da9ba5..7c721907ee8 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -53,6 +53,7 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   bool cudnn_off;
   dmlc::optional<int> p_value;
   dmlc::optional<bool> count_include_pad;
+  dmlc::optional<int> layout;
   DMLC_DECLARE_PARAMETER(PoolingParam) {
     DMLC_DECLARE_FIELD(kernel).set_default(TShape())  // add default value here
     .enforce_nonzero()
@@ -92,6 +93,16 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
               "calculation. For example, with a 5*5 kernel on a 3*3 corner of 
a image,"
               "the sum of the 9 valid elements will be divided by 25 if this 
is set to true,"
               "or it will be divided by 9 if this is set to false. Defaults to 
true.");
+
+    DMLC_DECLARE_FIELD(layout)
+    .add_enum("NCW", mshadow::kNCW)
+    .add_enum("NCHW", mshadow::kNCHW)
+    .add_enum("NCDHW", mshadow::kNCDHW)
+    .add_enum("NHWC", mshadow::kNHWC)
+    .add_enum("NDHWC", mshadow::kNDHWC)
+    .set_default(dmlc::optional<int>())
+    .describe("Set layout for input and output. Empty for\n    "
+              "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d.");
   }
 
   bool operator==(const PoolingParam& other) const {
@@ -103,7 +114,8 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
            this->global_pool        == other.global_pool &&
            this->cudnn_off          == other.cudnn_off &&
            this->p_value            == other.p_value &&
-           this->count_include_pad  == other.count_include_pad;
+           this->count_include_pad  == other.count_include_pad &&
+           this->layout             == other.layout;
   }
 };
 
@@ -124,6 +136,7 @@ struct hash<mxnet::op::PoolingParam> {
     ret = dmlc::HashCombine(ret, val.cudnn_off);
     ret = dmlc::HashCombine(ret, val.p_value);
     ret = dmlc::HashCombine(ret, val.count_include_pad);
+    ret = dmlc::HashCombine(ret, val.layout);
     return ret;
   }
 };
@@ -149,6 +162,9 @@ class PoolingOp {
   void Forward(const OpContext& ctx, const TBlob& in_data,
                const OpReqType& req, const TBlob& out_data) {
     using namespace mshadow;
+    CHECK(param_.layout.value() == kNCW ||
+          param_.layout.value() == kNCHW ||
+          param_.layout.value() == kNCDHW) << "Need CuDNN for layout support";
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const TShape& ishape = in_data.shape_;
     TShape kernel = param_.kernel;
@@ -198,6 +214,9 @@ class PoolingOp {
                 const TBlob& in_data, const TBlob& out_data,
                 const OpReqType& req, const TBlob& in_grad) {
     using namespace mshadow;
+    CHECK(param_.layout.value() == kNCW ||
+          param_.layout.value() == kNCHW ||
+          param_.layout.value() == kNCDHW) << "Need CuDNN for layout support";
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const TShape& ishape = in_data.shape_;
     TShape kernel = param_.kernel;
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 611568807a9..1024f110492 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -40,9 +40,11 @@ void PoolingParamParser(nnvm::NodeAttrs *attrs) {
   PoolingParam param;
   param.Init(attrs->dict);
   if (param.kernel.ndim() == 1) {
+    param.layout = param.layout ? param.layout.value() : mshadow::kNCW;
     if (param.stride.ndim() == 0) param.stride = Shape1(1);
     if (param.pad.ndim() == 0) param.pad = Shape1(0);
   } else if (param.kernel.ndim() == 2) {
+    param.layout = param.layout ? param.layout.value() : mshadow::kNCHW;
     if (param.stride.ndim() == 0) param.stride = Shape2(1, 1);
     if (param.pad.ndim() == 0) param.pad = Shape2(0, 0);
   } else {
@@ -51,6 +53,7 @@ void PoolingParamParser(nnvm::NodeAttrs *attrs) {
         CHECK_EQ(param.kernel.ndim(), 3U) << param.kernel.ndim()
             << "D pooling not supported";
       }
+    param.layout = param.layout ? param.layout.value(): mshadow::kNCDHW;
     if (param.stride.ndim() == 0) param.stride = Shape3(1, 1, 1);
     if (param.pad.ndim() == 0) param.pad = Shape3(0, 0, 0);
   }
@@ -111,38 +114,65 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
       << "Pooling: Input data should be  3D in (batch, channel, x)"
       << " Or 4D in (batch, channel, y, x) "
       << " Or 5D in (batch, channel, d, y, x)";
-  TShape oshape = dshape;
   if (dshape.ndim() == 0) return false;
   if (param.global_pool) {
-      for (size_t i{2}; i < dshape.ndim(); i++)
-          oshape[i] = 1;
-      out_shape->clear();
-      out_shape->push_back(oshape);  // save output shape
+    TShape oshape = dshape;
+    size_t c_index = 0;
+    switch (param.layout.value()) {
+      case mshadow::kNCW:
+      case mshadow::kNCHW:
+      case mshadow::kNCDHW:
+        c_index = 1;
+        break;
+      case mshadow::kNWC:
+      case mshadow::kNHWC:
+      case mshadow::kNDHWC:
+        c_index = dshape.ndim() - 1;
+        break;
+      default:
+        LOG(FATAL) << "Unsupported tensor layout " << param.layout.value();
+    }
+    for (size_t i{1}; i < dshape.ndim(); i++)
+      if (i != c_index)
+        oshape[i] = 1;
+    out_shape->clear();
+    out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_MKLDNN == 1
-      if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
+    if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
         out_shape->push_back(oshape);   // for workspace
 #endif
+  } else if (param.kernel.ndim() == 0) {
+    return false;
   } else if (param.kernel.ndim() == 1) {
-    CHECK_EQ(dshape.ndim(), 3U)
-        << "Pooling: Input data should be 3D in (batch, channel, x)";
-    CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0])
-        << "kernel size (" << param.kernel[0] << ") exceeds input ("
-        << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0])
-        << ")";
+    CHECK_EQ(dshape.ndim(), 3U) <<
+      "Pooling: Input data should be 3D in (batch, channel, x)";
+    CHECK(param.layout.value() == mshadow::kNCW ||
+          param.layout.value() == mshadow::kNWC) << "Need 1D layout";
+    // Perform shape calculations in a standard (NCW) layout space
+    mshadow::Shape<3> dshape_ncw = (param.layout.value() == mshadow::kNWC) ?
+                                    ConvertLayout(dshape.get<3>(), 
mshadow::kNWC, mshadow::kNCW) :
+                                    dshape.get<3>();
+    mshadow::Shape<3> oshape_ncw = dshape_ncw;
+    CHECK(param.kernel[0] <= dshape_ncw[2] + 2 * param.pad[0])
+        << "kernel size (" << param.kernel[0] << ") exceeds input (" << 
dshape[2]
+        << " padded to " << (dshape_ncw[2] + 2*param.pad[0]) << ")";
     if (param.pooling_convention == pool_enum::kValid) {
-      oshape[2] = 1 +
-                  (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
-                      param.stride[0];
+      oshape_ncw[2] = 1 +
+                      (dshape_ncw[2] + 2 * param.pad[0] - param.kernel[0]) /
+                          param.stride[0];
     } else if (param.pooling_convention == pool_enum::kFull) {
-      oshape[2] = 1 + static_cast<int>(std::ceil(
-                          static_cast<float>(dshape[2] + 2 * param.pad[0] -
-                                             param.kernel[0]) /
-                          param.stride[0]));
+      oshape_ncw[2] = 1 + static_cast<int>(std::ceil(
+                              static_cast<float>(dshape_ncw[2] + 2 * 
param.pad[0] -
+                                                 param.kernel[0]) /
+                              param.stride[0]));
     } else {
-      oshape[2] = static_cast<int>(std::ceil(
-                          static_cast<float>(dshape[2] + 2 * param.pad[0]) /
+      oshape_ncw[2] = static_cast<int>(std::ceil(
+                          static_cast<float>(dshape_ncw[2] + 2 * param.pad[0]) 
/
                           param.stride[0]));
     }
+    // Convert back from standard (NCW) layout space to the actual layout type
+    TShape oshape = (param.layout.value() == mshadow::kNWC) ?
+                    ConvertLayout(oshape_ncw, mshadow::kNCW, mshadow::kNWC) : 
oshape_ncw;
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_MKLDNN == 1
@@ -150,33 +180,38 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
       out_shape->push_back(oshape);   // for workspace
 #endif
   } else if (param.kernel.ndim() == 2) {
-    CHECK_EQ(dshape.ndim(), 4U)
-        << "Pooling: Input data should be 4D in (batch, channel, y, x)";
-    CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0])
-        << "kernel size (" << param.kernel[0] << ") exceeds input ("
-        << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0])
-        << ")";
-    CHECK(param.kernel[1] <= dshape[3] + 2 * param.pad[1])
-        << "kernel size (" << param.kernel[1] << ") exceeds input ("
-        << dshape[3] << " padded to " << (dshape[3] + 2 * param.pad[1])
-        << ")";
+    CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in 
(batch, channel, y, x)";
+    CHECK(param.layout.value() == mshadow::kNCHW ||
+          param.layout.value() == mshadow::kNHWC) << "Need 2D layout";
+    // Perform shape calculations in a standard (NCHW) layout space
+    mshadow::Shape<4> dshape_nchw = (param.layout.value() == mshadow::kNHWC) ?
+                                    ConvertLayout(dshape.get<4>(), 
mshadow::kNHWC, mshadow::kNCHW) :
+                                    dshape.get<4>();
+    mshadow::Shape<4> oshape_nchw = dshape_nchw;
+    CHECK(param.kernel[0] <= dshape_nchw[2] + 2 * param.pad[0])
+        << "kernel size (" << param.kernel[0] << ") exceeds input (" << 
dshape_nchw[2]
+        << " padded to " << (dshape_nchw[2] + 2*param.pad[0]) << ")";
+    CHECK(param.kernel[1] <= dshape_nchw[3] + 2 * param.pad[1])
+        << "kernel size (" << param.kernel[1] << ") exceeds input (" << 
dshape_nchw[3]
+        << " padded to " << (dshape_nchw[3] + 2*param.pad[1]) << ")";
     if (param.pooling_convention == pool_enum::kValid) {
-      oshape[2] = 1 +
-                  (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
-                      param.stride[0];
-      oshape[3] = 1 +
-                  (dshape[3] + 2 * param.pad[1] - param.kernel[1]) /
-                      param.stride[1];
+      oshape_nchw[2] = 1 + (dshape_nchw[2] + 2 * param.pad[0] - 
param.kernel[0]) /
+                          param.stride[0];
+      oshape_nchw[3] = 1 + (dshape_nchw[3] + 2 * param.pad[1] - 
param.kernel[1]) /
+                          param.stride[1];
     } else {
-      oshape[2] = 1 + static_cast<int>(std::ceil(
-                          static_cast<float>(dshape[2] + 2 * param.pad[0] -
-                                             param.kernel[0]) /
-                          param.stride[0]));
-      oshape[3] = 1 + static_cast<int>(std::ceil(
-                          static_cast<float>(dshape[3] + 2 * param.pad[1] -
-                                             param.kernel[1]) /
-                          param.stride[1]));
+      oshape_nchw[2] = 1 + static_cast<int>(ceil(
+                               static_cast<float>(dshape_nchw[2] + 2 * 
param.pad[0] -
+                                                  param.kernel[0]) /
+                               param.stride[0]));
+      oshape_nchw[3] = 1 + static_cast<int>(ceil(
+                               static_cast<float>(dshape_nchw[3] + 2 * 
param.pad[1] -
+                                                  param.kernel[1]) /
+                               param.stride[1]));
     }
+    // Convert back from standard (NCHW) layout space to the actual layout type
+    TShape oshape = (param.layout.value() == mshadow::kNHWC) ?
+                    ConvertLayout(oshape_nchw, mshadow::kNCHW, mshadow::kNHWC) 
: oshape_nchw;
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_MKLDNN == 1
@@ -185,38 +220,41 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
 #endif
   } else if (param.kernel.ndim() == 3) {
     CHECK_EQ(dshape.ndim(), 5U)
-        << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
-    CHECK_LE(param.kernel[0], dshape[2] + 2 * param.pad[0])
-        << "kernel size exceeds input";
-    CHECK_LE(param.kernel[1], dshape[3] + 2 * param.pad[1])
-        << "kernel size exceeds input";
-    CHECK_LE(param.kernel[2], dshape[4] + 2 * param.pad[2])
-        << "kernel size exceeds input";
+      << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
+    CHECK(param.layout.value() == mshadow::kNCDHW ||
+          param.layout.value() == mshadow::kNDHWC) << "Need 3D layout";
+    // Perform shape calculations in a standard (NCDHW) layout space
+    mshadow::Shape<5> dshape_ncdhw = (param.layout.value() == mshadow::kNDHWC) 
?
+                                  ConvertLayout(dshape.get<5>(), 
mshadow::kNDHWC, mshadow::kNCDHW) :
+                                  dshape.get<5>();
+    mshadow::Shape<5> oshape_ncdhw = dshape_ncdhw;
+    CHECK_LE(param.kernel[0], dshape_ncdhw[2] + 2 * param.pad[0]) << "kernel 
size exceeds input";
+    CHECK_LE(param.kernel[1], dshape_ncdhw[3] + 2 * param.pad[1]) << "kernel 
size exceeds input";
+    CHECK_LE(param.kernel[2], dshape_ncdhw[4] + 2 * param.pad[2]) << "kernel 
size exceeds input";
     if (param.pooling_convention == pool_enum::kValid) {
-      oshape[2] = 1 +
-                  (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
-                      param.stride[0];
-      oshape[3] = 1 +
-                  (dshape[3] + 2 * param.pad[1] - param.kernel[1]) /
-                      param.stride[1];
-      oshape[4] = 1 +
-                  (dshape[4] + 2 * param.pad[2] - param.kernel[2]) /
-                      param.stride[2];
+      oshape_ncdhw[2] = 1 + (dshape_ncdhw[2] + 2 * param.pad[0] - 
param.kernel[0]) /
+                          param.stride[0];
+      oshape_ncdhw[3] = 1 + (dshape_ncdhw[3] + 2 * param.pad[1] - 
param.kernel[1]) /
+                          param.stride[1];
+      oshape_ncdhw[4] = 1 + (dshape_ncdhw[4] + 2 * param.pad[2] - 
param.kernel[2]) /
+                          param.stride[2];
     } else {
-      oshape[2] = 1 + static_cast<int>(std::ceil(
-                          static_cast<float>(dshape[2] + 2 * param.pad[0] -
-                                             param.kernel[0]) /
-                          param.stride[0]));
-      oshape[3] = 1 + static_cast<int>(std::ceil(
-                          static_cast<float>(dshape[3] + 2 * param.pad[1] -
-                                             param.kernel[1]) /
-                          param.stride[1]));
-      oshape[4] = 1 + static_cast<int>(std::ceil(
-                          static_cast<float>(dshape[4] + 2 * param.pad[2] -
-                                             param.kernel[2]) /
-                          param.stride[2]));
+      oshape_ncdhw[2] = 1 + static_cast<int>(ceil(
+                                static_cast<float>(dshape_ncdhw[2] + 2 * 
param.pad[0] -
+                                                   param.kernel[0]) /
+                                param.stride[0]));
+      oshape_ncdhw[3] = 1 + static_cast<int>(ceil(
+                                static_cast<float>(dshape_ncdhw[3] + 2 * 
param.pad[1] -
+                                                   param.kernel[1]) /
+                                param.stride[1]));
+      oshape_ncdhw[4] = 1 + static_cast<int>(ceil(
+                                static_cast<float>(dshape_ncdhw[4] + 2 * 
param.pad[2] -
+                                                   param.kernel[2]) /
+                                param.stride[2]));
     }
-
+    // Convert back from standard (NCDHW) layout space to the actual layout 
type
+    TShape oshape = (param.layout.value() == mshadow::kNDHWC) ?
+                    ConvertLayout(oshape_ncdhw, mshadow::kNCDHW, 
mshadow::kNDHWC) : oshape_ncdhw;
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_MKLDNN == 1
@@ -224,6 +262,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
       out_shape->push_back(oshape);   // for workspace
 #endif
   }
+
   return true;
 }
 
@@ -331,13 +370,17 @@ NNVM_REGISTER_OP(Pooling)
 
 The shapes for 1-D pooling are
 
-- **data**: *(batch_size, channel, width)*,
-- **out**: *(batch_size, num_filter, out_width)*.
+- **data**: *(batch_size, channel, width)* (NCW layout) or
+  *(batch_size, width, channel)* (NWC layout, cuDNN only),
+- **out**: *(batch_size, num_filter, out_width)* (NCW layout) or
+  *(batch_size, width, channel)* (NWC layout, cuDNN only).
 
 The shapes for 2-D pooling are
 
-- **data**: *(batch_size, channel, height, width)*
-- **out**: *(batch_size, num_filter, out_height, out_width)*, with::
+- **data**: *(batch_size, channel, height, width)* (NCHW layout) pr
+  *(batch_size, height, width, channel)* (NHWC layout, cuDNN only),
+- **out**: *(batch_size, num_filter, out_height, out_width)* (NCHW layout) or
+  *(batch_size, out_height, out_width, num_filter)* (NHWC layout, cuDNN only), 
with::
 
     out_height = f(height, kernel[0], pad[0], stride[0])
     out_width = f(width, kernel[1], pad[1], stride[1])
@@ -364,7 +407,8 @@ Three pooling options are supported by ``pool_type``:
 
 For 3-D pooling, an additional *depth* dimension is added before
 *height*. Namely the input data will have shape *(batch_size, channel, depth,
-height, width)*.
+height, width)* (NCDHW) or *(batch_size, depth, height, width, channel)* 
(NDHWC,
+cuDNN only).
 
 Notes on Lp pooling:
 
diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu
index 997218620c3..b996ebec9b4 100644
--- a/src/operator/nn/pooling.cu
+++ b/src/operator/nn/pooling.cu
@@ -61,8 +61,10 @@ void PoolingCompute<gpu>(const nnvm::NodeAttrs& attrs,
       switch (param.pool_type) {
         case pool_enum::kMaxPooling:
         case pool_enum::kAvgPooling:
-          GetCuDNNPoolingOp<DType>(param).Forward(ctx, inputs[0], req[0], 
outputs[0]);
-          return;
+          if (GetCuDNNPoolingOp<DType>(param).Forward(ctx, inputs[0], req[0], 
outputs[0])) {
+            return;
+          }
+          break;
         case pool_enum::kSumPooling:
           LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum 
pooling is applied.";
           break;
@@ -116,10 +118,12 @@ void PoolingGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
       switch (param.pool_type) {
         case pool_enum::kMaxPooling:
         case pool_enum::kAvgPooling:
-          GetCuDNNPoolingOp<DType>(param).Backward(ctx, inputs[ograd_idx],
+          if (GetCuDNNPoolingOp<DType>(param).Backward(ctx, inputs[ograd_idx],
                                                    inputs[in_data_idx], 
inputs[out_data_idx],
-                                                   req[0], outputs[0]);
-          return;
+                                                   req[0], outputs[0])) {
+            return;
+          }
+          break;
         case pool_enum::kSumPooling:
           LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum 
pooling is applied.";
           break;
diff --git a/tests/python/gpu/test_operator_gpu.py 
b/tests/python/gpu/test_operator_gpu.py
index 8054937a84c..ba81ead671c 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -608,6 +608,52 @@ def test_convolution_versions():
 
 
 @with_seed()
+def test_pooling_with_convention():
+    # While the float32 and float64 output is reliably consistent, float16 
departs occasionally.
+    # We compare cpu and gpu results only within a given precision.
+    for data_type in [np.float64, np.float32, np.float16]:
+        ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (2, 2, 10, 10), 
'type_dict': {'pool_data': data_type}},
+                    {'ctx': mx.cpu(0), 'pool_data': (2, 2, 10, 10), 
'type_dict': {'pool_data': data_type}}]
+        sym = mx.sym.Pooling(kernel=(3,3), pool_type='max', 
pooling_convention='valid', name='pool')
+        check_consistency(sym, ctx_list)
+
+        sym = mx.sym.Pooling(kernel=(3,3), pool_type='max', 
pooling_convention='full', name='pool')
+        check_consistency(sym, ctx_list)
+
+        sym = mx.sym.Pooling(kernel=(300,300), pool_type='max', 
global_pool=True, name='pool')
+        check_consistency(sym, ctx_list)
+
+
+@with_seed()
+@assert_raises_cudnn_not_satisfied(min_version='7.0.1')
+def test_pooling_nhwc_with_convention():
+    def make_pooling_syms(**kwargs):
+        # Conventional NCHW layout pooling
+        sym = mx.sym.Pooling(**kwargs)
+        # NHWC pooling
+        data = mx.sym.Variable('pool_data')
+        sym_nhwc = mx.sym.transpose(data, axes=(0,2,3,1))
+        sym_nhwc = mx.sym.Pooling(sym_nhwc, layout='NHWC', **kwargs)
+        sym_nhwc = mx.sym.transpose(sym_nhwc, axes=(0,3,1,2), name='pool')
+        return [sym, sym_nhwc]
+
+    # While the float32 and float64 output is reliably consistent, float16 
departs occasionally.
+    # We compare nhwc and nchw results only within a given precision.
+    for in_shape in [(3, 4, 8, 8), (2, 2, 10, 10)]:
+        for data_type in [np.float64, np.float32, np.float16]:
+            # NHWC pooling is only enabled on GPU with CUDNN
+            ctx_list = [{'ctx': mx.gpu(0), 'pool_data': in_shape, 'type_dict': 
{'pool_data': data_type}}]
+            symlist = make_pooling_syms(kernel=(3,3), pool_type='max', 
pooling_convention='valid', name='pool')
+            check_consistency_NxM(symlist, ctx_list)
+
+            symlist = make_pooling_syms(kernel=(3,3), pool_type='max', 
pooling_convention='full', name='pool')
+            check_consistency_NxM(symlist, ctx_list)
+            # CUDNN v7.1.4 can't handle all cases, and there's no NHWC MXNet 
fallback impl yet
+            if in_shape[2] <= 8 and in_shape[3] <= 8:
+                symlist = make_pooling_syms(kernel=(300,300), pool_type='max', 
global_pool=True, name='pool')
+                check_consistency_NxM(symlist, ctx_list)
+
+
 def test_pooling_with_type():
     ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (2, 2, 10, 10), 'type_dict': 
{'pool_data': np.float64}},
                 {'ctx': mx.gpu(0), 'pool_data': (2, 2, 10, 10), 'type_dict': 
{'pool_data': np.float32}},
@@ -768,26 +814,52 @@ def test_spatial_transformer_with_type():
     check_consistency(sym, ctx_list)
     check_consistency(sym, ctx_list, grad_req="add")
 
-
 @with_seed()
 def test_pooling_with_type2():
-    ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': 
{'pool_data': np.float64}},
-                {'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': 
{'pool_data': np.float32}},
-                {'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': 
{'pool_data': np.float16}},
-                {'ctx': mx.cpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': 
{'pool_data': np.float64}},
-                {'ctx': mx.cpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': 
{'pool_data': np.float32}}]
+    # While the float32 and float64 output is reliably consistent, float16 
departs occasionally.
+    # We compare cpu and gpu results only within a given precision.
+    for data_type in [np.float64, np.float32, np.float16]:
+        ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 
'type_dict': {'pool_data': data_type}},
+                    {'ctx': mx.cpu(0), 'pool_data': (10, 2, 10, 10), 
'type_dict': {'pool_data': data_type}}]
 
-    sym = mx.sym.Pooling(name='pool', kernel=(3,3), stride=(2,2), 
pool_type='max')
-    check_consistency(sym, ctx_list, rand_type=np.float16)
+        sym = mx.sym.Pooling(name='pool', kernel=(3,3), stride=(2,2), 
pool_type='max')
+        check_consistency(sym, ctx_list)
 
-    sym = mx.sym.Pooling(name='pool', kernel=(3,3), pad=(1,1), pool_type='avg')
-    check_consistency(sym, ctx_list)
+        sym = mx.sym.Pooling(name='pool', kernel=(3,3), pad=(1,1), 
pool_type='avg')
+        check_consistency(sym, ctx_list)
 
-    sym = mx.sym.Pooling(name='pool', kernel=(5,5), pad=(2,2), pool_type='max')
-    check_consistency(sym, ctx_list, rand_type=np.float16)
+        sym = mx.sym.Pooling(name='pool', kernel=(5,5), pad=(2,2), 
pool_type='max')
+        check_consistency(sym, ctx_list)
 
-    sym = mx.sym.Pooling(name='pool', kernel=(3,3), pad=(1,1), pool_type='sum')
-    check_consistency(sym, ctx_list)
+        sym = mx.sym.Pooling(name='pool', kernel=(3,3), pad=(1,1), 
pool_type='sum')
+        check_consistency(sym, ctx_list)
+
+@with_seed()
+@assert_raises_cudnn_not_satisfied(min_version='7.0.1')
+def test_pooling_nhwc_with_type():
+    def make_pooling_syms(**kwargs):
+        # Conventional NCHW layout pooling
+        sym = mx.sym.Pooling(**kwargs)
+        # NHWC pooling
+        data = mx.sym.Variable('pool_data')
+        sym_nhwc = mx.sym.transpose(data, axes=(0,2,3,1))
+        sym_nhwc = mx.sym.Pooling(sym_nhwc, layout='NHWC', **kwargs)
+        sym_nhwc = mx.sym.transpose(sym_nhwc, axes=(0,3,1,2), name='pool')
+        return [sym, sym_nhwc]
+
+    # While the float32 and float64 output is reliably consistent, float16 
departs occasionally.
+    # We compare nhwc and nchw results only within a given precision.
+    for data_type in [np.float64, np.float32, np.float16]:
+        # NHWC pooling only enabled on GPU with CUDNN
+        ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 
'type_dict': {'pool_data': data_type}}]
+        symlist = make_pooling_syms(name='pool', kernel=(3,3), stride=(2,2), 
pool_type='max')
+        check_consistency_NxM(symlist, ctx_list)
+
+        symlist = make_pooling_syms(name='pool', kernel=(3,3), pad=(1,1), 
pool_type='avg')
+        check_consistency_NxM(symlist, ctx_list)
+
+        symlist = make_pooling_syms(name='pool', kernel=(5,5), pad=(2,2), 
pool_type='max')
+        check_consistency_NxM(symlist, ctx_list)
 
 @unittest.skip("Flaky test 
https://github.com/apache/incubator-mxnet/issues/11517";)
 @with_seed()


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] ptrendx closed pull request #13362: Add NHWC layout support to Pooling (cuDNN only)

Reply via email to