D-Roberts closed pull request #13696: [MXNET-1274] Add docstring comment about 
noise replication in SGLD optimizer and a test for SGLD optimizer
URL: https://github.com/apache/incubator-mxnet/pull/13696
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3b8bbd2e027..20ade5db13d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -732,7 +732,12 @@ install(TARGETS ${MXNET_INSTALL_TARGETS}
   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
 
+# NOTE: Public headers will be installed into 
${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}, see
+#       https://cmake.org/cmake/help/v3.0/variable/CMAKE_INSTALL_PREFIX.html
+#       https://cmake.org/cmake/help/v3.0/module/GNUInstallDirs.html
+
 install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(DIRECTORY 3rdparty/tvm/nnvm/include/ DESTINATION 
${CMAKE_INSTALL_INCLUDEDIR})
 if (INSTALL_EXAMPLES)
   install(DIRECTORY example  DESTINATION 
${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME})
 endif()
diff --git a/cpp-package/CMakeLists.txt b/cpp-package/CMakeLists.txt
index f7fbc77e1a5..5d2977279d7 100644
--- a/cpp-package/CMakeLists.txt
+++ b/cpp-package/CMakeLists.txt
@@ -20,4 +20,6 @@ if(USE_CPP_PACKAGE)
     add_subdirectory(example)
   endif()
 
+  install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
 endif()
diff --git a/docs/build_version_doc/build_all_version.sh 
b/docs/build_version_doc/build_all_version.sh
index 6b8c3cbd864..3c432bbedfc 100755
--- a/docs/build_version_doc/build_all_version.sh
+++ b/docs/build_version_doc/build_all_version.sh
@@ -122,6 +122,8 @@ function checkout () {
     git pull
     # master gets warnings as errors for Sphinx builds
     OPTS="-W"
+  else
+    OPTS=
   fi
   git submodule update --init --recursive
   cd ..
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index e960829e691..374a3b50bbb 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -54,7 +54,6 @@
 from . import lr_scheduler
 # use mx.kv as short for kvstore
 from . import kvstore as kv
-from . import kvstore_server
 # Runtime compile module
 from . import rtc
 # Attribute scope to add attributes to symbolic graphs
@@ -82,3 +81,11 @@
 from . import gluon
 
 __version__ = base.__version__
+
+# Dist kvstore module which launches a separate process when role is set to 
"server".
+# This should be done after other modules are initialized.
+# Otherwise this may result in errors when unpickling custom LR 
scheduler/optimizers.
+# For example, the LRScheduler in gluoncv depends on a specific version of 
MXNet, and
+# checks the __version__ attr of MXNet, which is not set on kvstore server due 
to the
+# fact that kvstore-server module is imported before the __version__ attr is 
set.
+from . import kvstore_server
diff --git a/python/mxnet/optimizer/optimizer.py 
b/python/mxnet/optimizer/optimizer.py
index ba16132ab08..6563b4735f1 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -29,7 +29,7 @@
                        mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, 
ftml_update,
                        signsgd_update, signum_update)
 from ..ndarray import sparse
-from ..random import normal
+from ..random import normal, seed
 
 __all__ = [
     'AdaDelta', 'AdaGrad', 'Adam', 'Adamax', 'DCASGD', 'FTML', 'Ftrl', 'LBSGD',
@@ -993,8 +993,10 @@ class SGLD(Optimizer):
     This class implements the optimizer described in the paper *Stochastic 
Gradient
     Riemannian Langevin Dynamics on the Probability Simplex*, available at
     
https://papers.nips.cc/paper/4883-stochastic-gradient-riemannian-langevin-dynamics-on-the-probability-simplex.pdf.
+    To reproduce the updates exactly, a seed for the noise generation must be 
set in the user program.
 
     """
+
     def __init__(self, **kwargs):
         super(SGLD, self).__init__(**kwargs)
 
@@ -1002,8 +1004,8 @@ def create_state(self, index, weight):
         return None
 
     def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
+        assert (isinstance(weight, NDArray))
+        assert (isinstance(grad, NDArray))
         self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)
@@ -1011,9 +1013,8 @@ def update(self, index, weight, grad, state):
         grad = grad * self.rescale_grad
         if self.clip_gradient is not None:
             grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-        weight[:] += - lr/2 * (grad + wd * weight) + normal(0, math.sqrt(lr), 
shape=weight.shape,
-                                                            
dtype=weight.dtype, ctx=weight.context)
-
+        weight[:] += - lr / 2 * (grad + wd * weight) + normal(0, 
math.sqrt(lr), shape=weight.shape,
+                                                              
dtype=weight.dtype, ctx=weight.context)
 
 
 @register  # pylint: disable=invalid-name
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 5a4cb29bc21..251bfb3f0e1 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -453,17 +453,10 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int 
dtype) {
 
   mkldnn::memory::dims dims;
   // These are shapes supprted by MKLDNN.
-  if (shape.ndim() == 1 || shape.ndim() == 2 || shape.ndim() == 4
-      || shape.ndim() == 5) {
+  if (shape.ndim() >= 1 && shape.ndim() <= 5) {
     dims.resize(shape.ndim());
     for (size_t i = 0; i < dims.size(); i++)
       dims[i] = shape[i];
-  } else if (shape.ndim() == 3) {
-    // If there are 3 dimensions, we'll force it to 4 dimensions.
-    dims.resize(shape.ndim() + 1);
-    dims[0] = 1;
-    for (size_t i = 0; i < shape.ndim(); i++)
-      dims[i + 1] = shape[i];
   } else {
     LOG(FATAL) << "MKLDNN doesn't support " << shape.ndim() << " dimensions";
   }
@@ -471,6 +464,7 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int 
dtype) {
   switch (dims.size()) {
     case 1: layout = mkldnn::memory::format::x; break;
     case 2: layout = mkldnn::memory::format::nc; break;
+    case 3: layout = mkldnn::memory::format::ncw; break;
     case 4: layout = mkldnn::memory::format::nchw; break;
     // This isn't the right layout when the data has 5 dimensions in MXNet.
     // MXNet interprets 5 dimensions as ncdhw, but MKLDNN doesn't have
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index 305eeab2117..fb920c31ce3 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -97,9 +97,10 @@ static void ActivationComputeExCPU(const nnvm::NodeAttrs& 
attrs,
                                    const std::vector<NDArray>& inputs,
                                    const std::vector<OpReqType>& req,
                                    const std::vector<NDArray>& outputs) {
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
-  if (SupportMKLDNN(inputs[0])) {
+  if (SupportMKLDNNAct(param, inputs[0])) {
     MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
     MKLDNNActivationForward(attrs, ctx, inputs[0], req[0], outputs[0]);
     MKLDNN_OPCHECK_RUN(ActivationCompute<cpu>, attrs, ctx, inputs, req, 
outputs);
@@ -115,7 +116,7 @@ void ActivationGradComputeExCPU(const nnvm::NodeAttrs& 
attrs,
                                 const std::vector<NDArray>& outputs) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), activation::GradNumInputs(param.act_type));
-  if (SupportMKLDNN(inputs[0])) {
+  if (SupportMKLDNNAct(param, inputs[0])) {
     MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
     // XXX: for y = relu(x), y is passed as "in_data" to Backward()
     const bool relu = param.act_type == activation::kReLU;
diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc 
b/src/operator/nn/mkldnn/mkldnn_act.cc
index 440705884b3..8c64888b460 100644
--- a/src/operator/nn/mkldnn/mkldnn_act.cc
+++ b/src/operator/nn/mkldnn/mkldnn_act.cc
@@ -49,6 +49,15 @@ bool SupportMKLDNNAct(const ActivationParam& param) {
       || param.act_type == activation::kTanh;
 }
 
+bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input) {
+  // MKL-DNN Activation supports 1d, 2d, 3d, 4d data layout
+  if ((input.shape().ndim() < 1) ||
+      (input.shape().ndim() > 4) ||
+      (input.dtype() != mshadow::kFloat32))
+    return false;
+  return SupportMKLDNNAct(param);
+}
+
 static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) 
{
   switch (param.act_type) {
     case activation::kReLU:
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h 
b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 17e74094c2b..e367f42c188 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -175,10 +175,11 @@ struct ConvolutionParam;
 struct DeconvolutionParam;
 struct SoftmaxParam;
 bool SupportMKLDNNAct(const ActivationParam& param);
+bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input);
 bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input);
 bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray 
&input);
 bool SupportMKLDNNSoftmax(const SoftmaxParam& param);
-}
+}  // namespace op
 
 static int GetTypeSize(int dtype) {
   int size = -1;
@@ -250,15 +251,24 @@ inline static mkldnn::memory::desc GetMemDesc(const 
NDArray &arr) {
 
 inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
                                                  int num_groups) {
+  auto ndim = arr.shape().ndim();
+  mkldnn::memory::dims tz = mkldnn::memory::dims{0};
   if (num_groups == 1) {
     return GetMemDesc(arr);
   } else {
-    CHECK_EQ(arr.shape().ndim(), 4U);
-    mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups,
-      static_cast<int>(arr.shape()[0] / num_groups),
-      static_cast<int>(arr.shape()[1]),
-      static_cast<int>(arr.shape()[2]),
-      static_cast<int>(arr.shape()[3])};
+    CHECK((ndim == 3) || (ndim == 4))
+        << "MKL-DNN weight currectly supports 3d and 4d layout";
+    const int N = 0, H = 2, W = 3, C = 1;
+    if (ndim == 3) {
+      tz = mkldnn::memory::dims{
+          num_groups, static_cast<int>(arr.shape()[N] / num_groups),
+          static_cast<int>(arr.shape()[C]), static_cast<int>(arr.shape()[H])};
+    } else {
+      tz = mkldnn::memory::dims{
+          num_groups, static_cast<int>(arr.shape()[N] / num_groups),
+          static_cast<int>(arr.shape()[C]), static_cast<int>(arr.shape()[H]),
+          static_cast<int>(arr.shape()[W])};
+    }
     return mkldnn::memory::desc{tz, get_mkldnn_type(arr.dtype()),
                                 mkldnn::memory::format::any};
   }
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc 
b/src/operator/nn/mkldnn/mkldnn_base.cc
index 5da55f4ca70..ccb9d7ec007 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -239,39 +239,49 @@ const mkldnn::memory *GetWeights(const NDArray &arr,
     return mem;
 
   mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype());
+  mkldnn::memory::dims tz = mkldnn::memory::dims{0};
+  mkldnn::memory::format format = mkldnn::memory::format::format_undef;
   auto engine = CpuEngine::Get()->get_engine();
+  const int O = 0, I = 1, H = 2, W = 3;
   if (arr.shape().ndim() == 2) {
-    mkldnn::memory::dims tz = mkldnn::memory::dims{
-      static_cast<int>(arr.shape()[0]), static_cast<int>(arr.shape()[1])};
-    mkldnn::memory::desc md =
-        mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi};
-    mkldnn::memory::primitive_desc pd =
-        mkldnn::memory::primitive_desc{md, engine};
-    mem = arr.GetMKLDNNData(pd);
-  } else if (arr.shape().ndim() == 4 && num_groups == 1) {
-    mkldnn::memory::dims tz = mkldnn::memory::dims{
-      static_cast<int>(arr.shape()[0]), static_cast<int>(arr.shape()[1]),
-          static_cast<int>(arr.shape()[2]), static_cast<int>(arr.shape()[3])};
-    mkldnn::memory::desc md =
-        mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw};
-    mkldnn::memory::primitive_desc pd =
-        mkldnn::memory::primitive_desc{md, engine};
-    mem = arr.GetMKLDNNData(pd);
+    tz = mkldnn::memory::dims{static_cast<int>(arr.shape()[O]),
+                              static_cast<int>(arr.shape()[I])};
+    format = mkldnn::memory::format::oi;
+  } else if (arr.shape().ndim() == 3) {
+    tz = num_groups > 1
+             ? mkldnn::memory::dims{num_groups,
+                                    static_cast<int>(arr.shape()[O] /
+                                                     num_groups),
+                                    static_cast<int>(arr.shape()[I]),
+                                    static_cast<int>(arr.shape()[H])}
+             : mkldnn::memory::dims{static_cast<int>(arr.shape()[O]),
+                                    static_cast<int>(arr.shape()[I]),
+                                    static_cast<int>(arr.shape()[H])};
+    format = num_groups > 1 ? mkldnn::memory::format::goiw
+                            : mkldnn::memory::format::oiw;
   } else if (arr.shape().ndim() == 4) {
-    mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups,
-      static_cast<int>(arr.shape()[0] / num_groups),
-      static_cast<int>(arr.shape()[1]),
-      static_cast<int>(arr.shape()[2]),
-      static_cast<int>(arr.shape()[3])};
-    mkldnn::memory::desc md =
-        mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw};
-    mkldnn::memory::primitive_desc pd =
-        mkldnn::memory::primitive_desc{md, engine};
-    mem = arr.GetMKLDNNData(pd);
+    tz = num_groups > 1
+             ? mkldnn::memory::dims{num_groups,
+                                    static_cast<int>(arr.shape()[O] /
+                                                     num_groups),
+                                    static_cast<int>(arr.shape()[I]),
+                                    static_cast<int>(arr.shape()[H]),
+                                    static_cast<int>(arr.shape()[W])}
+             : mkldnn::memory::dims{static_cast<int>(arr.shape()[O]),
+                                    static_cast<int>(arr.shape()[I]),
+                                    static_cast<int>(arr.shape()[H]),
+                                    static_cast<int>(arr.shape()[W])};
+    format = num_groups > 1 ? mkldnn::memory::format::goihw
+                            : mkldnn::memory::format::oihw;
   } else {
     LOG(FATAL) << "The weight array has an unsupported number of dimensions";
     return nullptr;
   }
+  mkldnn::memory::desc md =
+      mkldnn::memory::desc{tz, type, format};
+  mkldnn::memory::primitive_desc pd =
+      mkldnn::memory::primitive_desc{md, engine};
+  mem = arr.GetMKLDNNData(pd);
   if (mem == nullptr)
     mem = arr.GetMKLDNNDataReorder(target_pd);
   if (mem->get_primitive_desc() == target_pd) return mem;
@@ -285,6 +295,7 @@ mkldnn_memory_format_t GetDefaultFormat(int num_dims) {
   switch (num_dims) {
     case 1: return mkldnn_x;
     case 2: return mkldnn_nc;
+    case 3: return mkldnn_ncw;
     case 4: return mkldnn_nchw;
     case 5: return mkldnn_goihw;
     default:
@@ -301,6 +312,30 @@ mkldnn_memory_format_t GetDefaultFormat(const 
mkldnn::memory::desc &desc) {
       return mkldnn_oi;
     else
       return desc.data.format;
+  } else if (desc.data.ndims == 3) {
+    switch (desc.data.format) {
+      case mkldnn_ncw:
+      case mkldnn_nwc:
+      case mkldnn_nCw8c:
+      case mkldnn_nCw16c:
+        return mkldnn_ncw;
+      case mkldnn_oiw:
+      case mkldnn_wio:
+      case mkldnn_Owi8o:
+      case mkldnn_OIw8i8o:
+      case mkldnn_OIw8o8i:
+      case mkldnn_OIw16i16o:
+      case mkldnn_OIw16o16i:
+      case mkldnn_Oiw16o:
+      case mkldnn_Owi16o:
+      case mkldnn_OIw8i16o2i:
+      case mkldnn_OIw8o16i2o:
+      case mkldnn_IOw16o16i:
+        return mkldnn_oiw;
+      default:
+        LOG(FATAL) << "Unknown MKLDNN format for 3 dimensions: " << 
desc.data.format;
+        return mkldnn_format_undef;
+    }
   } else if (desc.data.ndims == 4) {
     switch (desc.data.format) {
       case mkldnn_nchw:
@@ -329,6 +364,18 @@ mkldnn_memory_format_t GetDefaultFormat(const 
mkldnn::memory::desc &desc) {
       case mkldnn_Ohwi16o:
       case mkldnn_OhIw16o4i:
         return mkldnn_oihw;
+      case mkldnn_goiw:
+      case mkldnn_gOwi8o:
+      case mkldnn_gOIw8o8i:
+      case mkldnn_gOIw8i8o:
+      case mkldnn_gOIw16i16o:
+      case mkldnn_gOIw16o16i:
+      case mkldnn_gOiw16o:
+      case mkldnn_gOwi16o:
+      case mkldnn_gOIw8i16o2i:
+      case mkldnn_gOIw8o16i2o:
+      case mkldnn_gIOw16o16i:
+        return mkldnn_goiw;
       default:
         LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << 
desc.data.format;
         return mkldnn_format_undef;
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc 
b/src/operator/nn/mkldnn/mkldnn_convolution.cc
index dd1f3ec07d7..7f423ce4524 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -37,9 +37,12 @@ namespace op {
 DMLC_REGISTER_PARAMETER(MKLDNNConvParam);
 
 bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input) {
-  if (params.kernel.ndim() != 2)
+  if ((params.kernel.ndim() != 1) &&
+      (params.kernel.ndim() != 2))
     return false;
-  return SupportMKLDNNQuantize(input.dtype()) && input.shape().ndim() == 4;
+  return SupportMKLDNNQuantize(input.dtype()) &&
+         ((input.shape().ndim() == 3) ||
+          (input.shape().ndim() == 4));
 }
 
 mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
@@ -51,15 +54,26 @@ mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
   auto weight_md = GetWeightDesc(weights, param.conv_param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
-  CHECK_GE(param.conv_param.stride.ndim(), 2U);
-  CHECK_GE(param.conv_param.pad.ndim(), 2U);
-  CHECK_GE(param.conv_param.dilate.ndim(), 2U);
-  mkldnn::memory::dims strides{0, 0};
-  strides[0] = param.conv_param.stride[0];
-  strides[1] = param.conv_param.stride[1];
-  mkldnn::memory::dims padding{0, 0};
-  padding[0] = param.conv_param.pad[0];
-  padding[1] = param.conv_param.pad[1];
+  mkldnn::memory::dims strides(param.conv_param.kernel.ndim());
+  mkldnn::memory::dims padding(param.conv_param.kernel.ndim());
+  if (param.conv_param.kernel.ndim() == 1) {
+    CHECK_GE(param.conv_param.stride.ndim(), 1U);
+    CHECK_GE(param.conv_param.pad.ndim(), 1U);
+    CHECK_GE(param.conv_param.dilate.ndim(), 1U);
+    strides[0] = param.conv_param.stride[0];
+    padding[0] = param.conv_param.pad[0];
+  } else if (param.conv_param.kernel.ndim() == 2) {
+    CHECK_GE(param.conv_param.stride.ndim(), 2U);
+    CHECK_GE(param.conv_param.pad.ndim(), 2U);
+    CHECK_GE(param.conv_param.dilate.ndim(), 2U);
+    strides[0] = param.conv_param.stride[0];
+    strides[1] = param.conv_param.stride[1];
+    padding[0] = param.conv_param.pad[0];
+    padding[1] = param.conv_param.pad[1];
+  } else {
+    LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size "
+               << param.conv_param.kernel.ndim() << ", supporting only 1 or 
2.";
+  }
   mkldnn::primitive_attr attr;
   mkldnn::post_ops ops;
   if (param.mkldnn_param.with_relu) {
@@ -113,9 +127,17 @@ mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
     }
     return conv_pd;
   } else {
-    mkldnn::memory::dims dilates{0, 0};
-    dilates[0] = param.conv_param.dilate[0] - 1;
-    dilates[1] = param.conv_param.dilate[1] - 1;
+    mkldnn::memory::dims dilates(param.conv_param.kernel.ndim());
+    if (param.conv_param.dilate.ndim() == 1) {
+      dilates[0] = param.conv_param.dilate[0] - 1;
+    } else if (param.conv_param.dilate.ndim() == 2) {
+      dilates[0] = param.conv_param.dilate[0] - 1;
+      dilates[1] = param.conv_param.dilate[1] - 1;
+    } else {
+      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size "
+                 << param.conv_param.dilate.ndim()
+                 << ", supporting only 1 or 2.";
+    }
     if (bias == nullptr) {
       mkldnn::convolution_forward::desc desc(prop, 
mkldnn::algorithm::convolution_direct,
           data_md, weight_md, out_md, strides, dilates, padding, padding,
@@ -151,15 +173,26 @@ static mkldnn::convolution_backward_data::primitive_desc 
GetConvBwdData(
   auto weight_md = GetWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
-  CHECK_GE(param.stride.ndim(), 2U);
-  CHECK_GE(param.pad.ndim(), 2U);
-  CHECK_GE(param.dilate.ndim(), 2U);
-  mkldnn::memory::dims strides{0, 0};
-  strides[0] = param.stride[0];
-  strides[1] = param.stride[1];
-  mkldnn::memory::dims padding{0, 0};
-  padding[0] = param.pad[0];
-  padding[1] = param.pad[1];
+  mkldnn::memory::dims strides(param.kernel.ndim());
+  mkldnn::memory::dims padding(param.kernel.ndim());
+  if (param.kernel.ndim() == 1) {
+    CHECK_GE(param.stride.ndim(), 1U);
+    CHECK_GE(param.pad.ndim(), 1U);
+    CHECK_GE(param.dilate.ndim(), 1U);
+    strides[0] = param.stride[0];
+    padding[0] = param.pad[0];
+  } else if (param.kernel.ndim() == 2) {
+    CHECK_GE(param.stride.ndim(), 2U);
+    CHECK_GE(param.pad.ndim(), 2U);
+    CHECK_GE(param.dilate.ndim(), 2U);
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  } else {
+    LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size " << param.kernel.ndim()
+               << ", supporting only 1 or 2.";
+  }
 
   // MKL-DNN introduced padded formats since 0.15 which require more memory
   // for computation compared with the actual tensor size. Currently, MKL-DNN
@@ -177,9 +210,16 @@ static mkldnn::convolution_backward_data::primitive_desc 
GetConvBwdData(
     }
     return conv_pd;
   } else {
-    mkldnn::memory::dims dilates{0, 0};
-    dilates[0] = param.dilate[0] - 1;
-    dilates[1] = param.dilate[1] - 1;
+    mkldnn::memory::dims dilates(param.kernel.ndim());
+    if (param.dilate.ndim() == 1) {
+      dilates[0] = param.dilate[0] - 1;
+    } else if (param.dilate.ndim() == 2) {
+      dilates[0] = param.dilate[0] - 1;
+      dilates[1] = param.dilate[1] - 1;
+    } else {
+      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size "
+                 << param.dilate.ndim() << ", supporting only 1 or 2.";
+    }
     mkldnn::convolution_backward_data::desc 
desc(mkldnn::algorithm::convolution_direct,
         data_md, weight_md, out_md, strides, dilates, padding, padding,
         mkldnn::padding_kind::zero);
@@ -201,15 +241,26 @@ static 
mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
   auto weight_md = GetWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
-  CHECK_GE(param.stride.ndim(), 2U);
-  CHECK_GE(param.pad.ndim(), 2U);
-  CHECK_GE(param.dilate.ndim(), 2U);
-  mkldnn::memory::dims strides{0, 0};
-  strides[0] = param.stride[0];
-  strides[1] = param.stride[1];
-  mkldnn::memory::dims padding{0, 0};
-  padding[0] = param.pad[0];
-  padding[1] = param.pad[1];
+  mkldnn::memory::dims strides(param.kernel.ndim());
+  mkldnn::memory::dims padding(param.kernel.ndim());
+  if (param.kernel.ndim() == 1) {
+    CHECK_GE(param.stride.ndim(), 1U);
+    CHECK_GE(param.pad.ndim(), 1U);
+    CHECK_GE(param.dilate.ndim(), 1U);
+    strides[0] = param.stride[0];
+    padding[0] = param.pad[0];
+  } else if (param.kernel.ndim() == 2) {
+    CHECK_GE(param.stride.ndim(), 2U);
+    CHECK_GE(param.pad.ndim(), 2U);
+    CHECK_GE(param.dilate.ndim(), 2U);
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  } else {
+    LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size " << param.kernel.ndim()
+               << ", supporting only 1 or 2.";
+  }
 
   // MKL-DNN introduced padded formats since 0.15 which require more memory
   // for computation compared with the actual tensor size. Currently, MKL-DNN
@@ -239,9 +290,16 @@ static 
mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
     }
     return conv_pd;
   } else {
-    mkldnn::memory::dims dilates{0, 0};
-    dilates[0] = param.dilate[0] - 1;
-    dilates[1] = param.dilate[1] - 1;
+    mkldnn::memory::dims dilates(param.kernel.ndim());
+    if (param.dilate.ndim() == 1) {
+      dilates[0] = param.dilate[0] - 1;
+    } else if (param.dilate.ndim() == 2) {
+      dilates[0] = param.dilate[0] - 1;
+      dilates[1] = param.dilate[1] - 1;
+    } else {
+      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size "
+                 << param.dilate.ndim() << ", supporting only 1 or 2.";
+    }
     if (bias == nullptr) {
       mkldnn::convolution_backward_weights::desc 
desc(mkldnn::algorithm::convolution_direct,
           data_md, weight_md, out_md, strides, dilates, padding, padding,
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv.cc 
b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
index dfa98d1f5ee..65e0e5c4b27 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
@@ -626,8 +626,12 @@ std::vector<std::pair<int, int>> SgMKLDNNConvInplaceOption(
 }
 
 nnvm::NodePtr SgMKLDNNConvQuantizedOp(const NodeAttrs& attrs) {
+  auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
   nnvm::NodePtr node = nnvm::Node::Create();
   node->attrs.op = Op::Get("_sg_mkldnn_conv");
+  CHECK_EQ(param.full_conv_param.conv_param.kernel.ndim(), 2U)
+      << "Quantized Convolution of MKL-DNN only supports 2D kernel currently."
+      <<  "Please exclude this layer from the quantized model.";
   node->attrs.name = "quantized_" + attrs.name;
   node->attrs.dict = attrs.dict;
   node->attrs.dict["quantized"] = "true";
diff --git a/tests/python/unittest/test_operator.py 
b/tests/python/unittest/test_operator.py
index 09157396f83..a895594ce28 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1602,33 +1602,33 @@ def check_batchnorm_training(stype):
 def test_convolution_grouping():
     for dim in [1, 2, 3]:
         num_filter = 4
-        num_group = 2
-        kernel = (3,) * dim
-        shape = (1, 4) + (9,) * dim
-
-        x = mx.sym.Variable('x')
-        w = mx.sym.Variable('w')
-        b = mx.sym.Variable('b')
-        y1 = mx.sym.Convolution(data=x, weight=w, bias=b, 
num_filter=num_filter, num_group=num_group, kernel=kernel)
-        xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1)
-        wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0)
-        bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0)
-        y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], 
weight=wslice[i], bias=bslice[i],
-                                                
num_filter=num_filter//num_group, kernel=kernel)
-                           for i in range(num_group)])
-
-        exe1 = y1.simple_bind(default_context(), x=shape)
-        exe2 = y2.simple_bind(default_context(), x=shape, w=(num_filter, 
shape[1]//num_group) + kernel, b=(num_filter,))
-        for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
-            arr1[:] = np.float32(np.random.normal(size=arr1.shape))
-            arr2[:] = arr1
-        exe1.forward(is_train=True)
-        exe1.backward(exe1.outputs[0])
-        exe2.forward(is_train=True)
-        exe2.backward(exe2.outputs[0])
-
-        for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + 
exe2.grad_arrays):
-            np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), 
rtol=1e-3, atol=1e-3)
+        for num_group in [1, 2]:
+            kernel = (3,) * dim
+            shape = (1, 4) + (9,) * dim
+
+            x = mx.sym.Variable('x')
+            w = mx.sym.Variable('w')
+            b = mx.sym.Variable('b')
+            y1 = mx.sym.Convolution(data=x, weight=w, bias=b, 
num_filter=num_filter, num_group=num_group, kernel=kernel)
+            xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1)
+            wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0)
+            bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0)
+            y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], 
weight=wslice[i], bias=bslice[i],
+                                                    
num_filter=num_filter//num_group, kernel=kernel)
+                            for i in range(num_group)])
+
+            exe1 = y1.simple_bind(default_context(), x=shape)
+            exe2 = y2.simple_bind(default_context(), x=shape, w=(num_filter, 
shape[1]//num_group) + kernel, b=(num_filter,))
+            for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
+                arr1[:] = np.float32(np.random.normal(size=arr1.shape))
+                arr2[:] = arr1
+            exe1.forward(is_train=True)
+            exe1.backward(exe1.outputs[0])
+            exe2.forward(is_train=True)
+            exe2.backward(exe2.outputs[0])
+
+            for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, 
exe2.outputs + exe2.grad_arrays):
+                np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), 
rtol=1e-3, atol=1e-3)
 
 
 @unittest.skip("Flaky test 
https://github.com/apache/incubator-mxnet/issues/12203";)
@@ -6772,7 +6772,7 @@ def get_output_names_callback(name, arr):
 
 @with_seed()
 def test_activation():
-    shape=(9, 10)
+    shapes = [(9,), (9, 10), (9, 10, 10), (1, 9, 10, 10)]
     dtype_l = [np.float64, np.float32, np.float16]
     rtol_l = [1e-7, 1e-6, 1e-2]
     atol_l = [1e-7, 1e-6, 1e-2]
@@ -6803,17 +6803,19 @@ def test_activation():
     }
     # Loop over operators
     for name, op in unary_ops.items():
-        # Loop over dtype's
-        for ind in range(len(dtype_l)):
-            dtype = dtype_l[ind]
-            rtol = rtol_l[ind]
-            atol = atol_l[ind]
-            compare_forw_backw_unary_op(
-                name, op[0], op[1], op[2], shape, op[3], op[4], rtol, atol,
-                dtype)
-        # Finite difference testing
-        finite_diff_unary_op(
-            name, op[0], shape, op[3], op[4], rtol_fd, atol_fd, num_eps)
+        # Loop over shapes
+        for shape in shapes:
+            # Loop over dtype's
+            for ind in range(len(dtype_l)):
+                dtype = dtype_l[ind]
+                rtol = rtol_l[ind]
+                atol = atol_l[ind]
+                compare_forw_backw_unary_op(
+                    name, op[0], op[1], op[2], shape, op[3], op[4], rtol, atol,
+                    dtype)
+            # Finite difference testing
+            finite_diff_unary_op(
+                name, op[0], shape, op[3], op[4], rtol_fd, atol_fd, num_eps)
 
 @with_seed()
 def test_ravel():
diff --git a/tests/python/unittest/test_optimizer.py 
b/tests/python/unittest/test_optimizer.py
index eb33f9b5217..a41bdad41cc 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import itertools
 import numpy as np
 import itertools
 import mxnet as mx
@@ -436,6 +435,90 @@ def test_nag():
                             compare_optimizer(opt1(**kwarg), opt2(**kwarg), 
shape, dtype)
 
 
+#SGLD
+class PySGLD(mx.optimizer.Optimizer):
+    """python reference implementation of SGLD"""
+
+    def __init__(self, **kwargs):
+        super(PySGLD, self).__init__(**kwargs)
+
+    def create_state(self, index, weight):
+        return None
+
+    def update(self, index, weight, grad, state):
+        assert(isinstance(weight, mx.nd.NDArray))
+        assert(isinstance(grad, mx.nd.NDArray))
+        self._update_count(index)
+        lr = self._get_lr(index)
+        wd = self._get_wd(index)
+
+        grad = grad * self.rescale_grad
+        if self.clip_gradient is not None:
+            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+        weight[:] += - lr/2 * (grad + wd * weight) + mx.random.normal(0, 
math.sqrt(lr), shape=weight.shape,
+                                                            
dtype=weight.dtype, ctx=weight.context)
+
+
+
+@with_seed()
+def test_sgld():
+    opt1 = PySGLD
+    opt2 = mx.optimizer.SGLD
+    shape = (3, 4, 5)
+    ns_options = [1234, 42]
+
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+
+
+    def compare_optimizer_noise_seeded(opt1, opt2, shape, dtype, noise_seed,
+                                       w_stype='default', g_stype='default',
+                                       rtol=1e-4, atol=1e-5, 
compare_states=True):
+        """Compare opt1 and opt2 with the added functionality that the seed 
for generating random noise
+        in the SGLD optimizer update is set so that the same noise is used in 
opt1 and opt2.
+
+        """
+        if w_stype == 'default':
+            w2 = mx.random.uniform(shape=shape, ctx=default_context(), 
dtype=dtype)
+            w1 = w2.copyto(default_context())
+        elif w_stype == 'row_sparse' or w_stype == 'csr':
+            w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
+            w1 = w2.copyto(default_context()).tostype('default')
+        else:
+            raise Exception("type not supported yet")
+        if g_stype == 'default':
+            g2 = mx.random.uniform(shape=shape, ctx=default_context(), 
dtype=dtype)
+            g1 = g2.copyto(default_context())
+        elif g_stype == 'row_sparse' or g_stype == 'csr':
+            g2 = rand_ndarray(shape, g_stype, dtype=dtype)
+            g1 = g2.copyto(default_context()).tostype('default')
+        else:
+            raise Exception("type not supported yet")
+
+        state1 = opt1.create_state_multi_precision(0, w1)
+        state2 = opt2.create_state_multi_precision(0, w2)
+        if compare_states:
+            compare_ndarray_tuple(state1, state2)
+
+        # set seed for Gaussian noise replication
+        mx.random.seed(noise_seed)
+        opt1.update_multi_precision(0, w1, g1, state1)
+        mx.random.seed(noise_seed)
+        opt2.update_multi_precision(0, w2, g2, state2)
+        if compare_states:
+            compare_ndarray_tuple(state1, state2, rtol=rtol, atol=atol)
+        assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=rtol, atol=atol)
+
+    for seed in ns_options:
+        for dtype in [np.float16, np.float32, np.float64]:
+            for params in itertools.product(cg_options, wd_options, 
mp_options):
+                kwarg = {k: v for param in params for k, v in param.items()}
+                if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                    not kwarg['multi_precision'])):
+                    continue
+                compare_optimizer_noise_seeded(opt1(**kwarg), opt2(**kwarg), 
shape, dtype, seed)
+
 
 # FTML
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to