[GitHub] eric-haibin-lin closed pull request #10293: [MXNET-72] Improve sparse sgd on GPU

GitBox Mon, 02 Apr 2018 17:24:07 -0700

eric-haibin-lin closed pull request #10293: [MXNET-72] Improve sparse sgd on GPU
URL: https://github.com/apache/incubator-mxnet/pull/10293


This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 104f20a61ee..66e5059d6ea 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -108,8 +108,39 @@ inline void SGDUpdate(const nnvm::NodeAttrs& attrs,
 
 /*! \brief kernel for sparse sgd
  */
+template<int req, typename xpu>
+struct SGDDnsRspKernel;
+
 template<int req>
-struct SGDDnsRspKernel {
+struct SGDDnsRspKernel<req, gpu> {
+  // DType is the output data type
+  // IType is row sparse idx type
+  // i is the ith element in row sparse gradient
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, const index_t row_length, DType* out, 
const DType* weight,
+                                  const IType* grad_idx, const DType *grad_val,
+                                  const DType clip_gradient, const DType lr,
+                                  const DType wd, const DType rescale_grad) {
+    using nnvm::dim_t;
+    using namespace mshadow_op;
+    const dim_t row_id = i / row_length;
+    const dim_t col_id = i % row_length;
+    const dim_t row_offset = grad_idx[row_id] * row_length;
+    const dim_t data_i = row_offset + col_id;
+    if (clip_gradient >= 0.0f) {
+      KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
+                   (lr) * mshadow_op::clip::Map(rescale_grad * grad_val[i], 
clip_gradient));
+    } else {
+      KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
+                    (lr * rescale_grad) * grad_val[i]);
+    }
+  }
+};
+
+/*! \brief kernel for sparse sgd
+ */
+template<int req>
+struct SGDDnsRspKernel<req, cpu> {
   // DType is the output data type
   // IType is row sparse idx type
   // i is the ith row in row sparse gradient
@@ -154,11 +185,15 @@ inline void SGDUpdateDnsRspImpl(const SGDParam& param,
     MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(rowsparse::kIdx), IType, {
       MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
         DType* weight_data = weight.dptr<DType>();
-        IType* grad_idx = grad.aux_data(rowsparse::kIdx).dptr<IType>();
-        DType* grad_val = grad.data().dptr<DType>();
-        index_t num_rows = grad.aux_shape(rowsparse::kIdx)[0];
-        auto row_length = weight.shape_.ProdShape(1, weight.ndim());
-        Kernel<SGDDnsRspKernel<req_type>, xpu>::Launch(s, num_rows, row_length,
+        const IType* grad_idx = grad.aux_data(rowsparse::kIdx).dptr<IType>();
+        const DType* grad_val = grad.data().dptr<DType>();
+        const nnvm::dim_t num_rows = grad.aux_shape(rowsparse::kIdx)[0];
+        const auto row_length = weight.shape_.ProdShape(1, weight.ndim());
+        size_t num_threads = num_rows;
+        if (std::is_same<xpu, gpu>::value) {
+          num_threads = num_rows * row_length;
+        }
+        Kernel<SGDDnsRspKernel<req_type, xpu>, xpu>::Launch(s, num_threads, 
row_length,
           out->dptr<DType>(), weight_data, grad_idx, grad_val,
           static_cast<DType>(param.clip_gradient),
           static_cast<DType>(param.lr), static_cast<DType>(param.wd),
@@ -375,8 +410,11 @@ inline void MP_SGDMomUpdate(const nnvm::NodeAttrs& attrs,
   });
 }
 
+template<int req, typename xpu>
+struct SGDMomDnsRspDnsKernel;
+
 template<int req>
-struct SGDMomDnsRspDnsKernel {
+struct SGDMomDnsRspDnsKernel<req, cpu> {
   template<typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
     DType* mom_data, const DType* weight_data, const IType* grad_idx,
@@ -402,6 +440,33 @@ struct SGDMomDnsRspDnsKernel {
   }
 };
 
+template<int req>
+struct SGDMomDnsRspDnsKernel<req, gpu> {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
+    DType* mom_data, const DType* weight_data, const IType* grad_idx,
+    const DType* grad_data, const DType clip_gradient, const DType momentum,
+    const DType lr, const DType wd, const DType rescale_grad) {
+    using nnvm::dim_t;
+    const DType rate = lr * wd;
+    const dim_t row_id = i / row_length;
+    const dim_t col_id = i % row_length;
+    const dim_t data_i = grad_idx[row_id] * row_length + col_id;
+    if (clip_gradient >= 0.0f) {
+      mom_data[data_i] = momentum * mom_data[data_i]
+              - rate * weight_data[data_i]
+              - lr *
+              mshadow_op::clip::Map(rescale_grad * grad_data[i],
+                                    clip_gradient);
+    } else {
+      mom_data[data_i] = momentum * mom_data[data_i]
+                - rate * weight_data[data_i]
+                - lr * rescale_grad * grad_data[i];
+    }
+    KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + 
mom_data[data_i]);
+  }
+};
+
 template<typename xpu>
 inline void SGDMomUpdateDnsRspDnsImpl(const SGDMomParam& param,
                                       const OpContext& ctx,
@@ -428,7 +493,11 @@ inline void SGDMomUpdateDnsRspDnsImpl(const SGDMomParam& 
param,
         DType* out_data = out->dptr<DType>();
         index_t num_rows = grad.aux_shape(kIdx)[0];
         auto row_length = weight.shape_.ProdShape(1, weight.ndim());
-        Kernel<SGDMomDnsRspDnsKernel<req_type>, xpu>::Launch(s, num_rows, 
row_length,
+        size_t num_threads = num_rows;
+        if (std::is_same<xpu, gpu>::value) {
+          num_threads = num_rows * row_length;
+        }
+        Kernel<SGDMomDnsRspDnsKernel<req_type, xpu>, xpu>::Launch(s, 
num_threads, row_length,
           out_data, mom_data, weight_data, grad_idx, grad_val,
           static_cast<DType>(param.clip_gradient), 
static_cast<DType>(param.momentum),
           static_cast<DType>(param.lr), static_cast<DType>(param.wd),
@@ -476,63 +545,41 @@ inline bool StdOptStorageType(const nnvm::NodeAttrs& 
attrs,
                               DispatchMode* dispatch_mode,
                               std::vector<int>* in_attrs,
                               std::vector<int>* out_attrs) {
+  using namespace common;
   CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_rsp + n_rsp_dns));
   CHECK_EQ(out_attrs->size(), 1U);
   bool dispatched = false;
-
-  if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
+  if (!dispatched && ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
     // dns, ... -> dns
     dispatched = storage_type_assign(out_attrs, kDefaultStorage,
                                      dispatch_mode, DispatchMode::kFCompute);
   }
   const std::vector<int> rsp_stypes(in_attrs->begin(), in_attrs->begin() + 
n_rsp);
   const std::vector<int> rsp_dns_stypes(in_attrs->begin() + n_rsp, 
in_attrs->end());
-  if (!dispatched && common::ContainsOnlyStorage(rsp_stypes, 
kRowSparseStorage) &&
-      (common::ContainsOnlyStorage(rsp_dns_stypes, kRowSparseStorage) ||
-       common::ContainsOnlyStorage(rsp_dns_stypes, kDefaultStorage))) {
+  if (!dispatched && ContainsOnlyStorage(rsp_stypes, kRowSparseStorage) &&
+      (ContainsOnlyStorage(rsp_dns_stypes, kRowSparseStorage) ||
+       ContainsOnlyStorage(rsp_dns_stypes, kDefaultStorage))) {
     // rsp, ..., rsp/dns, ... -> rsp
     dispatched = storage_type_assign(out_attrs, kRowSparseStorage,
                                      dispatch_mode, DispatchMode::kFComputeEx);
+    // warn users if lazy_update is turned on
+    if (dispatched && ContainsOnlyStorage(rsp_dns_stypes, kRowSparseStorage)) {
+      LogOnce("Optimizer with lazy_update = True detected. "
+      "Be aware that lazy update is different from standard update, "
+      "and may lead to different empirical results. See "
+      
"https://mxnet.incubator.apache.org/api/python/optimization/optimization.html "
+      "for more details.");
+    }
   }
-
   if (!dispatched) {
     dispatched = dispatch_fallback(out_attrs, dispatch_mode);
   }
   return dispatched;
 }
 
-template<int req>
-struct SGDMomStdDnsRspDnsKernel {
-  template<typename DType, typename IType, typename RType>
-  MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
-    DType* mom_data, const DType* weight_data, const IType* grad_idx,
-    const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
-    const DType momentum, const DType lr, const DType wd, const DType 
rescale_grad) {
-    const DType rate = lr * wd;
-    const bool non_zero = (i == 0) ? prefix_sum[0] > 0
-                                   : prefix_sum[i] > prefix_sum[i-1];
+template<int req, typename xpu>
+struct SGDMomStdDnsRspDnsKernel;
 
-    const index_t row_i = i * row_length;
-    const RType grad_i = (prefix_sum[i]-1) * row_length;
-    for (index_t j = 0; j < row_length; j++) {
-      const index_t data_i = row_i + j;
-      const DType grad = non_zero ? grad_data[grad_i + j]
-                                  : static_cast<DType>(0);
-      if (clip_gradient >= 0.0f) {
-        mom_data[data_i] = momentum * mom_data[data_i]
-                - rate * weight_data[data_i]
-                - lr *
-                mshadow_op::clip::Map(rescale_grad * grad,
-                                      clip_gradient);
-      } else {
-        mom_data[data_i] = momentum * mom_data[data_i]
-                  - rate * weight_data[data_i]
-                  - lr * rescale_grad * grad;
-      }
-      KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + 
mom_data[data_i]);
-    }
-  }
-};
 
 template<typename xpu>
 void SGDMomStdUpdateDnsRspDnsImpl(const SGDMomParam& param,
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index f7ccbbb739d..7d87e2c9413 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -98,6 +98,38 @@ Where the parameter ``momentum`` is the decay rate of 
momentum estimates at each
 .add_argument("mom", "NDArray-or-Symbol", "Momentum")
 .add_arguments(SignumParam::__FIELDS__());
 
+template<int req>
+struct SGDMomStdDnsRspDnsKernel<req, cpu> {
+  template<typename DType, typename IType, typename RType>
+  MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
+    DType* mom_data, const DType* weight_data, const IType* grad_idx,
+    const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
+    const DType momentum, const DType lr, const DType wd, const DType 
rescale_grad) {
+    const DType rate = lr * wd;
+    const bool non_zero = (i == 0) ? prefix_sum[0] > 0
+                                   : prefix_sum[i] > prefix_sum[i-1];
+
+    const index_t row_i = i * row_length;
+    const RType grad_i = (prefix_sum[i]-1) * row_length;
+    for (index_t j = 0; j < row_length; j++) {
+      const index_t data_i = row_i + j;
+      const DType grad = non_zero ? grad_data[grad_i + j]
+                                  : static_cast<DType>(0);
+      if (clip_gradient >= 0.0f) {
+        mom_data[data_i] = momentum * mom_data[data_i]
+                - rate * weight_data[data_i]
+                - lr *
+                mshadow_op::clip::Map(rescale_grad * grad,
+                                      clip_gradient);
+      } else {
+        mom_data[data_i] = momentum * mom_data[data_i]
+                  - rate * weight_data[data_i]
+                  - lr * rescale_grad * grad;
+      }
+      KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + 
mom_data[data_i]);
+    }
+  }
+};
 
 template<>
 void SGDMomStdUpdateDnsRspDnsImpl<cpu>(const SGDMomParam& param,
@@ -139,7 +171,7 @@ void SGDMomStdUpdateDnsRspDnsImpl<cpu>(const SGDMomParam& 
param,
             prefix_sum[i] += prefix_sum[i - 1];
           }
         }
-        Kernel<SGDMomStdDnsRspDnsKernel<req_type>, cpu>::Launch(s, num_rows, 
row_length,
+        Kernel<SGDMomStdDnsRspDnsKernel<req_type, cpu>, cpu>::Launch(s, 
num_rows, row_length,
           out_data, mom_data, weight_data, grad_idx, grad_val, prefix_sum,
           static_cast<DType>(param.clip_gradient), 
static_cast<DType>(param.momentum),
           static_cast<DType>(param.lr), static_cast<DType>(param.wd),
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index 18ee66a729c..0fd2ca83fda 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -29,6 +29,35 @@
 namespace mxnet {
 namespace op {
 
+template<int req>
+struct SGDMomStdDnsRspDnsKernel<req, gpu> {
+  template<typename DType, typename IType, typename RType>
+  MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
+    DType* mom_data, const DType* weight_data, const IType* grad_idx,
+    const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
+    const DType momentum, const DType lr, const DType wd, const DType 
rescale_grad) {
+    using nnvm::dim_t;
+    const DType rate = lr * wd;
+    const dim_t row_id = i / row_length;
+    const dim_t col_id = i % row_length;
+    const dim_t nnr = prefix_sum[row_id];
+    const bool non_zero = (row_id == 0) ? prefix_sum[0] > 0
+                                        : nnr > prefix_sum[row_id - 1];
+    const RType grad_i = (nnr - 1) * row_length + col_id;
+    const DType grad = non_zero ? grad_data[grad_i]
+                                : static_cast<DType>(0);
+    if (clip_gradient >= 0.0f) {
+      mom_data[i] = momentum * mom_data[i]
+              - rate * weight_data[i]
+              - lr * mshadow_op::clip::Map(rescale_grad * grad, clip_gradient);
+    } else {
+      mom_data[i] = momentum * mom_data[i]
+                  - rate * weight_data[i] - lr * rescale_grad * grad;
+    }
+    KERNEL_ASSIGN(out_data[i], req, weight_data[i] + mom_data[i]);
+  }
+};
+
 template<>
 void SGDMomStdUpdateDnsRspDnsImpl<gpu>(const SGDMomParam& param,
                                        const OpContext& ctx,
@@ -84,7 +113,8 @@ void SGDMomStdUpdateDnsRspDnsImpl<gpu>(const SGDMomParam& 
param,
                                         num_rows,
                                         mshadow::Stream<gpu>::GetStream(s));
         }
-        Kernel<SGDMomStdDnsRspDnsKernel<req_type>, gpu>::Launch(s, num_rows, 
row_length,
+        size_t num_threads = num_rows * row_length;
+        Kernel<SGDMomStdDnsRspDnsKernel<req_type, gpu>, gpu>::Launch(s, 
num_threads, row_length,
           out_data, mom_data, weight_data, grad_idx, grad_val, prefix_sum,
           static_cast<DType>(param.clip_gradient), 
static_cast<DType>(param.momentum),
           static_cast<DType>(param.lr), static_cast<DType>(param.wd),


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] eric-haibin-lin closed pull request #10293: [MXNET-72] Improve sparse sgd on GPU

Reply via email to