larroy commented on a change in pull request #15545: Softmax optimization for 
GPU
URL: https://github.com/apache/incubator-mxnet/pull/15545#discussion_r315952016
 
 

 ##########
 File path: src/operator/nn/softmax-inl.h
 ##########
 @@ -188,89 +180,77 @@ struct log_softmax_bwd {
   }
 };
 
-
 template<typename OP1, typename OP2, int Req, bool negate,
-         typename AType, typename DType, typename OType, int ndim>
+         typename AType, typename DType, typename OType, typename IType, int 
ndim>
 inline void SoftmaxGrad(Stream<cpu> *s, OType *out, OType *ograd,
-                        DType *igrad, Shape<ndim> shape, int axis,
-                        const DType temperature) {
+                        DType *igrad, IType *length, Shape<ndim> shape,
+                        int axis, const DType temperature) {
   index_t M = shape[axis];
   index_t N = shape.Size()/M;
   Shape<ndim> stride = calc_stride(shape);
   Shape<ndim> sshape = shape;
   sshape[axis] = 1;
   index_t sa = stride[axis];
 
-  #pragma omp parallel for
-  for (index_t i = 0; i < N; ++i) {
-    index_t base = unravel_dot(i, sshape, stride);
+  if (length != nullptr) {
+    #pragma omp parallel for
+    for (index_t i = 0; i < N; ++i) {
+      index_t base = unravel_dot(i, sshape, stride);
+      index_t len = static_cast<index_t>(length[i]);
 
-    AType sum = AType(0);
-    for (index_t j = 0; j < M; ++j) {
-      sum += OP1::Map(ograd[base + j*sa], out[base + j*sa]);
-    }
-
-    // By default temperature is 1.0.
-    // Adding a branch here to save the CPU 'divide-by-1' computation at 
runtime
-    DType final_result;
-    if (temperature == 1.0) {
-      for (index_t j = 0; j < M; ++j) {
-        final_result = negate ?
-                       -OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) :
-                       OP2::Map(ograd[base + j*sa], out[base + j*sa], sum);
-        KERNEL_ASSIGN(igrad[base + j*sa], Req, final_result);
-      }
-    } else {
-      for (index_t j = 0; j < M; ++j) {
-        final_result = negate ?
-                       -OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) / 
temperature :
-                       OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) / 
temperature;
-        KERNEL_ASSIGN(igrad[base + j*sa], Req, final_result);
+      AType sum = AType(0);
+      for (index_t j = 0; j < len; ++j) {
+        sum += OP1::Map(ograd[base + j*sa], out[base + j*sa]);
       }
-    }
-  }
-}
 
-template<typename OP1, typename OP2, int Req, bool negate,
-         typename AType, typename DType, typename OType, typename IType, int 
ndim>
-inline void SoftmaxWithLengthGrad(Stream<cpu> *s, OType *out, OType *ograd,
-                                  DType *igrad, IType *length, Shape<ndim> 
shape,
-                                  int axis, const DType temperature) {
-  index_t M = shape[axis];
-  index_t N = shape.Size()/M;
-  Shape<ndim> stride = calc_stride(shape);
-  Shape<ndim> sshape = shape;
-  sshape[axis] = 1;
-  index_t sa = stride[axis];
-
-  #pragma omp parallel for
-  for (index_t i = 0; i < N; ++i) {
-    index_t base = unravel_dot(i, sshape, stride);
-    index_t len = static_cast<index_t>(length[i]);
-
-    AType sum = AType(0);
-    for (index_t j = 0; j < len; ++j) {
-      sum += OP1::Map(ograd[base + j*sa], out[base + j*sa]);
+      // By default temperature is 1.0.
+      // Adding a branch here to save the CPU 'divide-by-1' computation at 
runtime
+      DType final_result;
+      if (temperature == 1.0) {
 
 Review comment:
   is this micro-opt really making things better?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to