[GitHub] [incubator-mxnet] apeforest commented on a change in pull request #17882: Improve performance of broadcast_axis

GitBox Mon, 20 Apr 2020 16:13:40 -0700


apeforest commented on a change in pull request #17882:
URL: https://github.com/apache/incubator-mxnet/pull/17882#discussion_r411751450




##########
File path: src/operator/tensor/broadcast_reduce_op.h
##########
@@ -1058,22 +1065,58 @@ struct broadcast_kernel {
                                   mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> 
in_shape,
                                   mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> 
out_shape,
                                   const OpReqType req,
-                                  const uint32_t ndim) {
-    size_t in_stride = 1;
-    size_t out_stride = 1;
-    index_t idx = i;
-    index_t in_idx = i;
-    for (int iter = ndim - 1; iter >= 0; --iter) {
-      size_t dim_idx = idx % out_shape[iter];
-      in_idx -= dim_idx * out_stride;
-      if (in_shape[iter] != 1) {
-        in_idx += dim_idx * in_stride;
-      }
-      idx /= out_shape[iter];
-      in_stride *= in_shape[iter];
-      out_stride *= out_shape[iter];
-    }
-    KERNEL_ASSIGN(output[i], req, OP::Map(input[in_idx]));
+                                  const uint32_t ndim,
+                                  const size_t *axes,
+                                  const size_t *out_stride,
+                                  const int num_broadcast_axes) {
+        index_t idx = i;
+        index_t init_off = 0;
+        for (int iter = ndim - 1; idx > 0 && iter >= 0; --iter) {
+          size_t dim_idx = idx % in_shape[iter];
+          init_off += dim_idx * out_stride[iter];
+          idx /= in_shape[iter];
+        }
+        index_t stride_0, stride_1, stride_2;
+        // Each case is based on the number of axis to be broadcasted
+        // (1, 2 or 3) after merging axes.
+        switch (num_broadcast_axes) {
+          // when input shape is amogst one of the form
+          // [(x,1), (x,1,x), (1,x)]
+          // x can be any +ve number >=0 and they need not be equal to each 
other

Review comment:
       what does +ve mean?

##########
File path: src/operator/numpy/np_matmul_op-inl.h
##########
@@ -157,12 +157,50 @@ inline void MatmulImpl(const OpContext& ctx,
     DType* bc_b_ptr = bc_a_ptr + bc_size_a;
     MSHADOW_TYPE_SWITCH_WITH_BOOL(input_a.type_flag_, IType, {
       MSHADOW_TYPE_SWITCH_WITH_BOOL(input_b.type_flag_, OType, {
-        Kernel<broadcast_kernel<mshadow_op::identity>, xpu>::Launch(
-          s, bc_size_a, input_a.dptr<IType>(), bc_a_ptr,
-          k_a_shape, k_a_shape_bc, OpReqType::kWriteTo, ndim);
-        Kernel<broadcast_kernel<mshadow_op::identity>, xpu>::Launch(
-          s, bc_size_b, input_b.dptr<IType>(), bc_b_ptr,
-          k_b_shape, k_b_shape_bc, OpReqType::kWriteTo, ndim);
+        bool a_shape_changed = false, b_shape_changed = false;
+        size_t axes[ndim], out_stride[ndim], axes_b[ndim], out_stride_b[ndim];
+        int iter = ndim - 1, idx_a = 0, idx_b = 0;
+        out_stride[iter] = 1;
+        out_stride_b[iter] = 1;
+        if (k_a_shape[iter] != k_a_shape_bc[iter]) {

Review comment:
       Can these two if blocks be included in the for loop below?

##########
File path: src/operator/tensor/broadcast_reduce_op.h
##########
@@ -1058,22 +1065,58 @@ struct broadcast_kernel {
                                   mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> 
in_shape,
                                   mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> 
out_shape,
                                   const OpReqType req,
-                                  const uint32_t ndim) {
-    size_t in_stride = 1;
-    size_t out_stride = 1;
-    index_t idx = i;
-    index_t in_idx = i;
-    for (int iter = ndim - 1; iter >= 0; --iter) {
-      size_t dim_idx = idx % out_shape[iter];
-      in_idx -= dim_idx * out_stride;
-      if (in_shape[iter] != 1) {
-        in_idx += dim_idx * in_stride;
-      }
-      idx /= out_shape[iter];
-      in_stride *= in_shape[iter];
-      out_stride *= out_shape[iter];
-    }
-    KERNEL_ASSIGN(output[i], req, OP::Map(input[in_idx]));
+                                  const uint32_t ndim,
+                                  const size_t *axes,
+                                  const size_t *out_stride,
+                                  const int num_broadcast_axes) {
+        index_t idx = i;
+        index_t init_off = 0;
+        for (int iter = ndim - 1; idx > 0 && iter >= 0; --iter) {
+          size_t dim_idx = idx % in_shape[iter];
+          init_off += dim_idx * out_stride[iter];
+          idx /= in_shape[iter];
+        }
+        index_t stride_0, stride_1, stride_2;
+        // Each case is based on the number of axis to be broadcasted
+        // (1, 2 or 3) after merging axes.
+        switch (num_broadcast_axes) {
+          // when input shape is amogst one of the form
+          // [(x,1), (x,1,x), (1,x)]
+          // x can be any +ve number >=0 and they need not be equal to each 
other
+          case 1 :
+            stride_0 = out_stride[axes[0]];
+            for (int l=0; l < out_shape[axes[0]]; l++) {
+              KERNEL_ASSIGN(output[init_off + l*stride_0],
+                  req, OP::Map(input[i]));
+            }
+            break;
+          // when input shape is amogst one of the form
+          // [(x,1,x,1), (1,x,1,x), (x,1,x,1,x)]
+          // x can be any +ve number >1 or =0(the axis ) and they need not be 
equal to each other
+          case 2:
+            stride_1 = out_stride[axes[1]], stride_0 = out_stride[axes[0]];
+            for (int k=0; k < out_shape[axes[1]]; k++) {
+              for (int l=0; l < out_shape[axes[0]]; l++) {
+                KERNEL_ASSIGN(output[init_off + k*stride_1 + l*stride_0],
+                    req, OP::Map(input[i]));
+              }
+            }
+            break;
+          // when input shape is of the form [(1,x,1,x,1)] and
+          // x can be any +ve number >=0 and they need not be equal to each 
other
+          case 3:
+            stride_2 = out_stride[axes[2]], stride_1 = out_stride[axes[1]];
+            stride_0 = out_stride[axes[0]];
+            for (int j=0; j < out_shape[axes[2]]; j++) {
+              for (int k=0; k < out_shape[axes[1]]; k++) {
+                for (int l=0; l < out_shape[axes[0]]; l++) {
+                  KERNEL_ASSIGN(output[init_off + j*stride_2 + k*stride_1 + 
l*stride_0],
+                      req, OP::Map(input[i]));
+                }
+              }
+            }
+            break;
+        }

Review comment:
       Add a default case to error out.

##########
File path: src/operator/tensor/broadcast_reduce_op.h
##########
@@ -1058,22 +1065,58 @@ struct broadcast_kernel {
                                   mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> 
in_shape,
                                   mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> 
out_shape,
                                   const OpReqType req,
-                                  const uint32_t ndim) {
-    size_t in_stride = 1;
-    size_t out_stride = 1;
-    index_t idx = i;
-    index_t in_idx = i;
-    for (int iter = ndim - 1; iter >= 0; --iter) {
-      size_t dim_idx = idx % out_shape[iter];
-      in_idx -= dim_idx * out_stride;
-      if (in_shape[iter] != 1) {
-        in_idx += dim_idx * in_stride;
-      }
-      idx /= out_shape[iter];
-      in_stride *= in_shape[iter];
-      out_stride *= out_shape[iter];
-    }
-    KERNEL_ASSIGN(output[i], req, OP::Map(input[in_idx]));
+                                  const uint32_t ndim,
+                                  const size_t *axes,
+                                  const size_t *out_stride,
+                                  const int num_broadcast_axes) {
+        index_t idx = i;
+        index_t init_off = 0;
+        for (int iter = ndim - 1; idx > 0 && iter >= 0; --iter) {
+          size_t dim_idx = idx % in_shape[iter];
+          init_off += dim_idx * out_stride[iter];
+          idx /= in_shape[iter];
+        }
+        index_t stride_0, stride_1, stride_2;
+        // Each case is based on the number of axis to be broadcasted
+        // (1, 2 or 3) after merging axes.
+        switch (num_broadcast_axes) {
+          // when input shape is amogst one of the form
+          // [(x,1), (x,1,x), (1,x)]

Review comment:
       No need to use [] in the comment.

##########
File path: src/operator/tensor/broadcast_reduce_op.h
##########
@@ -1058,22 +1065,58 @@ struct broadcast_kernel {
                                   mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> 
in_shape,
                                   mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> 
out_shape,
                                   const OpReqType req,
-                                  const uint32_t ndim) {
-    size_t in_stride = 1;
-    size_t out_stride = 1;
-    index_t idx = i;
-    index_t in_idx = i;
-    for (int iter = ndim - 1; iter >= 0; --iter) {
-      size_t dim_idx = idx % out_shape[iter];
-      in_idx -= dim_idx * out_stride;
-      if (in_shape[iter] != 1) {
-        in_idx += dim_idx * in_stride;
-      }
-      idx /= out_shape[iter];
-      in_stride *= in_shape[iter];
-      out_stride *= out_shape[iter];
-    }
-    KERNEL_ASSIGN(output[i], req, OP::Map(input[in_idx]));
+                                  const uint32_t ndim,
+                                  const size_t *axes,
+                                  const size_t *out_stride,
+                                  const int num_broadcast_axes) {
+        index_t idx = i;
+        index_t init_off = 0;
+        for (int iter = ndim - 1; idx > 0 && iter >= 0; --iter) {
+          size_t dim_idx = idx % in_shape[iter];
+          init_off += dim_idx * out_stride[iter];
+          idx /= in_shape[iter];
+        }
+        index_t stride_0, stride_1, stride_2;
+        // Each case is based on the number of axis to be broadcasted

Review comment:
       Again, could you please explain why there are only 3 cases regarding the 
number of axes?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [incubator-mxnet] apeforest commented on a change in pull request #17882: Improve performance of broadcast_axis

Reply via email to