[GitHub] [incubator-mxnet] bartekkuncer commented on a diff in pull request #21004: broadcast_like CPU optimization

GitBox Thu, 28 Apr 2022 03:02:13 -0700


bartekkuncer commented on code in PR #21004:
URL: https://github.com/apache/incubator-mxnet/pull/21004#discussion_r860678925



##########
src/operator/tensor/broadcast_reduce_op.h:
##########
@@ -1354,6 +1354,94 @@ struct direct_copy {
   }
 };
 
+template <typename IType, typename OType>
+void BroadcastCPU(const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs,
+                  const mxnet::TShape& src_shape,
+                  const mxnet::TShape& dst_shape,
+                  ShapeAndStride aux_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  constexpr size_t ELEMENTS_THRESHOLD = 256;
+  Stream<cpu>* s                      = ctx.get_stream<cpu>();
+
+  std::vector<size_t> elements_to_copy(aux_data.num_broadcast_axes);
+  std::vector<size_t> preaxis_dims(aux_data.num_broadcast_axes);
+  for (int ax = 0; ax < aux_data.num_broadcast_axes; ax++) {
+    index_t axis = aux_data.axes[ax];
+
+    elements_to_copy[ax] = 1;
+    for (int i = axis + 1; i < dst_shape.ndim(); i++) {
+      elements_to_copy[ax] *= dst_shape[i];
+    }
+    preaxis_dims[ax] = 1;
+    for (int i = axis - 1; i >= 0; i--) {
+      preaxis_dims[ax] *= src_shape[i];
+    }
+  }
+
+  // there is no need to check further axis's elements to copy as it for sure 
will be larger
+  if (elements_to_copy[0] < ELEMENTS_THRESHOLD || !std::is_same<IType, 
OType>::value) {
+    IType* src = static_cast<IType*>(inputs[0].dptr_);
+    OType* dst = static_cast<OType*>(outputs[0].dptr_);
+    mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape;
+    mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape;
+    for (int i = 0; i < MXNET_SPECIAL_MAX_NDIM; ++i) {
+      if (i < dst_shape.ndim()) {
+        in_shape[i]  = src_shape[i];
+        out_shape[i] = src_shape[i];
+      } else {
+        in_shape[i]  = 1;

Review Comment:
   What is in_shape's purpose?



##########
src/operator/tensor/broadcast_reduce_op.h:
##########
@@ -1354,6 +1354,94 @@ struct direct_copy {
   }
 };
 
+template <typename IType, typename OType>
+void BroadcastCPU(const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs,
+                  const mxnet::TShape& src_shape,
+                  const mxnet::TShape& dst_shape,
+                  ShapeAndStride aux_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  constexpr size_t ELEMENTS_THRESHOLD = 256;
+  Stream<cpu>* s                      = ctx.get_stream<cpu>();
+
+  std::vector<size_t> elements_to_copy(aux_data.num_broadcast_axes);
+  std::vector<size_t> preaxis_dims(aux_data.num_broadcast_axes);
+  for (int ax = 0; ax < aux_data.num_broadcast_axes; ax++) {
+    index_t axis = aux_data.axes[ax];
+
+    elements_to_copy[ax] = 1;
+    for (int i = axis + 1; i < dst_shape.ndim(); i++) {
+      elements_to_copy[ax] *= dst_shape[i];
+    }
+    preaxis_dims[ax] = 1;
+    for (int i = axis - 1; i >= 0; i--) {
+      preaxis_dims[ax] *= src_shape[i];
+    }
+  }
+
+  // there is no need to check further axis's elements to copy as it for sure 
will be larger
+  if (elements_to_copy[0] < ELEMENTS_THRESHOLD || !std::is_same<IType, 
OType>::value) {
+    IType* src = static_cast<IType*>(inputs[0].dptr_);
+    OType* dst = static_cast<OType*>(outputs[0].dptr_);
+    mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape;
+    mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape;
+    for (int i = 0; i < MXNET_SPECIAL_MAX_NDIM; ++i) {
+      if (i < dst_shape.ndim()) {
+        in_shape[i]  = src_shape[i];
+        out_shape[i] = src_shape[i];
+      } else {
+        in_shape[i]  = 1;
+        out_shape[i] = 1;
+      }
+    }
+
+    if (dst_shape.ndim() == 2) {

Review Comment:
   Why are you using just a '2' as parameter but for the other one you declare 
a variable? Maybe instead of branching the kernel call use if only for 
determining ndim e.g. `const int ndim = dst_shape.ndim() == 2 ? 2 : 
MXNET_SPECIAL_MAX_NDIM;` ?



##########
src/operator/tensor/broadcast_reduce_op.h:
##########
@@ -1354,6 +1354,94 @@ struct direct_copy {
   }
 };
 
+template <typename IType, typename OType>
+void BroadcastCPU(const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs,
+                  const mxnet::TShape& src_shape,
+                  const mxnet::TShape& dst_shape,
+                  ShapeAndStride aux_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  constexpr size_t ELEMENTS_THRESHOLD = 256;
+  Stream<cpu>* s                      = ctx.get_stream<cpu>();
+
+  std::vector<size_t> elements_to_copy(aux_data.num_broadcast_axes);
+  std::vector<size_t> preaxis_dims(aux_data.num_broadcast_axes);
+  for (int ax = 0; ax < aux_data.num_broadcast_axes; ax++) {
+    index_t axis = aux_data.axes[ax];
+
+    elements_to_copy[ax] = 1;
+    for (int i = axis + 1; i < dst_shape.ndim(); i++) {
+      elements_to_copy[ax] *= dst_shape[i];
+    }
+    preaxis_dims[ax] = 1;
+    for (int i = axis - 1; i >= 0; i--) {
+      preaxis_dims[ax] *= src_shape[i];
+    }
+  }
+
+  // there is no need to check further axis's elements to copy as it for sure 
will be larger
+  if (elements_to_copy[0] < ELEMENTS_THRESHOLD || !std::is_same<IType, 
OType>::value) {
+    IType* src = static_cast<IType*>(inputs[0].dptr_);
+    OType* dst = static_cast<OType*>(outputs[0].dptr_);
+    mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape;
+    mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape;
+    for (int i = 0; i < MXNET_SPECIAL_MAX_NDIM; ++i) {

Review Comment:
   maybe stick to one notation -> `i++`



##########
src/operator/tensor/broadcast_reduce_op.h:
##########
@@ -1354,6 +1354,94 @@ struct direct_copy {
   }
 };
 
+template <typename IType, typename OType>
+void BroadcastCPU(const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs,
+                  const mxnet::TShape& src_shape,
+                  const mxnet::TShape& dst_shape,
+                  ShapeAndStride aux_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  constexpr size_t ELEMENTS_THRESHOLD = 256;
+  Stream<cpu>* s                      = ctx.get_stream<cpu>();
+
+  std::vector<size_t> elements_to_copy(aux_data.num_broadcast_axes);
+  std::vector<size_t> preaxis_dims(aux_data.num_broadcast_axes);
+  for (int ax = 0; ax < aux_data.num_broadcast_axes; ax++) {
+    index_t axis = aux_data.axes[ax];
+
+    elements_to_copy[ax] = 1;
+    for (int i = axis + 1; i < dst_shape.ndim(); i++) {
+      elements_to_copy[ax] *= dst_shape[i];
+    }
+    preaxis_dims[ax] = 1;

Review Comment:
   You can simplify this one:
   ```suggestion
       preaxis_dims[ax] = src_shape[0];
       for (int i =1; i < axis; i++) {
         preaxis_dims[ax] *= src_shape[i];
       }
   ```
   You can do similar thing above as well.



##########
src/operator/tensor/broadcast_reduce_op.h:
##########
@@ -1354,6 +1354,94 @@ struct direct_copy {
   }
 };
 
+template <typename IType, typename OType>
+void BroadcastCPU(const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs,
+                  const mxnet::TShape& src_shape,
+                  const mxnet::TShape& dst_shape,
+                  ShapeAndStride aux_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  constexpr size_t ELEMENTS_THRESHOLD = 256;
+  Stream<cpu>* s                      = ctx.get_stream<cpu>();
+
+  std::vector<size_t> elements_to_copy(aux_data.num_broadcast_axes);
+  std::vector<size_t> preaxis_dims(aux_data.num_broadcast_axes);
+  for (int ax = 0; ax < aux_data.num_broadcast_axes; ax++) {
+    index_t axis = aux_data.axes[ax];
+
+    elements_to_copy[ax] = 1;
+    for (int i = axis + 1; i < dst_shape.ndim(); i++) {
+      elements_to_copy[ax] *= dst_shape[i];
+    }
+    preaxis_dims[ax] = 1;
+    for (int i = axis - 1; i >= 0; i--) {
+      preaxis_dims[ax] *= src_shape[i];
+    }
+  }
+
+  // there is no need to check further axis's elements to copy as it for sure 
will be larger

Review Comment:
   ```suggestion
     // there is no need to check further axis' elements to copy as it for sure 
will be larger
   ```



##########
src/operator/tensor/broadcast_reduce_op.h:
##########
@@ -1354,6 +1354,94 @@ struct direct_copy {
   }
 };
 
+template <typename IType, typename OType>
+void BroadcastCPU(const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs,
+                  const mxnet::TShape& src_shape,
+                  const mxnet::TShape& dst_shape,
+                  ShapeAndStride aux_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  constexpr size_t ELEMENTS_THRESHOLD = 256;
+  Stream<cpu>* s                      = ctx.get_stream<cpu>();
+
+  std::vector<size_t> elements_to_copy(aux_data.num_broadcast_axes);
+  std::vector<size_t> preaxis_dims(aux_data.num_broadcast_axes);
+  for (int ax = 0; ax < aux_data.num_broadcast_axes; ax++) {
+    index_t axis = aux_data.axes[ax];
+
+    elements_to_copy[ax] = 1;
+    for (int i = axis + 1; i < dst_shape.ndim(); i++) {
+      elements_to_copy[ax] *= dst_shape[i];
+    }
+    preaxis_dims[ax] = 1;
+    for (int i = axis - 1; i >= 0; i--) {
+      preaxis_dims[ax] *= src_shape[i];
+    }
+  }
+
+  // there is no need to check further axis's elements to copy as it for sure 
will be larger
+  if (elements_to_copy[0] < ELEMENTS_THRESHOLD || !std::is_same<IType, 
OType>::value) {
+    IType* src = static_cast<IType*>(inputs[0].dptr_);
+    OType* dst = static_cast<OType*>(outputs[0].dptr_);
+    mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> in_shape;
+    mshadow::Shape<MXNET_SPECIAL_MAX_NDIM> out_shape;
+    for (int i = 0; i < MXNET_SPECIAL_MAX_NDIM; ++i) {
+      if (i < dst_shape.ndim()) {
+        in_shape[i]  = src_shape[i];
+        out_shape[i] = src_shape[i];

Review Comment:
   Why src_shape not dst_shape?



##########
src/operator/tensor/broadcast_reduce_op.h:
##########
@@ -1354,6 +1354,94 @@ struct direct_copy {
   }
 };
 
+template <typename IType, typename OType>
+void BroadcastCPU(const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs,
+                  const mxnet::TShape& src_shape,
+                  const mxnet::TShape& dst_shape,
+                  ShapeAndStride aux_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  constexpr size_t ELEMENTS_THRESHOLD = 256;
+  Stream<cpu>* s                      = ctx.get_stream<cpu>();
+
+  std::vector<size_t> elements_to_copy(aux_data.num_broadcast_axes);
+  std::vector<size_t> preaxis_dims(aux_data.num_broadcast_axes);
+  for (int ax = 0; ax < aux_data.num_broadcast_axes; ax++) {
+    index_t axis = aux_data.axes[ax];
+
+    elements_to_copy[ax] = 1;
+    for (int i = axis + 1; i < dst_shape.ndim(); i++) {
+      elements_to_copy[ax] *= dst_shape[i];
+    }
+    preaxis_dims[ax] = 1;
+    for (int i = axis - 1; i >= 0; i--) {
+      preaxis_dims[ax] *= src_shape[i];

Review Comment:
   Why one is calculated on src_shape and the other on dst_shape?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [incubator-mxnet] bartekkuncer commented on a diff in pull request #21004: broadcast_like CPU optimization

Reply via email to