[GitHub] szha closed pull request #12997: A better take forward kernel for CPU

GitBox Sun, 04 Nov 2018 00:14:58 -0700

szha closed pull request #12997: A better take forward kernel for CPU
URL: https://github.com/apache/incubator-mxnet/pull/12997


This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/operator/tensor/indexing_op.cc 
b/src/operator/tensor/indexing_op.cc
index 710b50275c2..973546eea69 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -28,6 +28,28 @@
 namespace mxnet {
 namespace op {
 
+template<bool clip = true>
+struct TakeCPU {
+  // assume that idx have been flattened to a 1-D tensor (N,)
+  // assume that out_data and in_data have been flattened to 2-D tensors, (N, 
M) and (K, M)
+  // M is the number of columns of in_data and out_data
+  // K is the number of rows of in_data
+  // i is the index of out_data
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data,
+                                  const IType* idx, const size_t M, const 
int64_t K) {
+    int64_t j = static_cast<int64_t>(idx[i]);
+    if (clip) {
+      if (j <= 0) j = 0;
+      else if (j >= K) j = K - 1;
+    } else {
+      j = j % K;
+      j += (j < 0) ? K : 0;
+    }
+    std::memcpy(out_data + i * M, in_data + j * M, M * sizeof(DType));
+  }
+};
+
 /*
  * \brief returns true if all indices are between [min, max]
  * \param data_ptr the indices to check
@@ -48,6 +70,29 @@ bool CheckIndexOutOfBound(const DType* data_ptr, size_t 
data_size,
   return is_valid;
 }
 
+// Embedding forward implementation with dense weight
+template<>
+void EmbeddingOpForwardDnsImpl<cpu>(mshadow::Stream<cpu>* s,
+                                    const TBlob& data,
+                                    const TBlob& weight,
+                                    const OpReqType req,
+                                    const TBlob& output) {
+  using namespace mxnet_op;
+  const TShape& ishape = data.shape_;
+  const TShape& oshape = output.shape_;
+
+  MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
+      Tensor<cpu, 1, IType> idx = data.get_with_shape<cpu, 1, IType>(
+        Shape1(ishape.ProdShape(0, ishape.ndim())), s);
+      Tensor<cpu, 2, DType> wmat = weight.get<cpu, 2, DType>(s);
+      Tensor<cpu, 2, DType> out = output.get_with_shape<cpu, 2, DType>(
+        Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), 
s);
+      Kernel<TakeCPU<true>, cpu>::Launch(s, oshape.Size() / wmat.shape_[1], 
out.dptr_, wmat.dptr_,
+                                         idx.dptr_, wmat.shape_[1], 
wmat.shape_[0]);
+    });
+  });
+}
 
 template<>
 void SparseEmbeddingOpForwardRspImpl<cpu>(const OpContext& ctx,
@@ -227,6 +272,74 @@ void TakeOpForwardCsrImpl<cpu>(const TakeParam& params,
   });
 }
 
+template<>
+void TakeOpForward<cpu>(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  if (req[take_::kOut] == kNullOp) return;
+  const TakeParam& param = nnvm::get<TakeParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+
+  const TShape& idxshape = inputs[take_::kIdx].shape_;
+  const TShape& arrshape = inputs[take_::kArr].shape_;
+  const TShape& oshape = outputs[take_::kOut].shape_;
+
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  const int actual_axis = param.axis + ((param.axis < 0) ? arrshape.ndim() : 
0);
+
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {  // output data type
+    MSHADOW_TYPE_SWITCH(inputs[1].type_flag_, IType, {  // index data type
+      if (actual_axis == 0) {
+        if (param.mode == take_::kClip) {
+          Kernel<TakeCPU<true>, cpu>::Launch(s, idxshape.Size(),
+                                             
outputs[take_::kOut].dptr<DType>(),
+                                             inputs[take_::kArr].dptr<DType>(),
+                                             inputs[take_::kIdx].dptr<IType>(),
+                                             oshape.Size()/idxshape.Size(), 
arrshape[0]);
+        } else {
+          Kernel<TakeCPU<false>, cpu>::Launch(s, idxshape.Size(),
+                                              
outputs[take_::kOut].dptr<DType>(),
+                                              
inputs[take_::kArr].dptr<DType>(),
+                                              
inputs[take_::kIdx].dptr<IType>(),
+                                              oshape.Size()/idxshape.Size(), 
arrshape[0]);
+        }
+      } else {
+        mshadow::Shape<10> in_strides;
+        int stride = 1;
+        for (int i = arrshape.ndim() - 1; i >= 0; stride *= arrshape[i], --i) {
+          in_strides[i] = stride;
+        }
+        mshadow::Shape<10> out_strides;
+        stride = 1;
+        for (int i = oshape.ndim() - 1; i >= 0; stride *= oshape[i], --i) {
+          out_strides[i] = stride;
+        }
+        if (param.mode == take_::kClip) {
+          Kernel<Take<true>, cpu>::Launch(s, oshape.Size(),
+                                          outputs[take_::kOut].dptr<DType>(),
+                                          inputs[take_::kArr].dptr<DType>(),
+                                          inputs[take_::kIdx].dptr<IType>(),
+                                          in_strides, out_strides, 
arrshape.ndim(),
+                                          oshape.ndim(), idxshape.ndim(),
+                                          arrshape[actual_axis], actual_axis);
+        } else if (param.mode == take_::kWrap) {
+          Kernel<Take<false>, cpu>::Launch(s, oshape.Size(),
+                                           outputs[take_::kOut].dptr<DType>(),
+                                           inputs[take_::kArr].dptr<DType>(),
+                                           inputs[take_::kIdx].dptr<IType>(),
+                                           in_strides, out_strides, 
arrshape.ndim(),
+                                           oshape.ndim(), idxshape.ndim(),
+                                           arrshape[actual_axis], actual_axis);
+        }
+      }
+    });
+  });
+}
+
 template<>
 inline void SparseEmbeddingOpBackwardRspImpl<cpu>(const bool deterministic,
                                                   const OpContext& ctx,
diff --git a/src/operator/tensor/indexing_op.cu 
b/src/operator/tensor/indexing_op.cu
index bdc7f6e843c..df5db84fa4e 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -116,6 +116,31 @@ struct AddTakeGradRspDeterministicKernel {
   }
 };
 
+/*! \brief name the struct Take instead of take
+ * to avoid conflict with the take function in mshadow
+ */
+template<bool clip = true>
+struct TakeGPU {
+  // assume that idx have been flattened to a 1-D tensor (N,)
+  // assume that out_data and in_data have been flattened to 2-D tensors, (N, 
M) and (K, M)
+  // M is the number of columns of in_data and out_data
+  // K is the number of rows of in_data
+  // i is the index of out_data
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data,
+                                  const IType* idx, const int64_t M, const 
int64_t K) {
+    int64_t j = static_cast<int64_t>(idx[i/M]);
+    if (clip) {
+      if (j <= 0) j = 0;
+      else if (j >= K) j = K - 1;
+    } else {
+      j = j % K;
+      j += (j < 0) ? K : 0;
+    }
+    out_data[i] = in_data[j * M + i % M];
+  }
+};
+
 /*
  * \brief returns true if all indices are between [min, max]
  * \param s the stream
@@ -137,6 +162,30 @@ bool CheckIndexOutOfBound(mshadow::Stream<gpu> *s, const 
DType* data_ptr, size_t
   return is_valid == 0;
 }
 
+// Embedding forward implementation with dense weight
+template<>
+void EmbeddingOpForwardDnsImpl<gpu>(mshadow::Stream<gpu>* s,
+                                    const TBlob& data,
+                                    const TBlob& weight,
+                                    const OpReqType req,
+                                    const TBlob& output) {
+  using namespace mxnet_op;
+  const TShape& ishape = data.shape_;
+  const TShape& oshape = output.shape_;
+
+  MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
+      Tensor<gpu, 1, IType> idx = data.get_with_shape<gpu, 1, IType>(
+        Shape1(ishape.ProdShape(0, ishape.ndim())), s);
+      Tensor<gpu, 2, DType> wmat = weight.get<gpu, 2, DType>(s);
+      Tensor<gpu, 2, DType> out = output.get_with_shape<gpu, 2, DType>(
+        Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), 
s);
+      Kernel<TakeGPU<true>, gpu>::Launch(s, oshape.Size(), out.dptr_, 
wmat.dptr_,
+                                         idx.dptr_, wmat.shape_[1], 
wmat.shape_[0]);
+    });
+  });
+}
+
 template<>
 void SparseEmbeddingOpForwardRspImpl<gpu>(const OpContext& ctx,
                                           const TBlob& data,
@@ -414,6 +463,72 @@ inline void GatherNDBackwardImpl(int N, int M, int K,
   mxnet_op::Kernel<backward_gather_nd_gpu, gpu>::Launch(s, N, N, M, K, 
strides, out, data, indices);
 }
 
+template<>
+void TakeOpForward<gpu>(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  if (req[take_::kOut] == kNullOp) return;
+  const TakeParam& param = nnvm::get<TakeParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+
+  const TShape& idxshape = inputs[take_::kIdx].shape_;
+  const TShape& arrshape = inputs[take_::kArr].shape_;
+  const TShape& oshape = outputs[take_::kOut].shape_;
+
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  const int actual_axis = param.axis + ((param.axis < 0) ? arrshape.ndim() : 
0);
+
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {  // output data type
+    MSHADOW_TYPE_SWITCH(inputs[1].type_flag_, IType, {  // index data type
+      if (actual_axis == 0) {
+        if (param.mode == take_::kClip) {
+          Kernel<TakeGPU<true>, gpu>::Launch(s, oshape.Size(),
+                                             
outputs[take_::kOut].dptr<DType>(),
+                                             inputs[take_::kArr].dptr<DType>(),
+                                             inputs[take_::kIdx].dptr<IType>(),
+                                             oshape.Size()/idxshape.Size(), 
arrshape[0]);
+        } else {
+          Kernel<TakeGPU<false>, gpu>::Launch(s, oshape.Size(),
+                                              
outputs[take_::kOut].dptr<DType>(),
+                                              
inputs[take_::kArr].dptr<DType>(),
+                                              
inputs[take_::kIdx].dptr<IType>(),
+                                              oshape.Size()/idxshape.Size(), 
arrshape[0]);
+        }
+      } else {
+        mshadow::Shape<10> in_strides;
+        int stride = 1;
+        for (int i = arrshape.ndim() - 1; i >= 0; stride *= arrshape[i], --i) {
+          in_strides[i] = stride;
+        }
+        mshadow::Shape<10> out_strides;
+        stride = 1;
+        for (int i = oshape.ndim() - 1; i >= 0; stride *= oshape[i], --i) {
+          out_strides[i] = stride;
+        }
+        if (param.mode == take_::kClip) {
+          Kernel<Take<true>, gpu>::Launch(s, oshape.Size(),
+                                          outputs[take_::kOut].dptr<DType>(),
+                                          inputs[take_::kArr].dptr<DType>(),
+                                          inputs[take_::kIdx].dptr<IType>(),
+                                          in_strides, out_strides, 
arrshape.ndim(), oshape.ndim(),
+                                          idxshape.ndim(), 
arrshape[actual_axis], actual_axis);
+        } else if (param.mode == take_::kWrap) {
+          Kernel<Take<false>, gpu>::Launch(s, oshape.Size(),
+                                           outputs[take_::kOut].dptr<DType>(),
+                                           inputs[take_::kArr].dptr<DType>(),
+                                           inputs[take_::kIdx].dptr<IType>(),
+                                           in_strides, out_strides, 
arrshape.ndim(), oshape.ndim(),
+                                           idxshape.ndim(), 
arrshape[actual_axis], actual_axis);
+        }
+      }
+    });
+  });
+}
+
 NNVM_REGISTER_OP(Embedding)
 .set_attr<FCompute>("FCompute<gpu>", EmbeddingOpForward<gpu>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", SparseEmbeddingOpForwardEx<gpu>);
diff --git a/src/operator/tensor/indexing_op.h 
b/src/operator/tensor/indexing_op.h
index 5282a7ea9a6..fef590c4725 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -301,25 +301,6 @@ inline bool SparseEmbeddingOpBackwardStorageType(const 
nnvm::NodeAttrs& attrs,
  */
 template<bool clip = true>
 struct Take {
-  // assume that idx have been flattened to a 1-D tensor (N,)
-  // assume that out_data and in_data have been flattened to 2-D tensors, (N, 
M) and (K, M)
-  // M is the number of columns of in_data and out_data
-  // K is the number of rows of in_data
-  // i is the index of out_data
-  template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data,
-                                  const IType* idx, const int M, const int K) {
-    int j = static_cast<int>(idx[i/M]);
-    if (clip) {
-      if (j <= 0) j = 0;
-      else if (j >= K) j = K - 1;
-    } else {
-      j = j % K;
-      j += (j < 0) ? K : 0;
-    }
-    out_data[i] = in_data[j * M + i % M];
-  }
-
   /*!
    * \brief Map function for take operator
    * \param i           global thread id
@@ -339,21 +320,21 @@ struct Take {
                                   const int in_ndims, const int out_ndims, 
const int idx_ndims,
                                   const int axis_dim, const int axis) {
     // i is the global flattened index in the output
-    const int out_head_index = (axis == 0) ? 0 : (i / out_stride[axis - 1]);
-    const int out_rest_index = (axis == 0) ? i : (i % out_stride[axis - 1]);
-    const int out_mid_index = out_rest_index / in_stride[axis];
-    const int out_tail_index = (axis == in_ndims - 1) ?
-                               0 : (out_rest_index % in_stride[axis]);
-    int idx_index = static_cast<int>(idx[out_mid_index]);
+    const int64_t out_head_index = (axis == 0) ? 0 : (i / out_stride[axis - 
1]);
+    const int64_t out_rest_index = (axis == 0) ? i : (i % out_stride[axis - 
1]);
+    const int64_t out_mid_index = out_rest_index / in_stride[axis];
+    const int64_t out_tail_index = (axis == in_ndims - 1) ?
+                                   0 : (out_rest_index % in_stride[axis]);
+    int64_t idx_index = static_cast<int64_t>(idx[out_mid_index]);
     if (clip) {
       idx_index = (idx_index < 0) ? 0 : idx_index;
       idx_index = (idx_index > axis_dim - 1) ? (axis_dim - 1) : idx_index;
     }
     idx_index %= axis_dim;
     idx_index += (idx_index < 0) ? axis_dim : 0;
-    const int in_tail_index = out_tail_index;
-    const int in_head_index = out_head_index;
-    int in_src_index = in_tail_index + idx_index * in_stride[axis];
+    const int64_t in_tail_index = out_tail_index;
+    const int64_t in_head_index = out_head_index;
+    int64_t in_src_index = in_tail_index + idx_index * in_stride[axis];
     in_src_index += (axis == 0) ? 0 : in_head_index * in_stride[axis - 1];
     out_data[i] = in_data[in_src_index];
   }
@@ -365,24 +346,7 @@ void EmbeddingOpForwardDnsImpl(mshadow::Stream<xpu>* s,
                                const TBlob& data,
                                const TBlob& weight,
                                const OpReqType req,
-                               const TBlob& output) {
-  using namespace mxnet_op;
-  const TShape& ishape = data.shape_;
-  const TShape& oshape = output.shape_;
-
-  MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
-    MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
-      Tensor<xpu, 1, IType> idx = data.get_with_shape<xpu, 1, IType>(
-        Shape1(ishape.ProdShape(0, ishape.ndim())), s);
-      Tensor<xpu, 2, DType> wmat = weight.get<xpu, 2, DType>(s);
-      Tensor<xpu, 2, DType> out = output.get_with_shape<xpu, 2, DType>(
-        Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), 
s);
-      Kernel<Take<true>, xpu>::Launch(s, oshape.Size(), out.dptr_, wmat.dptr_,
-                                idx.dptr_, wmat.shape_[1], wmat.shape_[0]);
-    });
-  });
-}
-
+                               const TBlob& output);
 
 template<int req>
 struct TakeRspKernel {
@@ -825,66 +789,7 @@ void TakeOpForward(const nnvm::NodeAttrs& attrs,
                    const OpContext& ctx,
                    const std::vector<TBlob>& inputs,
                    const std::vector<OpReqType>& req,
-                   const std::vector<TBlob>& outputs) {
-  using namespace mxnet_op;
-  if (req[take_::kOut] == kNullOp) return;
-  const TakeParam& param = nnvm::get<TakeParam>(attrs.parsed);
-  CHECK_EQ(inputs.size(), 2U);
-  CHECK_EQ(outputs.size(), 1U);
-
-  const TShape& idxshape = inputs[take_::kIdx].shape_;
-  const TShape& arrshape = inputs[take_::kArr].shape_;
-  const TShape& oshape = outputs[take_::kOut].shape_;
-
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  const int actual_axis = param.axis + ((param.axis < 0) ? arrshape.ndim() : 
0);
-
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {  // output data type
-    MSHADOW_TYPE_SWITCH(inputs[1].type_flag_, IType, {  // index data type
-      if (actual_axis == 0) {
-        if (param.mode == take_::kClip) {
-          Kernel<Take<true>, xpu>::Launch(s, oshape.Size(),
-                                          outputs[take_::kOut].dptr<DType>(),
-                                          inputs[take_::kArr].dptr<DType>(),
-                                          inputs[take_::kIdx].dptr<IType>(),
-                                          oshape.Size()/idxshape.Size(), 
arrshape[0]);
-        } else {
-          Kernel<Take<false>, xpu>::Launch(s, oshape.Size(),
-                                           outputs[take_::kOut].dptr<DType>(),
-                                           inputs[take_::kArr].dptr<DType>(),
-                                           inputs[take_::kIdx].dptr<IType>(),
-                                           oshape.Size()/idxshape.Size(), 
arrshape[0]);
-        }
-      } else {
-        mshadow::Shape<10> in_strides;
-        int stride = 1;
-        for (int i = arrshape.ndim() - 1; i >= 0; stride *= arrshape[i], --i) {
-          in_strides[i] = stride;
-        }
-        mshadow::Shape<10> out_strides;
-        stride = 1;
-        for (int i = oshape.ndim() - 1; i >= 0; stride *= oshape[i], --i) {
-          out_strides[i] = stride;
-        }
-        if (param.mode == take_::kClip) {
-          Kernel<Take<true>, xpu>::Launch(s, oshape.Size(),
-                                          outputs[take_::kOut].dptr<DType>(),
-                                          inputs[take_::kArr].dptr<DType>(),
-                                          inputs[take_::kIdx].dptr<IType>(),
-                                          in_strides, out_strides, 
arrshape.ndim(), oshape.ndim(),
-                                          idxshape.ndim(), 
arrshape[actual_axis], actual_axis);
-        } else if (param.mode == take_::kWrap) {
-          Kernel<Take<false>, xpu>::Launch(s, oshape.Size(),
-                                           outputs[take_::kOut].dptr<DType>(),
-                                           inputs[take_::kArr].dptr<DType>(),
-                                           inputs[take_::kIdx].dptr<IType>(),
-                                           in_strides, out_strides, 
arrshape.ndim(), oshape.ndim(),
-                                           idxshape.ndim(), 
arrshape[actual_axis], actual_axis);
-        }
-      }
-    });
-  });
-}
+                   const std::vector<TBlob>& outputs);
 
 struct TakeGradGeneralKernel {
   /*!


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] szha closed pull request #12997: A better take forward kernel for CPU

Reply via email to