rongzha1 closed pull request #12865: Offical master
URL: https://github.com/apache/incubator-mxnet/pull/12865
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index e77569671eb..bf5452a801b 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -506,7 +506,7 @@ struct Kernel<OP, cpu> {
   inline static bool Launch(mshadow::Stream<cpu> *, const int N, Args... args) 
{
 #ifdef _OPENMP
     const int omp_threads = 
engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-    if (omp_threads < 2) {
+    if (omp_threads < 2 || N < 2) {
       for (int i = 0; i < N; ++i) {
         OP::Map(i, args...);
       }
diff --git a/src/operator/tensor/indexing_op.cc 
b/src/operator/tensor/indexing_op.cc
index b663ef0179d..e3225902f70 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -39,10 +39,11 @@ template<typename DType>
 bool CheckIndexOutOfBound(const DType* data_ptr, size_t data_size,
                           const DType min, const DType max) {
   bool is_valid = true;
+  int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  #pragma omp parallel for num_threads(omp_threads) if (data_size > 2000)
   for (size_t i = 0; i < data_size; i++) {
     if (data_ptr[i] > max || data_ptr[i] < min) {
       is_valid = false;
-      break;
     }
   }
   return is_valid;
diff --git a/src/operator/tensor/indexing_op.h 
b/src/operator/tensor/indexing_op.h
index 1daf0a2cb18..113a571c8ec 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -320,6 +320,46 @@ struct Take {
     out_data[i] = in_data[j * M + i % M];
   }
 
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int overload, int N, DType* out_data, const 
DType* in_data,
+                                  const IType* idx, const int M, const int K) {
+    int row = N / M;
+    int col = N % M;
+    int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+    #pragma omp parallel for num_threads(omp_threads) if (N > 2000)
+    for (int i = 0; i < row; i++) {
+      int j = static_cast<int>(idx[i]);
+      if (clip) {
+        if (j <= 0) j = 0;
+        else if (j >= K) j = K - 1;
+      } else {
+        j = j % K;
+        j += (j < 0) ? K : 0;
+      }
+      const int jM = j*M;
+      const int iM = i*M;
+      for (int n = 0; n < M; n++) {
+        out_data[iM + n] = in_data[jM + n];
+      }
+    }
+    if (col != 0) {
+      int j = static_cast<int>(idx[row]);
+      if (clip) {
+        if (j <= 0) j = 0;
+        else if (j >= K) j = K - 1;
+      } else {
+        j = j % K;
+        j += (j < 0) ? K : 0;
+      }
+      const int jM = j*M;
+      const int iM = row*M;
+      #pragma omp parallel for num_threads(omp_threads) if (col > 2000)
+      for (int n = 0; n < col; n++) {
+        out_data[iM + n] = in_data[jM + n];
+      }
+    }
+  }
+
   /*!
    * \brief Map function for take operator
    * \param i           global thread id
@@ -369,7 +409,6 @@ void EmbeddingOpForwardDnsImpl(mshadow::Stream<xpu>* s,
   using namespace mxnet_op;
   const TShape& ishape = data.shape_;
   const TShape& oshape = output.shape_;
-
   MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
     MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
       Tensor<xpu, 1, IType> idx = data.get_with_shape<xpu, 1, IType>(
@@ -377,8 +416,16 @@ void EmbeddingOpForwardDnsImpl(mshadow::Stream<xpu>* s,
       Tensor<xpu, 2, DType> wmat = weight.get<xpu, 2, DType>(s);
       Tensor<xpu, 2, DType> out = output.get_with_shape<xpu, 2, DType>(
         Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), 
s);
-      Kernel<Take<true>, xpu>::Launch(s, oshape.Size(), out.dptr_, wmat.dptr_,
+      if (std::is_same<xpu, cpu>::value) {
+        // 1:loop times, call overload function for Take map which is more 
friendly for
+        // cpu platform. Using this function, embedding OP will has more than 
3 times speedup
+        // when oshape.Size() is large
+        Kernel<Take<true>, xpu>::Launch(s, 1, oshape.Size(), out.dptr_, 
wmat.dptr_,
+                                idx.dptr_, wmat.shape_[1], wmat.shape_[0]);
+      } else {
+        Kernel<Take<true>, xpu>::Launch(s,  oshape.Size(), out.dptr_, 
wmat.dptr_,
                                 idx.dptr_, wmat.shape_[1], wmat.shape_[0]);
+      }
     });
   });
 }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to