This is an automated email from the ASF dual-hosted git repository.

anirudh2290 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 6fd4384  parallelize NDArray::Copy<cpu, cpu> when data size is large 
(#12926)
6fd4384 is described below

commit 6fd4384fa9b5a1fe8ae294cd790b4872667310e4
Author: XiaotaoChen <[email protected]>
AuthorDate: Fri Nov 30 05:12:40 2018 +0800

    parallelize NDArray::Copy<cpu, cpu> when data size is large (#12926)
    
    * parallelize NDArray::Copy<cpu, cpu> by OpenMP when data size > 
MXNET_CPU_PARALLEL_COPY_SIZE
    
    * code specification according to reviewer's suggestions
    
    * align with std::memcpy api
    
    * add descriptive error message
    
    * update MXNET_CPU_PARALLEL_COPY_SIZE doc
    
    * update MXNET_CPU_PARALLEL_COPY_SIZE doc again
---
 docs/faq/env_var.md             |  6 ++++++
 src/common/utils.h              | 17 +++++++++++++++++
 src/ndarray/ndarray_function.cc | 14 ++++++++------
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index c7d3b28..8d08e32 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -206,6 +206,12 @@ When USE_PROFILER is enabled in Makefile or CMake, the 
following environments ca
   If no such algorithm exists given other constraints, MXNet will error out. 
This variable affects the choice
   of CUDNN convolution algorithms. Please see [CUDNN developer 
guide](https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html)
 for more details.
 
+* MXNET_CPU_PARALLEL_COPY_SIZE
+  - Values: Int ```(default=200000)```
+  - The minimum size to call parallel copy by OpenMP in CPU2CPU mode.
+  - When the array size is bigger than or equal to  this threshold, 
NDArray::Copy(from, to) is implemented by OpenMP with the Recommended OMP 
Thread Count.
+  - When the array size is less than this threshold, NDArray::Copy(from , to)) 
is implemented by memcpy in single thread.
+
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
diff --git a/src/common/utils.h b/src/common/utils.h
index 92b7c20..b902b38 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -717,6 +717,23 @@ inline void EmplaceBackZeros(const NDArrayStorageType 
stype, const TShape &shape
   }
 }
 
+
+/*!
+ * \brief parallelize copy by OpenMP.
+ */
+template<typename DType>
+inline void ParallelCopy(DType* dst, const DType* src, index_t size) {
+  static index_t copy_block_size = 
dmlc::GetEnv("MXNET_CPU_PARALLEL_COPY_SIZE", 200000);
+  if (size >= copy_block_size) {
+    #pragma omp parallel for 
num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+    for (index_t i = 0; i < size; ++i) {
+      dst[i] = src[i];
+    }
+  } else {
+    std::memcpy(dst, src, sizeof(DType) * size);
+  }
+}
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index 43295d6..a613d5a 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -38,13 +38,15 @@ void Copy<cpu, cpu>(const TBlob &from, TBlob *to,
                     RunContext ctx) {
   MSHADOW_TYPE_SWITCH(to->type_flag_, DType, {
     if (to->type_flag_ == from.type_flag_) {
-        mshadow::Copy(to->FlatTo1D<cpu, DType>(),
-                      from.FlatTo1D<cpu, DType>());
+      const index_t size = from.Size();
+      CHECK_EQ(size, to->Size()) << "copying size mismatch, from: " << size * 
sizeof(DType)
+               << " bytes, to: " << to->Size() * sizeof(DType) << " bytes.";
+      common::ParallelCopy(to->dptr<DType>(), from.dptr<DType>(), size);
     } else {
-        MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, {
-            to->FlatTo1D<cpu, DType>() =
-                mshadow::expr::tcast<DType>(from.FlatTo1D<cpu, SrcDType>());
-        })
+      MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, {
+          to->FlatTo1D<cpu, DType>() =
+              mshadow::expr::tcast<DType>(from.FlatTo1D<cpu, SrcDType>());
+      })
     }
   })
 }

Reply via email to