anirudh2290 closed pull request #12926: parallelize NDArray::Copy<cpu, cpu> 
when data size is large
URL: https://github.com/apache/incubator-mxnet/pull/12926
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index e373377ee8d..06e9da50ded 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -202,6 +202,12 @@ When USE_PROFILER is enabled in Makefile or CMake, the 
following environments ca
   If no such algorithm exists given other constraints, MXNet will error out. 
This variable affects the choice
   of CUDNN convolution algorithms. Please see [CUDNN developer 
guide](https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html)
 for more details.
 
+* MXNET_CPU_PARALLEL_COPY_SIZE
+  - Values: Int ```(default=200000)```
+  - The minimum size to call parallel copy by OpenMP in CPU2CPU mode.
+  - When the array size is bigger than or equal to  this threshold, 
NDArray::Copy(from, to) is implemented by OpenMP with the Recommended OMP 
Thread Count.
+  - When the array size is less than this threshold, NDArray::Copy(from , to)) 
is implemented by memcpy in single thread.
+
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
diff --git a/src/common/utils.h b/src/common/utils.h
index 26889792e53..a60253547fc 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -713,6 +713,23 @@ inline void EmplaceBackZeros(const NDArrayStorageType 
stype, const TShape &shape
   }
 }
 
+
+/*!
+ * \brief parallelize copy by OpenMP.
+ */
+template<typename DType>
+inline void ParallelCopy(DType* dst, const DType* src, index_t size) {
+  static index_t copy_block_size = 
dmlc::GetEnv("MXNET_CPU_PARALLEL_COPY_SIZE", 200000);
+  if (size >= copy_block_size) {
+    #pragma omp parallel for 
num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+    for (index_t i = 0; i < size; ++i) {
+      dst[i] = src[i];
+    }
+  } else {
+    std::memcpy(dst, src, sizeof(DType) * size);
+  }
+}
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index 43295d6e101..a613d5a3dec 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -38,13 +38,15 @@ void Copy<cpu, cpu>(const TBlob &from, TBlob *to,
                     RunContext ctx) {
   MSHADOW_TYPE_SWITCH(to->type_flag_, DType, {
     if (to->type_flag_ == from.type_flag_) {
-        mshadow::Copy(to->FlatTo1D<cpu, DType>(),
-                      from.FlatTo1D<cpu, DType>());
+      const index_t size = from.Size();
+      CHECK_EQ(size, to->Size()) << "copying size mismatch, from: " << size * 
sizeof(DType)
+               << " bytes, to: " << to->Size() * sizeof(DType) << " bytes.";
+      common::ParallelCopy(to->dptr<DType>(), from.dptr<DType>(), size);
     } else {
-        MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, {
-            to->FlatTo1D<cpu, DType>() =
-                mshadow::expr::tcast<DType>(from.FlatTo1D<cpu, SrcDType>());
-        })
+      MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, {
+          to->FlatTo1D<cpu, DType>() =
+              mshadow::expr::tcast<DType>(from.FlatTo1D<cpu, SrcDType>());
+      })
     }
   })
 }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to