This is an automated email from the ASF dual-hosted git repository.
anirudh2290 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 6fd4384 parallelize NDArray::Copy<cpu, cpu> when data size is large
(#12926)
6fd4384 is described below
commit 6fd4384fa9b5a1fe8ae294cd790b4872667310e4
Author: XiaotaoChen <[email protected]>
AuthorDate: Fri Nov 30 05:12:40 2018 +0800
parallelize NDArray::Copy<cpu, cpu> when data size is large (#12926)
* parallelize NDArray::Copy<cpu, cpu> by OpenMP when data size >
MXNET_CPU_PARALLEL_COPY_SIZE
* code specification according to reviewer's suggestions
* align with std::memcpy api
* add descriptive error message
* update MXNET_CPU_PARALLEL_COPY_SIZE doc
* update MXNET_CPU_PARALLEL_COPY_SIZE doc again
---
docs/faq/env_var.md | 6 ++++++
src/common/utils.h | 17 +++++++++++++++++
src/ndarray/ndarray_function.cc | 14 ++++++++------
3 files changed, 31 insertions(+), 6 deletions(-)
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index c7d3b28..8d08e32 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -206,6 +206,12 @@ When USE_PROFILER is enabled in Makefile or CMake, the
following environments ca
If no such algorithm exists given other constraints, MXNet will error out.
This variable affects the choice
of CUDNN convolution algorithms. Please see [CUDNN developer
guide](https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html)
for more details.
+* MXNET_CPU_PARALLEL_COPY_SIZE
+ - Values: Int ```(default=200000)```
+ - The minimum size to call parallel copy by OpenMP in CPU2CPU mode.
+ - When the array size is bigger than or equal to this threshold,
NDArray::Copy(from, to) is implemented by OpenMP with the Recommended OMP
Thread Count.
+ - When the array size is less than this threshold, NDArray::Copy(from , to))
is implemented by memcpy in single thread.
+
Settings for Minimum Memory Usage
---------------------------------
- Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
diff --git a/src/common/utils.h b/src/common/utils.h
index 92b7c20..b902b38 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -717,6 +717,23 @@ inline void EmplaceBackZeros(const NDArrayStorageType
stype, const TShape &shape
}
}
+
+/*!
+ * \brief parallelize copy by OpenMP.
+ */
+template<typename DType>
+inline void ParallelCopy(DType* dst, const DType* src, index_t size) {
+ static index_t copy_block_size =
dmlc::GetEnv("MXNET_CPU_PARALLEL_COPY_SIZE", 200000);
+ if (size >= copy_block_size) {
+ #pragma omp parallel for
num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+ for (index_t i = 0; i < size; ++i) {
+ dst[i] = src[i];
+ }
+ } else {
+ std::memcpy(dst, src, sizeof(DType) * size);
+ }
+}
+
} // namespace common
} // namespace mxnet
#endif // MXNET_COMMON_UTILS_H_
diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index 43295d6..a613d5a 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -38,13 +38,15 @@ void Copy<cpu, cpu>(const TBlob &from, TBlob *to,
RunContext ctx) {
MSHADOW_TYPE_SWITCH(to->type_flag_, DType, {
if (to->type_flag_ == from.type_flag_) {
- mshadow::Copy(to->FlatTo1D<cpu, DType>(),
- from.FlatTo1D<cpu, DType>());
+ const index_t size = from.Size();
+ CHECK_EQ(size, to->Size()) << "copying size mismatch, from: " << size *
sizeof(DType)
+ << " bytes, to: " << to->Size() * sizeof(DType) << " bytes.";
+ common::ParallelCopy(to->dptr<DType>(), from.dptr<DType>(), size);
} else {
- MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, {
- to->FlatTo1D<cpu, DType>() =
- mshadow::expr::tcast<DType>(from.FlatTo1D<cpu, SrcDType>());
- })
+ MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, {
+ to->FlatTo1D<cpu, DType>() =
+ mshadow::expr::tcast<DType>(from.FlatTo1D<cpu, SrcDType>());
+ })
}
})
}