anirudh2290 closed pull request #12926: parallelize NDArray::Copy<cpu, cpu> when data size is large URL: https://github.com/apache/incubator-mxnet/pull/12926
This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md index e373377ee8d..06e9da50ded 100644 --- a/docs/faq/env_var.md +++ b/docs/faq/env_var.md @@ -202,6 +202,12 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca If no such algorithm exists given other constraints, MXNet will error out. This variable affects the choice of CUDNN convolution algorithms. Please see [CUDNN developer guide](https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html) for more details. +* MXNET_CPU_PARALLEL_COPY_SIZE + - Values: Int ```(default=200000)``` + - The minimum size to call parallel copy by OpenMP in CPU2CPU mode. + - When the array size is bigger than or equal to this threshold, NDArray::Copy(from, to) is implemented by OpenMP with the Recommended OMP Thread Count. + - When the array size is less than this threshold, NDArray::Copy(from , to)) is implemented by memcpy in single thread. + Settings for Minimum Memory Usage --------------------------------- - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1``` diff --git a/src/common/utils.h b/src/common/utils.h index 26889792e53..a60253547fc 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -713,6 +713,23 @@ inline void EmplaceBackZeros(const NDArrayStorageType stype, const TShape &shape } } + +/*! + * \brief parallelize copy by OpenMP. + */ +template<typename DType> +inline void ParallelCopy(DType* dst, const DType* src, index_t size) { + static index_t copy_block_size = dmlc::GetEnv("MXNET_CPU_PARALLEL_COPY_SIZE", 200000); + if (size >= copy_block_size) { + #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount()) + for (index_t i = 0; i < size; ++i) { + dst[i] = src[i]; + } + } else { + std::memcpy(dst, src, sizeof(DType) * size); + } +} + } // namespace common } // namespace mxnet #endif // MXNET_COMMON_UTILS_H_ diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc index 43295d6e101..a613d5a3dec 100644 --- a/src/ndarray/ndarray_function.cc +++ b/src/ndarray/ndarray_function.cc @@ -38,13 +38,15 @@ void Copy<cpu, cpu>(const TBlob &from, TBlob *to, RunContext ctx) { MSHADOW_TYPE_SWITCH(to->type_flag_, DType, { if (to->type_flag_ == from.type_flag_) { - mshadow::Copy(to->FlatTo1D<cpu, DType>(), - from.FlatTo1D<cpu, DType>()); + const index_t size = from.Size(); + CHECK_EQ(size, to->Size()) << "copying size mismatch, from: " << size * sizeof(DType) + << " bytes, to: " << to->Size() * sizeof(DType) << " bytes."; + common::ParallelCopy(to->dptr<DType>(), from.dptr<DType>(), size); } else { - MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, { - to->FlatTo1D<cpu, DType>() = - mshadow::expr::tcast<DType>(from.FlatTo1D<cpu, SrcDType>()); - }) + MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, { + to->FlatTo1D<cpu, DType>() = + mshadow::expr::tcast<DType>(from.FlatTo1D<cpu, SrcDType>()); + }) } }) } ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
