cjolivier01 closed pull request #9002: Misc performance changes
URL: https://github.com/apache/incubator-mxnet/pull/9002
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index 25a95be787..295c8db146 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -34,7 +34,7 @@
from .base import mx_real_t
from .base import check_call, build_param_doc as _build_param_doc
from .ndarray import NDArray
-from .ndarray.sparse import CSRNDArray
+from .ndarray.sparse import CSRNDArray, RowSparseNDArray
from .ndarray.sparse import array as sparse_array
from .ndarray import _ndarray_cls
from .ndarray import array
@@ -537,8 +537,10 @@ def _shuffle(data, idx):
shuffle_data.append((k, v))
elif isinstance(v, CSRNDArray):
shuffle_data.append((k, sparse_array(v.asscipy()[idx], v.context)))
- else:
+ elif isinstance(v, RowSparseNDArray):
shuffle_data.append((k, array(v.asnumpy()[idx], v.context)))
+ else:
+ shuffle_data.append((k, array(v[idx], v.context)))
return shuffle_data
diff --git a/src/operator/random/sampler.h b/src/operator/random/sampler.h
index d544aec88d..d0030b9163 100644
--- a/src/operator/random/sampler.h
+++ b/src/operator/random/sampler.h
@@ -25,6 +25,8 @@
#ifndef MXNET_OPERATOR_RANDOM_SAMPLER_H_
#define MXNET_OPERATOR_RANDOM_SAMPLER_H_
+#include "../../engine/openmp.h"
+
#ifdef __CUDACC__
#include <curand.h>
#include <curand_kernel.h>
@@ -89,8 +91,8 @@ class RandGenerator<gpu, double> {
template<typename xpu>
MSHADOW_XINLINE index_t OptSampleSeedNum(index_t N);
template<>
-MSHADOW_XINLINE index_t OptSampleSeedNum<cpu>(index_t N) {
- return omp_get_num_threads();
+MSHADOW_CINLINE index_t OptSampleSeedNum<cpu>(index_t N) {
+ return engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
}
template<>
MSHADOW_XINLINE index_t OptSampleSeedNum<gpu>(index_t N) {
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index 3f5014d8ca..9218a9cd18 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -209,7 +209,7 @@ void Fill(mshadow::Stream<xpu> *s, const TBlob& b, const
OpReqType req, ValueTyp
const size_t size = b.Size();
if (val == 0) {
if (req != kAddTo) {
- if (b.dev_mask() == cpu::kDevMask) {
+ if (b.dev_mask() == cpu::kDevMask && size < 50000) {
MSHADOW_TYPE_SWITCH(b.type_flag_, DType, {
memset(b.dptr_, 0, size * sizeof(DType));
});
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index ce40daa904..dd2bdfe625 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -47,7 +47,6 @@ class StorageImpl : public Storage {
private:
static constexpr size_t kMaxNumberOfDevices = Context::kMaxDevType + 1;
- static constexpr size_t kMaxNumberOfDeviceIDs = Context::kMaxDevID + 1;
#if MXNET_USE_CUDA
static int num_gpu_device;
#endif // MXNET_USE_CUDA
diff --git a/tests/cpp/misc/memory_test.cc b/tests/cpp/misc/memory_test.cc
index 8f4e8c25e8..7c83e461bd 100644
--- a/tests/cpp/misc/memory_test.cc
+++ b/tests/cpp/misc/memory_test.cc
@@ -64,40 +64,40 @@ TEST(MEMORY_TEST, MemsetAndMemcopyPerformance) {
std::cout << "====================================" << std::endl
<< "Data size: " << test::pretty_num(test_size) << std::endl <<
std::flush;
- std::unique_ptr<uint8_t> buffer_1(new uint8_t[test_size]), buffer_2(new
uint8_t[test_size]);
- uint8_t *src = buffer_1.get(), *dest = buffer_2.get();
+ std::unique_ptr<float> buffer_1(new float[test_size]), buffer_2(new
float[test_size]);
+ float *src = buffer_1.get(), *dest = buffer_2.get();
for (size_t x = 0; x < 5; ++x) {
// Init memory with different values
- memset(src, 3, test_size);
- memset(dest, 255, test_size); // wipe out some/all of src cache
+ memset(src, 3, test_size * sizeof(float));
+ memset(dest, 255, test_size * sizeof(float)); // wipe out some/all of
src cache
// memset
- uint64_t start = test::perf::getNannoTickCount();
- memset(src, 123, test_size);
- const uint64_t memset_time = test::perf::getNannoTickCount() - start;
+ uint64_t start = mxnet::test::perf::getNannoTickCount();
+ memset(src, 0, test_size * sizeof(float));
+ const uint64_t memset_time = mxnet::test::perf::getNannoTickCount() -
start;
- start = test::perf::getNannoTickCount();
+ start = mxnet::test::perf::getNannoTickCount();
#pragma omp parallel for num_threads(GetOMPThreadCount())
for (int i = 0; i < static_cast<int>(test_size); ++i) {
- src[i] = 42;
+ src[i] = 42.0f;
}
- const uint64_t omp_set_time = test::perf::getNannoTickCount() - start;
+ const uint64_t omp_set_time = mxnet::test::perf::getNannoTickCount() -
start;
- start = test::perf::getNannoTickCount();
- memcpy(dest, src, test_size);
- const uint64_t memcpy_time = test::perf::getNannoTickCount() - start;
+ start = mxnet::test::perf::getNannoTickCount();
+ memcpy(dest, src, test_size * sizeof(float));
+ const uint64_t memcpy_time = mxnet::test::perf::getNannoTickCount() -
start;
// bounce the cache and dirty logic
- memset(src, 6, test_size);
- memset(dest, 200, test_size);
+ memset(src, 6, test_size * sizeof(float));
+ memset(dest, 200, test_size * sizeof(float));
- start = test::perf::getNannoTickCount();
+ start = mxnet::test::perf::getNannoTickCount();
#pragma omp parallel for num_threads(GetOMPThreadCount())
for (int i = 0; i < static_cast<int>(test_size); ++i) {
dest[i] = src[i];
}
- const uint64_t omp_copy_time = test::perf::getNannoTickCount() - start;
+ const uint64_t omp_copy_time = mxnet::test::perf::getNannoTickCount() -
start;
memset_times.push_back(memset_time);
omp_set_times.push_back(omp_set_time);
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services