This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new d5fe8e5 ARROW-4234: [C++] Improve memory bandwidth test
d5fe8e5 is described below
commit d5fe8e5c6789e1eac484d6a6d4d8c487ec89e126
Author: François Saint-Jacques <[email protected]>
AuthorDate: Wed Jan 23 16:30:29 2019 +0100
ARROW-4234: [C++] Improve memory bandwidth test
- Kept the existing memcopy benchmark, but made the number of threads a
benchmark variable.
- Added 3 explicit pure bandwidth tests: Read, Write, ReadWrite.
Author: François Saint-Jacques <[email protected]>
Closes #3378 from fsaintjacques/ARROW-4234-memory-bandwidth and squashes
the following commits:
576e1161 <François Saint-Jacques> Cast
16807177 <François Saint-Jacques> Make clang happy, take 2
7495f2af <François Saint-Jacques> Fix for windows and CLANG warning
58064ecb <François Saint-Jacques> Fix lint issues.
9a15cc55 <François Saint-Jacques> Add Read/Write/ReadWrite test
d8c0b491 <François Saint-Jacques> ARROW-4234: Improve memory bandwidth test
---
cpp/src/arrow/io/memory-benchmark.cc | 99 +++++++++++++++++++++++++++---------
1 file changed, 75 insertions(+), 24 deletions(-)
diff --git a/cpp/src/arrow/io/memory-benchmark.cc
b/cpp/src/arrow/io/memory-benchmark.cc
index 72a5dc8..b36be4d 100644
--- a/cpp/src/arrow/io/memory-benchmark.cc
+++ b/cpp/src/arrow/io/memory-benchmark.cc
@@ -15,50 +15,101 @@
// specific language governing permissions and limitations
// under the License.
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <immintrin.h>
+#endif
+
+#include <iostream>
+
#include "arrow/api.h"
#include "arrow/io/memory.h"
#include "arrow/test-util.h"
+#include "arrow/util/cpu-info.h"
#include "benchmark/benchmark.h"
-#include <iostream>
-
namespace arrow {
-static void BM_SerialMemcopy(benchmark::State& state) { // NOLINT non-const
reference
- constexpr int64_t kTotalSize = 100 * 1024 * 1024; // 100MB
+static const int kNumCores = internal::CpuInfo::GetInstance()->num_cores();
+constexpr size_t kMemoryPerCore = 32 * 1024 * 1024;
+using BufferPtr = std::shared_ptr<Buffer>;
+
+using VectorType = __m128i;
+
+// See
http://codearcana.com/posts/2013/05/18/achieving-maximum-memory-bandwidth.html
+// for the usage of stream loads/writes. Or section 6.1, page 47 of
+// https://akkadia.org/drepper/cpumemory.pdf .
+
+static void Read(void* src, void* dst, size_t size) {
+ auto simd = static_cast<VectorType*>(src);
+ (void)dst;
+
+ for (size_t i = 0; i < size / sizeof(VectorType); i++)
+ benchmark::DoNotOptimize(_mm_stream_load_si128(&simd[i]));
+}
+
+static void Write(void* src, void* dst, size_t size) {
+ auto simd = static_cast<VectorType*>(dst);
+ const VectorType ones = _mm_set1_epi32(1);
+ (void)src;
- std::shared_ptr<Buffer> buffer1, buffer2;
- ABORT_NOT_OK(AllocateBuffer(kTotalSize, &buffer1));
- ABORT_NOT_OK(AllocateBuffer(kTotalSize, &buffer2));
- random_bytes(kTotalSize, 0, buffer2->mutable_data());
+ for (size_t i = 0; i < size / sizeof(VectorType); i++)
_mm_stream_si128(&simd[i], ones);
+}
+
+static void ReadWrite(void* src, void* dst, size_t size) {
+ auto src_simd = static_cast<VectorType*>(src);
+ auto dst_simd = static_cast<VectorType*>(dst);
+
+ for (size_t i = 0; i < size / sizeof(VectorType); i++)
+ _mm_stream_si128(&dst_simd[i], _mm_stream_load_si128(&src_simd[i]));
+}
+
+using ApplyFn = decltype(Read);
+
+template <ApplyFn Apply>
+static void MemoryBandwidth(benchmark::State& state) { // NOLINT non-const
reference
+ const size_t buffer_size = kMemoryPerCore;
+ BufferPtr src, dst;
+
+ ABORT_NOT_OK(AllocateBuffer(buffer_size, &src));
+ ABORT_NOT_OK(AllocateBuffer(buffer_size, &dst));
+ random_bytes(buffer_size, 0, src->mutable_data());
while (state.KeepRunning()) {
- io::FixedSizeBufferWriter writer(buffer1);
- ABORT_NOT_OK(writer.Write(buffer2->data(), buffer2->size()));
+ Apply(src->mutable_data(), dst->mutable_data(), buffer_size);
}
- state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize);
+
+ state.SetBytesProcessed(state.iterations() * buffer_size);
}
-static void BM_ParallelMemcopy(benchmark::State& state) { // NOLINT non-const
reference
- constexpr int64_t kTotalSize = 100 * 1024 * 1024; // 100MB
+// `UseRealTime` is required due to threads, otherwise the cumulative CPU time
+// is used which will skew the results by the number of threads.
+BENCHMARK_TEMPLATE(MemoryBandwidth, Read)->ThreadRange(1,
kNumCores)->UseRealTime();
+BENCHMARK_TEMPLATE(MemoryBandwidth, Write)->ThreadRange(1,
kNumCores)->UseRealTime();
+BENCHMARK_TEMPLATE(MemoryBandwidth, ReadWrite)->ThreadRange(1,
kNumCores)->UseRealTime();
+
+static void ParallelMemoryCopy(benchmark::State& state) { // NOLINT non-const
reference
+ const int64_t n_threads = state.range(0);
+ const int64_t buffer_size = kMemoryPerCore;
- std::shared_ptr<Buffer> buffer1, buffer2;
- ABORT_NOT_OK(AllocateBuffer(kTotalSize, &buffer1));
- ABORT_NOT_OK(AllocateBuffer(kTotalSize, &buffer2));
+ std::shared_ptr<Buffer> src, dst;
+ ABORT_NOT_OK(AllocateBuffer(buffer_size, &src));
+ ABORT_NOT_OK(AllocateBuffer(buffer_size, &dst));
- random_bytes(kTotalSize, 0, buffer2->mutable_data());
+ random_bytes(buffer_size, 0, src->mutable_data());
while (state.KeepRunning()) {
- io::FixedSizeBufferWriter writer(buffer1);
- writer.set_memcopy_threads(4);
- ABORT_NOT_OK(writer.Write(buffer2->data(), buffer2->size()));
+ io::FixedSizeBufferWriter writer(dst);
+ writer.set_memcopy_threads(static_cast<int>(n_threads));
+ ABORT_NOT_OK(writer.Write(src->data(), src->size()));
}
- state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize);
-}
-BENCHMARK(BM_SerialMemcopy)->MinTime(1.0)->Repetitions(2)->UseRealTime();
+ state.SetBytesProcessed(int64_t(state.iterations()) * buffer_size);
+ state.counters["threads"] = static_cast<double>(n_threads);
+}
-BENCHMARK(BM_ParallelMemcopy)->MinTime(1.0)->Repetitions(2)->UseRealTime();
+BENCHMARK(ParallelMemoryCopy)->RangeMultiplier(2)->Range(1,
kNumCores)->UseRealTime();
} // namespace arrow