This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new ed355945d2 GH-48897: [C++] Benchmark and optimize CountSetBits (#48898)
ed355945d2 is described below
commit ed355945d223575649f7e77e65453320d8bb771a
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jan 21 11:35:07 2026 +0100
GH-48897: [C++] Benchmark and optimize CountSetBits (#48898)
### Rationale for this change
Counting the set bits in a null bitmap is an operation that comes often, it
can be useful to get a more precise idea of its performance.
### What changes are included in this PR?
1. Add a benchmark for `CountSetBits`.
2. Hand-unroll its inner loop for better performance as otherwise the
compiler may not respect the nested loop hint.
Local results (AMD Zen 2):
```
------------------------------------------------------------------------------
Benchmark Time CPU Iterations
UserCounters...
------------------------------------------------------------------------------
CountSetBits/16 5.27 ns 5.27 ns 133267114
bytes_per_second=2.82991Gi/s
CountSetBits/1024 35.1 ns 35.1 ns 19960309
bytes_per_second=27.178Gi/s
CountSetBits/131072 3703 ns 3702 ns 184743
bytes_per_second=32.9698Gi/s
```
Local results (Intel(R) Core(TM) Ultra 7 255H):
```
------------------------------------------------------------------------------
Benchmark Time CPU Iterations
UserCounters...
------------------------------------------------------------------------------
CountSetBits/16 2.45 ns 2.45 ns 285392946
bytes_per_second=6.08012Gi/s
CountSetBits/1024 28.9 ns 28.9 ns 23618777
bytes_per_second=33.0086Gi/s
CountSetBits/131072 3490 ns 3489 ns 198472
bytes_per_second=34.9862Gi/s
```
### Are these changes tested?
By running said benchmark manually (and by Continuous Benchmarking).
### Are there any user-facing changes?
No.
* GitHub Issue: #48897
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/util/bit_util_benchmark.cc | 12 ++++++++++++
cpp/src/arrow/util/bitmap_ops.cc | 11 +++++++----
2 files changed, 19 insertions(+), 4 deletions(-)
diff --git a/cpp/src/arrow/util/bit_util_benchmark.cc
b/cpp/src/arrow/util/bit_util_benchmark.cc
index ec90fb453f..da624bec19 100644
--- a/cpp/src/arrow/util/bit_util_benchmark.cc
+++ b/cpp/src/arrow/util/bit_util_benchmark.cc
@@ -439,6 +439,17 @@ static void SetBitsTo(benchmark::State& state) {
state.SetBytesProcessed(state.iterations() * nbytes);
}
+static void CountSetBits(benchmark::State& state) {
+ int64_t nbytes = state.range(0);
+ std::shared_ptr<Buffer> buffer = CreateRandomBuffer(nbytes);
+
+ for (auto _ : state) {
+ auto count = internal::CountSetBits(buffer->data(), /*bit_offset=*/0,
nbytes * 8);
+ benchmark::DoNotOptimize(count);
+ }
+ state.SetBytesProcessed(state.iterations() * nbytes);
+}
+
template <int64_t OffsetSrc, int64_t OffsetDest = 0>
static void CopyBitmap(benchmark::State& state) { // NOLINT non-const
reference
const int64_t buffer_size = state.range(0);
@@ -519,6 +530,7 @@
BENCHMARK(ReverseSetBitRunReader)->Apply(SetBitRunReaderPercentageArg);
BENCHMARK(VisitBits)->Arg(kBufferSize);
BENCHMARK(VisitBitsUnrolled)->Arg(kBufferSize);
BENCHMARK(SetBitsTo)->Arg(2)->Arg(1 << 4)->Arg(1 << 10)->Arg(1 << 17);
+BENCHMARK(CountSetBits)->Arg(1 << 4)->Arg(1 << 10)->Arg(1 << 17);
#ifdef ARROW_WITH_BENCHMARKS_REFERENCE
static void ReferenceNaiveBitmapWriter(benchmark::State& state) {
diff --git a/cpp/src/arrow/util/bitmap_ops.cc b/cpp/src/arrow/util/bitmap_ops.cc
index 6246656ef2..ce2224f2f6 100644
--- a/cpp/src/arrow/util/bitmap_ops.cc
+++ b/cpp/src/arrow/util/bitmap_ops.cc
@@ -17,6 +17,7 @@
#include "arrow/util/bitmap_ops.h"
+#include <array>
#include <cstdint>
#include <cstring>
#include <functional>
@@ -55,13 +56,15 @@ int64_t CountSetBits(const uint8_t* data, int64_t
bit_offset, int64_t length) {
constexpr int64_t kCountUnrollFactor = 4;
const int64_t words_rounded =
bit_util::RoundDown(p.aligned_words, kCountUnrollFactor);
- int64_t count_unroll[kCountUnrollFactor] = {0};
+ std::array<int64_t, kCountUnrollFactor> count_unroll{};
// Unroll the loop for better performance
for (int64_t i = 0; i < words_rounded; i += kCountUnrollFactor) {
- for (int64_t k = 0; k < kCountUnrollFactor; k++) {
- count_unroll[k] += bit_util::PopCount(u64_data[k]);
- }
+ // (hand-unrolled as some gcc versions would unnest a nested `for` loop)
+ count_unroll[0] += bit_util::PopCount(u64_data[0]);
+ count_unroll[1] += bit_util::PopCount(u64_data[1]);
+ count_unroll[2] += bit_util::PopCount(u64_data[2]);
+ count_unroll[3] += bit_util::PopCount(u64_data[3]);
u64_data += kCountUnrollFactor;
}
for (int64_t k = 0; k < kCountUnrollFactor; k++) {