This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new ed355945d2 GH-48897: [C++] Benchmark and optimize CountSetBits (#48898)
ed355945d2 is described below

commit ed355945d223575649f7e77e65453320d8bb771a
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jan 21 11:35:07 2026 +0100

    GH-48897: [C++] Benchmark and optimize CountSetBits (#48898)
    
    ### Rationale for this change
    
    Counting the set bits in a null bitmap is an operation that comes often, it 
can be useful to get a more precise idea of its performance.
    
    ### What changes are included in this PR?
    
    1. Add a benchmark for `CountSetBits`.
    2. Hand-unroll its inner loop for better performance as otherwise the 
compiler may not respect the nested loop hint.
    
    Local results (AMD Zen 2):
    ```
    
------------------------------------------------------------------------------
    Benchmark                    Time             CPU   Iterations 
UserCounters...
    
------------------------------------------------------------------------------
    CountSetBits/16           5.27 ns         5.27 ns    133267114 
bytes_per_second=2.82991Gi/s
    CountSetBits/1024         35.1 ns         35.1 ns     19960309 
bytes_per_second=27.178Gi/s
    CountSetBits/131072       3703 ns         3702 ns       184743 
bytes_per_second=32.9698Gi/s
    ```
    
    Local results (Intel(R) Core(TM) Ultra 7 255H):
    ```
    
------------------------------------------------------------------------------
    Benchmark                    Time             CPU   Iterations 
UserCounters...
    
------------------------------------------------------------------------------
    CountSetBits/16           2.45 ns         2.45 ns    285392946 
bytes_per_second=6.08012Gi/s
    CountSetBits/1024         28.9 ns         28.9 ns     23618777 
bytes_per_second=33.0086Gi/s
    CountSetBits/131072       3490 ns         3489 ns       198472 
bytes_per_second=34.9862Gi/s
    ```
    
    ### Are these changes tested?
    
    By running said benchmark manually (and by Continuous Benchmarking).
    
    ### Are there any user-facing changes?
    
    No.
    
    * GitHub Issue: #48897
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/util/bit_util_benchmark.cc | 12 ++++++++++++
 cpp/src/arrow/util/bitmap_ops.cc         | 11 +++++++----
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/util/bit_util_benchmark.cc 
b/cpp/src/arrow/util/bit_util_benchmark.cc
index ec90fb453f..da624bec19 100644
--- a/cpp/src/arrow/util/bit_util_benchmark.cc
+++ b/cpp/src/arrow/util/bit_util_benchmark.cc
@@ -439,6 +439,17 @@ static void SetBitsTo(benchmark::State& state) {
   state.SetBytesProcessed(state.iterations() * nbytes);
 }
 
+static void CountSetBits(benchmark::State& state) {
+  int64_t nbytes = state.range(0);
+  std::shared_ptr<Buffer> buffer = CreateRandomBuffer(nbytes);
+
+  for (auto _ : state) {
+    auto count = internal::CountSetBits(buffer->data(), /*bit_offset=*/0, 
nbytes * 8);
+    benchmark::DoNotOptimize(count);
+  }
+  state.SetBytesProcessed(state.iterations() * nbytes);
+}
+
 template <int64_t OffsetSrc, int64_t OffsetDest = 0>
 static void CopyBitmap(benchmark::State& state) {  // NOLINT non-const 
reference
   const int64_t buffer_size = state.range(0);
@@ -519,6 +530,7 @@ 
BENCHMARK(ReverseSetBitRunReader)->Apply(SetBitRunReaderPercentageArg);
 BENCHMARK(VisitBits)->Arg(kBufferSize);
 BENCHMARK(VisitBitsUnrolled)->Arg(kBufferSize);
 BENCHMARK(SetBitsTo)->Arg(2)->Arg(1 << 4)->Arg(1 << 10)->Arg(1 << 17);
+BENCHMARK(CountSetBits)->Arg(1 << 4)->Arg(1 << 10)->Arg(1 << 17);
 
 #ifdef ARROW_WITH_BENCHMARKS_REFERENCE
 static void ReferenceNaiveBitmapWriter(benchmark::State& state) {
diff --git a/cpp/src/arrow/util/bitmap_ops.cc b/cpp/src/arrow/util/bitmap_ops.cc
index 6246656ef2..ce2224f2f6 100644
--- a/cpp/src/arrow/util/bitmap_ops.cc
+++ b/cpp/src/arrow/util/bitmap_ops.cc
@@ -17,6 +17,7 @@
 
 #include "arrow/util/bitmap_ops.h"
 
+#include <array>
 #include <cstdint>
 #include <cstring>
 #include <functional>
@@ -55,13 +56,15 @@ int64_t CountSetBits(const uint8_t* data, int64_t 
bit_offset, int64_t length) {
     constexpr int64_t kCountUnrollFactor = 4;
     const int64_t words_rounded =
         bit_util::RoundDown(p.aligned_words, kCountUnrollFactor);
-    int64_t count_unroll[kCountUnrollFactor] = {0};
+    std::array<int64_t, kCountUnrollFactor> count_unroll{};
 
     // Unroll the loop for better performance
     for (int64_t i = 0; i < words_rounded; i += kCountUnrollFactor) {
-      for (int64_t k = 0; k < kCountUnrollFactor; k++) {
-        count_unroll[k] += bit_util::PopCount(u64_data[k]);
-      }
+      // (hand-unrolled as some gcc versions would unnest a nested `for` loop)
+      count_unroll[0] += bit_util::PopCount(u64_data[0]);
+      count_unroll[1] += bit_util::PopCount(u64_data[1]);
+      count_unroll[2] += bit_util::PopCount(u64_data[2]);
+      count_unroll[3] += bit_util::PopCount(u64_data[3]);
       u64_data += kCountUnrollFactor;
     }
     for (int64_t k = 0; k < kCountUnrollFactor; k++) {

Reply via email to