(arrow-rs) branch main updated: bench(parquet): add Sbbf check/insert benchmarks (#10041)

etseidl Sat, 30 May 2026 20:18:25 -0700

This is an automated email from the ASF dual-hosted git repository.

etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/main by this push:
     new 511ad068ae bench(parquet): add Sbbf check/insert benchmarks (#10041)
511ad068ae is described below

commit 511ad068ae2b7f511e16215d9d3c96fb0f334f2c
Author: Dan Mattheiss <[email protected]>
AuthorDate: Sat May 30 23:18:11 2026 -0400

    bench(parquet): add Sbbf check/insert benchmarks (#10041)
    
    Adds `bench_check` and `bench_insert` benchmarks
    for`Sbbf::{check,insert}`. Originally benchmarks were part of #10011 but
    were split out to follow Contributing guidelines
    
    # Are these changes tested?
    
    Benchmarks compiled and ran using `cargo bench -p parquet --bench
    bloom_filter`.
    
    # Are there any user-facing changes?
    
    No.
---
 parquet/benches/bloom_filter.rs | 123 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 120 insertions(+), 3 deletions(-)

diff --git a/parquet/benches/bloom_filter.rs b/parquet/benches/bloom_filter.rs
index ca4f900067..cb2bfb6b4a 100644
--- a/parquet/benches/bloom_filter.rs
+++ b/parquet/benches/bloom_filter.rs
@@ -15,8 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, 
criterion_main};
+use std::hint;
+
+use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, 
criterion_group, criterion_main};
 use parquet::bloom_filter::Sbbf;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
 
 /// Build a bloom filter sized for `initial_ndv` at `fpp`, insert `num_values` 
distinct values,
 /// and return it ready for folding.
@@ -46,7 +50,7 @@ fn bench_fold_to_target_fpp(c: &mut Criterion) {
                     f.fold_to_target_fpp(fpp);
                     f
                 },
-                criterion::BatchSize::SmallInput,
+                BatchSize::SmallInput,
             );
         });
     }
@@ -104,10 +108,123 @@ fn bench_insert_only(c: &mut Criterion) {
     group.finish();
 }
 
+/// Benchmark `Sbbf::insert` across the same three cache regimes as
+/// `bench_check`. The filter is allocated once per regime and reused
+/// across iterations — bloom inserts are idempotent on identical
+/// input, so this measures the pure insert kernel cost (hash + mask +
+/// load + OR + store) without per-iteration allocation noise.
+fn bench_insert(c: &mut Criterion) {
+    let regimes: [(&str, usize); 3] = [
+        ("s_128KiB", 128 * 1024),
+        ("m_2MiB", 2 * 1024 * 1024),
+        ("l_32MiB", 32 * 1024 * 1024),
+    ];
+    const NUM_INSERTS: usize = 50_000;
+
+    let mut group = c.benchmark_group("insert");
+
+    for (label, num_bytes) in regimes {
+        let mut rng = StdRng::seed_from_u64(0xC0FFEE);
+        let keys: Vec<[u8; 16]> = (0..NUM_INSERTS)
+            .map(|_| {
+                let mut k = [0u8; 16];
+                rng.fill(&mut k);
+                k
+            })
+            .collect();
+
+        let mut filter = Sbbf::new_with_num_of_bytes(num_bytes);
+
+        group.throughput(Throughput::Elements(NUM_INSERTS as u64));
+        group.bench_with_input(BenchmarkId::new("ins", label), &keys, |b, 
keys| {
+            b.iter(|| {
+                for k in keys {
+                    filter.insert(hint::black_box(k.as_slice()));
+                }
+            });
+        });
+    }
+    group.finish();
+}
+
+/// Benchmark `Sbbf::check` across three cache regimes and both miss-heavy
+/// (the common case for Parquet row-group skipping) and hit-heavy workloads.
+///
+/// The three filter sizes span the cache hierarchy on a typical server:
+/// 128 KB fits in L2, 2 MB lives in L3, 32 MB spills out of L3.
+fn bench_check(c: &mut Criterion) {
+    let regimes: [(&str, usize, usize); 3] = [
+        ("s_128KiB", 128 * 1024, 10_000),
+        ("m_2MiB", 2 * 1024 * 1024, 200_000),
+        ("l_32MiB", 32 * 1024 * 1024, 3_000_000),
+    ];
+    const NUM_QUERIES: usize = 50_000;
+
+    let mut group = c.benchmark_group("check");
+
+    for (label, num_bytes, ndv) in regimes {
+        let mut rng = StdRng::seed_from_u64(0xC0FFEE);
+
+        // Clear the high bit of byte 0 on inserted keys so the miss set below
+        // — which forces the same bit to 1 — is genuinely disjoint by
+        // construction (not just disjoint at birthday-paradox probability).
+        let mut filter = Sbbf::new_with_num_of_bytes(num_bytes);
+        let mut inserted: Vec<[u8; 16]> = Vec::with_capacity(ndv);
+        for _ in 0..ndv {
+            let mut k = [0u8; 16];
+            rng.fill(&mut k);
+            k[0] &= 0x7f;
+            filter.insert(k.as_slice());
+            inserted.push(k);
+        }
+
+        // Disjoint miss set: high bit of byte 0 is 1 here and 0 in `inserted`,
+        // so no miss key can equal an inserted key.
+        let miss_keys: Vec<[u8; 16]> = (0..NUM_QUERIES)
+            .map(|_| {
+                let mut k = [0u8; 16];
+                rng.fill(&mut k);
+                k[0] |= 0x80;
+                k
+            })
+            .collect();
+        let hit_keys: Vec<[u8; 16]> = (0..NUM_QUERIES)
+            .map(|_| inserted[rng.random_range(0..inserted.len())])
+            .collect();
+
+        group.throughput(Throughput::Elements(NUM_QUERIES as u64));
+        group.bench_with_input(BenchmarkId::new("miss", label), &miss_keys, 
|b, keys| {
+            b.iter(|| {
+                let mut found = 0u64;
+                for k in keys {
+                    if hint::black_box(filter.check(k.as_slice())) {
+                        found += 1;
+                    }
+                }
+                hint::black_box(found)
+            });
+        });
+        group.bench_with_input(BenchmarkId::new("hit", label), &hit_keys, |b, 
keys| {
+            b.iter(|| {
+                let mut found = 0u64;
+                for k in keys {
+                    if hint::black_box(filter.check(k.as_slice())) {
+                        found += 1;
+                    }
+                }
+                hint::black_box(found)
+            });
+        });
+    }
+    group.finish();
+}
+
 criterion_group!(
     benches,
     bench_fold_to_target_fpp,
     bench_insert_and_fold,
-    bench_insert_only
+    bench_insert_only,
+    bench_insert,
+    bench_check,
 );
 criterion_main!(benches);

(arrow-rs) branch main updated: bench(parquet): add Sbbf check/insert benchmarks (#10041)

Reply via email to