This is an automated email from the ASF dual-hosted git repository.

jiayuliu pushed a commit to branch feat/sbbf/size
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git

commit f274bfe958e8fbb6135debd8326d9a159915bba6
Author: Jiayu Liu <[email protected]>
AuthorDate: Sat Jan 17 09:36:12 2026 +0800

    adjust sbbf sizing for better cache efficiency
---
 parquet/src/bloom_filter/mod.rs | 87 +++++++++++++++++++++++++++++------------
 1 file changed, 62 insertions(+), 25 deletions(-)

diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs
index 1f77e492cc..b260ef32a0 100644
--- a/parquet/src/bloom_filter/mod.rs
+++ b/parquet/src/bloom_filter/mod.rs
@@ -242,14 +242,38 @@ fn optimal_num_of_bytes(num_bytes: usize) -> usize {
     num_bytes.next_power_of_two()
 }
 
-// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf
-// given fpp = (1 - e^(-k * n / m)) ^ k
-// we have m = - k * n / ln(1 - fpp ^ (1 / k))
-// where k = number of hash functions, m = number of bits, n = number of 
distinct values
+/// Linear interpolation helper
+#[inline]
+fn interpolate(x: f64, x1: f64, x2: f64, y1: f64, y2: f64) -> f64 {
+    y1 + (x - x1) * (y2 - y1) / (x2 - x1)
+}
+
+// See http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf
+// The SBBF uses a block structure which introduces overhead due to Poisson 
distribution
+// of elements across blocks. We use the paper's compensation table (Table I) 
to adjust
+// the theoretical bits-per-element to account for this overhead.
 #[inline]
 fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize {
-    let num_bits = -8.0 * ndv as f64 / (1.0 - fpp.powf(1.0 / 8.0)).ln();
-    num_bits as usize
+    // 1. Calculate the theoretical 'c' (bits per element) for an IDEAL filter.
+    // The paper uses c = m/n. For an ideal filter, f = (1/2)^k if k is 
optimal.
+    // With k=8 fixed, we use the rearrangement: c = -k / ln(1 - fpp^(1/k))
+    let k = 8.0;
+    let theoretical_c = -k / (1.0 - fpp.powf(1.0 / k)).ln();
+
+    // 2. Adjust 'c' to 'c_prime' using the paper's compensation table (Table 
I).
+    // This accounts for the Poisson distribution of elements across blocks.
+    let compensated_c = match theoretical_c {
+        c if c <= 6.0 => interpolate(c, 5.0, 6.0, 6.0, 7.0),
+        c if c <= 8.0 => interpolate(c, 7.0, 8.0, 8.0, 9.0),
+        c if c <= 13.0 => interpolate(c, 10.0, 13.0, 11.0, 14.0),
+        c if c <= 20.0 => interpolate(c, 14.0, 20.0, 16.0, 25.0),
+        c if c <= 26.0 => interpolate(c, 21.0, 26.0, 26.0, 38.0),
+        c if c <= 30.0 => interpolate(c, 27.0, 30.0, 40.0, 51.0),
+        c if c <= 34.0 => interpolate(c, 31.0, 34.0, 58.0, 90.0),
+        _ => theoretical_c * 2.65, // Beyond c=34, SBBF efficiency drops off a 
cliff
+    };
+
+    (ndv as f64 * compensated_c).ceil() as usize
 }
 
 impl Sbbf {
@@ -575,26 +599,39 @@ mod tests {
 
     #[test]
     fn test_num_of_bits_from_ndv_fpp() {
+        // Expected values calculated using the SBBF compensation table from 
the paper
+        // 
http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf
         for (fpp, ndv, num_bits) in &[
-            (0.1, 10, 57),
-            (0.01, 10, 96),
-            (0.001, 10, 146),
-            (0.1, 100, 577),
-            (0.01, 100, 968),
-            (0.001, 100, 1460),
-            (0.1, 1000, 5772),
-            (0.01, 1000, 9681),
-            (0.001, 1000, 14607),
-            (0.1, 10000, 57725),
-            (0.01, 10000, 96815),
-            (0.001, 10000, 146076),
-            (0.1, 100000, 577254),
-            (0.01, 100000, 968152),
-            (0.001, 100000, 1460769),
-            (0.1, 1000000, 5772541),
-            (0.01, 1000000, 9681526),
-            (0.001, 1000000, 14607697),
-            (1e-50, 1_000_000_000_000, 14226231280773240832),
+            (0.1, 10, 68),
+            (0.01, 10, 107),
+            (0.001, 10, 170),
+            (0.0001, 10, 262),
+            (0.00001, 10, 494),
+            (0.1, 100, 678),
+            (0.01, 100, 1069),
+            (0.001, 100, 1692),
+            (0.0001, 100, 2611),
+            (0.00001, 100, 4938),
+            (0.1, 1000, 6773),
+            (0.01, 1000, 10682),
+            (0.001, 1000, 16912),
+            (0.0001, 1000, 26109),
+            (0.00001, 1000, 49371),
+            (0.1, 10000, 67726),
+            (0.01, 10000, 106816),
+            (0.001, 10000, 169116),
+            (0.0001, 10000, 261090),
+            (0.00001, 10000, 493702),
+            (0.1, 100000, 677255),
+            (0.01, 100000, 1068153),
+            (0.001, 100000, 1691155),
+            (0.0001, 100000, 2610899),
+            (0.00001, 100000, 4937013),
+            (0.1, 1000000, 6772542),
+            (0.01, 1000000, 10681527),
+            (0.001, 1000000, 16911547),
+            (0.0001, 1000000, 26108983),
+            (0.00001, 1000000, 49370126),
         ] {
             assert_eq!(*num_bits, num_of_bits_from_ndv_fpp(*ndv, *fpp) as u64);
         }

Reply via email to