This is an automated email from the ASF dual-hosted git repository. jiayuliu pushed a commit to branch feat/sbbf/size in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
commit f274bfe958e8fbb6135debd8326d9a159915bba6 Author: Jiayu Liu <[email protected]> AuthorDate: Sat Jan 17 09:36:12 2026 +0800 adjust sbbf sizing for better cache efficiency --- parquet/src/bloom_filter/mod.rs | 87 +++++++++++++++++++++++++++++------------ 1 file changed, 62 insertions(+), 25 deletions(-) diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index 1f77e492cc..b260ef32a0 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -242,14 +242,38 @@ fn optimal_num_of_bytes(num_bytes: usize) -> usize { num_bytes.next_power_of_two() } -// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf -// given fpp = (1 - e^(-k * n / m)) ^ k -// we have m = - k * n / ln(1 - fpp ^ (1 / k)) -// where k = number of hash functions, m = number of bits, n = number of distinct values +/// Linear interpolation helper +#[inline] +fn interpolate(x: f64, x1: f64, x2: f64, y1: f64, y2: f64) -> f64 { + y1 + (x - x1) * (y2 - y1) / (x2 - x1) +} + +// See http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf +// The SBBF uses a block structure which introduces overhead due to Poisson distribution +// of elements across blocks. We use the paper's compensation table (Table I) to adjust +// the theoretical bits-per-element to account for this overhead. #[inline] fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize { - let num_bits = -8.0 * ndv as f64 / (1.0 - fpp.powf(1.0 / 8.0)).ln(); - num_bits as usize + // 1. Calculate the theoretical 'c' (bits per element) for an IDEAL filter. + // The paper uses c = m/n. For an ideal filter, f = (1/2)^k if k is optimal. + // With k=8 fixed, we use the rearrangement: c = -k / ln(1 - fpp^(1/k)) + let k = 8.0; + let theoretical_c = -k / (1.0 - fpp.powf(1.0 / k)).ln(); + + // 2. Adjust 'c' to 'c_prime' using the paper's compensation table (Table I). + // This accounts for the Poisson distribution of elements across blocks. + let compensated_c = match theoretical_c { + c if c <= 6.0 => interpolate(c, 5.0, 6.0, 6.0, 7.0), + c if c <= 8.0 => interpolate(c, 7.0, 8.0, 8.0, 9.0), + c if c <= 13.0 => interpolate(c, 10.0, 13.0, 11.0, 14.0), + c if c <= 20.0 => interpolate(c, 14.0, 20.0, 16.0, 25.0), + c if c <= 26.0 => interpolate(c, 21.0, 26.0, 26.0, 38.0), + c if c <= 30.0 => interpolate(c, 27.0, 30.0, 40.0, 51.0), + c if c <= 34.0 => interpolate(c, 31.0, 34.0, 58.0, 90.0), + _ => theoretical_c * 2.65, // Beyond c=34, SBBF efficiency drops off a cliff + }; + + (ndv as f64 * compensated_c).ceil() as usize } impl Sbbf { @@ -575,26 +599,39 @@ mod tests { #[test] fn test_num_of_bits_from_ndv_fpp() { + // Expected values calculated using the SBBF compensation table from the paper + // http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf for (fpp, ndv, num_bits) in &[ - (0.1, 10, 57), - (0.01, 10, 96), - (0.001, 10, 146), - (0.1, 100, 577), - (0.01, 100, 968), - (0.001, 100, 1460), - (0.1, 1000, 5772), - (0.01, 1000, 9681), - (0.001, 1000, 14607), - (0.1, 10000, 57725), - (0.01, 10000, 96815), - (0.001, 10000, 146076), - (0.1, 100000, 577254), - (0.01, 100000, 968152), - (0.001, 100000, 1460769), - (0.1, 1000000, 5772541), - (0.01, 1000000, 9681526), - (0.001, 1000000, 14607697), - (1e-50, 1_000_000_000_000, 14226231280773240832), + (0.1, 10, 68), + (0.01, 10, 107), + (0.001, 10, 170), + (0.0001, 10, 262), + (0.00001, 10, 494), + (0.1, 100, 678), + (0.01, 100, 1069), + (0.001, 100, 1692), + (0.0001, 100, 2611), + (0.00001, 100, 4938), + (0.1, 1000, 6773), + (0.01, 1000, 10682), + (0.001, 1000, 16912), + (0.0001, 1000, 26109), + (0.00001, 1000, 49371), + (0.1, 10000, 67726), + (0.01, 10000, 106816), + (0.001, 10000, 169116), + (0.0001, 10000, 261090), + (0.00001, 10000, 493702), + (0.1, 100000, 677255), + (0.01, 100000, 1068153), + (0.001, 100000, 1691155), + (0.0001, 100000, 2610899), + (0.00001, 100000, 4937013), + (0.1, 1000000, 6772542), + (0.01, 1000000, 10681527), + (0.001, 1000000, 16911547), + (0.0001, 1000000, 26108983), + (0.00001, 1000000, 49370126), ] { assert_eq!(*num_bits, num_of_bits_from_ndv_fpp(*ndv, *fpp) as u64); }
