Copilot commented on code in PR #9206:
URL: https://github.com/apache/arrow-rs/pull/9206#discussion_r2700559502
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -576,25 +658,36 @@ mod tests {
#[test]
fn test_num_of_bits_from_ndv_fpp() {
for (fpp, ndv, num_bits) in &[
- (0.1, 10, 57),
- (0.01, 10, 96),
- (0.001, 10, 146),
- (0.1, 100, 577),
- (0.01, 100, 968),
- (0.001, 100, 1460),
- (0.1, 1000, 5772),
- (0.01, 1000, 9681),
- (0.001, 1000, 14607),
- (0.1, 10000, 57725),
- (0.01, 10000, 96815),
- (0.001, 10000, 146076),
- (0.1, 100000, 577254),
- (0.01, 100000, 968152),
- (0.001, 100000, 1460769),
- (0.1, 1000000, 5772541),
- (0.01, 1000000, 9681526),
- (0.001, 1000000, 14607697),
- (1e-50, 1_000_000_000_000, 14226231280773240832),
+ (0.1, 10, 68),
+ (0.01, 10, 107),
+ (0.001, 10, 167),
+ (0.0001, 10, 261),
+ (0.00001, 10, 497),
+ (0.1, 100, 678),
+ (0.01, 100, 1069),
+ (0.001, 100, 1661),
+ (0.0001, 100, 2610),
+ (0.00001, 100, 4967),
+ (0.1, 1000, 6773),
+ (0.01, 1000, 10682),
+ (0.001, 1000, 16608),
+ (0.0001, 1000, 26091),
+ (0.00001, 1000, 49667),
+ (0.1, 10000, 67726),
+ (0.01, 10000, 106816),
+ (0.001, 10000, 166077),
+ (0.0001, 10000, 260909),
+ (0.00001, 10000, 496665),
+ (0.1, 100000, 677255),
+ (0.01, 100000, 1068153),
+ (0.001, 100000, 1660770),
+ (0.0001, 100000, 2609082),
+ (0.00001, 100000, 4966647),
+ (0.1, 1000000, 6772542),
+ (0.01, 1000000, 10681527),
+ (0.001, 1000000, 16607698),
+ (0.0001, 1000000, 26090819),
+ (0.00001, 1000000, 49666467),
] {
assert_eq!(*num_bits, num_of_bits_from_ndv_fpp(*ndv, *fpp) as u64);
}
Review Comment:
The extreme edge case test `(1e-50, 1_000_000_000_000,
14226231280773240832)` was removed from the test. While the new tests provide
better coverage for typical FPP values (0.1 to 0.00001), consider verifying
that the new compensation logic handles very low FPP values (e.g., 1e-50)
correctly, or document why such extreme values are not supported if that's
intentional.
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -242,14 +256,82 @@ fn optimal_num_of_bytes(num_bytes: usize) -> usize {
num_bytes.next_power_of_two()
}
-// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf
-// given fpp = (1 - e^(-k * n / m)) ^ k
-// we have m = - k * n / ln(1 - fpp ^ (1 / k))
-// where k = number of hash functions, m = number of bits, n = number of
distinct values
+/// Compensation table from Table I of the paper referenced in the module
documentation.
+/// Maps theoretical bits-per-element (c) to compensated bits-per-element (c')
for B=512.
+const COMPENSATION_TABLE: &[(f64, f64)] = &[
+ (5.0, 6.0),
+ (6.0, 7.0),
+ (7.0, 8.0),
+ (8.0, 9.0),
+ (9.0, 10.0),
+ (10.0, 11.0),
+ (11.0, 12.0),
+ (12.0, 13.0),
+ (13.0, 14.0),
+ (14.0, 16.0),
+ (15.0, 17.0),
+ (16.0, 18.0),
+ (17.0, 20.0),
+ (18.0, 21.0),
+ (19.0, 23.0),
+ (20.0, 25.0),
+ (21.0, 26.0),
+ (22.0, 28.0),
+ (23.0, 30.0),
+ (24.0, 32.0),
+ (25.0, 35.0),
+ (26.0, 38.0),
+ (27.0, 40.0),
+ (28.0, 44.0),
+ (29.0, 48.0),
+ (30.0, 51.0),
+ (31.0, 58.0),
+ (32.0, 64.0),
+ (33.0, 74.0),
+ (34.0, 90.0),
+];
+
+/// Calculates the number of bits needed for a Split Block Bloom Filter given
NDV and FPP.
+///
+/// Uses the compensation approach described in the module documentation to
account for
+/// block structure overhead. The calculation:
+/// 1. Determines the theoretical bits-per-element (c) for an ideal Bloom
filter with k=8
+/// 2. Applies compensation factors using linear interpolation between table
entries
+/// 3. Multiplies by NDV to get the total number of bits needed
#[inline]
fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize {
- let num_bits = -8.0 * ndv as f64 / (1.0 - fpp.powf(1.0 / 8.0)).ln();
- num_bits as usize
+ // Calculate the theoretical 'c' (bits per element) for an IDEAL filter.
+ // With k=8 fixed: c = -k / ln(1 - fpp^(1/k))
+ let k = 8.0;
+ let theoretical_c = -k / (1.0 - fpp.powf(1.0 / k)).ln();
+
+ // Apply compensation using linear interpolation between table entries
+ let compensated_c = if theoretical_c <= COMPENSATION_TABLE[0].0 {
+ // Below table range, use first entry
+ COMPENSATION_TABLE[0].1
+ } else if theoretical_c >= COMPENSATION_TABLE[COMPENSATION_TABLE.len() -
1].0 {
+ // Beyond c=34, SBBF efficiency drops off a cliff
+ theoretical_c * 2.65
Review Comment:
The magic number 2.65 should be documented or defined as a named constant.
This appears to be an empirically-determined compensation factor for
theoretical c values beyond 34, but the origin and rationale for this specific
value is not clear from the code or comments. Consider adding a comment
explaining where this value comes from or defining it as a named constant with
documentation.
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -638,4 +731,109 @@ mod tests {
);
}
}
+
+ /// Test that the actual false positive probability matches the expected
FPP
+ /// for various configurations of NDV and FPP.
+ #[test]
+ fn test_actual_fpp_matches_expected() {
+ // Test configurations: (expected_fpp, ndv, num_tests)
+ let test_cases = [
+ (0.01, 10_000, 100_000),
+ (0.001, 50_000, 200_000),
+ (0.01, 100_000, 200_000),
+ ];
+
+ for (expected_fpp, ndv, num_tests) in test_cases {
+ println!("Testing FPP={expected_fpp}, NDV={ndv}");
+
+ // Create bloom filter with specified NDV and FPP
+ let mut sbbf = Sbbf::new_with_ndv_fpp(ndv, expected_fpp).unwrap();
+
+ // Insert exactly NDV elements (using format "inserted_{i}")
+ for i in 0..ndv {
+ let value = format!("inserted_{i}");
+ sbbf.insert(value.as_str());
+ }
+
+ // Verify inserted elements are all found (no false negatives)
+ for i in 0..ndv.min(1000) {
+ let value = format!("inserted_{i}");
+ assert!(
+ sbbf.check(&value.as_str()),
+ "False negative detected for inserted value: {value}"
+ );
+ }
+
+ // Test non-inserted elements to measure actual FPP
+ let mut false_positives = 0;
+ for i in 0..num_tests {
+ let value = format!("not_inserted_{i}");
+ if sbbf.check(&value.as_str()) {
+ false_positives += 1;
+ }
+ }
+
+ let actual_fpp = false_positives as f64 / num_tests as f64;
+ println!(" Expected FPP: {expected_fpp}, Actual FPP:
{actual_fpp}");
+
+ // Verify actual FPP is not worse than expected by a large margin
+ // The compensation table often results in better (lower) FPP than
theoretical,
+ // so we mainly check that we don't exceed the expected FPP by too
much
+ let upper_tolerance = 3.0;
+ let upper_bound = expected_fpp * upper_tolerance;
Review Comment:
The test allows actual FPP to be up to 3x higher than expected
(upper_tolerance = 3.0). This is quite lenient - for example, if you request
0.01 FPP, the test passes even at 0.03 FPP. Consider if this tolerance is
appropriate, or if it should be tightened to better validate that the
compensation table produces bloom filters with FPPs close to the requested
values. Alternatively, add a comment explaining why such a high tolerance is
acceptable.
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -242,14 +256,82 @@ fn optimal_num_of_bytes(num_bytes: usize) -> usize {
num_bytes.next_power_of_two()
}
-// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf
-// given fpp = (1 - e^(-k * n / m)) ^ k
-// we have m = - k * n / ln(1 - fpp ^ (1 / k))
-// where k = number of hash functions, m = number of bits, n = number of
distinct values
+/// Compensation table from Table I of the paper referenced in the module
documentation.
+/// Maps theoretical bits-per-element (c) to compensated bits-per-element (c')
for B=512.
Review Comment:
The documentation states the compensation table is "for B=512" (line 260),
but this implementation uses 256-bit blocks as documented on lines 43 and 131.
In the referenced paper "Cache-, hash-, and space-efficient bloom filters", B
represents the block size in bits. Using a compensation table for B=512 with an
implementation that has B=256 could result in incorrect FPP calculations.
Please verify that the compensation values in the table are appropriate for
256-bit blocks, or update the documentation if B refers to a different
parameter.
```suggestion
/// Maps theoretical bits-per-element (c) to compensated bits-per-element
(c') as reported in
/// Table I of [cache-efficient-bf].
```
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -242,14 +256,82 @@ fn optimal_num_of_bytes(num_bytes: usize) -> usize {
num_bytes.next_power_of_two()
}
-// see http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf
-// given fpp = (1 - e^(-k * n / m)) ^ k
-// we have m = - k * n / ln(1 - fpp ^ (1 / k))
-// where k = number of hash functions, m = number of bits, n = number of
distinct values
+/// Compensation table from Table I of the paper referenced in the module
documentation.
+/// Maps theoretical bits-per-element (c) to compensated bits-per-element (c')
for B=512.
+const COMPENSATION_TABLE: &[(f64, f64)] = &[
+ (5.0, 6.0),
+ (6.0, 7.0),
+ (7.0, 8.0),
+ (8.0, 9.0),
+ (9.0, 10.0),
+ (10.0, 11.0),
+ (11.0, 12.0),
+ (12.0, 13.0),
+ (13.0, 14.0),
+ (14.0, 16.0),
+ (15.0, 17.0),
+ (16.0, 18.0),
+ (17.0, 20.0),
+ (18.0, 21.0),
+ (19.0, 23.0),
+ (20.0, 25.0),
+ (21.0, 26.0),
+ (22.0, 28.0),
+ (23.0, 30.0),
+ (24.0, 32.0),
+ (25.0, 35.0),
+ (26.0, 38.0),
+ (27.0, 40.0),
+ (28.0, 44.0),
+ (29.0, 48.0),
+ (30.0, 51.0),
+ (31.0, 58.0),
+ (32.0, 64.0),
+ (33.0, 74.0),
+ (34.0, 90.0),
+];
+
+/// Calculates the number of bits needed for a Split Block Bloom Filter given
NDV and FPP.
+///
+/// Uses the compensation approach described in the module documentation to
account for
+/// block structure overhead. The calculation:
+/// 1. Determines the theoretical bits-per-element (c) for an ideal Bloom
filter with k=8
+/// 2. Applies compensation factors using linear interpolation between table
entries
+/// 3. Multiplies by NDV to get the total number of bits needed
#[inline]
fn num_of_bits_from_ndv_fpp(ndv: u64, fpp: f64) -> usize {
- let num_bits = -8.0 * ndv as f64 / (1.0 - fpp.powf(1.0 / 8.0)).ln();
- num_bits as usize
+ // Calculate the theoretical 'c' (bits per element) for an IDEAL filter.
+ // With k=8 fixed: c = -k / ln(1 - fpp^(1/k))
+ let k = 8.0;
+ let theoretical_c = -k / (1.0 - fpp.powf(1.0 / k)).ln();
+
+ // Apply compensation using linear interpolation between table entries
+ let compensated_c = if theoretical_c <= COMPENSATION_TABLE[0].0 {
+ // Below table range, use first entry
+ COMPENSATION_TABLE[0].1
+ } else if theoretical_c >= COMPENSATION_TABLE[COMPENSATION_TABLE.len() -
1].0 {
+ // Beyond c=34, SBBF efficiency drops off a cliff
+ theoretical_c * 2.65
+ } else {
+ // Find the two table entries to interpolate between
+ let idx = COMPENSATION_TABLE
+ .iter()
+ .position(|(c, _)| *c >= theoretical_c)
+ .unwrap(); // Safe because we checked bounds above
+
+ if idx == 0 || COMPENSATION_TABLE[idx].0 == theoretical_c {
+ // Exact match or at start
Review Comment:
The condition `idx == 0` on this line is unreachable. When theoretical_c is
in the range handled by the else block (lines 315-332), the earliest index that
`position` can return is 1. This is because:
1. Values where theoretical_c <= 5.0 are handled by the first if branch
(line 309-311)
2. The position function looks for the first entry where c >= theoretical_c
3. For values > 5.0, this will always be at index 1 or later
Consider removing the `idx == 0` check to simplify the logic.
```suggestion
if COMPENSATION_TABLE[idx].0 == theoretical_c {
// Exact match
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]