adriangb commented on code in PR #9628:
URL: https://github.com/apache/arrow-rs/pull/9628#discussion_r3037380644


##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -431,6 +485,132 @@ impl Sbbf {
         self.0.capacity() * std::mem::size_of::<Block>()
     }
 
+    /// Returns the number of blocks in this bloom filter.
+    pub fn num_blocks(&self) -> usize {
+        self.0.len()
+    }
+
+    /// Fold the bloom filter down to the smallest size that still meets the 
target FPP
+    /// (False Positive Percentage).
+    ///
+    /// Repeatedly halves the filter by merging adjacent block pairs via 
bitwise OR,
+    /// stopping when the next fold would cause the estimated FPP to exceed 
`target_fpp`, or
+    /// when the filter reaches the minimum size of 1 block (32 bytes).
+    ///
+    /// ## How it works
+    ///
+    /// SBBFs use multiplicative hashing for block selection:
+    ///
+    /// ```text
+    /// block_index = ((hash >> 32) * num_blocks) >> 32
+    /// ```
+    ///
+    /// When `num_blocks` is halved, the new index becomes 
`floor(original_index / 2)`, so
+    /// blocks `2i` and `2i+1` map to the same position. Each fold merges 
**adjacent** pairs:
+    ///
+    /// ```text
+    /// folded[i] = blocks[2*i] | blocks[2*i + 1]
+    /// ```
+    ///
+    /// This differs from standard Bloom filter folding, which merges the two 
halves
+    /// (`B[i] | B[i + m/2]`) because standard filters use modular hashing 
where
+    /// `h(x) mod (m/2)` maps indices `i` and `i + m/2` to the same position.
+    ///
+    /// ## Correctness
+    ///
+    /// Folding **never introduces false negatives**. Every bit that was set 
in the original
+    /// filter remains set in the folded filter (via bitwise OR). The only 
effect is a controlled
+    /// increase in FPP as set bits from different blocks are merged together.
+    ///
+    /// ## References
+    ///
+    /// - Sailhan, F. & Stehr, M-O. "Folding and Unfolding Bloom Filters",
+    ///   IEEE iThings 2012. <https://doi.org/10.1109/GreenCom.2012.16>
+    pub fn fold_to_target_fpp(&mut self, target_fpp: f64) {
+        let num_folds = self.num_folds_for_target_fpp(target_fpp);
+        if num_folds > 0 {
+            self.fold_n(num_folds);
+        }
+    }
+
+    /// Determine how many folds can be applied without exceeding `target_fpp`.
+    ///
+    /// Computes the average per-block fill rate in a single pass (no 
allocation),
+    /// then analytically estimates the FPP at each fold level.
+    ///
+    /// When two blocks with independent fill rate `f` are OR'd, the expected 
fill
+    /// of the merged block is `1 - (1-f)^2`. After `k` folds (merging `2^k` 
blocks):
+    ///
+    /// ```text
+    /// f_k = 1 - (1 - f)^(2^k)
+    /// ```
+    ///
+    /// SBBF membership checks perform `k=8` bit checks within one 256-bit 
block,
+    /// so the estimated FPP at fold level k is `f_k^8`.
+    fn num_folds_for_target_fpp(&self, target_fpp: f64) -> u32 {
+        let len = self.0.len();
+        if len < 2 {
+            return 0;
+        }
+
+        // Single pass: compute average per-block fill rate.
+        let total_set_bits: u64 = self.0.iter().map(|b| 
u64::from(b.count_ones())).sum();
+        let avg_fill = total_set_bits as f64 / (len as f64 * 256.0);
+
+        // Empty filter: can fold all the way down.
+        if avg_fill == 0.0 {
+            return len.trailing_zeros();
+        }
+
+        // Find max folds where estimated FPP stays within target.
+        // f_k = 1 - (1 - avg_fill)^(2^k), FPP_k = f_k^8
+        let max_folds = len.trailing_zeros(); // log2(len) since len is power 
of 2

Review Comment:
   e8e8b8b4aaaffe3bc13fe70c7ee1055728c39785



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to