viirya commented on code in PR #9628:
URL: https://github.com/apache/arrow-rs/pull/9628#discussion_r3012777595
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -641,4 +897,197 @@ mod tests {
);
}
}
+
+ /*
+ Ok, so the following is trying to prove in simple terms that folding
an SBBF and
+ building a fresh smaller SBBF from scratch prodcues the exact same bits
+
+ If you insert the same values into a 512-block filter and fold it to
256 blocks,
+ you get a bit-for-bit identical result to just inserting those values
into a
+ 256-block filter directly. The fold doesn't lose information or
scramble anything,
+ it's like you had known the right size all along
+
+ This works because of the 2 lemmas:
+ 1. when you half the filter, each hash's block index divides cleanly
by 2
+ so the hash that went to block `i` in the big filter goes to block
`i/2` in the small one
+ which is exactly where the fold puts it
+ > this is trivial since floor(x/2) == floor(floor(x) / 2) is a basic
math fact
+
+ 2. the bit pattern set _within_ a block depends only on the lower 32
bits of the hash,
+ which doesn't change with filter size. So the same bits get set
regardless!
+ > structually trivial, mask() takes a u32 and uses only the SALT
constants.
+
+
+ When you combine it together, every hash sets the same bits in the
same destination block
+ whether you fold or build fresh. Therefore the filters are
bit-identical
+ */
+ #[test]
+ fn test_sbbf_folded_equals_fresh() {
+ let values = (0..5000).map(|i|
format!("elem_{i}")).collect::<Vec<_>>();
+ let hashes = values
+ .iter()
+ .map(|v| hash_as_bytes(v.as_str()))
+ .collect::<Vec<_>>();
+
+ for num_blocks in [64, 256, 1024] {
+ let half = num_blocks / 2;
+
+ // original filter
+ let mut original = Sbbf::new_with_num_of_bytes(num_blocks * 32);
+ assert_eq!(original.num_blocks(), num_blocks);
+ for &h in &hashes {
+ original.insert_hash(h);
+ }
+
+ for &h in hashes.iter() {
+ let mask = Block::mask(h as u32);
+
+ // step 1: element's block in original
+ let orig_idx = original.hash_to_block_index(h);
+ assert!(orig_idx < num_blocks);
+
+ // step 2 (lemma 1): destination in N/2 filter
+ let fresh_idx = {
+ let tmp = Sbbf(vec![Block::ZERO; half]);
+ tmp.hash_to_block_index(h)
+ };
+
+ let folded_idx = orig_idx / 2;
+ assert_eq!(fresh_idx, folded_idx,);
+
+ // step 3 (lemma 2): mask is the same
+ for w in 0..8 {
+ assert_ne!(original.0[orig_idx].0[w] & mask.0[w], 0,);
+ }
+ }
+
+ // verify the actual blocks match
+ let mut folded = original.clone();
+ folded.fold_once();
+ assert_eq!(folded.num_blocks(), half);
+
+ let mut fresh = Sbbf::new_with_num_of_bytes(half * 32);
+ for &h in &hashes {
+ fresh.insert_hash(h);
+ }
+
+ for j in 0..half {
+ assert_eq!(
+ folded.0[j].0, fresh.0[j].0,
+ "Step 4 failed: block {j} differs (N={num_blocks}→{half})"
+ );
+ }
+ }
+ }
+
+ /// show multi-step folding.
+ ///
+ /// You can apply the above inductively, folding k times from N blocks
prodcues a filter bit-identical to a fresh N/2^k filter
+ #[test]
+ fn test_multi_step_fold() {
+ let values = (0..3000).map(|i| format!("x_{i}")).collect::<Vec<_>>();
+
+ let mut filter = Sbbf::new_with_num_of_bytes(512 * 32);
+ for v in &values {
+ filter.insert(v.as_str());
+ }
+
+ for expected_blocks in [256, 128, 64, 32, 16, 8, 4, 2, 1] {
+ filter.fold_once();
+ assert_eq!(filter.num_blocks(), expected_blocks);
+
+ let mut fresh = Sbbf::new_with_num_of_bytes(expected_blocks * 32);
+ for v in &values {
+ fresh.insert(v.as_str());
+ }
+ for (fb, rb) in filter.0.iter().zip(fresh.0.iter()) {
+ assert_eq!(fb.0, rb.0,);
+ }
+ }
+ }
+
+ /// test that the fpp estimator's overestimation doesn't cause
fold_to_target_fpp
+ /// to produce significantly oversized filters
+ ///
+ /// compare the final size after folding agains tthe theoretical optimal
size
Review Comment:
```suggestion
/// compare the final size after folding against the theoretical optimal
size
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]