viirya commented on code in PR #9628:
URL: https://github.com/apache/arrow-rs/pull/9628#discussion_r3012777595


##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -641,4 +897,197 @@ mod tests {
             );
         }
     }
+
+    /*
+        Ok, so the following is trying to prove in simple terms that folding 
an SBBF and
+        building a fresh smaller SBBF from scratch prodcues the exact same bits
+
+        If you insert the same values into a 512-block filter and fold it to 
256 blocks,
+        you get a bit-for-bit identical result to just inserting those values 
into a
+        256-block filter directly. The fold doesn't lose information or 
scramble anything,
+        it's like you had known the right size all along
+
+        This works because of the 2 lemmas:
+        1. when you half the filter, each hash's block index divides cleanly 
by 2
+        so the hash that went to block `i` in the big filter goes to block 
`i/2` in the small one
+        which is exactly where the fold puts it
+        > this is trivial since floor(x/2) == floor(floor(x) / 2) is a basic 
math fact
+
+        2. the bit pattern set _within_ a block depends only on the lower 32 
bits of the hash,
+        which doesn't change with filter size. So the same bits get set 
regardless!
+        > structually trivial, mask() takes a u32 and uses only the SALT 
constants.
+
+
+        When you combine it together, every hash sets the same bits in the 
same destination block
+        whether you fold or build fresh. Therefore the filters are 
bit-identical
+    */
+    #[test]
+    fn test_sbbf_folded_equals_fresh() {
+        let values = (0..5000).map(|i| 
format!("elem_{i}")).collect::<Vec<_>>();
+        let hashes = values
+            .iter()
+            .map(|v| hash_as_bytes(v.as_str()))
+            .collect::<Vec<_>>();
+
+        for num_blocks in [64, 256, 1024] {
+            let half = num_blocks / 2;
+
+            // original filter
+            let mut original = Sbbf::new_with_num_of_bytes(num_blocks * 32);
+            assert_eq!(original.num_blocks(), num_blocks);
+            for &h in &hashes {
+                original.insert_hash(h);
+            }
+
+            for &h in hashes.iter() {
+                let mask = Block::mask(h as u32);
+
+                // step 1: element's block in original
+                let orig_idx = original.hash_to_block_index(h);
+                assert!(orig_idx < num_blocks);
+
+                // step 2 (lemma 1): destination in N/2 filter
+                let fresh_idx = {
+                    let tmp = Sbbf(vec![Block::ZERO; half]);
+                    tmp.hash_to_block_index(h)
+                };
+
+                let folded_idx = orig_idx / 2;
+                assert_eq!(fresh_idx, folded_idx,);
+
+                // step 3 (lemma 2): mask is the same
+                for w in 0..8 {
+                    assert_ne!(original.0[orig_idx].0[w] & mask.0[w], 0,);
+                }
+            }
+
+            // verify the actual blocks match
+            let mut folded = original.clone();
+            folded.fold_once();
+            assert_eq!(folded.num_blocks(), half);
+
+            let mut fresh = Sbbf::new_with_num_of_bytes(half * 32);
+            for &h in &hashes {
+                fresh.insert_hash(h);
+            }
+
+            for j in 0..half {
+                assert_eq!(
+                    folded.0[j].0, fresh.0[j].0,
+                    "Step 4 failed: block {j} differs (N={num_blocks}→{half})"
+                );
+            }
+        }
+    }
+
+    /// show multi-step folding.
+    ///
+    /// You can apply the above inductively, folding k times from N blocks 
prodcues a filter bit-identical to a fresh N/2^k filter
+    #[test]
+    fn test_multi_step_fold() {
+        let values = (0..3000).map(|i| format!("x_{i}")).collect::<Vec<_>>();
+
+        let mut filter = Sbbf::new_with_num_of_bytes(512 * 32);
+        for v in &values {
+            filter.insert(v.as_str());
+        }
+
+        for expected_blocks in [256, 128, 64, 32, 16, 8, 4, 2, 1] {
+            filter.fold_once();
+            assert_eq!(filter.num_blocks(), expected_blocks);
+
+            let mut fresh = Sbbf::new_with_num_of_bytes(expected_blocks * 32);
+            for v in &values {
+                fresh.insert(v.as_str());
+            }
+            for (fb, rb) in filter.0.iter().zip(fresh.0.iter()) {
+                assert_eq!(fb.0, rb.0,);
+            }
+        }
+    }
+
+    /// test that the fpp estimator's overestimation doesn't cause 
fold_to_target_fpp
+    /// to produce significantly oversized filters
+    ///
+    /// compare the final size after folding agains tthe theoretical optimal 
size

Review Comment:
   ```suggestion
       /// compare the final size after folding against the theoretical optimal 
size
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to