ODukhno commented on code in PR #8762:
URL: https://github.com/apache/arrow-rs/pull/8762#discussion_r2621241936
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -541,4 +599,49 @@ mod tests {
assert_eq!(*num_bits, num_of_bits_from_ndv_fpp(*ndv, *fpp) as u64);
}
}
+
+ #[test]
+ fn test_sbbf_write_round_trip() {
+ // Create a bloom filter with a 32-byte bitset (minimum size)
+ let bitset_bytes = vec![0u8; 32];
+ let mut original = Sbbf::new(&bitset_bytes);
+
+ // Insert some test values
+ let test_values = ["hello", "world", "rust", "parquet", "bloom",
"filter"];
+ for value in &test_values {
+ original.insert(value);
+ }
+
+ // Serialize to bytes
+ let mut output = Vec::new();
+ original.write(&mut output).unwrap();
+
+ // Validate header was written correctly
+ let mut protocol = ThriftSliceInputProtocol::new(&output);
+ let header = BloomFilterHeader::read_thrift(&mut protocol).unwrap();
+ assert_eq!(header.num_bytes, bitset_bytes.len() as i32);
+ assert_eq!(header.algorithm, BloomFilterAlgorithm::BLOCK);
+ assert_eq!(header.hash, BloomFilterHash::XXHASH);
+ assert_eq!(header.compression, BloomFilterCompression::UNCOMPRESSED);
+
+ // Deserialize using from_bytes
+ let reconstructed = Sbbf::from_bytes(&output).unwrap();
+
+ // Most importantly: verify the bloom filter WORKS correctly after
round-trip
+ for value in &test_values {
+ assert!(
+ reconstructed.check(value),
+ "Value '{}' should be present after round-trip",
+ value
+ );
+ }
+
+ // Verify false negative check (values not inserted should not be
found)
+ let missing_values = ["missing", "absent", "nothere"];
+ for value in &missing_values {
+ // Note: bloom filters can have false positives, but should never
have false negatives
+ // So we can't assert !check(), but we should verify inserted
values are found
+ let _ = reconstructed.check(value); // Just exercise the code path
Review Comment:
Just pushed another change where I applied this one along with all other
suggestions above.
Thanks!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]