etseidl commented on code in PR #8762:
URL: https://github.com/apache/arrow-rs/pull/8762#discussion_r2620481961
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -417,6 +427,54 @@ impl Sbbf {
pub(crate) fn estimated_memory_size(&self) -> usize {
self.0.capacity() * std::mem::size_of::<Block>()
}
+
+ /// reads a Sbff from thrift encoded bytes
Review Comment:
```suggestion
/// Reads a Sbff from Thrift encoded bytes
```
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -283,7 +292,8 @@ impl Sbbf {
/// Write the bloom filter data (header and then bitset) to the output.
This doesn't
/// flush the writer in order to boost performance of bulk writing all
blocks. Caller
/// must remember to flush the writer.
- pub(crate) fn write<W: Write>(&self, mut writer: W) -> Result<(),
ParquetError> {
+ /// This method usually is used in conjunction with from_bytes for
serialization/deserialization.
Review Comment:
```suggestion
/// This method usually is used in conjunction with [`Self::from_bytes`]
for serialization/deserialization.
```
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -215,9 +215,18 @@ pub(crate) fn chunk_read_bloom_filter_header_and_offset(
#[inline]
pub(crate) fn read_bloom_filter_header_and_length(
buffer: Bytes,
+) -> Result<(BloomFilterHeader, u64), ParquetError> {
+ read_bloom_filter_header_and_length_from_bytes(buffer.as_ref())
+}
+
+/// given a byte slice, try to read out a bloom filter header and return both
the header and
Review Comment:
```suggestion
/// Given a byte slice, try to read out a bloom filter header and return
both the header and
```
##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -541,4 +599,49 @@ mod tests {
assert_eq!(*num_bits, num_of_bits_from_ndv_fpp(*ndv, *fpp) as u64);
}
}
+
+ #[test]
+ fn test_sbbf_write_round_trip() {
+ // Create a bloom filter with a 32-byte bitset (minimum size)
+ let bitset_bytes = vec![0u8; 32];
+ let mut original = Sbbf::new(&bitset_bytes);
+
+ // Insert some test values
+ let test_values = ["hello", "world", "rust", "parquet", "bloom",
"filter"];
+ for value in &test_values {
+ original.insert(value);
+ }
+
+ // Serialize to bytes
+ let mut output = Vec::new();
+ original.write(&mut output).unwrap();
+
+ // Validate header was written correctly
+ let mut protocol = ThriftSliceInputProtocol::new(&output);
+ let header = BloomFilterHeader::read_thrift(&mut protocol).unwrap();
+ assert_eq!(header.num_bytes, bitset_bytes.len() as i32);
+ assert_eq!(header.algorithm, BloomFilterAlgorithm::BLOCK);
+ assert_eq!(header.hash, BloomFilterHash::XXHASH);
+ assert_eq!(header.compression, BloomFilterCompression::UNCOMPRESSED);
+
+ // Deserialize using from_bytes
+ let reconstructed = Sbbf::from_bytes(&output).unwrap();
+
+ // Most importantly: verify the bloom filter WORKS correctly after
round-trip
+ for value in &test_values {
+ assert!(
+ reconstructed.check(value),
+ "Value '{}' should be present after round-trip",
+ value
+ );
+ }
+
+ // Verify false negative check (values not inserted should not be
found)
+ let missing_values = ["missing", "absent", "nothere"];
+ for value in &missing_values {
+ // Note: bloom filters can have false positives, but should never
have false negatives
+ // So we can't assert !check(), but we should verify inserted
values are found
+ let _ = reconstructed.check(value); // Just exercise the code path
Review Comment:
Since we can't verify the negative test, why not just move these comments up
and skip this loop?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]