Jimexist commented on code in PR #3102:
URL: https://github.com/apache/arrow-rs/pull/3102#discussion_r1020917157


##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -79,6 +81,40 @@ fn block_check(block: &Block, hash: u32) -> bool {
 /// A split block Bloom filter
 pub struct Sbbf(Vec<Block>);
 
+// this size should not be too large to not to hit short read too early 
(although unlikely)
+// but also not to small to ensure cache efficiency, this is essential a 
"guess" of the header
+// size
+const STEP_SIZE: usize = 32;
+
+/// given an initial offset, and a chunk reader, try to read out a bloom 
filter header by trying
+/// one or more iterations, returns both the header and the offset after it 
(for bitset).
+fn chunk_read_bloom_filter_header_and_offset<R: ChunkReader>(
+    offset: usize,
+    reader: Arc<R>,
+) -> Result<(BloomFilterHeader, usize), ParquetError> {
+    // because we do not know in advance what the TCompactInputProtocol will 
read, we have to
+    // loop read until we can parse the header. Allocate at least 128 bytes to 
start with
+    let mut buffer = BytesMut::with_capacity(128);
+    let mut start = offset;
+    loop {
+        let batch = reader.get_bytes(offset as u64, STEP_SIZE)?;
+        buffer.put(batch);
+        // need to clone as we read from the very beginning of the buffer each 
time
+        let buffer = buffer.clone();
+        let mut buf_reader = buffer.reader();
+        // try to deserialize header
+        let mut prot = TCompactInputProtocol::new(&mut buf_reader);
+        if let Ok(h) = BloomFilterHeader::read_from_in_protocol(&mut prot) {
+            let buffer = buf_reader.into_inner();
+            let bitset_offset = start + STEP_SIZE - buffer.remaining();
+            return Ok((h, bitset_offset));
+        } else {
+            // continue to try by reading another batch
+            start += STEP_SIZE;
+        }
+    }
+}

Review Comment:
   @alamb et. al. - sanity check before i move on to implement more API and 
test cases



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to