alamb commented on code in PR #8714:
URL: https://github.com/apache/arrow-rs/pull/8714#discussion_r2466874803


##########
parquet/src/file/metadata/thrift/mod.rs:
##########
@@ -1306,6 +1436,32 @@ impl<'a> WriteThrift for FileMeta<'a> {
                 .write_thrift_field(writer, 9, last_field_id)?;
         }
 
+        let mut index = Vec::<u8>::new();
+        {
+            let mut w = ThriftCompactOutputProtocol::new(&mut index);
+            if let Some(meta_index) = writer.take_index() {
+                let idx = MetaIndex {
+                    schema_offset: meta_index.schema_range.start,
+                    schema_length: meta_index.schema_range.end - 
meta_index.schema_range.start,
+                    row_group_offsets: meta_index.row_group_offsets,
+                    column_offsets: meta_index.col_chunk_offsets,
+                    column_meta_lengths: meta_index.col_meta_lengths,
+                };
+                idx.write_thrift(&mut w)?;
+            }
+        }
+
+        if !index.is_empty() {
+            // write footer for index
+            // TODO(ets): this should use UUID rather than simple string, but 
this works for prototype
+            let idx_len = index.len() as u64;
+            index.extend_from_slice(idx_len.as_bytes());
+            index.extend_from_slice("PARI".as_bytes());

Review Comment:
   PARI -- love it



##########
parquet/src/file/metadata/thrift/mod.rs:
##########
@@ -669,9 +696,46 @@ fn read_row_group(
     Ok(row_group)
 }
 
+/// Extract the metadata index from the footer bytes. `buf` should contain the 
entire footer.
+pub(crate) fn get_metadata_index(buf: &[u8]) -> Result<Option<MetaIndex>> {
+    // TODO(ets): need constants to get rid of magic numbers
+    if buf.len() < 13 {
+        return Ok(None);
+    }
+    // check the last 4 bytes to see if we have the full footer or not
+    let magic = &buf[buf.len() - 4..];
+    let buf = if magic == "PAR1".as_bytes() {
+        &buf[0..buf.len() - 8]
+    } else {
+        buf
+    };
+
+    // check for PARI followed by 0.
+    if buf[buf.len() - 1] != 0 {
+        return Ok(None);
+    }
+    let magic = &buf[buf.len() - 5..buf.len() - 1];
+    if magic != "PARI".as_bytes() {

Review Comment:
   Do I read this right that the format would look like:
   
   ```
   (.. data pages ..)
   (.. metadata ..)
   (.. current footer - PAR1 w/ len)
   (.. MetaIndex ..)
   (.. new footer - PARI w/ len)
   ```
   
   It is clever, but would not be backwards compatible with existing readers. 
Though this is a nice way to get some sense of how much faster thrift-parsing 
could go without having to implement an entirely new parsing system...



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to