alamb commented on code in PR #8111: URL: https://github.com/apache/arrow-rs/pull/8111#discussion_r2274286608
########## parquet/src/file/metadata/reader.rs: ########## @@ -1102,10 +1218,388 @@ fn get_file_decryptor( } } +// temp structs used to construct RowGroupMetaData Review Comment: Is there some way to avoid reading into a temp structure and directly construct a RowGroupMetaData? ########## parquet/src/file/metadata/reader.rs: ########## @@ -1040,6 +1055,107 @@ impl ParquetMetaDataReader { Ok(ParquetMetaData::new(file_metadata, row_groups)) } + /// create meta data from thrift encoded bytes + pub fn decode_file_metadata(buf: &[u8]) -> Result<ParquetMetaData> { + let mut prot = ThriftCompactInputProtocol::new(buf); + + // components of the FileMetaData + let mut version: Option<i32> = None; + let mut schema_descr: Option<Arc<SchemaDescriptor>> = None; + let mut num_rows: Option<i64> = None; + let mut row_groups: Option<Vec<RowGroup>> = None; + let mut key_value_metadata: Option<Vec<KeyValue>> = None; + let mut created_by: Option<String> = None; + let mut column_orders: Option<Vec<ColumnOrder>> = None; + + // begin decoding to intermediates + prot.read_struct_begin()?; + loop { + let field_ident = prot.read_field_begin()?; + if field_ident.field_type == FieldType::Stop { + break; + } + let prot = &mut prot; + + match field_ident.id { + 1 => { Review Comment: Is there some way to use one of the magic `thrift_struct!` macro here rather than hard coding the field ids? It isn't clear to me why this code is different ########## parquet/src/basic.rs: ########## @@ -227,84 +227,33 @@ struct TimestampType { // they are identical use TimestampType as TimeType; -thrift_private_struct!( +thrift_struct!( struct IntType { 1: required i8 bit_width 2: required bool is_signed } ); -thrift_private_struct!( +thrift_struct!( struct VariantType { // The version of the variant specification that the variant was // written with. 1: optional i8 specification_version } ); -// TODO need macro for structs that need lifetime annotation +thrift_struct!( struct GeometryType<'a> { - crs: Option<&'a str>, -} - -impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for GeometryType<'a> { - type Error = ParquetError; - fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> { - let mut crs: Option<&str> = None; - prot.read_struct_begin()?; - loop { - let field_ident = prot.read_field_begin()?; - if field_ident.field_type == FieldType::Stop { - break; - } - match field_ident.id { - 1 => { - let val = prot.read_string()?; - crs = Some(val); - } - _ => { - prot.skip(field_ident.field_type)?; - } - }; - } - Ok(Self { crs }) - } + 1: optional string<'a> crs; } +); +thrift_struct!( Review Comment: this is a pretty sweet way of generating structs BTW ########## parquet/src/file/metadata/mod.rs: ########## @@ -2203,9 +2247,9 @@ mod tests { .build(); #[cfg(not(feature = "encryption"))] - let base_expected_size = 2312; + let base_expected_size = 2280; Review Comment: do you know why this changes? It isn't clear to me why the metadata size gets smaller ########## parquet/src/file/metadata/reader.rs: ########## @@ -1040,6 +1055,107 @@ impl ParquetMetaDataReader { Ok(ParquetMetaData::new(file_metadata, row_groups)) } + /// create meta data from thrift encoded bytes + pub fn decode_file_metadata(buf: &[u8]) -> Result<ParquetMetaData> { + let mut prot = ThriftCompactInputProtocol::new(buf); + + // components of the FileMetaData Review Comment: I wonder if this would be better / easier to find if it were a method on ParquetMetaData ? ```rust impl ParquetMetaData { fn from_thrift_bytes(&[u8]) -> Result<Self> { ... } } ``` 🤔 ########## parquet/src/file/metadata/reader.rs: ########## @@ -1102,10 +1218,388 @@ fn get_file_decryptor( } } +// temp structs used to construct RowGroupMetaData +thrift_struct!( +struct RowGroup<'a> { + 1: required list<'a><ColumnChunk> columns + 2: required i64 total_byte_size + 3: required i64 num_rows + 4: optional list<SortingColumn> sorting_columns + 5: optional i64 file_offset + // we don't expose total_compressed_size so skip + //6: optional i64 total_compressed_size + 7: optional i16 ordinal +} +); + +#[cfg(feature = "encryption")] Review Comment: Also, I suggest moving the code that does the thrift decoding into a different module from the ParquetMetaDataReader (which is basically a state machine for reading parts of the file into local buffers and decoding it) Maybe parquet/src/file/metadata/from_thrift.rs 🤔 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org