etseidl commented on code in PR #8574:
URL: https://github.com/apache/arrow-rs/pull/8574#discussion_r2420255840
##########
parquet/src/file/metadata/thrift_gen.rs:
##########
@@ -837,62 +697,575 @@ fn get_file_decryptor(
}
}
-/// Create ParquetMetaData from thrift input. Note that this only decodes the
file metadata in
-/// the Parquet footer. Page indexes will need to be added later.
-impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for
ParquetMetaData {
- fn read_thrift(prot: &mut R) -> Result<Self> {
- let file_meta = FileMetaData::read_thrift(prot)?;
-
- let version = file_meta.version;
- let num_rows = file_meta.num_rows;
- let row_groups = file_meta.row_groups;
- let created_by = file_meta.created_by.map(|c| c.to_owned());
- let key_value_metadata = file_meta.key_value_metadata;
-
- let val = parquet_schema_from_array(file_meta.schema)?;
- let schema_descr = Arc::new(SchemaDescriptor::new(val));
-
- // need schema_descr to get final RowGroupMetaData
- let row_groups = convert_row_groups(row_groups, schema_descr.clone())?;
-
- // need to map read column orders to actual values based on the schema
- if file_meta
- .column_orders
- .as_ref()
- .is_some_and(|cos| cos.len() != schema_descr.num_columns())
- {
- return Err(general_err!("Column order length mismatch"));
+// using ThriftSliceInputProtocol rather than ThriftCompactInputProtocl trait
because
+// these are all internal and operate on slices.
+fn read_column_chunk<'a>(
+ prot: &mut ThriftSliceInputProtocol<'a>,
+ column_descr: &Arc<ColumnDescriptor>,
+) -> Result<ColumnChunkMetaData> {
+ // ColumnChunk fields
+ let mut file_path: Option<&str> = None;
+ let mut file_offset: Option<i64> = None;
+ let mut offset_index_offset: Option<i64> = None;
+ let mut offset_index_length: Option<i32> = None;
+ let mut column_index_offset: Option<i64> = None;
+ let mut column_index_length: Option<i32> = None;
+ #[cfg(feature = "encryption")]
+ let mut column_crypto_metadata: Option<Box<ColumnCryptoMetaData>> = None;
+ #[cfg(feature = "encryption")]
+ let mut encrypted_column_metadata: Option<&[u8]> = None;
+
+ // ColumnMetaData
+ let mut encodings: Option<Vec<Encoding>> = None;
+ let mut codec: Option<Compression> = None;
+ let mut num_values: Option<i64> = None;
+ let mut total_uncompressed_size: Option<i64> = None;
+ let mut total_compressed_size: Option<i64> = None;
+ let mut data_page_offset: Option<i64> = None;
+ let mut index_page_offset: Option<i64> = None;
+ let mut dictionary_page_offset: Option<i64> = None;
+ let mut statistics: Option<Statistics> = None;
+ let mut encoding_stats: Option<Vec<PageEncodingStats>> = None;
+ let mut bloom_filter_offset: Option<i64> = None;
+ let mut bloom_filter_length: Option<i32> = None;
+ let mut size_statistics: Option<SizeStatistics> = None;
+ let mut geospatial_statistics: Option<GeospatialStatistics> = None;
+
+ // struct ColumnChunk {
+ // 1: optional string file_path
+ // 2: required i64 file_offset = 0
+ // 3: optional ColumnMetaData meta_data
+ // 4: optional i64 offset_index_offset
+ // 5: optional i32 offset_index_length
+ // 6: optional i64 column_index_offset
+ // 7: optional i32 column_index_length
+ // 8: optional ColumnCryptoMetaData crypto_metadata
+ // 9: optional binary encrypted_column_metadata
+ // }
+ let mut last_field_id = 0i16;
+ loop {
+ let field_ident = prot.read_field_begin(last_field_id)?;
+ if field_ident.field_type == FieldType::Stop {
+ break;
}
-
- let column_orders = file_meta.column_orders.map(|cos| {
- let mut res = Vec::with_capacity(cos.len());
- for (i, column) in schema_descr.columns().iter().enumerate() {
- match cos[i] {
- ColumnOrder::TYPE_DEFINED_ORDER(_) => {
- let sort_order = ColumnOrder::get_sort_order(
- column.logical_type(),
- column.converted_type(),
- column.physical_type(),
- );
- res.push(ColumnOrder::TYPE_DEFINED_ORDER(sort_order));
+ match field_ident.id {
+ 1 => {
+ file_path = Some(<&str>::read_thrift(&mut *prot)?);
+ }
+ 2 => {
+ file_offset = Some(i64::read_thrift(&mut *prot)?);
+ }
+ 3 => {
+ // `ColumnMetaData`. Read inline for performance sake.
Review Comment:
It was hard to tell from profiling exactly what was addding to the overhead,
but splitting this into a separate call that decodes and returns a
`ColumnMetaData` object which is then moved into the `ColumnChunkMetaData`
added about 5% to execution time IIRC. As mentioned above, my latest code
passes the `ColumnChunkMetaData` in, so no temporary object is created. Then I
get some code reuse and this becomes easier to read 😄
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]