etseidl commented on code in PR #8190: URL: https://github.com/apache/arrow-rs/pull/8190#discussion_r2296076837
########## parquet/src/file/page_index/offset_index.rs: ########## @@ -37,7 +37,67 @@ pub struct PageLocation { /// (repetition_level = 0). 3: required i64 first_row_index } -); +);*/ + +// hand coding this one because it is very time critical + +/// Page location information for [`OffsetIndexMetaData`] +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct PageLocation { + /// Offset of the page in the file + pub offset: i64, + /// Size of the page, including header. Sum of compressed_page_size and header + pub compressed_page_size: i32, + /// Index within the RowGroup of the first row of the page. When an + /// OffsetIndex is present, pages must begin on row boundaries + /// (repetition_level = 0). + pub first_row_index: i64, +} + +// Note: this will fail if the fields are either out of order, or if a suboptimal +// encoder doesn't use field deltas. If that ever occurs, remove this code and +// revert to the commented out thrift_struct!() implementation above. +impl<'a> TryFrom<&mut ThriftCompactInputProtocol<'a>> for PageLocation { + type Error = ParquetError; + fn try_from(prot: &mut ThriftCompactInputProtocol<'a>) -> Result<Self> { + // there are 3 fields, all mandatory, so all field deltas should be 1 + let (field_type, delta) = prot.read_field_header()?; + if delta != 1 || field_type != FieldType::I64 as u8 { + return Err(general_err!("error reading PageLocation::offset")); + } + let offset = prot.read_i64()?; + + let (field_type, delta) = prot.read_field_header()?; + if delta != 1 || field_type != FieldType::I32 as u8 { + return Err(general_err!( + "error reading PageLocation::compressed_page_size" + )); + } + let compressed_page_size = prot.read_i32()?; + + let (field_type, delta) = prot.read_field_header()?; + if delta != 1 || field_type != FieldType::I64 as u8 { + return Err(general_err!("error reading PageLocation::first_row_index")); + } + let first_row_index = prot.read_i64()?; + + // This loop slows things down a bit, but it's an acceptible price to allow + // forwards compatibility. We could instead assert the next field is Stop. Review Comment: This pr definitely needs more benchmarking to justify it. On a xeon the speed up isn't as dramatic, so i might shelve this one for now. I do like the idea of keeping both paths, especially with the push decoder on the horizon. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org