alamb commented on code in PR #8160: URL: https://github.com/apache/arrow-rs/pull/8160#discussion_r2283054774
########## parquet/src/file/page_index/index.rs: ########## @@ -306,6 +308,75 @@ impl<T: ParquetValueType> NativeIndex<T> { definition_level_histograms, ) } + + /// Creates a new [`NativeIndex`] + pub(crate) fn try_new_local(index: ColumnIndex) -> Result<Self, ParquetError> { + let len = index.min_values.len(); + + // turn Option<Vec<i64>> into Vec<Option<i64>> + let null_counts = index + .null_counts + .map(|x| x.into_iter().map(Some).collect::<Vec<_>>()) + .unwrap_or_else(|| vec![None; len]); + + // histograms are a 1D array encoding a 2D num_pages X num_levels matrix. + let to_page_histograms = |opt_hist: Option<Vec<i64>>| { + if let Some(hist) = opt_hist { + // TODO: should we assert (hist.len() % len) == 0? + let num_levels = hist.len() / len; + let mut res = Vec::with_capacity(len); + for i in 0..len { + let page_idx = i * num_levels; + let page_hist = hist[page_idx..page_idx + num_levels].to_vec(); + res.push(Some(LevelHistogram::from(page_hist))); + } + res + } else { + vec![None; len] + } + }; + + // turn Option<Vec<i64>> into Vec<Option<i64>> Review Comment: 😢 ########## parquet/src/file/page_index/offset_index.rs: ########## @@ -55,24 +59,26 @@ impl From<&PageLocation> for crate::format::PageLocation { } } +thrift_struct!( /// [`OffsetIndex`] information for a column chunk. Contains offsets and sizes for each page /// in the chunk. Optionally stores fully decoded page sizes for BYTE_ARRAY columns. /// /// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md -#[derive(Debug, Clone, PartialEq)] pub struct OffsetIndexMetaData { - /// Vector of [`PageLocation`] objects, one per page in the chunk. - pub page_locations: Vec<PageLocation>, - /// Optional vector of unencoded page sizes, one per page in the chunk. - /// Only defined for BYTE_ARRAY columns. - pub unencoded_byte_array_data_bytes: Option<Vec<i64>>, + /// Vector of [`PageLocation`] objects, one per page in the chunk. + 1: required list<PageLocation> page_locations Review Comment: I will admit these macros are quite neat and are growing on me ########## parquet/src/file/page_index/index_reader.rs: ########## @@ -129,25 +130,37 @@ pub fn read_offset_indexes<R: ChunkReader>( } pub(crate) fn decode_offset_index(data: &[u8]) -> Result<OffsetIndexMetaData, ParquetError> { - let mut prot = TCompactSliceInputProtocol::new(data); - let offset = crate::format::OffsetIndex::read_from_in_protocol(&mut prot)?; - OffsetIndexMetaData::try_new(offset) + let mut prot = ThriftCompactInputProtocol::new(data); + OffsetIndexMetaData::try_from(&mut prot) } -pub(crate) fn decode_column_index(data: &[u8], column_type: Type) -> Result<Index, ParquetError> { - let mut prot = TCompactSliceInputProtocol::new(data); +thrift_struct!( +pub(crate) struct ColumnIndex<'a> { + 1: required list<bool> null_pages + 2: required list<'a><binary> min_values Review Comment: This is so sad -- the structure actually starts out column oriented and then we pivot it to rows (just to have to pivot it back to columns to use in DataFusion) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org