alamb commented on code in PR #8160:
URL: https://github.com/apache/arrow-rs/pull/8160#discussion_r2283054774


##########
parquet/src/file/page_index/index.rs:
##########
@@ -306,6 +308,75 @@ impl<T: ParquetValueType> NativeIndex<T> {
             definition_level_histograms,
         )
     }
+
+    /// Creates a new [`NativeIndex`]
+    pub(crate) fn try_new_local(index: ColumnIndex) -> Result<Self, 
ParquetError> {
+        let len = index.min_values.len();
+
+        // turn Option<Vec<i64>> into Vec<Option<i64>>
+        let null_counts = index
+            .null_counts
+            .map(|x| x.into_iter().map(Some).collect::<Vec<_>>())
+            .unwrap_or_else(|| vec![None; len]);
+
+        // histograms are a 1D array encoding a 2D num_pages X num_levels 
matrix.
+        let to_page_histograms = |opt_hist: Option<Vec<i64>>| {
+            if let Some(hist) = opt_hist {
+                // TODO: should we assert (hist.len() % len) == 0?
+                let num_levels = hist.len() / len;
+                let mut res = Vec::with_capacity(len);
+                for i in 0..len {
+                    let page_idx = i * num_levels;
+                    let page_hist = hist[page_idx..page_idx + 
num_levels].to_vec();
+                    res.push(Some(LevelHistogram::from(page_hist)));
+                }
+                res
+            } else {
+                vec![None; len]
+            }
+        };
+
+        // turn Option<Vec<i64>> into Vec<Option<i64>>

Review Comment:
   😢 



##########
parquet/src/file/page_index/offset_index.rs:
##########
@@ -55,24 +59,26 @@ impl From<&PageLocation> for crate::format::PageLocation {
     }
 }
 
+thrift_struct!(
 /// [`OffsetIndex`] information for a column chunk. Contains offsets and sizes 
for each page
 /// in the chunk. Optionally stores fully decoded page sizes for BYTE_ARRAY 
columns.
 ///
 /// [`OffsetIndex`]: 
https://github.com/apache/parquet-format/blob/master/PageIndex.md
-#[derive(Debug, Clone, PartialEq)]
 pub struct OffsetIndexMetaData {
-    /// Vector of [`PageLocation`] objects, one per page in the chunk.
-    pub page_locations: Vec<PageLocation>,
-    /// Optional vector of unencoded page sizes, one per page in the chunk.
-    /// Only defined for BYTE_ARRAY columns.
-    pub unencoded_byte_array_data_bytes: Option<Vec<i64>>,
+  /// Vector of [`PageLocation`] objects, one per page in the chunk.
+  1: required list<PageLocation> page_locations

Review Comment:
   I will admit these macros are quite neat and are growing on me



##########
parquet/src/file/page_index/index_reader.rs:
##########
@@ -129,25 +130,37 @@ pub fn read_offset_indexes<R: ChunkReader>(
 }
 
 pub(crate) fn decode_offset_index(data: &[u8]) -> Result<OffsetIndexMetaData, 
ParquetError> {
-    let mut prot = TCompactSliceInputProtocol::new(data);
-    let offset = crate::format::OffsetIndex::read_from_in_protocol(&mut prot)?;
-    OffsetIndexMetaData::try_new(offset)
+    let mut prot = ThriftCompactInputProtocol::new(data);
+    OffsetIndexMetaData::try_from(&mut prot)
 }
 
-pub(crate) fn decode_column_index(data: &[u8], column_type: Type) -> 
Result<Index, ParquetError> {
-    let mut prot = TCompactSliceInputProtocol::new(data);
+thrift_struct!(
+pub(crate) struct ColumnIndex<'a> {
+  1: required list<bool> null_pages
+  2: required list<'a><binary> min_values

Review Comment:
   This is so sad -- the structure actually starts out column oriented and then 
we pivot it to rows (just to have to pivot it back to columns to use in 
DataFusion)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to