Re: [PR] [thrift-remodel] PoC new form for column index [arrow-rs]

via GitHub Thu, 21 Aug 2025 11:39:35 -0700


etseidl commented on code in PR #8191:
URL: https://github.com/apache/arrow-rs/pull/8191#discussion_r2289285508



##########
parquet/src/file/page_index/index_reader.rs:
##########
@@ -146,22 +149,232 @@ pub(crate) struct ColumnIndex<'a> {
 }
 );
 
+/// column index
+pub struct NativeColumnIndex<T: ParquetValueType> {
+    phantom_data: PhantomData<T>,
+    null_pages: Vec<bool>,
+    boundary_order: BoundaryOrder,
+    null_counts: Option<Vec<i64>>,
+    repetition_level_histograms: Option<Vec<i64>>,
+    definition_level_histograms: Option<Vec<i64>>,
+    // raw bytes for min and max values
+    min_bytes: Vec<u8>,
+    min_offsets: Vec<usize>, // offsets are really only needed for BYTE_ARRAY
+    max_bytes: Vec<u8>,
+    max_offsets: Vec<usize>,
+}
+
+impl<T: ParquetValueType> NativeColumnIndex<T> {
+    fn try_new(index: ColumnIndex) -> Result<Self> {
+        let len = index.null_pages.len();
+
+        let min_len = index.min_values.iter().map(|&v| v.len()).sum();
+        let max_len = index.max_values.iter().map(|&v| v.len()).sum();
+        let mut min_bytes = vec![0u8; min_len];
+        let mut max_bytes = vec![0u8; max_len];
+
+        let mut min_offsets = vec![0usize; len + 1];
+        let mut max_offsets = vec![0usize; len + 1];
+
+        let mut min_pos = 0;
+        let mut max_pos = 0;
+
+        for (i, is_null) in index.null_pages.iter().enumerate().take(len) {
+            if !is_null {
+                let min = index.min_values[i];
+                let dst = &mut min_bytes[min_pos..min_pos + min.len()];
+                dst.copy_from_slice(min);
+                min_offsets[i] = min_pos;
+                min_pos += min.len();
+
+                let max = index.max_values[i];
+                let dst = &mut max_bytes[max_pos..max_pos + min.len()];
+                dst.copy_from_slice(max);
+                max_offsets[i] = max_pos;
+                max_pos += max.len();
+            } else {
+                min_offsets[i] = min_pos;
+                max_offsets[i] = max_pos;
+            }
+        }
+
+        min_offsets[len] = min_pos;
+        max_offsets[len] = max_pos;
+
+        Ok(Self {
+            phantom_data: PhantomData,
+            null_pages: index.null_pages,
+            boundary_order: index.boundary_order,
+            null_counts: index.null_counts,
+            repetition_level_histograms: index.repetition_level_histograms,
+            definition_level_histograms: index.definition_level_histograms,
+            min_bytes,
+            min_offsets,
+            max_bytes,
+            max_offsets,
+        })
+    }
+
+    /// Returns the number of pages
+    pub fn num_pages(&self) -> u64 {
+        self.null_pages.len() as u64
+    }
+
+    /// Returns the number of null values in the page indexed by `idx`
+    pub fn null_count(&self, idx: usize) -> Option<i64> {
+        self.null_counts.as_ref().map(|nc| nc[idx])
+    }
+
+    /// Returns the repetition level histogram for the page indexed by `idx`
+    pub fn repetition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+        if let Some(rep_hists) = self.repetition_level_histograms.as_ref() {
+            let num_lvls = rep_hists.len() / self.num_pages() as usize;
+            let start = num_lvls * idx;
+            Some(&rep_hists[start..start + num_lvls])
+        } else {
+            None
+        }
+    }
+
+    /// Returns the definition level histogram for the page indexed by `idx`
+    pub fn definition_level_histogram(&self, idx: usize) -> Option<&[i64]> {
+        if let Some(def_hists) = self.definition_level_histograms.as_ref() {
+            let num_lvls = def_hists.len() / self.num_pages() as usize;
+            let start = num_lvls * idx;
+            Some(&def_hists[start..start + num_lvls])
+        } else {
+            None
+        }
+    }
+
+    /// Returns whether this is an all null page
+    pub fn is_null_page(&self, idx: usize) -> bool {
+        self.null_pages[idx]
+    }
+
+    /// Returns the minimum value in the page indexed by `idx` as raw bytes
+    ///
+    /// It is `None` when all values are null
+    pub fn min_value_bytes(&self, idx: usize) -> Option<&[u8]> {
+        if self.null_pages[idx] {
+            None
+        } else {
+            let start = self.min_offsets[idx];
+            let end = self.min_offsets[idx + 1];
+            Some(&self.min_bytes[start..end])
+        }
+    }
+
+    /// Returns the maximum value in the page indexed by `idx` as raw bytes
+    ///
+    /// It is `None` when all values are null
+    pub fn max_value_bytes(&self, idx: usize) -> Option<&[u8]> {
+        if self.null_pages[idx] {
+            None
+        } else {
+            let start = self.max_offsets[idx];
+            let end = self.max_offsets[idx + 1];
+            Some(&self.max_bytes[start..end])
+        }
+    }
+}
+
+macro_rules! min_max_values {
+    ($ty: ty) => {
+        impl NativeColumnIndex<$ty> {
+            /// Returns the minimum value in the page indexed by `idx`
+            ///
+            /// It is `None` when all values are null
+            pub fn min_value(&self, idx: usize) -> Option<$ty> {
+                <$ty>::try_from_le_slice(self.min_value_bytes(idx)?).ok()
+            }
+
+            /// Returns the maximum value in the page indexed by `idx`
+            ///
+            /// It is `None` when all values are null
+            pub fn max_value(&self, idx: usize) -> Option<$ty> {
+                <$ty>::try_from_le_slice(self.max_value_bytes(idx)?).ok()
+            }
+        }
+    };
+}
+
+min_max_values!(bool);
+min_max_values!(i32);
+min_max_values!(i64);
+min_max_values!(f32);
+min_max_values!(f64);
+min_max_values!(Int96);
+
+/// index
+#[allow(non_camel_case_types)]
+pub enum ColumnIndexMetaData {

Review Comment:
   I followed what was done with the `Index` enum, ~but I'll admit I don't love 
this. We could instead drop the type bounds on `NativeColumnIndex` and rename 
that `ColumnIndexMetaData` (this name was picked to mirror 
`OffsetIndexMetaData`). Then we could add functions to convert the min/max 
values to the proper types.~
   
   Refactored since I made this comment. Translating the stats for primitive 
types speeds things up even further.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] [thrift-remodel] PoC new form for column index [arrow-rs]

Reply via email to