This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 9be0eb530d Minor: Improve parquet PageIndex documentation (#6042)
9be0eb530d is described below

commit 9be0eb530de85a68358dc31c146f71ccb97739ca
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Jul 17 16:07:47 2024 -0400

    Minor: Improve parquet PageIndex documentation (#6042)
    
    * Minor: Improve parquet PageIndex documentation
    
    * More improvements
    
    * Add reasons for data page being without null
    
    * Apply suggestions from code review
    
    Co-authored-by: Val Lorentz <[email protected]>
    
    * Update parquet/src/file/page_index/index.rs
    
    ---------
    
    Co-authored-by: Val Lorentz <[email protected]>
---
 parquet/src/file/metadata/mod.rs     | 37 +++++++++++++++++++++++++++++++-----
 parquet/src/file/page_index/index.rs | 34 ++++++++++++++++++++-------------
 parquet/src/file/statistics.rs       | 16 ++++++++++++++--
 3 files changed, 67 insertions(+), 20 deletions(-)

diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index 46ebf5d0f2..3722f2cbe4 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -54,7 +54,12 @@ use crate::schema::types::{
     Type as SchemaType,
 };
 
-/// [`Index`] for each row group of each column.
+/// Page level statistics for each column chunk of each row group.
+///
+/// This structure is an in-memory representation of multiple [`ColumnIndex`]
+/// structures in a parquet file footer, as described in the Parquet [PageIndex
+/// documentation]. Each [`Index`] holds statistics about all the pages in a
+/// particular column chunk.
 ///
 /// `column_index[row_group_number][column_number]` holds the
 /// [`Index`] corresponding to column `column_number` of row group
@@ -62,9 +67,14 @@ use crate::schema::types::{
 ///
 /// For example `column_index[2][3]` holds the [`Index`] for the forth
 /// column in the third row group of the parquet file.
+///
+/// [PageIndex documentation]: 
https://github.com/apache/parquet-format/blob/master/PageIndex.md
 pub type ParquetColumnIndex = Vec<Vec<Index>>;
 
-/// [`PageLocation`] for each data page of each row group of each column.
+/// [`PageLocation`] for each data page of each row group of each column
+///
+/// This structure is the parsed representation of the [`OffsetIndex`] from the
+/// Parquet file footer, as described in the Parquet [PageIndex documentation].
 ///
 /// `offset_index[row_group_number][column_number][page_number]` holds
 /// the [`PageLocation`] corresponding to page `page_number` of column
@@ -73,6 +83,8 @@ pub type ParquetColumnIndex = Vec<Vec<Index>>;
 /// For example `offset_index[2][3][4]` holds the [`PageLocation`] for
 /// the fifth page of the forth column in the third row group of the
 /// parquet file.
+///
+/// [PageIndex documentation]: 
https://github.com/apache/parquet-format/blob/master/PageIndex.md
 pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>;
 
 /// Parsed metadata for a single Parquet file
@@ -946,14 +958,22 @@ impl ColumnChunkMetaDataBuilder {
     }
 }
 
-/// Builder for column index
+/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
+///
+/// [PageIndex]: 
https://github.com/apache/parquet-format/blob/master/PageIndex.md
 pub struct ColumnIndexBuilder {
     null_pages: Vec<bool>,
     min_values: Vec<Vec<u8>>,
     max_values: Vec<Vec<u8>>,
     null_counts: Vec<i64>,
     boundary_order: BoundaryOrder,
-    // If one page can't get build index, need to ignore all index in this 
column
+    /// Is the information in the builder valid?
+    ///
+    /// Set to `false` if any entry in the page doesn't have statistics for
+    /// some reason, so statistics for that page won't be written to the file.
+    /// This might happen if the page is entirely null, or
+    /// is a floating point column without any non-nan values
+    /// e.g. <https://github.com/apache/parquet-format/pull/196>
     valid: bool,
 }
 
@@ -975,6 +995,7 @@ impl ColumnIndexBuilder {
         }
     }
 
+    /// Append statistics for the next page
     pub fn append(
         &mut self,
         null_page: bool,
@@ -992,15 +1013,19 @@ impl ColumnIndexBuilder {
         self.boundary_order = boundary_order;
     }
 
+    /// Mark this column index as invalid
     pub fn to_invalid(&mut self) {
         self.valid = false;
     }
 
+    /// Is the information in the builder valid?
     pub fn valid(&self) -> bool {
         self.valid
     }
 
     /// Build and get the thrift metadata of column index
+    ///
+    /// Note: callers should check [`Self::valid`] before calling this method
     pub fn build_to_thrift(self) -> ColumnIndex {
         ColumnIndex::new(
             self.null_pages,
@@ -1012,7 +1037,9 @@ impl ColumnIndexBuilder {
     }
 }
 
-/// Builder for offset index
+/// Builder for offset index, part of the Parquet [PageIndex].
+///
+/// [PageIndex]: 
https://github.com/apache/parquet-format/blob/master/PageIndex.md
 pub struct OffsetIndexBuilder {
     offset_array: Vec<i64>,
     compressed_page_size_array: Vec<i32>,
diff --git a/parquet/src/file/page_index/index.rs 
b/parquet/src/file/page_index/index.rs
index 71fb47afa9..7eba949042 100644
--- a/parquet/src/file/page_index/index.rs
+++ b/parquet/src/file/page_index/index.rs
@@ -25,14 +25,9 @@ use crate::format::{BoundaryOrder, ColumnIndex};
 use crate::util::bit_util::from_le_slice;
 use std::fmt::Debug;
 
-/// PageIndex Statistics for one data page, as described in [Column Index].
+/// Typed statistics for one data page
 ///
-/// One significant difference from the row group level
-/// [`Statistics`](crate::format::Statistics) is that page level
-/// statistics may not store actual column values as min and max
-/// (e.g. they may store truncated strings to save space)
-///
-/// [Column Index]: 
https://github.com/apache/parquet-format/blob/master/PageIndex.md
+/// See [`NativeIndex`] for more details
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct PageIndex<T> {
     /// The minimum value, It is None when all values are null
@@ -70,11 +65,9 @@ where
 
 #[derive(Debug, Clone, PartialEq)]
 #[allow(non_camel_case_types)]
-/// Typed statistics for a data page in a column chunk.
+/// Statistics for data pages in a column chunk.
 ///
-/// This structure is part of the "Page Index" and is optionally part of
-/// [ColumnIndex] in the parquet file and can be used to skip decoding pages
-/// while reading the file data.
+/// See [`NativeIndex`] for more information
 pub enum Index {
     /// Sometimes reading page index from parquet file
     /// will only return pageLocations without min_max index,
@@ -117,10 +110,25 @@ impl Index {
     }
 }
 
-/// Stores the [`PageIndex`] for each page of a column
+/// Strongly typed statistics for data pages in a column chunk.
+///
+/// This structure is a natively typed, in memory representation of the
+/// [`ColumnIndex`] structure in a parquet file footer, as described in the
+/// Parquet [PageIndex documentation]. The statistics stored in this structure
+/// can be used by query engines to skip decoding pages while reading parquet
+/// data.
+///
+/// # Differences with Row Group Level Statistics
+///
+/// One significant difference between `NativeIndex` and row group level
+/// [`Statistics`] is that page level statistics may not store actual column
+/// values as min and max (e.g. they may store truncated strings to save space)
+///
+/// [PageIndex documentation]: 
https://github.com/apache/parquet-format/blob/master/PageIndex.md
+/// [`Statistics`]: crate::file::statistics::Statistics
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct NativeIndex<T: ParquetValueType> {
-    /// The indexes, one item per page
+    /// The actual column indexes, one item per page
     pub indexes: Vec<PageIndex<T>>,
     /// If the min/max elements are ordered, and if so in which
     /// direction. See [source] for details.
diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs
index 7d704cc138..dded9c9c11 100644
--- a/parquet/src/file/statistics.rs
+++ b/parquet/src/file/statistics.rs
@@ -287,7 +287,17 @@ pub fn to_thrift(stats: Option<&Statistics>) -> 
Option<TStatistics> {
     Some(thrift_stats)
 }
 
-/// Statistics for a column chunk and data page.
+/// Strongly typed statistics for a column chunk within a row group.
+///
+/// This structure is a natively typed, in memory representation of the
+/// [`Statistics`] structure in a parquet file footer. The statistics stored in
+/// this structure can be used by query engines to skip decoding pages while
+/// reading parquet data.
+///
+/// Page level statistics are stored separately, in [NativeIndex].
+///
+/// [`Statistics`]: crate::format::Statistics
+/// [NativeIndex]: crate::file::page_index::index::NativeIndex
 #[derive(Debug, Clone, PartialEq)]
 pub enum Statistics {
     Boolean(ValueStatistics<bool>),
@@ -445,7 +455,9 @@ impl fmt::Display for Statistics {
 /// Typed implementation for [`Statistics`].
 pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
 
-/// Statistics for a particular `ParquetValueType`
+/// Typed statistics for one column chunk
+///
+/// See [`Statistics`] for more details
 #[derive(Clone, Eq, PartialEq)]
 pub struct ValueStatistics<T> {
     min: Option<T>,

Reply via email to