This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 9be0eb530d Minor: Improve parquet PageIndex documentation (#6042)
9be0eb530d is described below
commit 9be0eb530de85a68358dc31c146f71ccb97739ca
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Jul 17 16:07:47 2024 -0400
Minor: Improve parquet PageIndex documentation (#6042)
* Minor: Improve parquet PageIndex documentation
* More improvements
* Add reasons for data page being without null
* Apply suggestions from code review
Co-authored-by: Val Lorentz <[email protected]>
* Update parquet/src/file/page_index/index.rs
---------
Co-authored-by: Val Lorentz <[email protected]>
---
parquet/src/file/metadata/mod.rs | 37 +++++++++++++++++++++++++++++++-----
parquet/src/file/page_index/index.rs | 34 ++++++++++++++++++++-------------
parquet/src/file/statistics.rs | 16 ++++++++++++++--
3 files changed, 67 insertions(+), 20 deletions(-)
diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index 46ebf5d0f2..3722f2cbe4 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -54,7 +54,12 @@ use crate::schema::types::{
Type as SchemaType,
};
-/// [`Index`] for each row group of each column.
+/// Page level statistics for each column chunk of each row group.
+///
+/// This structure is an in-memory representation of multiple [`ColumnIndex`]
+/// structures in a parquet file footer, as described in the Parquet [PageIndex
+/// documentation]. Each [`Index`] holds statistics about all the pages in a
+/// particular column chunk.
///
/// `column_index[row_group_number][column_number]` holds the
/// [`Index`] corresponding to column `column_number` of row group
@@ -62,9 +67,14 @@ use crate::schema::types::{
///
/// For example `column_index[2][3]` holds the [`Index`] for the forth
/// column in the third row group of the parquet file.
+///
+/// [PageIndex documentation]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub type ParquetColumnIndex = Vec<Vec<Index>>;
-/// [`PageLocation`] for each data page of each row group of each column.
+/// [`PageLocation`] for each data page of each row group of each column
+///
+/// This structure is the parsed representation of the [`OffsetIndex`] from the
+/// Parquet file footer, as described in the Parquet [PageIndex documentation].
///
/// `offset_index[row_group_number][column_number][page_number]` holds
/// the [`PageLocation`] corresponding to page `page_number` of column
@@ -73,6 +83,8 @@ pub type ParquetColumnIndex = Vec<Vec<Index>>;
/// For example `offset_index[2][3][4]` holds the [`PageLocation`] for
/// the fifth page of the forth column in the third row group of the
/// parquet file.
+///
+/// [PageIndex documentation]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>;
/// Parsed metadata for a single Parquet file
@@ -946,14 +958,22 @@ impl ColumnChunkMetaDataBuilder {
}
}
-/// Builder for column index
+/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
+///
+/// [PageIndex]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub struct ColumnIndexBuilder {
null_pages: Vec<bool>,
min_values: Vec<Vec<u8>>,
max_values: Vec<Vec<u8>>,
null_counts: Vec<i64>,
boundary_order: BoundaryOrder,
- // If one page can't get build index, need to ignore all index in this
column
+ /// Is the information in the builder valid?
+ ///
+ /// Set to `false` if any entry in the page doesn't have statistics for
+ /// some reason, so statistics for that page won't be written to the file.
+ /// This might happen if the page is entirely null, or
+ /// is a floating point column without any non-nan values
+ /// e.g. <https://github.com/apache/parquet-format/pull/196>
valid: bool,
}
@@ -975,6 +995,7 @@ impl ColumnIndexBuilder {
}
}
+ /// Append statistics for the next page
pub fn append(
&mut self,
null_page: bool,
@@ -992,15 +1013,19 @@ impl ColumnIndexBuilder {
self.boundary_order = boundary_order;
}
+ /// Mark this column index as invalid
pub fn to_invalid(&mut self) {
self.valid = false;
}
+ /// Is the information in the builder valid?
pub fn valid(&self) -> bool {
self.valid
}
/// Build and get the thrift metadata of column index
+ ///
+ /// Note: callers should check [`Self::valid`] before calling this method
pub fn build_to_thrift(self) -> ColumnIndex {
ColumnIndex::new(
self.null_pages,
@@ -1012,7 +1037,9 @@ impl ColumnIndexBuilder {
}
}
-/// Builder for offset index
+/// Builder for offset index, part of the Parquet [PageIndex].
+///
+/// [PageIndex]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub struct OffsetIndexBuilder {
offset_array: Vec<i64>,
compressed_page_size_array: Vec<i32>,
diff --git a/parquet/src/file/page_index/index.rs
b/parquet/src/file/page_index/index.rs
index 71fb47afa9..7eba949042 100644
--- a/parquet/src/file/page_index/index.rs
+++ b/parquet/src/file/page_index/index.rs
@@ -25,14 +25,9 @@ use crate::format::{BoundaryOrder, ColumnIndex};
use crate::util::bit_util::from_le_slice;
use std::fmt::Debug;
-/// PageIndex Statistics for one data page, as described in [Column Index].
+/// Typed statistics for one data page
///
-/// One significant difference from the row group level
-/// [`Statistics`](crate::format::Statistics) is that page level
-/// statistics may not store actual column values as min and max
-/// (e.g. they may store truncated strings to save space)
-///
-/// [Column Index]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
+/// See [`NativeIndex`] for more details
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct PageIndex<T> {
/// The minimum value, It is None when all values are null
@@ -70,11 +65,9 @@ where
#[derive(Debug, Clone, PartialEq)]
#[allow(non_camel_case_types)]
-/// Typed statistics for a data page in a column chunk.
+/// Statistics for data pages in a column chunk.
///
-/// This structure is part of the "Page Index" and is optionally part of
-/// [ColumnIndex] in the parquet file and can be used to skip decoding pages
-/// while reading the file data.
+/// See [`NativeIndex`] for more information
pub enum Index {
/// Sometimes reading page index from parquet file
/// will only return pageLocations without min_max index,
@@ -117,10 +110,25 @@ impl Index {
}
}
-/// Stores the [`PageIndex`] for each page of a column
+/// Strongly typed statistics for data pages in a column chunk.
+///
+/// This structure is a natively typed, in memory representation of the
+/// [`ColumnIndex`] structure in a parquet file footer, as described in the
+/// Parquet [PageIndex documentation]. The statistics stored in this structure
+/// can be used by query engines to skip decoding pages while reading parquet
+/// data.
+///
+/// # Differences with Row Group Level Statistics
+///
+/// One significant difference between `NativeIndex` and row group level
+/// [`Statistics`] is that page level statistics may not store actual column
+/// values as min and max (e.g. they may store truncated strings to save space)
+///
+/// [PageIndex documentation]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
+/// [`Statistics`]: crate::file::statistics::Statistics
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct NativeIndex<T: ParquetValueType> {
- /// The indexes, one item per page
+ /// The actual column indexes, one item per page
pub indexes: Vec<PageIndex<T>>,
/// If the min/max elements are ordered, and if so in which
/// direction. See [source] for details.
diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs
index 7d704cc138..dded9c9c11 100644
--- a/parquet/src/file/statistics.rs
+++ b/parquet/src/file/statistics.rs
@@ -287,7 +287,17 @@ pub fn to_thrift(stats: Option<&Statistics>) ->
Option<TStatistics> {
Some(thrift_stats)
}
-/// Statistics for a column chunk and data page.
+/// Strongly typed statistics for a column chunk within a row group.
+///
+/// This structure is a natively typed, in memory representation of the
+/// [`Statistics`] structure in a parquet file footer. The statistics stored in
+/// this structure can be used by query engines to skip decoding pages while
+/// reading parquet data.
+///
+/// Page level statistics are stored separately, in [NativeIndex].
+///
+/// [`Statistics`]: crate::format::Statistics
+/// [NativeIndex]: crate::file::page_index::index::NativeIndex
#[derive(Debug, Clone, PartialEq)]
pub enum Statistics {
Boolean(ValueStatistics<bool>),
@@ -445,7 +455,9 @@ impl fmt::Display for Statistics {
/// Typed implementation for [`Statistics`].
pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;
-/// Statistics for a particular `ParquetValueType`
+/// Typed statistics for one column chunk
+///
+/// See [`Statistics`] for more details
#[derive(Clone, Eq, PartialEq)]
pub struct ValueStatistics<T> {
min: Option<T>,