This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new ed018a34d99 Minor: Clarify when page index structures are read (#5886)
ed018a34d99 is described below
commit ed018a34d996590544fe5e833cb601bf46e9758e
Author: Andrew Lamb <[email protected]>
AuthorDate: Sat Jun 15 13:07:22 2024 -0400
Minor: Clarify when page index structures are read (#5886)
* Minor: Clarify when page index structures are read
* fix link
---
parquet/src/arrow/arrow_reader/mod.rs | 8 +++++++-
parquet/src/file/metadata.rs | 27 ++++++++++++++++++---------
parquet/src/file/page_index/index_reader.rs | 4 ++--
3 files changed, 27 insertions(+), 12 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index fd9cbf2039b..070dda97c59 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -273,12 +273,18 @@ impl ArrowReaderOptions {
}
}
- /// Enable decoding of the [`PageIndex`], if present (defaults to `false`)
+ /// Enable reading [`PageIndex`], if present (defaults to `false`)
///
/// The `PageIndex` can be used to push down predicates to the parquet
scan,
/// potentially eliminating unnecessary IO, by some query engines.
///
+ /// If this is enabled, [`ParquetMetaData::column_index`] and
+ /// [`ParquetMetaData::offset_index`] will be populated if the
corresponding
+ /// information is present in the file.
+ ///
/// [`PageIndex`]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
+ /// [`ParquetMetaData::column_index`]:
crate::file::metadata::ParquetMetaData::column_index
+ /// [`ParquetMetaData::offset_index`]:
crate::file::metadata::ParquetMetaData::offset_index
pub fn with_page_index(self, page_index: bool) -> Self {
Self { page_index, ..self }
}
diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs
index 853d5ffec8b..fb8f798fd3a 100644
--- a/parquet/src/file/metadata.rs
+++ b/parquet/src/file/metadata.rs
@@ -29,7 +29,6 @@
//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
//! within a Row Group including encoding and compression information,
//! number of values, statistics, etc.
-
use std::ops::Range;
use std::sync::Arc;
@@ -69,20 +68,20 @@ pub type ParquetColumnIndex = Vec<Vec<Index>>;
/// parquet file.
pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>;
-/// Global Parquet metadata, including [`FileMetaData`], [`RowGroupMetaData`].
+/// Parsed metadata for a single Parquet file
///
/// This structure is stored in the footer of Parquet files, in the format
-/// defined by [`parquet.thrift`]. It contains:
-///
-/// * File level metadata: [`FileMetaData`]
-/// * Row Group level metadata: [`RowGroupMetaData`]
-/// * (Optional) "Page Index" structures: [`ParquetColumnIndex`] and
[`ParquetOffsetIndex`]
+/// defined by [`parquet.thrift`].
///
-/// [`parquet.thrift`]:
https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
+/// # Overview
+/// * [`FileMetaData`]: Information about the overall file (such as the
schema) (See [`Self::file_metadata`])
+/// * [`RowGroupMetaData`]: Information about each Row Group (see
[`Self::row_groups`])
+/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index"
structures (see [`Self::column_index`] and [`Self::offset_index`])
///
/// This structure is read by the various readers in this crate or can be read
/// directly from a file using the [`parse_metadata`] function.
///
+/// [`parquet.thrift`]:
https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
/// [`parse_metadata`]: crate::file::footer::parse_metadata
#[derive(Debug, Clone)]
pub struct ParquetMetaData {
@@ -152,6 +151,11 @@ impl ParquetMetaData {
}
/// Returns the column index for this file if loaded
+ ///
+ /// Returns `None` if the parquet file does not have a `ColumnIndex` or
+ /// [ArrowReaderOptions::with_page_index] was set to false.
+ ///
+ /// [ArrowReaderOptions::with_page_index]:
https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
self.column_index.as_ref()
}
@@ -162,7 +166,12 @@ impl ParquetMetaData {
self.offset_index.as_ref()
}
- /// Returns offset indexes in this file.
+ /// Returns offset indexes in this file, if loaded
+ ///
+ /// Returns `None` if the parquet file does not have a `OffsetIndex` or
+ /// [ArrowReaderOptions::with_page_index] was set to false.
+ ///
+ /// [ArrowReaderOptions::with_page_index]:
https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
self.offset_index.as_ref()
}
diff --git a/parquet/src/file/page_index/index_reader.rs
b/parquet/src/file/page_index/index_reader.rs
index f298601f5d5..2ddf826fb02 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -73,8 +73,8 @@ pub fn read_columns_indexes<R: ChunkReader>(
.collect()
}
-/// Reads per-page [`PageLocation`] for all columns of a row group by
-/// decoding the [`OffsetIndex`].
+/// Reads [`OffsetIndex`], per-page [`PageLocation`] for all columns of a row
+/// group.
///
/// Returns a vector of `location[column_number][page_number]`
///