This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new ed018a34d99 Minor: Clarify when page index structures are read (#5886)
ed018a34d99 is described below

commit ed018a34d996590544fe5e833cb601bf46e9758e
Author: Andrew Lamb <[email protected]>
AuthorDate: Sat Jun 15 13:07:22 2024 -0400

    Minor: Clarify when page index structures are read (#5886)
    
    * Minor: Clarify when page index structures are read
    
    * fix link
---
 parquet/src/arrow/arrow_reader/mod.rs       |  8 +++++++-
 parquet/src/file/metadata.rs                | 27 ++++++++++++++++++---------
 parquet/src/file/page_index/index_reader.rs |  4 ++--
 3 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/parquet/src/arrow/arrow_reader/mod.rs 
b/parquet/src/arrow/arrow_reader/mod.rs
index fd9cbf2039b..070dda97c59 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -273,12 +273,18 @@ impl ArrowReaderOptions {
         }
     }
 
-    /// Enable decoding of the [`PageIndex`], if present (defaults to `false`)
+    /// Enable reading [`PageIndex`], if present (defaults to `false`)
     ///
     /// The `PageIndex` can be used to push down predicates to the parquet 
scan,
     /// potentially eliminating unnecessary IO, by some query engines.
     ///
+    /// If this is enabled, [`ParquetMetaData::column_index`] and
+    /// [`ParquetMetaData::offset_index`] will be populated if the 
corresponding
+    /// information is present in the file.
+    ///
     /// [`PageIndex`]: 
https://github.com/apache/parquet-format/blob/master/PageIndex.md
+    /// [`ParquetMetaData::column_index`]: 
crate::file::metadata::ParquetMetaData::column_index
+    /// [`ParquetMetaData::offset_index`]: 
crate::file::metadata::ParquetMetaData::offset_index
     pub fn with_page_index(self, page_index: bool) -> Self {
         Self { page_index, ..self }
     }
diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs
index 853d5ffec8b..fb8f798fd3a 100644
--- a/parquet/src/file/metadata.rs
+++ b/parquet/src/file/metadata.rs
@@ -29,7 +29,6 @@
 //! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
 //!   within a Row Group including encoding and compression information,
 //!   number of values, statistics, etc.
-
 use std::ops::Range;
 use std::sync::Arc;
 
@@ -69,20 +68,20 @@ pub type ParquetColumnIndex = Vec<Vec<Index>>;
 /// parquet file.
 pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>;
 
-/// Global Parquet metadata, including [`FileMetaData`], [`RowGroupMetaData`].
+/// Parsed metadata for a single Parquet file
 ///
 /// This structure is stored in the footer of Parquet files, in the format
-/// defined by [`parquet.thrift`]. It contains:
-///
-/// * File level metadata: [`FileMetaData`]
-/// * Row Group level metadata: [`RowGroupMetaData`]
-/// * (Optional) "Page Index" structures: [`ParquetColumnIndex`] and 
[`ParquetOffsetIndex`]
+/// defined by [`parquet.thrift`].
 ///
-/// [`parquet.thrift`]: 
https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
+/// # Overview
+/// * [`FileMetaData`]: Information about the overall file (such as the 
schema) (See [`Self::file_metadata`])
+/// * [`RowGroupMetaData`]: Information about each Row Group (see 
[`Self::row_groups`])
+/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" 
structures (see [`Self::column_index`] and [`Self::offset_index`])
 ///
 /// This structure is read by the various readers in this crate or can be read
 /// directly from a file using the [`parse_metadata`] function.
 ///
+/// [`parquet.thrift`]: 
https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
 /// [`parse_metadata`]: crate::file::footer::parse_metadata
 #[derive(Debug, Clone)]
 pub struct ParquetMetaData {
@@ -152,6 +151,11 @@ impl ParquetMetaData {
     }
 
     /// Returns the column index for this file if loaded
+    ///
+    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
+    /// [ArrowReaderOptions::with_page_index] was set to false.
+    ///
+    /// [ArrowReaderOptions::with_page_index]: 
https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
     pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
         self.column_index.as_ref()
     }
@@ -162,7 +166,12 @@ impl ParquetMetaData {
         self.offset_index.as_ref()
     }
 
-    /// Returns offset indexes in this file.
+    /// Returns offset indexes in this file, if loaded
+    ///
+    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
+    /// [ArrowReaderOptions::with_page_index] was set to false.
+    ///
+    /// [ArrowReaderOptions::with_page_index]: 
https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
     pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
         self.offset_index.as_ref()
     }
diff --git a/parquet/src/file/page_index/index_reader.rs 
b/parquet/src/file/page_index/index_reader.rs
index f298601f5d5..2ddf826fb02 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -73,8 +73,8 @@ pub fn read_columns_indexes<R: ChunkReader>(
         .collect()
 }
 
-/// Reads per-page [`PageLocation`] for all columns of a row group by
-/// decoding the [`OffsetIndex`].
+/// Reads [`OffsetIndex`],  per-page [`PageLocation`] for all columns of a row
+/// group.
 ///
 /// Returns a vector of `location[column_number][page_number]`
 ///

Reply via email to