This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new c1912948268 Document when the ParquetRecordBatchReader will re-read
metadata (#5887)
c1912948268 is described below
commit c19129482680559c812fbb21ddc24fc33e1d95f5
Author: Andrew Lamb <[email protected]>
AuthorDate: Sat Jun 15 13:06:28 2024 -0400
Document when the ParquetRecordBatchReader will re-read metadata (#5887)
---
parquet/src/arrow/arrow_reader/mod.rs | 50 +++++++++++++++++++++++++++++------
parquet/src/arrow/async_reader/mod.rs | 6 +++++
2 files changed, 48 insertions(+), 8 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index 6b95324bee3..fd9cbf2039b 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -248,7 +248,9 @@ impl<T> ArrowReaderBuilder<T> {
/// is then read from the file, including projection and filter pushdown
#[derive(Debug, Clone, Default)]
pub struct ArrowReaderOptions {
+ /// Should the reader strip any user defined metadata from the Arrow schema
skip_arrow_metadata: bool,
+ /// If true, attempt to read `OffsetIndex` and `ColumnIndex`
pub(crate) page_index: bool,
}
@@ -282,23 +284,41 @@ impl ArrowReaderOptions {
}
}
-/// The cheaply clone-able metadata necessary to construct a
[`ArrowReaderBuilder`]
+/// The metadata necessary to construct a [`ArrowReaderBuilder`]
///
-/// This allows loading the metadata for a file once and then using this to
construct
-/// multiple separate readers, for example, to distribute readers across
multiple threads
+/// Note this structure is cheaply clone-able as it consists of several arcs.
+///
+/// This structure allows
+///
+/// 1. Loading metadata for a file once and then using that same metadata to
+/// construct multiple separate readers, for example, to distribute readers
+/// across multiple threads
+///
+/// 2. Using a cached copy of the [`ParquetMetadata`] rather than reading it
+/// from the file each time a reader is constructed.
+///
+/// [`ParquetMetadata`]: crate::file::metadata::ParquetMetaData
#[derive(Debug, Clone)]
pub struct ArrowReaderMetadata {
+ /// The Parquet Metadata, if known aprior
pub(crate) metadata: Arc<ParquetMetaData>,
-
+ /// The Arrow Schema
pub(crate) schema: SchemaRef,
pub(crate) fields: Option<Arc<ParquetField>>,
}
impl ArrowReaderMetadata {
- /// Loads [`ArrowReaderMetadata`] from the provided [`ChunkReader`]
+ /// Loads [`ArrowReaderMetadata`] from the provided [`ChunkReader`], if
necessary
+ ///
+ /// See [`ParquetRecordBatchReaderBuilder::new_with_metadata`] for an
+ /// example of how this can be used
+ ///
+ /// # Notes
///
- /// See [`ParquetRecordBatchReaderBuilder::new_with_metadata`] for how
this can be used
+ /// If `options` has [`ArrowReaderOptions::with_page_index`] true, but
+ /// `Self::metadata` is missing the page index, this function will attempt
+ /// to load the page index by making an object store request.
pub fn load<T: ChunkReader>(reader: &T, options: ArrowReaderOptions) ->
Result<Self> {
let mut metadata = footer::parse_metadata(reader)?;
if options.page_index {
@@ -320,6 +340,12 @@ impl ArrowReaderMetadata {
Self::try_new(Arc::new(metadata), options)
}
+ /// Create a new [`ArrowReaderMetadata`]
+ ///
+ /// # Notes
+ ///
+ /// This function does not attempt to load the PageIndex if not present in
the metadata.
+ /// See [`Self::load`] for more details.
pub fn try_new(metadata: Arc<ParquetMetaData>, options:
ArrowReaderOptions) -> Result<Self> {
let kv_metadata = match options.skip_arrow_metadata {
true => None,
@@ -407,9 +433,17 @@ impl<T: ChunkReader + 'static>
ParquetRecordBatchReaderBuilder<T> {
/// Create a [`ParquetRecordBatchReaderBuilder`] from the provided
[`ArrowReaderMetadata`]
///
- /// This allows loading metadata once and using it to create multiple
builders with
- /// potentially different settings
+ /// This interface allows:
+ ///
+ /// 1. Loading metadata once and using it to create multiple builders with
+ /// potentially different settings or run on different threads
///
+ /// 2. Using a cached copy of the metadata rather than re-reading it from
the
+ /// file each time a reader is constructed.
+ ///
+ /// See the docs on [`ArrowReaderMetadata`] for more details
+ ///
+ /// # Example
/// ```
/// # use std::fs::metadata;
/// # use std::sync::Arc;
diff --git a/parquet/src/arrow/async_reader/mod.rs
b/parquet/src/arrow/async_reader/mod.rs
index 1e298c65497..0a72583b90d 100644
--- a/parquet/src/arrow/async_reader/mod.rs
+++ b/parquet/src/arrow/async_reader/mod.rs
@@ -202,6 +202,12 @@ impl ArrowReaderMetadata {
/// Returns a new [`ArrowReaderMetadata`] for this builder
///
/// See [`ParquetRecordBatchStreamBuilder::new_with_metadata`] for how
this can be used
+ ///
+ /// # Notes
+ ///
+ /// If `options` has [`ArrowReaderOptions::with_page_index`] true, but
+ /// `Self::metadata` is missing the page index, this function will attempt
+ /// to load the page index by making an object store request.
pub async fn load_async<T: AsyncFileReader>(
input: &mut T,
options: ArrowReaderOptions,