This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new cfb9807646 Deprecate `ArrowReaderOptions::with_page_index` and update
API (#9199)
cfb9807646 is described below
commit cfb9807646eaa3e577b406fcfe18312960a59f99
Author: Matthew Kim <[email protected]>
AuthorDate: Sat Jan 17 06:12:29 2026 -1000
Deprecate `ArrowReaderOptions::with_page_index` and update API (#9199)
# Which issue does this PR close?
- Closes https://github.com/apache/arrow-rs/issues/9197
# Rationale for this change
This PR deprecates `ArrowReaderOptions::with_page_index(bool)` in favor
of `with_page_index_policy(PageIndexPolicy)` to align with the
`ParquetMetadataReader` api. The underlying implementation continues to
use separate `column_index` and `offset_index` fields
# Are there any user-facing changes?
Yes, some methods are deprecated
---
parquet/benches/arrow_reader_clickbench.rs | 3 +-
parquet/benches/arrow_statistics.rs | 8 ++-
parquet/examples/external_metadata.rs | 2 +-
parquet/src/arrow/arrow_reader/mod.rs | 100 +++++++++++++++++++-------
parquet/src/arrow/async_reader/mod.rs | 50 +++++++------
parquet/src/arrow/async_reader/store.rs | 20 +++---
parquet/src/file/metadata/reader.rs | 4 +-
parquet/tests/arrow_reader/io/async_reader.rs | 7 +-
parquet/tests/arrow_reader/io/mod.rs | 5 +-
parquet/tests/arrow_reader/io/sync_reader.rs | 3 +-
parquet/tests/arrow_reader/predicate_cache.rs | 13 ++--
parquet/tests/arrow_reader/statistics.rs | 5 +-
parquet/tests/arrow_writer_layout.rs | 4 +-
parquet/tests/encryption/encryption.rs | 7 +-
parquet/tests/encryption/encryption_async.rs | 3 +-
15 files changed, 154 insertions(+), 80 deletions(-)
diff --git a/parquet/benches/arrow_reader_clickbench.rs
b/parquet/benches/arrow_reader_clickbench.rs
index e737a4cad1..32035c772b 100644
--- a/parquet/benches/arrow_reader_clickbench.rs
+++ b/parquet/benches/arrow_reader_clickbench.rs
@@ -42,6 +42,7 @@ use parquet::arrow::arrow_reader::{
ParquetRecordBatchReaderBuilder, RowFilter,
};
use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask};
+use parquet::file::metadata::PageIndexPolicy;
use parquet::schema::types::SchemaDescriptor;
use std::fmt::{Display, Formatter};
use std::path::{Path, PathBuf};
@@ -847,7 +848,7 @@ fn column_indices(schema: &SchemaDescriptor, column_names:
&Vec<&str>) -> Vec<us
/// Loads Parquet metadata from the given path, including page indexes
fn load_metadata(path: &Path) -> ArrowReaderMetadata {
let file = std::fs::File::open(path).unwrap();
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::from(true));
let orig_metadata =
ArrowReaderMetadata::load(&file,
options.clone()).expect("parquet-metadata loading failed");
diff --git a/parquet/benches/arrow_statistics.rs
b/parquet/benches/arrow_statistics.rs
index f825883e32..a4aa9d137e 100644
--- a/parquet/benches/arrow_statistics.rs
+++ b/parquet/benches/arrow_statistics.rs
@@ -24,7 +24,10 @@ use arrow_schema::{
Field, Schema,
};
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
-use parquet::{arrow::arrow_reader::ArrowReaderOptions,
file::properties::WriterProperties};
+use parquet::{
+ arrow::arrow_reader::ArrowReaderOptions,
+ file::{metadata::PageIndexPolicy, properties::WriterProperties},
+};
use parquet::{
arrow::{ArrowWriter, arrow_reader::ArrowReaderBuilder},
file::properties::EnabledStatistics,
@@ -195,7 +198,8 @@ fn criterion_benchmark(c: &mut Criterion) {
for data_page_row_count_limit in &data_page_row_count_limits {
let file = create_parquet_file(dtype.clone(), row_groups,
data_page_row_count_limit);
let file = file.reopen().unwrap();
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
+
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::from(true));
let reader = ArrowReaderBuilder::try_new_with_options(file,
options).unwrap();
let metadata = reader.metadata();
let row_groups = metadata.row_groups();
diff --git a/parquet/examples/external_metadata.rs
b/parquet/examples/external_metadata.rs
index 9370016049..eeb1d90d3c 100644
--- a/parquet/examples/external_metadata.rs
+++ b/parquet/examples/external_metadata.rs
@@ -189,7 +189,7 @@ async fn read_remote_parquet_file_with_metadata(
) -> Vec<RecordBatch> {
let options = ArrowReaderOptions::new()
// tell the reader to read the page index
- .with_page_index(true);
+ .with_page_index_policy(PageIndexPolicy::from(true));
// create a reader with pre-existing metadata
let arrow_reader_metadata = ArrowReaderMetadata::try_new(metadata.into(),
options).unwrap();
let reader =
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index cb172e1e38..29346f0b27 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -455,8 +455,10 @@ pub struct ArrowReaderOptions {
///
/// [ARROW_SCHEMA_META_KEY]: crate::arrow::ARROW_SCHEMA_META_KEY
supplied_schema: Option<SchemaRef>,
- /// Policy for reading offset and column indexes.
- pub(crate) page_index_policy: PageIndexPolicy,
+
+ pub(crate) column_index: PageIndexPolicy,
+ pub(crate) offset_index: PageIndexPolicy,
+
/// Options to control reading of Parquet metadata
metadata_options: ParquetMetaDataOptions,
/// If encryption is enabled, the file decryption properties can be
provided
@@ -601,6 +603,7 @@ impl ArrowReaderOptions {
}
}
+ #[deprecated(since = "57.2.0", note = "Use `with_page_index_policy`
instead")]
/// Enable reading the [`PageIndex`] from the metadata, if present
(defaults to `false`)
///
/// The `PageIndex` can be used to push down predicates to the parquet
scan,
@@ -614,22 +617,41 @@ impl ArrowReaderOptions {
/// [`ParquetMetaData::column_index`]:
crate::file::metadata::ParquetMetaData::column_index
/// [`ParquetMetaData::offset_index`]:
crate::file::metadata::ParquetMetaData::offset_index
pub fn with_page_index(self, page_index: bool) -> Self {
- let page_index_policy = PageIndexPolicy::from(page_index);
-
- Self {
- page_index_policy,
- ..self
- }
+ self.with_page_index_policy(PageIndexPolicy::from(page_index))
}
- /// Set the [`PageIndexPolicy`] to determine how page indexes should be
read.
+ /// Sets the [`PageIndexPolicy`] for both the column and offset indexes.
+ ///
+ /// The `PageIndex` consists of two structures: the `ColumnIndex` and
`OffsetIndex`.
+ /// This method sets the same policy for both. For fine-grained control,
use
+ /// [`Self::with_column_index_policy`] and
[`Self::with_offset_index_policy`].
///
- /// See [`Self::with_page_index`] for more details.
+ /// See [`Self::with_page_index`] for more details on page indexes.
pub fn with_page_index_policy(self, policy: PageIndexPolicy) -> Self {
- Self {
- page_index_policy: policy,
- ..self
- }
+ self.with_column_index_policy(policy)
+ .with_offset_index_policy(policy)
+ }
+
+ /// Sets the [`PageIndexPolicy`] for the Parquet [ColumnIndex] structure.
+ ///
+ /// The `ColumnIndex` contains min/max statistics for each page, which can
be used
+ /// for predicate pushdown and page-level pruning.
+ ///
+ /// [ColumnIndex]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
+ pub fn with_column_index_policy(mut self, policy: PageIndexPolicy) -> Self
{
+ self.column_index = policy;
+ self
+ }
+
+ /// Sets the [`PageIndexPolicy`] for the Parquet [OffsetIndex] structure.
+ ///
+ /// The `OffsetIndex` contains the locations and sizes of each page, which
enables
+ /// efficient page-level skipping and random access within column chunks.
+ ///
+ /// [OffsetIndex]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
+ pub fn with_offset_index_policy(mut self, policy: PageIndexPolicy) -> Self
{
+ self.offset_index = policy;
+ self
}
/// Provide a Parquet schema to use when decoding the metadata. The schema
in the Parquet
@@ -766,11 +788,34 @@ impl ArrowReaderOptions {
})
}
- /// Retrieve the currently set page index behavior.
+ #[deprecated(
+ since = "57.2.0",
+ note = "Use `column_index_policy` or `offset_index_policy` instead"
+ )]
+ /// Returns whether page index reading is enabled.
+ ///
+ /// This returns `true` if both the column index and offset index policies
are not [`PageIndexPolicy::Skip`].
///
- /// This can be set via [`with_page_index`][Self::with_page_index].
+ /// This can be set via [`with_page_index`][Self::with_page_index] or
+ /// [`with_page_index_policy`][Self::with_page_index_policy].
pub fn page_index(&self) -> bool {
- self.page_index_policy != PageIndexPolicy::Skip
+ self.offset_index != PageIndexPolicy::Skip && self.column_index !=
PageIndexPolicy::Skip
+ }
+
+ /// Retrieve the currently set [`PageIndexPolicy`] for the offset index.
+ ///
+ /// This can be set via
[`with_offset_index_policy`][Self::with_offset_index_policy]
+ /// or [`with_page_index_policy`][Self::with_page_index_policy].
+ pub fn offset_index_policy(&self) -> PageIndexPolicy {
+ self.offset_index
+ }
+
+ /// Retrieve the currently set [`PageIndexPolicy`] for the column index.
+ ///
+ /// This can be set via
[`with_column_index_policy`][Self::with_column_index_policy]
+ /// or [`with_page_index_policy`][Self::with_page_index_policy].
+ pub fn column_index_policy(&self) -> PageIndexPolicy {
+ self.column_index
}
/// Retrieve the currently set metadata decoding options.
@@ -826,7 +871,8 @@ impl ArrowReaderMetadata {
/// to load the page index by making an object store request.
pub fn load<T: ChunkReader>(reader: &T, options: ArrowReaderOptions) ->
Result<Self> {
let metadata = ParquetMetaDataReader::new()
- .with_page_index_policy(options.page_index_policy)
+ .with_column_index_policy(options.column_index)
+ .with_offset_index_policy(options.offset_index)
.with_metadata_options(Some(options.metadata_options.clone()));
#[cfg(feature = "encryption")]
let metadata = metadata.with_decryption_properties(
@@ -1551,7 +1597,7 @@ pub(crate) mod tests {
FloatType, Int32Type, Int64Type, Int96, Int96Type,
};
use crate::errors::Result;
- use crate::file::metadata::{ParquetMetaData, ParquetStatisticsPolicy};
+ use crate::file::metadata::{PageIndexPolicy, ParquetMetaData,
ParquetStatisticsPolicy};
use crate::file::properties::{EnabledStatistics, WriterProperties,
WriterVersion};
use crate::file::writer::SerializedFileWriter;
use crate::schema::parser::parse_message_type;
@@ -3348,8 +3394,9 @@ pub(crate) mod tests {
file.rewind().unwrap();
- let options = ArrowReaderOptions::new()
- .with_page_index(opts.enabled_statistics ==
EnabledStatistics::Page);
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::from(
+ opts.enabled_statistics == EnabledStatistics::Page,
+ ));
let mut builder =
ParquetRecordBatchReaderBuilder::try_new_with_options(file,
options).unwrap();
@@ -4757,7 +4804,8 @@ pub(crate) mod tests {
batch_size: usize,
selections: RowSelection,
) -> ParquetRecordBatchReader {
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
+
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
let file = test_file.try_clone().unwrap();
ParquetRecordBatchReaderBuilder::try_new_with_options(file,
options)
.unwrap()
@@ -4796,7 +4844,7 @@ pub(crate) mod tests {
let test_file = File::open(path).unwrap();
let builder =
ParquetRecordBatchReaderBuilder::try_new_with_options(
test_file,
- ArrowReaderOptions::new().with_page_index(true),
+
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required),
)
.unwrap();
assert!(!builder.metadata().offset_index().unwrap()[0].is_empty());
@@ -4811,7 +4859,7 @@ pub(crate) mod tests {
let test_file = File::open(path).unwrap();
let builder =
ParquetRecordBatchReaderBuilder::try_new_with_options(
test_file,
- ArrowReaderOptions::new().with_page_index(true),
+
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required),
)
.unwrap();
// Although `Vec<Vec<PageLoacation>>` of each row group is empty,
@@ -5583,7 +5631,7 @@ pub(crate) mod tests {
writer.close().unwrap();
let data = Bytes::from(buffer);
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
let builder =
ParquetRecordBatchReaderBuilder::try_new_with_options(data.clone(),
options).unwrap();
let schema = builder.parquet_schema().clone();
@@ -5598,7 +5646,7 @@ pub(crate) mod tests {
})
};
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
let predicate = make_predicate(filter_mask.clone());
// The batch size is set to 12 to read all rows in one go after
filtering
diff --git a/parquet/src/arrow/async_reader/mod.rs
b/parquet/src/arrow/async_reader/mod.rs
index 60f2ca1615..38eef7343e 100644
--- a/parquet/src/arrow/async_reader/mod.rs
+++ b/parquet/src/arrow/async_reader/mod.rs
@@ -45,7 +45,7 @@ use crate::bloom_filter::{
SBBF_HEADER_SIZE_ESTIMATE, Sbbf, chunk_read_bloom_filter_header_and_offset,
};
use crate::errors::{ParquetError, Result};
-use crate::file::metadata::{PageIndexPolicy, ParquetMetaData,
ParquetMetaDataReader};
+use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
mod metadata;
pub use metadata::*;
@@ -165,11 +165,14 @@ impl<T: AsyncRead + AsyncSeek + Unpin + Send>
AsyncFileReader for T {
) -> BoxFuture<'a, Result<Arc<ParquetMetaData>>> {
async move {
let metadata_opts = options.map(|o| o.metadata_options().clone());
- let metadata_reader = ParquetMetaDataReader::new()
- .with_page_index_policy(PageIndexPolicy::from(
- options.is_some_and(|o| o.page_index()),
- ))
- .with_metadata_options(metadata_opts);
+ let mut metadata_reader =
+
ParquetMetaDataReader::new().with_metadata_options(metadata_opts);
+
+ if let Some(opts) = options {
+ metadata_reader = metadata_reader
+ .with_column_index_policy(opts.column_index_policy())
+ .with_offset_index_policy(opts.offset_index_policy());
+ }
#[cfg(feature = "encryption")]
let metadata_reader = metadata_reader.with_decryption_properties(
@@ -775,6 +778,7 @@ mod tests {
use crate::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions};
use crate::arrow::schema::virtual_type::RowNumber;
use crate::arrow::{ArrowWriter, AsyncArrowWriter, ProjectionMask};
+ use crate::file::metadata::PageIndexPolicy;
use crate::file::metadata::ParquetMetaDataReader;
use crate::file::properties::WriterProperties;
use arrow::compute::kernels::cmp::eq;
@@ -829,9 +833,12 @@ mod tests {
&'a mut self,
options: Option<&'a ArrowReaderOptions>,
) -> BoxFuture<'a, Result<Arc<ParquetMetaData>>> {
- let metadata_reader =
ParquetMetaDataReader::new().with_page_index_policy(
- PageIndexPolicy::from(options.is_some_and(|o| o.page_index())),
- );
+ let mut metadata_reader = ParquetMetaDataReader::new();
+ if let Some(opts) = options {
+ metadata_reader = metadata_reader
+ .with_column_index_policy(opts.column_index_policy())
+ .with_offset_index_policy(opts.offset_index_policy());
+ }
self.metadata = Some(Arc::new(
metadata_reader.parse_and_finish(&self.data).unwrap(),
));
@@ -953,7 +960,7 @@ mod tests {
let async_reader = TestReader::new(data.clone());
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
let builder =
ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options)
.await
.unwrap();
@@ -1055,7 +1062,7 @@ mod tests {
let async_reader = TestReader::new(data.clone());
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
let builder =
ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options)
.await
.unwrap();
@@ -1129,7 +1136,8 @@ mod tests {
let async_reader = TestReader::new(data.clone());
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
+
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
let builder =
ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options)
.await
.unwrap();
@@ -1191,7 +1199,7 @@ mod tests {
let async_reader = TestReader::new(data.clone());
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
let builder =
ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options)
.await
.unwrap();
@@ -1252,7 +1260,7 @@ mod tests {
let builder = ParquetRecordBatchStreamBuilder::new_with_options(
TestReader::new(data.clone()),
- ArrowReaderOptions::new().with_page_index(true),
+
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required),
)
.await
.unwrap();
@@ -1274,7 +1282,7 @@ mod tests {
// If the Reader chooses mask to handle filter, it might cause panic
because the mid 4 pages may not be decoded.
let stream = ParquetRecordBatchStreamBuilder::new_with_options(
TestReader::new(data.clone()),
- ArrowReaderOptions::new().with_page_index(true),
+
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required),
)
.await
.unwrap()
@@ -1557,7 +1565,7 @@ mod tests {
let mask = ProjectionMask::leaves(&parquet_schema, vec![0, 2]);
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
let stream =
ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options)
.await
.unwrap()
@@ -1835,7 +1843,7 @@ mod tests {
// Read data
let mut reader = ParquetRecordBatchStreamBuilder::new_with_options(
tokio::fs::File::from_std(file.try_clone().unwrap()),
- ArrowReaderOptions::new().with_page_index(true),
+
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required),
)
.await
.unwrap();
@@ -1957,7 +1965,7 @@ mod tests {
.unwrap();
metadata.set_offset_index(Some(vec![]));
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
let arrow_reader_metadata =
ArrowReaderMetadata::try_new(metadata.into(), options).unwrap();
let reader =
ParquetRecordBatchStreamBuilder::new_with_metadata(file,
arrow_reader_metadata)
@@ -1982,7 +1990,7 @@ mod tests {
.await
.unwrap();
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
let arrow_reader_metadata =
ArrowReaderMetadata::try_new(metadata.into(), options).unwrap();
let reader =
ParquetRecordBatchStreamBuilder::new_with_metadata(file,
arrow_reader_metadata)
@@ -2034,7 +2042,7 @@ mod tests {
write_metadata_to_local_file(metadata, &metadata_path);
let metadata = read_metadata_from_local_file(&metadata_path);
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
let arrow_reader_metadata =
ArrowReaderMetadata::try_new(metadata.into(), options).unwrap();
let reader =
ParquetRecordBatchStreamBuilder::new_with_metadata(file,
arrow_reader_metadata)
@@ -2060,7 +2068,7 @@ mod tests {
let async_reader = TestReader::new(data);
// Enable page index so the fetch logic loads only required pages
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
let builder =
ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options)
.await
.unwrap();
diff --git a/parquet/src/arrow/async_reader/store.rs
b/parquet/src/arrow/async_reader/store.rs
index f1e987081d..59b161bbc6 100644
--- a/parquet/src/arrow/async_reader/store.rs
+++ b/parquet/src/arrow/async_reader/store.rs
@@ -226,8 +226,12 @@ impl AsyncFileReader for ParquetObjectReader {
// When page_index_policy is Optional or Required, override the
preload flags
// to ensure the specified policy takes precedence.
if let Some(options) = options {
- if options.page_index_policy != PageIndexPolicy::Skip {
- metadata =
metadata.with_page_index_policy(options.page_index_policy);
+ if options.column_index_policy() != PageIndexPolicy::Skip
+ || options.offset_index_policy() != PageIndexPolicy::Skip
+ {
+ metadata = metadata
+
.with_column_index_policy(options.column_index_policy())
+
.with_offset_index_policy(options.offset_index_policy());
}
}
@@ -426,8 +430,7 @@ mod tests {
.with_preload_offset_index(true);
// Create options with page_index_policy set to Skip (default)
- let mut options = ArrowReaderOptions::new();
- options.page_index_policy = PageIndexPolicy::Skip;
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Skip);
// Get metadata - Skip means use reader's preload flags (true)
let metadata = reader.get_metadata(Some(&options)).await.unwrap();
@@ -447,8 +450,7 @@ mod tests {
.with_preload_offset_index(false);
// Create options with page_index_policy set to Optional
- let mut options = ArrowReaderOptions::new();
- options.page_index_policy = PageIndexPolicy::Optional;
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Optional);
// Get metadata - Optional overrides preload flags and attempts to
load indexes
let metadata = reader.get_metadata(Some(&options)).await.unwrap();
@@ -468,8 +470,7 @@ mod tests {
.with_preload_column_index(false)
.with_preload_offset_index(false);
- let mut options1 = ArrowReaderOptions::new();
- options1.page_index_policy = PageIndexPolicy::Skip;
+ let options1 =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Skip);
let metadata1 = reader1.get_metadata(Some(&options1)).await.unwrap();
// Test 2: preload=false + Optional policy -> overrides to try loading
@@ -478,8 +479,7 @@ mod tests {
.with_preload_column_index(false)
.with_preload_offset_index(false);
- let mut options2 = ArrowReaderOptions::new();
- options2.page_index_policy = PageIndexPolicy::Optional;
+ let options2 =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Optional);
let metadata2 = reader2.get_metadata(Some(&options2)).await.unwrap();
// Both should succeed (no panic/error)
diff --git a/parquet/src/file/metadata/reader.rs
b/parquet/src/file/metadata/reader.rs
index a18a5e68a9..2be9dcbd4b 100644
--- a/parquet/src/file/metadata/reader.rs
+++ b/parquet/src/file/metadata/reader.rs
@@ -123,9 +123,7 @@ impl ParquetMetaDataReader {
/// [Parquet page index]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
#[deprecated(since = "56.1.0", note = "Use `with_page_index_policy`
instead")]
pub fn with_page_indexes(self, val: bool) -> Self {
- let policy = PageIndexPolicy::from(val);
- self.with_column_index_policy(policy)
- .with_offset_index_policy(policy)
+ self.with_page_index_policy(PageIndexPolicy::from(val))
}
/// Enable or disable reading the Parquet [ColumnIndex] structure.
diff --git a/parquet/tests/arrow_reader/io/async_reader.rs
b/parquet/tests/arrow_reader/io/async_reader.rs
index 2f49de8a38..8022335da0 100644
--- a/parquet/tests/arrow_reader/io/async_reader.rs
+++ b/parquet/tests/arrow_reader/io/async_reader.rs
@@ -28,6 +28,7 @@ use parquet::arrow::arrow_reader::{ArrowReaderOptions,
RowSelection, RowSelector
use parquet::arrow::async_reader::AsyncFileReader;
use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask};
use parquet::errors::Result;
+use parquet::file::metadata::PageIndexPolicy;
use parquet::file::metadata::ParquetMetaData;
use std::ops::Range;
use std::sync::Arc;
@@ -206,7 +207,7 @@ async fn test_read_single_row_filter_no_page_index() {
// Apply a filter "b" > 575 and <less> than 625
// (last data page in Row Group 0 and first DataPage in Row Group 1)
let test_file = test_file();
- let options = test_options().with_page_index(false);
+ let options =
test_options().with_page_index_policy(PageIndexPolicy::from(false));
let builder = async_builder(&test_file, options).await;
let schema_descr = builder.metadata().file_metadata().schema_descr_ptr();
@@ -318,7 +319,9 @@ async fn async_builder(
test_file: &TestParquetFile,
options: ArrowReaderOptions,
) -> ParquetRecordBatchStreamBuilder<RecordingAsyncFileReader> {
- let parquet_meta_data = if options.page_index() {
+ let parquet_meta_data = if options.offset_index_policy() !=
PageIndexPolicy::Skip
+ || options.column_index_policy() != PageIndexPolicy::Skip
+ {
Arc::clone(test_file.parquet_metadata())
} else {
// strip out the page index from the metadata
diff --git a/parquet/tests/arrow_reader/io/mod.rs
b/parquet/tests/arrow_reader/io/mod.rs
index 86b7674121..3b11429be4 100644
--- a/parquet/tests/arrow_reader/io/mod.rs
+++ b/parquet/tests/arrow_reader/io/mod.rs
@@ -47,6 +47,7 @@ use parquet::arrow::arrow_reader::{
use parquet::arrow::{ArrowWriter, ProjectionMask};
use parquet::data_type::AsBytes;
use parquet::file::FOOTER_SIZE;
+use parquet::file::metadata::PageIndexPolicy;
use parquet::file::metadata::{FooterTail, ParquetMetaData, ParquetOffsetIndex};
use parquet::file::page_index::offset_index::PageLocation;
use parquet::file::properties::WriterProperties;
@@ -73,7 +74,7 @@ fn test_file() -> TestParquetFile {
///
/// Note these tests use the PageIndex to reduce IO
fn test_options() -> ArrowReaderOptions {
- ArrowReaderOptions::default().with_page_index(true)
+
ArrowReaderOptions::default().with_page_index_policy(PageIndexPolicy::from(true))
}
/// Return a row filter that evaluates "b > 575" AND "b < 625"
@@ -189,7 +190,7 @@ impl TestParquetFile {
// Read the parquet file to determine its layout
let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(
bytes.clone(),
- ArrowReaderOptions::default().with_page_index(true),
+
ArrowReaderOptions::default().with_page_index_policy(PageIndexPolicy::from(true)),
)
.unwrap();
diff --git a/parquet/tests/arrow_reader/io/sync_reader.rs
b/parquet/tests/arrow_reader/io/sync_reader.rs
index 77c200fa86..835029d5c6 100644
--- a/parquet/tests/arrow_reader/io/sync_reader.rs
+++ b/parquet/tests/arrow_reader/io/sync_reader.rs
@@ -27,6 +27,7 @@ use parquet::arrow::ProjectionMask;
use parquet::arrow::arrow_reader::{
ArrowReaderOptions, ParquetRecordBatchReaderBuilder, RowSelection,
RowSelector,
};
+use parquet::file::metadata::PageIndexPolicy;
use parquet::file::reader::{ChunkReader, Length};
use std::io::Read;
use std::sync::Arc;
@@ -122,7 +123,7 @@ fn test_read_single_column() {
#[test]
fn test_read_single_column_no_page_index() {
let test_file = test_file();
- let options = test_options().with_page_index(false);
+ let options =
test_options().with_page_index_policy(PageIndexPolicy::from(false));
let builder = sync_builder(&test_file, options);
let schema_descr = builder.metadata().file_metadata().schema_descr_ptr();
let builder =
builder.with_projection(ProjectionMask::columns(&schema_descr, ["b"]));
diff --git a/parquet/tests/arrow_reader/predicate_cache.rs
b/parquet/tests/arrow_reader/predicate_cache.rs
index b419c37158..bf3412dd4d 100644
--- a/parquet/tests/arrow_reader/predicate_cache.rs
+++ b/parquet/tests/arrow_reader/predicate_cache.rs
@@ -33,7 +33,7 @@ use parquet::arrow::arrow_reader::{ArrowPredicateFn,
ArrowReaderOptions, RowFilt
use parquet::arrow::arrow_reader::{ArrowReaderBuilder,
ParquetRecordBatchReaderBuilder};
use parquet::arrow::async_reader::AsyncFileReader;
use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder,
ProjectionMask};
-use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData,
ParquetMetaDataReader};
+use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
use parquet::file::properties::WriterProperties;
use std::ops::Range;
use std::sync::Arc;
@@ -356,9 +356,14 @@ impl AsyncFileReader for TestReader {
&'a mut self,
options: Option<&'a ArrowReaderOptions>,
) -> BoxFuture<'a, parquet::errors::Result<Arc<ParquetMetaData>>> {
- let metadata_reader =
ParquetMetaDataReader::new().with_page_index_policy(
- PageIndexPolicy::from(options.is_some_and(|o| o.page_index())),
- );
+ let mut metadata_reader = ParquetMetaDataReader::new();
+
+ if let Some(options) = options {
+ metadata_reader = metadata_reader
+ .with_column_index_policy(options.column_index_policy())
+ .with_offset_index_policy(options.offset_index_policy());
+ }
+
self.metadata = Some(Arc::new(
metadata_reader.parse_and_finish(&self.data).unwrap(),
));
diff --git a/parquet/tests/arrow_reader/statistics.rs
b/parquet/tests/arrow_reader/statistics.rs
index aef473fa84..07a9bcc578 100644
--- a/parquet/tests/arrow_reader/statistics.rs
+++ b/parquet/tests/arrow_reader/statistics.rs
@@ -46,6 +46,7 @@ use
parquet::arrow::arrow_reader::statistics::StatisticsConverter;
use parquet::arrow::arrow_reader::{
ArrowReaderBuilder, ArrowReaderOptions, ParquetRecordBatchReaderBuilder,
};
+use parquet::file::metadata::PageIndexPolicy;
use parquet::file::metadata::{ColumnChunkMetaData, RowGroupMetaData};
use parquet::file::properties::{EnabledStatistics, WriterProperties};
use parquet::file::statistics::{Statistics, ValueStatistics};
@@ -145,7 +146,7 @@ fn build_parquet_file(
let _file_meta = writer.close().unwrap();
let file = output_file.reopen().unwrap();
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::from(true));
ArrowReaderBuilder::try_new_with_options(file, options).unwrap()
}
@@ -170,7 +171,7 @@ impl TestReader {
// open the file & get the reader
let file = file.reopen().unwrap();
- let options = ArrowReaderOptions::new().with_page_index(true);
+ let options =
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::from(true));
ArrowReaderBuilder::try_new_with_options(file, options).unwrap()
}
}
diff --git a/parquet/tests/arrow_writer_layout.rs
b/parquet/tests/arrow_writer_layout.rs
index f78370ca8d..ca6f89cab4 100644
--- a/parquet/tests/arrow_writer_layout.rs
+++ b/parquet/tests/arrow_writer_layout.rs
@@ -24,6 +24,7 @@ use bytes::Bytes;
use parquet::arrow::ArrowWriter;
use parquet::arrow::arrow_reader::{ArrowReaderOptions,
ParquetRecordBatchReaderBuilder};
use parquet::basic::{Encoding, PageType};
+use parquet::file::metadata::PageIndexPolicy;
use parquet::file::metadata::ParquetMetaData;
use parquet::file::properties::{ReaderProperties, WriterProperties};
use parquet::file::reader::SerializedPageReader;
@@ -68,7 +69,8 @@ fn do_test(test: LayoutTest) {
let b = Bytes::from(buf);
// Re-read file to decode column index
- let read_options = ArrowReaderOptions::new().with_page_index(true);
+ let read_options =
+
ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::from(true));
let reader =
ParquetRecordBatchReaderBuilder::try_new_with_options(b.clone(),
read_options).unwrap();
diff --git a/parquet/tests/encryption/encryption.rs
b/parquet/tests/encryption/encryption.rs
index f999abab95..b642af040e 100644
--- a/parquet/tests/encryption/encryption.rs
+++ b/parquet/tests/encryption/encryption.rs
@@ -34,6 +34,7 @@ use parquet::data_type::{ByteArray, ByteArrayType};
use parquet::encryption::decrypt::FileDecryptionProperties;
use parquet::encryption::encrypt::FileEncryptionProperties;
use parquet::errors::ParquetError;
+use parquet::file::metadata::PageIndexPolicy;
use parquet::file::metadata::ParquetMetaData;
use parquet::file::properties::WriterProperties;
use parquet::file::writer::SerializedFileWriter;
@@ -453,7 +454,7 @@ fn uniform_encryption_roundtrip(
let options = ArrowReaderOptions::new()
.with_file_decryption_properties(decryption_properties)
- .with_page_index(page_index);
+ .with_page_index_policy(PageIndexPolicy::from(page_index));
let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(file,
options)?;
assert_eq!(&row_group_sizes(builder.metadata()), &[50, 50, 50]);
@@ -557,7 +558,7 @@ fn uniform_encryption_page_skipping(page_index: bool) ->
parquet::errors::Result
let options = ArrowReaderOptions::new()
.with_file_decryption_properties(decryption_properties)
- .with_page_index(page_index);
+ .with_page_index_policy(PageIndexPolicy::from(page_index));
let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(file,
options)?;
@@ -1041,7 +1042,7 @@ fn test_decrypt_page_index(
let file = File::open(path)?;
let options = ArrowReaderOptions::default()
.with_file_decryption_properties(decryption_properties)
- .with_page_index(true);
+ .with_page_index_policy(PageIndexPolicy::from(true));
let arrow_metadata = ArrowReaderMetadata::load(&file, options)?;
diff --git a/parquet/tests/encryption/encryption_async.rs
b/parquet/tests/encryption/encryption_async.rs
index 51acd73748..dc57ecd50d 100644
--- a/parquet/tests/encryption/encryption_async.rs
+++ b/parquet/tests/encryption/encryption_async.rs
@@ -35,6 +35,7 @@ use parquet::arrow::{
use parquet::encryption::decrypt::FileDecryptionProperties;
use parquet::encryption::encrypt::FileEncryptionProperties;
use parquet::errors::ParquetError;
+use parquet::file::metadata::PageIndexPolicy;
use parquet::file::metadata::ParquetMetaData;
use parquet::file::properties::{WriterProperties, WriterPropertiesBuilder};
use parquet::file::writer::SerializedFileWriter;
@@ -439,7 +440,7 @@ async fn test_decrypt_page_index(
let options = ArrowReaderOptions::new()
.with_file_decryption_properties(decryption_properties)
- .with_page_index(true);
+ .with_page_index_policy(PageIndexPolicy::from(true));
let arrow_metadata = ArrowReaderMetadata::load_async(&mut file,
options).await?;