This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new e4fdefb415 Respect page index policy option for ParquetObjectReader
when it's not skip (#8857)
e4fdefb415 is described below
commit e4fdefb415519ca88ad347f38e912f1dd08122b3
Author: Qi Zhu <[email protected]>
AuthorDate: Tue Nov 18 19:57:30 2025 +0800
Respect page index policy option for ParquetObjectReader when it's not skip
(#8857)
# Which issue does this PR close?
When i try to wrapper ParquetObjectReader and implement our internal
metadata cache, and i will pass the option to the inner
ParquetObjectReader, but it does not respect the index policy option
even it's not skip, and it always be false and will not load page index
which i want to prefetch and cache.
- Closes #[8856](https://github.com/apache/arrow-rs/issues/8856)
cc @alamb
# Rationale for this change
Support option with page index if it's not skip.
# Are these changes tested?
Yes
# Are there any user-facing changes?
No
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
parquet/src/arrow/async_reader/store.rs | 129 +++++++++++++++++++++++++++++++-
1 file changed, 127 insertions(+), 2 deletions(-)
diff --git a/parquet/src/arrow/async_reader/store.rs
b/parquet/src/arrow/async_reader/store.rs
index efb3de0f22..f1e987081d 100644
--- a/parquet/src/arrow/async_reader/store.rs
+++ b/parquet/src/arrow/async_reader/store.rs
@@ -101,7 +101,11 @@ impl ParquetObjectReader {
}
}
- /// Load the Column Index as part of [`Self::get_metadata`]
+ /// Whether to load the Column Index as part of [`Self::get_metadata`]
+ ///
+ /// Note: This setting may be overridden by [`ArrowReaderOptions`]
`page_index_policy`.
+ /// If `page_index_policy` is `Optional` or `Required`, it will take
precedence
+ /// over this preload flag. When it is `Skip` (default), this flag is used.
pub fn with_preload_column_index(self, preload_column_index: bool) -> Self
{
Self {
preload_column_index,
@@ -109,7 +113,11 @@ impl ParquetObjectReader {
}
}
- /// Load the Offset Index as part of [`Self::get_metadata`]
+ /// Whether to load the Offset Index as part of [`Self::get_metadata`]
+ ///
+ /// Note: This setting may be overridden by [`ArrowReaderOptions`]
`page_index_policy`.
+ /// If `page_index_policy` is `Optional` or `Required`, it will take
precedence
+ /// over this preload flag. When it is `Skip` (default), this flag is used.
pub fn with_preload_offset_index(self, preload_offset_index: bool) -> Self
{
Self {
preload_offset_index,
@@ -213,6 +221,16 @@ impl AsyncFileReader for ParquetObjectReader {
);
}
+ // Override page index policies from ArrowReaderOptions if
specified and not Skip.
+ // When page_index_policy is Skip (default), use the reader's
preload flags.
+ // When page_index_policy is Optional or Required, override the
preload flags
+ // to ensure the specified policy takes precedence.
+ if let Some(options) = options {
+ if options.page_index_policy != PageIndexPolicy::Skip {
+ metadata =
metadata.with_page_index_policy(options.page_index_policy);
+ }
+ }
+
let metadata = if let Some(file_size) = self.file_size {
metadata.load_and_finish(self, file_size).await?
} else {
@@ -226,6 +244,8 @@ impl AsyncFileReader for ParquetObjectReader {
#[cfg(test)]
mod tests {
+ use crate::arrow::async_reader::ArrowReaderOptions;
+ use crate::file::metadata::PageIndexPolicy;
use std::sync::{
Arc,
atomic::{AtomicUsize, Ordering},
@@ -254,6 +274,18 @@ mod tests {
(meta, Arc::new(store) as Arc<dyn ObjectStore>)
}
+ async fn get_meta_store_with_page_index() -> (ObjectMeta, Arc<dyn
ObjectStore>) {
+ let res = parquet_test_data();
+ let store = LocalFileSystem::new_with_prefix(res).unwrap();
+
+ let meta = store
+ .head(&Path::from("alltypes_tiny_pages_plain.parquet"))
+ .await
+ .unwrap();
+
+ (meta, Arc::new(store) as Arc<dyn ObjectStore>)
+ }
+
#[tokio::test]
async fn test_simple() {
let (meta, store) = get_meta_store().await;
@@ -382,4 +414,97 @@ mod tests {
assert!(err.to_string().contains("was cancelled"));
}
+
+ #[tokio::test]
+ async fn test_page_index_policy_skip_uses_preload_true() {
+ let (meta, store) = get_meta_store_with_page_index().await;
+
+ // Create reader with preload flags set to true
+ let mut reader = ParquetObjectReader::new(store.clone(),
meta.location.clone())
+ .with_file_size(meta.size)
+ .with_preload_column_index(true)
+ .with_preload_offset_index(true);
+
+ // Create options with page_index_policy set to Skip (default)
+ let mut options = ArrowReaderOptions::new();
+ options.page_index_policy = PageIndexPolicy::Skip;
+
+ // Get metadata - Skip means use reader's preload flags (true)
+ let metadata = reader.get_metadata(Some(&options)).await.unwrap();
+
+ // With preload=true, indexes should be loaded since the test file has
them
+ assert!(metadata.column_index().is_some());
+ }
+
+ #[tokio::test]
+ async fn test_page_index_policy_optional_overrides_preload_false() {
+ let (meta, store) = get_meta_store_with_page_index().await;
+
+ // Create reader with preload flags set to false
+ let mut reader = ParquetObjectReader::new(store.clone(),
meta.location.clone())
+ .with_file_size(meta.size)
+ .with_preload_column_index(false)
+ .with_preload_offset_index(false);
+
+ // Create options with page_index_policy set to Optional
+ let mut options = ArrowReaderOptions::new();
+ options.page_index_policy = PageIndexPolicy::Optional;
+
+ // Get metadata - Optional overrides preload flags and attempts to
load indexes
+ let metadata = reader.get_metadata(Some(&options)).await.unwrap();
+
+ // With Optional policy, it will TRY to load indexes but won't fail if
they don't exist
+ // The test file has page indexes, so they will be some
+ assert!(metadata.column_index().is_some());
+ }
+
+ #[tokio::test]
+ async fn test_page_index_policy_optional_vs_skip() {
+ let (meta, store) = get_meta_store_with_page_index().await;
+
+ // Test 1: preload=false + Skip policy -> uses preload flags (false)
+ let mut reader1 = ParquetObjectReader::new(store.clone(),
meta.location.clone())
+ .with_file_size(meta.size)
+ .with_preload_column_index(false)
+ .with_preload_offset_index(false);
+
+ let mut options1 = ArrowReaderOptions::new();
+ options1.page_index_policy = PageIndexPolicy::Skip;
+ let metadata1 = reader1.get_metadata(Some(&options1)).await.unwrap();
+
+ // Test 2: preload=false + Optional policy -> overrides to try loading
+ let mut reader2 = ParquetObjectReader::new(store.clone(),
meta.location.clone())
+ .with_file_size(meta.size)
+ .with_preload_column_index(false)
+ .with_preload_offset_index(false);
+
+ let mut options2 = ArrowReaderOptions::new();
+ options2.page_index_policy = PageIndexPolicy::Optional;
+ let metadata2 = reader2.get_metadata(Some(&options2)).await.unwrap();
+
+ // Both should succeed (no panic/error)
+ // metadata1 (Skip) uses preload=false -> Skip policy
+ // metadata2 (Optional) overrides preload=false -> Optional policy
+ assert!(metadata1.column_index().is_none());
+ assert!(metadata2.column_index().is_some());
+ }
+
+ #[tokio::test]
+ async fn test_page_index_policy_no_options_uses_preload() {
+ let (meta, store) = get_meta_store_with_page_index().await;
+
+ // Create reader with preload flags set to true
+ let mut reader = ParquetObjectReader::new(store, meta.location)
+ .with_file_size(meta.size)
+ .with_preload_column_index(true)
+ .with_preload_offset_index(true);
+
+ // Get metadata without options - should use reader's preload flags
+ let metadata = reader.get_metadata(None).await.unwrap();
+
+ // With no options provided, preload flags (true) should be respected
+ // and converted to Optional policy internally (preload=true ->
Optional)
+ // The test file has page indexes, so they will be some
+ assert!(metadata.column_index().is_some() &&
metadata.column_index().is_some());
+ }
}