This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new d48010e571 Workaround for missing Parquet page indexes in
`ParquetMetadaReader` (#6450)
d48010e571 is described below
commit d48010e5717a1fb54822ad3a8c41f084cff05c92
Author: Ed Seidl <[email protected]>
AuthorDate: Wed Sep 25 16:55:21 2024 -0700
Workaround for missing Parquet page indexes in `ParquetMetadaReader` (#6450)
* workaround for missing page indexes
* remove empty line
* Apply suggestions from code review
Co-authored-by: Andrew Lamb <[email protected]>
* fmt
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
parquet/src/arrow/arrow_reader/mod.rs | 26 ++++++--------------------
parquet/src/file/metadata/reader.rs | 28 ++++++++++++++++++++++++----
2 files changed, 30 insertions(+), 24 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index 253625117c..a109851f72 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -34,9 +34,7 @@ use
crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField};
use crate::arrow::{parquet_to_arrow_field_levels, FieldLevels, ProjectionMask};
use crate::column::page::{PageIterator, PageReader};
use crate::errors::{ParquetError, Result};
-use crate::file::footer;
-use crate::file::metadata::ParquetMetaData;
-use crate::file::page_index::index_reader;
+use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
use crate::file::reader::{ChunkReader, SerializedPageReader};
use crate::schema::types::SchemaDescriptor;
@@ -382,23 +380,9 @@ impl ArrowReaderMetadata {
/// `Self::metadata` is missing the page index, this function will attempt
/// to load the page index by making an object store request.
pub fn load<T: ChunkReader>(reader: &T, options: ArrowReaderOptions) ->
Result<Self> {
- let mut metadata = footer::parse_metadata(reader)?;
- if options.page_index {
- let column_index = metadata
- .row_groups()
- .iter()
- .map(|rg| index_reader::read_columns_indexes(reader,
rg.columns()))
- .collect::<Result<Vec<_>>>()?;
- metadata.set_column_index(Some(column_index));
-
- let offset_index = metadata
- .row_groups()
- .iter()
- .map(|rg| index_reader::read_offset_indexes(reader,
rg.columns()))
- .collect::<Result<Vec<_>>>()?;
-
- metadata.set_offset_index(Some(offset_index))
- }
+ let metadata = ParquetMetaDataReader::new()
+ .with_page_indexes(options.page_index)
+ .parse_and_finish(reader)?;
Self::try_new(Arc::new(metadata), options)
}
@@ -3498,6 +3482,8 @@ mod tests {
.unwrap();
// Although `Vec<Vec<PageLoacation>>` of each row group is empty,
// we should read the file successfully.
+ // FIXME: this test will fail when metadata parsing returns `None`
for missing page
+ // indexes. https://github.com/apache/arrow-rs/issues/6447
assert!(builder.metadata().offset_index().unwrap()[0].is_empty());
let reader = builder.build().unwrap();
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
diff --git a/parquet/src/file/metadata/reader.rs
b/parquet/src/file/metadata/reader.rs
index 333e0fd6e9..4a5a1bc93c 100644
--- a/parquet/src/file/metadata/reader.rs
+++ b/parquet/src/file/metadata/reader.rs
@@ -201,6 +201,7 @@ impl ParquetMetaDataReader {
// need for more data. This is not it's intended use. The plan is
to add a NeedMoreData
// value to the enum, but this would be a breaking change. This
will be done as
// 54.0.0 draws nearer.
+ // https://github.com/apache/arrow-rs/issues/6447
Err(ParquetError::IndexOutOfBound(needed, _)) => {
// If reader is the same length as `file_size` then presumably
there is no more to
// read, so return an EOF error.
@@ -247,13 +248,18 @@ impl ParquetMetaDataReader {
));
}
- // TODO(ets): what is the correct behavior for missing page indexes?
MetadataLoader would
- // leave them as `None`, while the parser in
`index_reader::read_columns_indexes` returns a
- // vector of empty vectors.
- // I think it's best to leave them as `None`.
+ // FIXME: there are differing implementations in the case where page
indexes are missing
+ // from the file. `MetadataLoader` will leave them as `None`, while
the parser in
+ // `index_reader::read_columns_indexes` returns a vector of empty
vectors.
+ // It is best for this function to replicate the latter behavior for
now, but in a future
+ // breaking release, the two paths to retrieve metadata should be made
consistent. Note that this is only
+ // an issue if the user requested page indexes, so there is no need to
provide empty
+ // vectors in `try_parse_sized()`.
+ // https://github.com/apache/arrow-rs/issues/6447
// Get bounds needed for page indexes (if any are present in the file).
let Some(range) = self.range_for_page_index() else {
+ self.empty_page_indexes();
return Ok(());
};
@@ -412,6 +418,20 @@ impl ParquetMetaDataReader {
Ok(())
}
+ /// Set the column_index and offset_indexes to empty `Vec` for backwards
compatibility
+ ///
+ /// See <https://github.com/apache/arrow-rs/pull/6451> for details
+ fn empty_page_indexes(&mut self) {
+ let metadata = self.metadata.as_mut().unwrap();
+ let num_row_groups = metadata.num_row_groups();
+ if self.column_index {
+ metadata.set_column_index(Some(vec![vec![]; num_row_groups]));
+ }
+ if self.offset_index {
+ metadata.set_offset_index(Some(vec![vec![]; num_row_groups]));
+ }
+ }
+
fn range_for_page_index(&self) -> Option<Range<usize>> {
// sanity check
self.metadata.as_ref()?;