This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 73a0c2661 Return `None` when Parquet page indexes are not present in
file (#6639)
73a0c2661 is described below
commit 73a0c26615623dc3b5e63166e9ba3e6eeb2001a5
Author: Ed Seidl <[email protected]>
AuthorDate: Sun Nov 24 15:08:20 2024 -0800
Return `None` when Parquet page indexes are not present in file (#6639)
* return none for missing page indexes
* return option from page index read functions
* update docs
---
parquet/src/arrow/arrow_reader/mod.rs | 4 +--
parquet/src/arrow/arrow_writer/mod.rs | 2 +-
parquet/src/arrow/async_reader/mod.rs | 6 ++--
parquet/src/file/metadata/reader.rs | 15 ---------
parquet/src/file/page_index/index_reader.rs | 48 ++++++++++++++++-------------
parquet/src/file/serialized_reader.rs | 22 +++++++------
6 files changed, 42 insertions(+), 55 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index d3709c03e..e25b42fb3 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -3584,9 +3584,7 @@ mod tests {
.unwrap();
// Although `Vec<Vec<PageLoacation>>` of each row group is empty,
// we should read the file successfully.
- // FIXME: this test will fail when metadata parsing returns `None`
for missing page
- // indexes. https://github.com/apache/arrow-rs/issues/6447
- assert!(builder.metadata().offset_index().unwrap()[0].is_empty());
+ assert!(builder.metadata().offset_index().is_none());
let reader = builder.build().unwrap();
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
assert_eq!(batches.len(), 1);
diff --git a/parquet/src/arrow/arrow_writer/mod.rs
b/parquet/src/arrow/arrow_writer/mod.rs
index 99d54eef3..115b9a2f1 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1743,7 +1743,7 @@ mod tests {
"Expected a dictionary page"
);
- let offset_indexes = read_offset_indexes(&file, column).unwrap();
+ let offset_indexes = read_offset_indexes(&file,
column).unwrap().unwrap();
let page_locations = offset_indexes[0].page_locations.clone();
diff --git a/parquet/src/arrow/async_reader/mod.rs
b/parquet/src/arrow/async_reader/mod.rs
index cee5f11dd..e1b8b6a11 100644
--- a/parquet/src/arrow/async_reader/mod.rs
+++ b/parquet/src/arrow/async_reader/mod.rs
@@ -928,7 +928,6 @@ mod tests {
use crate::arrow::schema::parquet_to_arrow_schema_and_fields;
use crate::arrow::ArrowWriter;
use crate::file::metadata::ParquetMetaDataReader;
- use crate::file::page_index::index_reader;
use crate::file::properties::WriterProperties;
use arrow::compute::kernels::cmp::eq;
use arrow::error::Result as ArrowResult;
@@ -1566,12 +1565,11 @@ mod tests {
let data = Bytes::from(std::fs::read(path).unwrap());
let metadata = ParquetMetaDataReader::new()
+ .with_page_indexes(true)
.parse_and_finish(&data)
.unwrap();
- let offset_index =
- index_reader::read_offset_indexes(&data,
metadata.row_group(0).columns())
- .expect("reading offset index");
+ let offset_index = metadata.offset_index().expect("reading offset
index")[0].clone();
let mut metadata_builder = metadata.into_builder();
let mut row_groups = metadata_builder.take_row_groups();
diff --git a/parquet/src/file/metadata/reader.rs
b/parquet/src/file/metadata/reader.rs
index 9f8be2299..ec2cd1094 100644
--- a/parquet/src/file/metadata/reader.rs
+++ b/parquet/src/file/metadata/reader.rs
@@ -303,7 +303,6 @@ impl ParquetMetaDataReader {
// Get bounds needed for page indexes (if any are present in the file).
let Some(range) = self.range_for_page_index() else {
- self.empty_page_indexes();
return Ok(());
};
@@ -477,20 +476,6 @@ impl ParquetMetaDataReader {
Ok(())
}
- /// Set the column_index and offset_indexes to empty `Vec` for backwards
compatibility
- ///
- /// See <https://github.com/apache/arrow-rs/pull/6451> for details
- fn empty_page_indexes(&mut self) {
- let metadata = self.metadata.as_mut().unwrap();
- let num_row_groups = metadata.num_row_groups();
- if self.column_index {
- metadata.set_column_index(Some(vec![vec![]; num_row_groups]));
- }
- if self.offset_index {
- metadata.set_offset_index(Some(vec![vec![]; num_row_groups]));
- }
- }
-
fn range_for_page_index(&self) -> Option<Range<usize>> {
// sanity check
self.metadata.as_ref()?;
diff --git a/parquet/src/file/page_index/index_reader.rs
b/parquet/src/file/page_index/index_reader.rs
index 395e9afe1..fd3639ac3 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -43,8 +43,7 @@ pub(crate) fn acc_range(a: Option<Range<usize>>, b:
Option<Range<usize>>) -> Opt
///
/// Returns a vector of `index[column_number]`.
///
-/// Returns an empty vector if this row group does not contain a
-/// [`ColumnIndex`].
+/// Returns `None` if this row group does not contain a [`ColumnIndex`].
///
/// See [Page Index Documentation] for more details.
///
@@ -52,26 +51,29 @@ pub(crate) fn acc_range(a: Option<Range<usize>>, b:
Option<Range<usize>>) -> Opt
pub fn read_columns_indexes<R: ChunkReader>(
reader: &R,
chunks: &[ColumnChunkMetaData],
-) -> Result<Vec<Index>, ParquetError> {
+) -> Result<Option<Vec<Index>>, ParquetError> {
let fetch = chunks
.iter()
.fold(None, |range, c| acc_range(range, c.column_index_range()));
let fetch = match fetch {
Some(r) => r,
- None => return Ok(vec![Index::NONE; chunks.len()]),
+ None => return Ok(None),
};
let bytes = reader.get_bytes(fetch.start as _, fetch.end - fetch.start)?;
let get = |r: Range<usize>| &bytes[(r.start - fetch.start)..(r.end -
fetch.start)];
- chunks
- .iter()
- .map(|c| match c.column_index_range() {
- Some(r) => decode_column_index(get(r), c.column_type()),
- None => Ok(Index::NONE),
- })
- .collect()
+ Some(
+ chunks
+ .iter()
+ .map(|c| match c.column_index_range() {
+ Some(r) => decode_column_index(get(r), c.column_type()),
+ None => Ok(Index::NONE),
+ })
+ .collect(),
+ )
+ .transpose()
}
/// Reads [`OffsetIndex`], per-page [`PageLocation`] for all columns of a row
@@ -116,8 +118,7 @@ pub fn read_pages_locations<R: ChunkReader>(
///
/// Returns a vector of `offset_index[column_number]`.
///
-/// Returns an empty vector if this row group does not contain an
-/// [`OffsetIndex`].
+/// Returns `None` if this row group does not contain an [`OffsetIndex`].
///
/// See [Page Index Documentation] for more details.
///
@@ -125,26 +126,29 @@ pub fn read_pages_locations<R: ChunkReader>(
pub fn read_offset_indexes<R: ChunkReader>(
reader: &R,
chunks: &[ColumnChunkMetaData],
-) -> Result<Vec<OffsetIndexMetaData>, ParquetError> {
+) -> Result<Option<Vec<OffsetIndexMetaData>>, ParquetError> {
let fetch = chunks
.iter()
.fold(None, |range, c| acc_range(range, c.offset_index_range()));
let fetch = match fetch {
Some(r) => r,
- None => return Ok(vec![]),
+ None => return Ok(None),
};
let bytes = reader.get_bytes(fetch.start as _, fetch.end - fetch.start)?;
let get = |r: Range<usize>| &bytes[(r.start - fetch.start)..(r.end -
fetch.start)];
- chunks
- .iter()
- .map(|c| match c.offset_index_range() {
- Some(r) => decode_offset_index(get(r)),
- None => Err(general_err!("missing offset index")),
- })
- .collect()
+ Some(
+ chunks
+ .iter()
+ .map(|c| match c.offset_index_range() {
+ Some(r) => decode_offset_index(get(r)),
+ None => Err(general_err!("missing offset index")),
+ })
+ .collect(),
+ )
+ .transpose()
}
pub(crate) fn decode_offset_index(data: &[u8]) -> Result<OffsetIndexMetaData,
ParquetError> {
diff --git a/parquet/src/file/serialized_reader.rs
b/parquet/src/file/serialized_reader.rs
index 3262d1fba..06f3cf9fb 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -1223,8 +1223,8 @@ mod tests {
let reader = SerializedFileReader::new_with_options(test_file,
read_options)?;
let metadata = reader.metadata();
assert_eq!(metadata.num_row_groups(), 0);
- assert_eq!(metadata.column_index().unwrap().len(), 0);
- assert_eq!(metadata.offset_index().unwrap().len(), 0);
+ assert!(metadata.column_index().is_none());
+ assert!(metadata.offset_index().is_none());
// false, true predicate
let test_file = get_test_file("alltypes_tiny_pages.parquet");
@@ -1236,8 +1236,8 @@ mod tests {
let reader = SerializedFileReader::new_with_options(test_file,
read_options)?;
let metadata = reader.metadata();
assert_eq!(metadata.num_row_groups(), 0);
- assert_eq!(metadata.column_index().unwrap().len(), 0);
- assert_eq!(metadata.offset_index().unwrap().len(), 0);
+ assert!(metadata.column_index().is_none());
+ assert!(metadata.offset_index().is_none());
// false, false predicate
let test_file = get_test_file("alltypes_tiny_pages.parquet");
@@ -1249,8 +1249,8 @@ mod tests {
let reader = SerializedFileReader::new_with_options(test_file,
read_options)?;
let metadata = reader.metadata();
assert_eq!(metadata.num_row_groups(), 0);
- assert_eq!(metadata.column_index().unwrap().len(), 0);
- assert_eq!(metadata.offset_index().unwrap().len(), 0);
+ assert!(metadata.column_index().is_none());
+ assert!(metadata.offset_index().is_none());
Ok(())
}
@@ -1340,13 +1340,15 @@ mod tests {
let columns = metadata.row_group(0).columns();
let reversed: Vec<_> = columns.iter().cloned().rev().collect();
- let a = read_columns_indexes(&test_file, columns).unwrap();
- let mut b = read_columns_indexes(&test_file, &reversed).unwrap();
+ let a = read_columns_indexes(&test_file, columns).unwrap().unwrap();
+ let mut b = read_columns_indexes(&test_file, &reversed)
+ .unwrap()
+ .unwrap();
b.reverse();
assert_eq!(a, b);
- let a = read_offset_indexes(&test_file, columns).unwrap();
- let mut b = read_offset_indexes(&test_file, &reversed).unwrap();
+ let a = read_offset_indexes(&test_file, columns).unwrap().unwrap();
+ let mut b = read_offset_indexes(&test_file,
&reversed).unwrap().unwrap();
b.reverse();
assert_eq!(a, b);
}