This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new d8a3b1c95 Don't Duplicate Offset Index on RowGroupMetadata (#4142)
d8a3b1c95 is described below
commit d8a3b1c95d88292985d4f2dc306836b3143b8a6b
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Apr 27 07:29:44 2023 -0400
Don't Duplicate Offset Index on RowGroupMetadata (#4142)
* Remove offset index from RowGroupMetadata
* Rename index accessors
* Update layout test
---
parquet/src/arrow/async_reader/mod.rs | 90 +++++++++++++++--------------------
parquet/src/bin/parquet-index.rs | 4 +-
parquet/src/file/metadata.rs | 60 +++++++++--------------
parquet/src/file/serialized_reader.rs | 85 ++++++++++++++++-----------------
parquet/tests/arrow_writer_layout.rs | 10 ++--
5 files changed, 112 insertions(+), 137 deletions(-)
diff --git a/parquet/src/arrow/async_reader/mod.rs
b/parquet/src/arrow/async_reader/mod.rs
index 2d39284c7..a0e7ff72a 100644
--- a/parquet/src/arrow/async_reader/mod.rs
+++ b/parquet/src/arrow/async_reader/mod.rs
@@ -84,7 +84,7 @@ use std::pin::Pin;
use std::sync::Arc;
use std::task::{Context, Poll};
-use crate::format::OffsetIndex;
+use crate::format::{OffsetIndex, PageLocation};
use bytes::{Buf, Bytes};
use futures::future::{BoxFuture, FutureExt};
use futures::ready;
@@ -237,10 +237,8 @@ impl<T: AsyncFileReader + Send + 'static>
ArrowReaderBuilder<AsyncReader<T>> {
let mut metadata = input.get_metadata().await?;
if options.page_index
- && metadata
- .page_indexes()
- .zip(metadata.offset_indexes())
- .is_none()
+ && metadata.column_index().is_none()
+ && metadata.offset_index().is_none()
{
let mut fetch_ranges = vec![];
let mut index_lengths: Vec<Vec<usize>> = vec![];
@@ -284,7 +282,6 @@ impl<T: AsyncFileReader + Send + 'static>
ArrowReaderBuilder<AsyncReader<T>> {
offset_index.push(offset.page_locations);
}
- rg.set_page_offset(offset_index.clone());
offset_indexes.push(offset_index);
let index_data = chunks.next().unwrap();
@@ -399,11 +396,17 @@ where
// TODO: calling build_array multiple times is wasteful
let meta = self.metadata.row_group(row_group_idx);
+ let page_locations = self
+ .metadata
+ .offset_index()
+ .map(|x| x[row_group_idx].as_slice());
+
let mut row_group = InMemoryRowGroup {
metadata: meta,
// schema: meta.schema_descr_ptr(),
row_count: meta.num_rows() as usize,
column_chunks: vec![None; meta.columns().len()],
+ page_locations,
};
if let Some(filter) = self.filter.as_mut() {
@@ -614,6 +617,7 @@ where
/// An in-memory collection of column chunks
struct InMemoryRowGroup<'a> {
metadata: &'a RowGroupMetaData,
+ page_locations: Option<&'a [Vec<PageLocation>]>,
column_chunks: Vec<Option<Arc<ColumnChunkData>>>,
row_count: usize,
}
@@ -626,9 +630,7 @@ impl<'a> InMemoryRowGroup<'a> {
projection: &ProjectionMask,
selection: Option<&RowSelection>,
) -> Result<()> {
- if let Some((selection, page_locations)) =
- selection.zip(self.metadata.page_offset_index().as_ref())
- {
+ if let Some((selection, page_locations)) =
selection.zip(self.page_locations) {
// If we have a `RowSelection` and an `OffsetIndex` then only
fetch pages required for the
// `RowSelection`
let mut page_start_offsets: Vec<Vec<usize>> = vec![];
@@ -730,11 +732,7 @@ impl<'a> RowGroupCollection for InMemoryRowGroup<'a> {
"Invalid column index {i}, column was not fetched"
))),
Some(data) => {
- let page_locations = self
- .metadata
- .page_offset_index()
- .as_ref()
- .map(|index| index[i].clone());
+ let page_locations = self.page_locations.map(|index|
index[i].clone());
let page_reader: Box<dyn PageReader> =
Box::new(SerializedPageReader::new(
data.clone(),
@@ -947,19 +945,24 @@ mod tests {
let metadata_with_index = builder.metadata();
// Check offset indexes are present for all columns
- for rg in metadata_with_index.row_groups() {
- let page_locations =
- rg.page_offset_index().expect("expected page offset index");
- assert_eq!(page_locations.len(), rg.columns().len())
- }
+ let offset_index = metadata_with_index.offset_index().unwrap();
+ let column_index = metadata_with_index.column_index().unwrap();
+
+ assert_eq!(offset_index.len(), metadata_with_index.num_row_groups());
+ assert_eq!(column_index.len(), metadata_with_index.num_row_groups());
+
+ let num_columns = metadata_with_index
+ .file_metadata()
+ .schema_descr()
+ .num_columns();
// Check page indexes are present for all columns
- let page_indexes = metadata_with_index
- .page_indexes()
- .expect("expected page indexes");
- for (idx, rg) in metadata_with_index.row_groups().iter().enumerate() {
- assert_eq!(page_indexes[idx].len(), rg.columns().len())
- }
+ offset_index
+ .iter()
+ .for_each(|x| assert_eq!(x.len(), num_columns));
+ column_index
+ .iter()
+ .for_each(|x| assert_eq!(x.len(), num_columns));
let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![1,
2]);
let stream = builder
@@ -999,29 +1002,9 @@ mod tests {
requests: Default::default(),
};
- let options = ArrowReaderOptions::new().with_page_index(true);
- let builder =
- ParquetRecordBatchStreamBuilder::new_with_options(async_reader,
options)
- .await
- .unwrap();
-
- // The builder should have page and offset indexes loaded now
- let metadata_with_index = builder.metadata();
-
- // Check offset indexes are present for all columns
- for rg in metadata_with_index.row_groups() {
- let page_locations =
- rg.page_offset_index().expect("expected page offset index");
- assert_eq!(page_locations.len(), rg.columns().len())
- }
-
- // Check page indexes are present for all columns
- let page_indexes = metadata_with_index
- .page_indexes()
- .expect("expected page indexes");
- for (idx, rg) in metadata_with_index.row_groups().iter().enumerate() {
- assert_eq!(page_indexes[idx].len(), rg.columns().len())
- }
+ let builder = ParquetRecordBatchStreamBuilder::new(async_reader)
+ .await
+ .unwrap();
let mask = ProjectionMask::leaves(builder.parquet_schema(), vec![1,
2]);
let stream = builder
@@ -1473,10 +1456,13 @@ mod tests {
index_reader::read_pages_locations(&data,
metadata.row_group(0).columns())
.expect("reading offset index");
- let mut row_group_meta = metadata.row_group(0).clone();
- row_group_meta.set_page_offset(offset_index.clone());
- let metadata =
- ParquetMetaData::new(metadata.file_metadata().clone(),
vec![row_group_meta]);
+ let row_group_meta = metadata.row_group(0).clone();
+ let metadata = ParquetMetaData::new_with_page_index(
+ metadata.file_metadata().clone(),
+ vec![row_group_meta],
+ None,
+ Some(vec![offset_index.clone()]),
+ );
let metadata = Arc::new(metadata);
diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs
index d8a72dd79..4b82c2196 100644
--- a/parquet/src/bin/parquet-index.rs
+++ b/parquet/src/bin/parquet-index.rs
@@ -70,13 +70,13 @@ impl Args {
// Column index data for all row groups and columns
let column_index = reader
.metadata()
- .page_indexes()
+ .column_index()
.ok_or_else(|| ParquetError::General("Column index not
found".to_string()))?;
// Offset index data for all row groups and columns
let offset_index = reader
.metadata()
- .offset_indexes()
+ .offset_index()
.ok_or_else(|| ParquetError::General("Offset index not
found".to_string()))?;
// Iterate through each row group
diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs
index a83f02dfd..41097e107 100644
--- a/parquet/src/file/metadata.rs
+++ b/parquet/src/file/metadata.rs
@@ -77,9 +77,9 @@ pub struct ParquetMetaData {
file_metadata: FileMetaData,
row_groups: Vec<RowGroupMetaData>,
/// Page index for all pages in each column chunk
- page_indexes: Option<ParquetColumnIndex>,
+ column_index: Option<ParquetColumnIndex>,
/// Offset index for all pages in each column chunk
- offset_indexes: Option<ParquetOffsetIndex>,
+ offset_index: Option<ParquetOffsetIndex>,
}
impl ParquetMetaData {
@@ -89,8 +89,8 @@ impl ParquetMetaData {
ParquetMetaData {
file_metadata,
row_groups,
- page_indexes: None,
- offset_indexes: None,
+ column_index: None,
+ offset_index: None,
}
}
@@ -99,14 +99,14 @@ impl ParquetMetaData {
pub fn new_with_page_index(
file_metadata: FileMetaData,
row_groups: Vec<RowGroupMetaData>,
- page_indexes: Option<ParquetColumnIndex>,
- offset_indexes: Option<ParquetOffsetIndex>,
+ column_index: Option<ParquetColumnIndex>,
+ offset_index: Option<ParquetOffsetIndex>,
) -> Self {
ParquetMetaData {
file_metadata,
row_groups,
- page_indexes,
- offset_indexes,
+ column_index,
+ offset_index,
}
}
@@ -132,13 +132,25 @@ impl ParquetMetaData {
}
/// Returns page indexes in this file.
+ #[deprecated(note = "Use Self::column_index")]
pub fn page_indexes(&self) -> Option<&ParquetColumnIndex> {
- self.page_indexes.as_ref()
+ self.column_index.as_ref()
}
- /// Returns offset indexes in this file.
+ /// Returns the column index for this file if loaded
+ pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
+ self.column_index.as_ref()
+ }
+
+ /// Returns the offset index for this file if loaded
+ #[deprecated(note = "Use Self::offset_index")]
pub fn offset_indexes(&self) -> Option<&ParquetOffsetIndex> {
- self.offset_indexes.as_ref()
+ self.offset_index.as_ref()
+ }
+
+ /// Returns offset indexes in this file.
+ pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
+ self.offset_index.as_ref()
}
}
@@ -252,8 +264,6 @@ pub struct RowGroupMetaData {
sorting_columns: Option<Vec<SortingColumn>>,
total_byte_size: i64,
schema_descr: SchemaDescPtr,
- /// `page_offset_index[column_number][page_number]`
- page_offset_index: Option<Vec<Vec<PageLocation>>>,
}
impl RowGroupMetaData {
@@ -297,13 +307,6 @@ impl RowGroupMetaData {
self.columns.iter().map(|c| c.total_compressed_size).sum()
}
- /// Returns reference of page offset index of all column in this row group.
- ///
- /// The returned vector contains `page_offset[column_number][page_number]`
- pub fn page_offset_index(&self) -> Option<&Vec<Vec<PageLocation>>> {
- self.page_offset_index.as_ref()
- }
-
/// Returns reference to a schema descriptor.
pub fn schema_descr(&self) -> &SchemaDescriptor {
self.schema_descr.as_ref()
@@ -314,13 +317,6 @@ impl RowGroupMetaData {
self.schema_descr.clone()
}
- /// Sets page offset index for this row group.
- ///
- /// The vector represents `page_offset[column_number][page_number]`
- pub fn set_page_offset(&mut self, page_offset: Vec<Vec<PageLocation>>) {
- self.page_offset_index = Some(page_offset);
- }
-
/// Method to convert from Thrift.
pub fn from_thrift(
schema_descr: SchemaDescPtr,
@@ -341,7 +337,6 @@ impl RowGroupMetaData {
sorting_columns,
total_byte_size,
schema_descr,
- page_offset_index: None,
})
}
@@ -366,7 +361,6 @@ pub struct RowGroupMetaDataBuilder {
num_rows: i64,
sorting_columns: Option<Vec<SortingColumn>>,
total_byte_size: i64,
- page_offset_index: Option<Vec<Vec<PageLocation>>>,
}
impl RowGroupMetaDataBuilder {
@@ -378,7 +372,6 @@ impl RowGroupMetaDataBuilder {
num_rows: 0,
sorting_columns: None,
total_byte_size: 0,
- page_offset_index: None,
}
}
@@ -406,12 +399,6 @@ impl RowGroupMetaDataBuilder {
self
}
- /// Sets page offset index for this row group.
- pub fn set_page_offset(mut self, page_offset: Vec<Vec<PageLocation>>) ->
Self {
- self.page_offset_index = Some(page_offset);
- self
- }
-
/// Builds row group metadata.
pub fn build(self) -> Result<RowGroupMetaData> {
if self.schema_descr.num_columns() != self.columns.len() {
@@ -428,7 +415,6 @@ impl RowGroupMetaDataBuilder {
sorting_columns: self.sorting_columns,
total_byte_size: self.total_byte_size,
schema_descr: self.schema_descr,
- page_offset_index: self.page_offset_index,
})
}
}
diff --git a/parquet/src/file/serialized_reader.rs
b/parquet/src/file/serialized_reader.rs
index 2ddbf0f7c..7346b1a12 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -269,7 +269,6 @@ impl<R: 'static + ChunkReader> SerializedFileReader<R> {
index_reader::read_columns_indexes(&chunk_reader,
rg.columns())?;
let offset_index =
index_reader::read_pages_locations(&chunk_reader,
rg.columns())?;
- rg.set_page_offset(offset_index.clone());
columns_indexes.push(column_index);
offset_indexes.push(offset_index);
}
@@ -328,9 +327,10 @@ impl<R: 'static + ChunkReader> FileReader for
SerializedFileReader<R> {
// Row groups should be processed sequentially.
let props = Arc::clone(&self.props);
let f = Arc::clone(&self.chunk_reader);
- Ok(Box::new(SerializedRowGroupReader::new_with_properties(
+ Ok(Box::new(SerializedRowGroupReader::new(
f,
row_group_metadata,
+ self.metadata.offset_index().map(|x| x[i].as_slice()),
props,
)?))
}
@@ -344,15 +344,17 @@ impl<R: 'static + ChunkReader> FileReader for
SerializedFileReader<R> {
pub struct SerializedRowGroupReader<'a, R: ChunkReader> {
chunk_reader: Arc<R>,
metadata: &'a RowGroupMetaData,
+ page_locations: Option<&'a [Vec<PageLocation>]>,
props: ReaderPropertiesPtr,
bloom_filters: Vec<Option<Sbbf>>,
}
impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> {
/// Creates new row group reader from a file, row group metadata and
custom config.
- fn new_with_properties(
+ fn new(
chunk_reader: Arc<R>,
metadata: &'a RowGroupMetaData,
+ page_locations: Option<&'a [Vec<PageLocation>]>,
props: ReaderPropertiesPtr,
) -> Result<Self> {
let bloom_filters = if props.read_bloom_filter() {
@@ -367,6 +369,7 @@ impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> {
Ok(Self {
chunk_reader,
metadata,
+ page_locations,
props,
bloom_filters,
})
@@ -386,11 +389,7 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for
SerializedRowGroupReader<'
fn get_column_page_reader(&self, i: usize) -> Result<Box<dyn PageReader>> {
let col = self.metadata.column(i);
- let page_locations = self
- .metadata
- .page_offset_index()
- .as_ref()
- .map(|x| x[i].clone());
+ let page_locations = self.page_locations.map(|x| x[i].clone());
let props = Arc::clone(&self.props);
Ok(Box::new(SerializedPageReader::new_with_properties(
@@ -1350,11 +1349,11 @@ mod tests {
let metadata = reader.metadata();
assert_eq!(metadata.num_row_groups(), 1);
- let page_indexes = metadata.page_indexes().unwrap();
+ let column_index = metadata.column_index().unwrap();
// only one row group
- assert_eq!(page_indexes.len(), 1);
- let index = if let Index::BYTE_ARRAY(index) = &page_indexes[0][0] {
+ assert_eq!(column_index.len(), 1);
+ let index = if let Index::BYTE_ARRAY(index) = &column_index[0][0] {
index
} else {
unreachable!()
@@ -1372,7 +1371,7 @@ mod tests {
assert_eq!(b"Hello", min.as_bytes());
assert_eq!(b"today", max.as_bytes());
- let offset_indexes = metadata.offset_indexes().unwrap();
+ let offset_indexes = metadata.offset_index().unwrap();
// only one row group
assert_eq!(offset_indexes.len(), 1);
let offset_index = &offset_indexes[0];
@@ -1396,19 +1395,19 @@ mod tests {
let metadata = reader.metadata();
assert_eq!(metadata.num_row_groups(), 1);
- let page_indexes = metadata.page_indexes().unwrap();
- let row_group_offset_indexes = &metadata.offset_indexes().unwrap()[0];
+ let column_index = metadata.column_index().unwrap();
+ let row_group_offset_indexes = &metadata.offset_index().unwrap()[0];
// only one row group
- assert_eq!(page_indexes.len(), 1);
+ assert_eq!(column_index.len(), 1);
let row_group_metadata = metadata.row_group(0);
//col0->id: INT32 UNCOMPRESSED DO:0 FPO:4 SZ:37325/37325/1.00 VC:7300
ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 7299, num_nulls: 0]
- assert!(!&page_indexes[0][0].is_sorted());
- let boundary_order = &page_indexes[0][0].get_boundary_order();
+ assert!(!&column_index[0][0].is_sorted());
+ let boundary_order = &column_index[0][0].get_boundary_order();
assert!(boundary_order.is_some());
matches!(boundary_order.unwrap(), BoundaryOrder::UNORDERED);
- if let Index::INT32(index) = &page_indexes[0][0] {
+ if let Index::INT32(index) = &column_index[0][0] {
check_native_page_index(
index,
325,
@@ -1420,16 +1419,16 @@ mod tests {
unreachable!()
};
//col1->bool_col:BOOLEAN UNCOMPRESSED DO:0 FPO:37329 SZ:3022/3022/1.00
VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: false, max: true, num_nulls: 0]
- assert!(&page_indexes[0][1].is_sorted());
- if let Index::BOOLEAN(index) = &page_indexes[0][1] {
+ assert!(&column_index[0][1].is_sorted());
+ if let Index::BOOLEAN(index) = &column_index[0][1] {
assert_eq!(index.indexes.len(), 82);
assert_eq!(row_group_offset_indexes[1].len(), 82);
} else {
unreachable!()
};
//col2->tinyint_col: INT32 UNCOMPRESSED DO:0 FPO:40351
SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9,
num_nulls: 0]
- assert!(&page_indexes[0][2].is_sorted());
- if let Index::INT32(index) = &page_indexes[0][2] {
+ assert!(&column_index[0][2].is_sorted());
+ if let Index::INT32(index) = &column_index[0][2] {
check_native_page_index(
index,
325,
@@ -1441,8 +1440,8 @@ mod tests {
unreachable!()
};
//col4->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676
SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9,
num_nulls: 0]
- assert!(&page_indexes[0][3].is_sorted());
- if let Index::INT32(index) = &page_indexes[0][3] {
+ assert!(&column_index[0][3].is_sorted());
+ if let Index::INT32(index) = &column_index[0][3] {
check_native_page_index(
index,
325,
@@ -1454,8 +1453,8 @@ mod tests {
unreachable!()
};
//col5->smallint_col: INT32 UNCOMPRESSED DO:0 FPO:77676
SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9,
num_nulls: 0]
- assert!(&page_indexes[0][4].is_sorted());
- if let Index::INT32(index) = &page_indexes[0][4] {
+ assert!(&column_index[0][4].is_sorted());
+ if let Index::INT32(index) = &column_index[0][4] {
check_native_page_index(
index,
325,
@@ -1467,8 +1466,8 @@ mod tests {
unreachable!()
};
//col6->bigint_col: INT64 UNCOMPRESSED DO:0 FPO:152326
SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 90,
num_nulls: 0]
- assert!(!&page_indexes[0][5].is_sorted());
- if let Index::INT64(index) = &page_indexes[0][5] {
+ assert!(!&column_index[0][5].is_sorted());
+ if let Index::INT64(index) = &column_index[0][5] {
check_native_page_index(
index,
528,
@@ -1480,8 +1479,8 @@ mod tests {
unreachable!()
};
//col7->float_col: FLOAT UNCOMPRESSED DO:0 FPO:223924
SZ:37325/37325/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: -0.0, max: 9.9,
num_nulls: 0]
- assert!(&page_indexes[0][6].is_sorted());
- if let Index::FLOAT(index) = &page_indexes[0][6] {
+ assert!(&column_index[0][6].is_sorted());
+ if let Index::FLOAT(index) = &column_index[0][6] {
check_native_page_index(
index,
325,
@@ -1493,8 +1492,8 @@ mod tests {
unreachable!()
};
//col8->double_col: DOUBLE UNCOMPRESSED DO:0 FPO:261249
SZ:71598/71598/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: -0.0, max:
90.89999999999999, num_nulls: 0]
- assert!(!&page_indexes[0][7].is_sorted());
- if let Index::DOUBLE(index) = &page_indexes[0][7] {
+ assert!(!&column_index[0][7].is_sorted());
+ if let Index::DOUBLE(index) = &column_index[0][7] {
check_native_page_index(
index,
528,
@@ -1506,8 +1505,8 @@ mod tests {
unreachable!()
};
//col9->date_string_col: BINARY UNCOMPRESSED DO:0 FPO:332847
SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 01/01/09, max:
12/31/10, num_nulls: 0]
- assert!(!&page_indexes[0][8].is_sorted());
- if let Index::BYTE_ARRAY(index) = &page_indexes[0][8] {
+ assert!(!&column_index[0][8].is_sorted());
+ if let Index::BYTE_ARRAY(index) = &column_index[0][8] {
check_native_page_index(
index,
974,
@@ -1519,8 +1518,8 @@ mod tests {
unreachable!()
};
//col10->string_col: BINARY UNCOMPRESSED DO:0 FPO:444795
SZ:45298/45298/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 0, max: 9,
num_nulls: 0]
- assert!(&page_indexes[0][9].is_sorted());
- if let Index::BYTE_ARRAY(index) = &page_indexes[0][9] {
+ assert!(&column_index[0][9].is_sorted());
+ if let Index::BYTE_ARRAY(index) = &column_index[0][9] {
check_native_page_index(
index,
352,
@@ -1533,15 +1532,15 @@ mod tests {
};
//col11->timestamp_col: INT96 UNCOMPRESSED DO:0 FPO:490093
SZ:111948/111948/1.00 VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 0,
min/max not defined]
//Notice: min_max values for each page for this col not exits.
- assert!(!&page_indexes[0][10].is_sorted());
- if let Index::NONE = &page_indexes[0][10] {
+ assert!(!&column_index[0][10].is_sorted());
+ if let Index::NONE = &column_index[0][10] {
assert_eq!(row_group_offset_indexes[10].len(), 974);
} else {
unreachable!()
};
//col12->year: INT32 UNCOMPRESSED DO:0 FPO:602041 SZ:37325/37325/1.00
VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 2009, max: 2010, num_nulls: 0]
- assert!(&page_indexes[0][11].is_sorted());
- if let Index::INT32(index) = &page_indexes[0][11] {
+ assert!(&column_index[0][11].is_sorted());
+ if let Index::INT32(index) = &column_index[0][11] {
check_native_page_index(
index,
325,
@@ -1553,8 +1552,8 @@ mod tests {
unreachable!()
};
//col13->month: INT32 UNCOMPRESSED DO:0 FPO:639366 SZ:37325/37325/1.00
VC:7300 ENC:BIT_PACKED,RLE,PLAIN ST:[min: 1, max: 12, num_nulls: 0]
- assert!(!&page_indexes[0][12].is_sorted());
- if let Index::INT32(index) = &page_indexes[0][12] {
+ assert!(!&column_index[0][12].is_sorted());
+ if let Index::INT32(index) = &column_index[0][12] {
check_native_page_index(
index,
325,
@@ -1768,7 +1767,7 @@ mod tests {
let b = Bytes::from(out);
let options = ReadOptionsBuilder::new().with_page_index().build();
let reader = SerializedFileReader::new_with_options(b,
options).unwrap();
- let index = reader.metadata().page_indexes().unwrap();
+ let index = reader.metadata().column_index().unwrap();
// 1 row group
assert_eq!(index.len(), 1);
diff --git a/parquet/tests/arrow_writer_layout.rs
b/parquet/tests/arrow_writer_layout.rs
index 0c66fcd10..4bf649f24 100644
--- a/parquet/tests/arrow_writer_layout.rs
+++ b/parquet/tests/arrow_writer_layout.rs
@@ -78,10 +78,14 @@ fn do_test(test: LayoutTest) {
fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout)
{
assert_eq!(meta.row_groups().len(), layout.row_groups.len());
- for (row_group, row_group_layout) in
meta.row_groups().iter().zip(&layout.row_groups)
- {
+ let iter = meta
+ .row_groups()
+ .iter()
+ .zip(&layout.row_groups)
+ .zip(meta.offset_index().unwrap());
+
+ for ((row_group, row_group_layout), offset_index) in iter {
// Check against offset index
- let offset_index = row_group.page_offset_index().unwrap();
assert_eq!(offset_index.len(), row_group_layout.columns.len());
for (column_index, column_layout) in