This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new ee5694078c fix panic in `ParquetMetadata::memory_size`: check
has_min_max_set before invoking min()/max() (#6092)
ee5694078c is described below
commit ee5694078c86c8201549654246900a4232d531a9
Author: Fischer <[email protected]>
AuthorDate: Sat Jul 20 19:01:01 2024 +0800
fix panic in `ParquetMetadata::memory_size`: check has_min_max_set before
invoking min()/max() (#6092)
* fix: check has_min_max_set before invoking min()/max()
* chore: add unit test for statistics heap size
* Fixup test
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
parquet/src/file/metadata/memory.rs | 9 ++++++++-
parquet/src/file/metadata/mod.rs | 32 ++++++++++++++++++++++++++++----
2 files changed, 36 insertions(+), 5 deletions(-)
diff --git a/parquet/src/file/metadata/memory.rs
b/parquet/src/file/metadata/memory.rs
index 57b2f7eec0..57d5aaa2dd 100644
--- a/parquet/src/file/metadata/memory.rs
+++ b/parquet/src/file/metadata/memory.rs
@@ -173,7 +173,14 @@ impl<T: ParquetValueType> HeapSize for PageIndex<T> {
impl<T: ParquetValueType> HeapSize for ValueStatistics<T> {
fn heap_size(&self) -> usize {
- self.min().heap_size() + self.max().heap_size()
+ if self.has_min_max_set() {
+ return self.min().heap_size() + self.max().heap_size();
+ } else if self.min_is_exact() {
+ return self.min().heap_size();
+ } else if self.max_is_exact() {
+ return self.max().heap_size();
+ }
+ 0
}
}
impl HeapSize for bool {
diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs
index 1d338bb8ee..9f2084a35d 100644
--- a/parquet/src/file/metadata/mod.rs
+++ b/parquet/src/file/metadata/mod.rs
@@ -1291,7 +1291,11 @@ mod tests {
let columns = schema_descr
.columns()
.iter()
- .map(|column_descr|
ColumnChunkMetaData::builder(column_descr.clone()).build())
+ .map(|column_descr| {
+ ColumnChunkMetaData::builder(column_descr.clone())
+ .set_statistics(Statistics::new::<i32>(None, None, None,
0, false))
+ .build()
+ })
.collect::<Result<Vec<_>>>()
.unwrap();
let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
@@ -1317,11 +1321,31 @@ mod tests {
num_rows,
created_by,
key_value_metadata,
- schema_descr,
+ schema_descr.clone(),
column_orders,
);
- let parquet_meta = ParquetMetaData::new(file_metadata.clone(),
row_group_meta.clone());
- let base_expected_size = 1320;
+
+ // Now, add in Exact Statistics
+ let columns_with_stats = schema_descr
+ .columns()
+ .iter()
+ .map(|column_descr| {
+ ColumnChunkMetaData::builder(column_descr.clone())
+ .set_statistics(Statistics::new::<i32>(Some(0), Some(100),
None, 0, false))
+ .build()
+ })
+ .collect::<Result<Vec<_>>>()
+ .unwrap();
+
+ let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
+ .set_num_rows(1000)
+ .set_column_metadata(columns_with_stats)
+ .build()
+ .unwrap();
+ let row_group_meta_with_stats = vec![row_group_meta_with_stats];
+
+ let parquet_meta = ParquetMetaData::new(file_metadata.clone(),
row_group_meta_with_stats);
+ let base_expected_size = 2024;
assert_eq!(parquet_meta.memory_size(), base_expected_size);
let mut column_index = ColumnIndexBuilder::new();