This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 7f1bae205 Make it clear that StatisticsConverter can not panic (#6187)
7f1bae205 is described below
commit 7f1bae205d94827ec79b7c35bcdc32d1763fb3ef
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu Aug 8 07:10:13 2024 -0400
Make it clear that StatisticsConverter can not panic (#6187)
---
parquet/src/arrow/arrow_reader/statistics.rs | 102 ++++++++++++---------------
1 file changed, 45 insertions(+), 57 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/statistics.rs
b/parquet/src/arrow/arrow_reader/statistics.rs
index 369ea4a47..d487967c2 100644
--- a/parquet/src/arrow/arrow_reader/statistics.rs
+++ b/parquet/src/arrow/arrow_reader/statistics.rs
@@ -758,7 +758,7 @@ macro_rules! get_data_page_statistics {
($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => {
paste! {
match $data_type {
- Some(DataType::Boolean) => {
+ DataType::Boolean => {
let iterator = [<$stat_type_prefix
BooleanDataPageStatsIterator>]::new($iterator);
let mut builder = BooleanBuilder::new();
for x in iterator {
@@ -772,7 +772,7 @@ macro_rules! get_data_page_statistics {
}
Ok(Arc::new(builder.finish()))
},
- Some(DataType::UInt8) => Ok(Arc::new(
+ DataType::UInt8 => Ok(Arc::new(
UInt8Array::from_iter(
[<$stat_type_prefix
Int32DataPageStatsIterator>]::new($iterator)
.map(|x| {
@@ -783,7 +783,7 @@ macro_rules! get_data_page_statistics {
.flatten()
)
)),
- Some(DataType::UInt16) => Ok(Arc::new(
+ DataType::UInt16 => Ok(Arc::new(
UInt16Array::from_iter(
[<$stat_type_prefix
Int32DataPageStatsIterator>]::new($iterator)
.map(|x| {
@@ -794,7 +794,7 @@ macro_rules! get_data_page_statistics {
.flatten()
)
)),
- Some(DataType::UInt32) => Ok(Arc::new(
+ DataType::UInt32 => Ok(Arc::new(
UInt32Array::from_iter(
[<$stat_type_prefix
Int32DataPageStatsIterator>]::new($iterator)
.map(|x| {
@@ -804,7 +804,7 @@ macro_rules! get_data_page_statistics {
})
.flatten()
))),
- Some(DataType::UInt64) => Ok(Arc::new(
+ DataType::UInt64 => Ok(Arc::new(
UInt64Array::from_iter(
[<$stat_type_prefix
Int64DataPageStatsIterator>]::new($iterator)
.map(|x| {
@@ -814,7 +814,7 @@ macro_rules! get_data_page_statistics {
})
.flatten()
))),
- Some(DataType::Int8) => Ok(Arc::new(
+ DataType::Int8 => Ok(Arc::new(
Int8Array::from_iter(
[<$stat_type_prefix
Int32DataPageStatsIterator>]::new($iterator)
.map(|x| {
@@ -825,7 +825,7 @@ macro_rules! get_data_page_statistics {
.flatten()
)
)),
- Some(DataType::Int16) => Ok(Arc::new(
+ DataType::Int16 => Ok(Arc::new(
Int16Array::from_iter(
[<$stat_type_prefix
Int32DataPageStatsIterator>]::new($iterator)
.map(|x| {
@@ -836,9 +836,9 @@ macro_rules! get_data_page_statistics {
.flatten()
)
)),
- Some(DataType::Int32) =>
Ok(Arc::new(Int32Array::from_iter([<$stat_type_prefix
Int32DataPageStatsIterator>]::new($iterator).flatten()))),
- Some(DataType::Int64) =>
Ok(Arc::new(Int64Array::from_iter([<$stat_type_prefix
Int64DataPageStatsIterator>]::new($iterator).flatten()))),
- Some(DataType::Float16) => Ok(Arc::new(
+ DataType::Int32 =>
Ok(Arc::new(Int32Array::from_iter([<$stat_type_prefix
Int32DataPageStatsIterator>]::new($iterator).flatten()))),
+ DataType::Int64 =>
Ok(Arc::new(Int64Array::from_iter([<$stat_type_prefix
Int64DataPageStatsIterator>]::new($iterator).flatten()))),
+ DataType::Float16 => Ok(Arc::new(
Float16Array::from_iter(
[<$stat_type_prefix
Float16DataPageStatsIterator>]::new($iterator)
.map(|x| {
@@ -849,11 +849,11 @@ macro_rules! get_data_page_statistics {
.flatten()
)
)),
- Some(DataType::Float32) =>
Ok(Arc::new(Float32Array::from_iter([<$stat_type_prefix
Float32DataPageStatsIterator>]::new($iterator).flatten()))),
- Some(DataType::Float64) =>
Ok(Arc::new(Float64Array::from_iter([<$stat_type_prefix
Float64DataPageStatsIterator>]::new($iterator).flatten()))),
- Some(DataType::Binary) =>
Ok(Arc::new(BinaryArray::from_iter([<$stat_type_prefix
ByteArrayDataPageStatsIterator>]::new($iterator).flatten()))),
- Some(DataType::LargeBinary) =>
Ok(Arc::new(LargeBinaryArray::from_iter([<$stat_type_prefix
ByteArrayDataPageStatsIterator>]::new($iterator).flatten()))),
- Some(DataType::Utf8) => {
+ DataType::Float32 =>
Ok(Arc::new(Float32Array::from_iter([<$stat_type_prefix
Float32DataPageStatsIterator>]::new($iterator).flatten()))),
+ DataType::Float64 =>
Ok(Arc::new(Float64Array::from_iter([<$stat_type_prefix
Float64DataPageStatsIterator>]::new($iterator).flatten()))),
+ DataType::Binary =>
Ok(Arc::new(BinaryArray::from_iter([<$stat_type_prefix
ByteArrayDataPageStatsIterator>]::new($iterator).flatten()))),
+ DataType::LargeBinary =>
Ok(Arc::new(LargeBinaryArray::from_iter([<$stat_type_prefix
ByteArrayDataPageStatsIterator>]::new($iterator).flatten()))),
+ DataType::Utf8 => {
let mut builder = StringBuilder::new();
let iterator = [<$stat_type_prefix
ByteArrayDataPageStatsIterator>]::new($iterator);
for x in iterator {
@@ -873,7 +873,7 @@ macro_rules! get_data_page_statistics {
}
Ok(Arc::new(builder.finish()))
},
- Some(DataType::LargeUtf8) => {
+ DataType::LargeUtf8 => {
let mut builder = LargeStringBuilder::new();
let iterator = [<$stat_type_prefix
ByteArrayDataPageStatsIterator>]::new($iterator);
for x in iterator {
@@ -893,10 +893,10 @@ macro_rules! get_data_page_statistics {
}
Ok(Arc::new(builder.finish()))
},
- Some(DataType::Dictionary(_, value_type)) => {
- [<$stat_type_prefix:lower _
page_statistics>](Some(value_type), $iterator)
+ DataType::Dictionary(_, value_type) => {
+ [<$stat_type_prefix:lower _ page_statistics>](value_type,
$iterator)
},
- Some(DataType::Timestamp(unit, timezone)) => {
+ DataType::Timestamp(unit, timezone) => {
let iter = [<$stat_type_prefix
Int64DataPageStatsIterator>]::new($iterator).flatten();
Ok(match unit {
TimeUnit::Second =>
Arc::new(TimestampSecondArray::from_iter(iter).with_timezone_opt(timezone.clone())),
@@ -905,8 +905,8 @@ macro_rules! get_data_page_statistics {
TimeUnit::Nanosecond =>
Arc::new(TimestampNanosecondArray::from_iter(iter).with_timezone_opt(timezone.clone())),
})
},
- Some(DataType::Date32) =>
Ok(Arc::new(Date32Array::from_iter([<$stat_type_prefix
Int32DataPageStatsIterator>]::new($iterator).flatten()))),
- Some(DataType::Date64) => Ok(
+ DataType::Date32 =>
Ok(Arc::new(Date32Array::from_iter([<$stat_type_prefix
Int32DataPageStatsIterator>]::new($iterator).flatten()))),
+ DataType::Date64 => Ok(
Arc::new(
Date64Array::from_iter([<$stat_type_prefix
Int32DataPageStatsIterator>]::new($iterator)
.map(|x| {
@@ -919,11 +919,11 @@ macro_rules! get_data_page_statistics {
)
)
),
- Some(DataType::Decimal128(precision, scale)) => Ok(Arc::new(
+ DataType::Decimal128(precision, scale) => Ok(Arc::new(
Decimal128Array::from_iter([<$stat_type_prefix
Decimal128DataPageStatsIterator>]::new($iterator).flatten()).with_precision_and_scale(*precision,
*scale)?)),
- Some(DataType::Decimal256(precision, scale)) => Ok(Arc::new(
+ DataType::Decimal256(precision, scale) => Ok(Arc::new(
Decimal256Array::from_iter([<$stat_type_prefix
Decimal256DataPageStatsIterator>]::new($iterator).flatten()).with_precision_and_scale(*precision,
*scale)?)),
- Some(DataType::Time32(unit)) => {
+ DataType::Time32(unit) => {
Ok(match unit {
TimeUnit::Second =>
Arc::new(Time32SecondArray::from_iter(
[<$stat_type_prefix
Int32DataPageStatsIterator>]::new($iterator).flatten(),
@@ -937,7 +937,7 @@ macro_rules! get_data_page_statistics {
}
})
}
- Some(DataType::Time64(unit)) => {
+ DataType::Time64(unit) => {
Ok(match unit {
TimeUnit::Microsecond =>
Arc::new(Time64MicrosecondArray::from_iter(
[<$stat_type_prefix
Int64DataPageStatsIterator>]::new($iterator).flatten(),
@@ -951,7 +951,7 @@ macro_rules! get_data_page_statistics {
}
})
},
- Some(DataType::FixedSizeBinary(size)) => {
+ DataType::FixedSizeBinary(size) => {
let mut builder = FixedSizeBinaryBuilder::new(*size);
let iterator = [<$stat_type_prefix
FixedLenByteArrayDataPageStatsIterator>]::new($iterator);
for x in iterator {
@@ -964,18 +964,13 @@ macro_rules! get_data_page_statistics {
if x.len() == *size as usize {
let _ = builder.append_value(x.data());
} else {
- // log::debug!(
- // "FixedSizeBinary({}) statistics is a
binary of size {}, ignoring it.",
- // size,
- // x.len(),
- // );
builder.append_null();
}
}
}
Ok(Arc::new(builder.finish()))
},
- Some(DataType::Utf8View) => {
+ DataType::Utf8View => {
let mut builder = StringViewBuilder::new();
let iterator = [<$stat_type_prefix
ByteArrayDataPageStatsIterator>]::new($iterator);
for x in iterator {
@@ -995,7 +990,7 @@ macro_rules! get_data_page_statistics {
}
Ok(Arc::new(builder.finish()))
},
- Some(DataType::BinaryView) => {
+ DataType::BinaryView => {
let mut builder = BinaryViewBuilder::new();
let iterator = [<$stat_type_prefix
ByteArrayDataPageStatsIterator>]::new($iterator);
for x in iterator {
@@ -1010,23 +1005,22 @@ macro_rules! get_data_page_statistics {
}
Ok(Arc::new(builder.finish()))
},
- Some(DataType::Null) |
- Some(DataType::Duration(_)) |
- Some(DataType::Interval(_)) |
- Some(DataType::List(_)) |
- Some(DataType::ListView(_)) |
- Some(DataType::FixedSizeList(_, _)) |
- Some(DataType::LargeList(_)) |
- Some(DataType::LargeListView(_)) |
- Some(DataType::Struct(_)) |
- Some(DataType::Union(_, _)) |
- Some(DataType::Map(_, _)) |
- Some(DataType::RunEndEncoded(_, _)) => {
+ DataType::Null |
+ DataType::Duration(_) |
+ DataType::Interval(_) |
+ DataType::List(_) |
+ DataType::ListView(_) |
+ DataType::FixedSizeList(_, _) |
+ DataType::LargeList(_) |
+ DataType::LargeListView(_) |
+ DataType::Struct(_) |
+ DataType::Union(_, _) |
+ DataType::Map(_, _) |
+ DataType::RunEndEncoded(_, _) => {
let len = $iterator.count();
// don't know how to extract statistics, so return a null
array
- Ok(new_null_array($data_type.unwrap(), len))
+ Ok(new_null_array($data_type, len))
},
- None => unimplemented!() // not sure how to handle this
}
}
}
@@ -1054,10 +1048,7 @@ fn max_statistics<'a, I: Iterator<Item = Option<&'a
ParquetStatistics>>>(
/// Extracts the min statistics from an iterator
/// of parquet page [`Index`]'es to an [`ArrayRef`]
-pub(crate) fn min_page_statistics<'a, I>(
- data_type: Option<&DataType>,
- iterator: I,
-) -> Result<ArrayRef>
+pub(crate) fn min_page_statistics<'a, I>(data_type: &DataType, iterator: I) ->
Result<ArrayRef>
where
I: Iterator<Item = (usize, &'a Index)>,
{
@@ -1066,10 +1057,7 @@ where
/// Extracts the max statistics from an iterator
/// of parquet page [`Index`]'es to an [`ArrayRef`]
-pub(crate) fn max_page_statistics<'a, I>(
- data_type: Option<&DataType>,
- iterator: I,
-) -> Result<ArrayRef>
+pub(crate) fn max_page_statistics<'a, I>(data_type: &DataType, iterator: I) ->
Result<ArrayRef>
where
I: Iterator<Item = (usize, &'a Index)>,
{
@@ -1439,7 +1427,7 @@ impl<'a> StatisticsConverter<'a> {
(*num_data_pages, column_page_index_per_row_group_per_column)
});
- min_page_statistics(Some(data_type), iter)
+ min_page_statistics(data_type, iter)
}
/// Extract the maximum values from Data Page statistics.
@@ -1470,7 +1458,7 @@ impl<'a> StatisticsConverter<'a> {
(*num_data_pages, column_page_index_per_row_group_per_column)
});
- max_page_statistics(Some(data_type), iter)
+ max_page_statistics(data_type, iter)
}
/// Returns a [`UInt64Array`] with null counts for each data page.