Dandandan commented on code in PR #9303:
URL: https://github.com/apache/arrow-rs/pull/9303#discussion_r2749489138
##########
parquet/src/arrow/arrow_reader/statistics.rs:
##########
@@ -596,473 +600,572 @@ macro_rules! get_statistics {
}}}
}
-macro_rules! make_data_page_stats_iterator {
- ($iterator_type: ident, $func: ident, $stat_value_type: ty) => {
- struct $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- iter: I,
- }
-
- impl<'a, I> $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- fn new(iter: I) -> Self {
- Self { iter }
- }
- }
-
- impl<'a, I> Iterator for $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- type Item = Vec<Option<$stat_value_type>>;
-
- fn next(&mut self) -> Option<Self::Item> {
- let next = self.iter.next();
- match next {
- Some((len, index)) => match index {
- // No matching `Index` found;
- // thus no statistics that can be extracted.
- // We return vec![None; len] to effectively
- // create an arrow null-array with the length
- // corresponding to the number of entries in
- // `ParquetOffsetIndex` per row group per column.
- ColumnIndexMetaData::NONE => Some(vec![None; len]),
- _ =>
Some(<$stat_value_type>::$func(&index).collect::<Vec<_>>()),
- },
- _ => None,
- }
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.iter.size_hint()
- }
- }
- };
-}
-
-make_data_page_stats_iterator!(MinBooleanDataPageStatsIterator,
min_values_iter, bool);
-make_data_page_stats_iterator!(MaxBooleanDataPageStatsIterator,
max_values_iter, bool);
-make_data_page_stats_iterator!(MinInt32DataPageStatsIterator, min_values_iter,
i32);
-make_data_page_stats_iterator!(MaxInt32DataPageStatsIterator, max_values_iter,
i32);
-make_data_page_stats_iterator!(MinInt64DataPageStatsIterator, min_values_iter,
i64);
-make_data_page_stats_iterator!(MaxInt64DataPageStatsIterator, max_values_iter,
i64);
-make_data_page_stats_iterator!(
- MinFloat16DataPageStatsIterator,
- min_values_iter,
- FixedLenByteArray
-);
-make_data_page_stats_iterator!(
- MaxFloat16DataPageStatsIterator,
- max_values_iter,
- FixedLenByteArray
-);
-make_data_page_stats_iterator!(MinFloat32DataPageStatsIterator,
min_values_iter, f32);
-make_data_page_stats_iterator!(MaxFloat32DataPageStatsIterator,
max_values_iter, f32);
-make_data_page_stats_iterator!(MinFloat64DataPageStatsIterator,
min_values_iter, f64);
-make_data_page_stats_iterator!(MaxFloat64DataPageStatsIterator,
max_values_iter, f64);
-make_data_page_stats_iterator!(
- MinByteArrayDataPageStatsIterator,
- min_values_iter,
- ByteArray
-);
-make_data_page_stats_iterator!(
- MaxByteArrayDataPageStatsIterator,
- max_values_iter,
- ByteArray
-);
-make_data_page_stats_iterator!(
- MaxFixedLenByteArrayDataPageStatsIterator,
- max_values_iter,
- FixedLenByteArray
-);
-
-make_data_page_stats_iterator!(
- MinFixedLenByteArrayDataPageStatsIterator,
- min_values_iter,
- FixedLenByteArray
-);
-
-macro_rules! get_decimal_page_stats_iterator {
- ($iterator_type: ident, $func: ident, $stat_value_type: ident,
$convert_func: ident) => {
- struct $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- iter: I,
- }
-
- impl<'a, I> $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- fn new(iter: I) -> Self {
- Self { iter }
- }
- }
-
- impl<'a, I> Iterator for $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- type Item = Vec<Option<$stat_value_type>>;
-
- // Some(native_index.$func().map(|v|
v.map($conv)).collect::<Vec<_>>())
- fn next(&mut self) -> Option<Self::Item> {
- let next = self.iter.next();
- match next {
- Some((len, index)) => match index {
- ColumnIndexMetaData::INT32(native_index) => Some(
- native_index
- .$func()
- .map(|x| x.map(|x| $stat_value_type::from(*x)))
- .collect::<Vec<_>>(),
- ),
- ColumnIndexMetaData::INT64(native_index) => Some(
- native_index
- .$func()
- .map(|x| x.map(|x|
$stat_value_type::try_from(*x).unwrap()))
- .collect::<Vec<_>>(),
- ),
- ColumnIndexMetaData::BYTE_ARRAY(native_index) => Some(
- native_index
- .$func()
- .map(|x| x.map(|x| $convert_func(x)))
- .collect::<Vec<_>>(),
- ),
-
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(native_index) => Some(
- native_index
- .$func()
- .map(|x| x.map(|x| $convert_func(x)))
- .collect::<Vec<_>>(),
- ),
- _ => Some(vec![None; len]),
- },
- _ => None,
- }
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.iter.size_hint()
- }
- }
- };
-}
-
-get_decimal_page_stats_iterator!(
- MinDecimal32DataPageStatsIterator,
- min_values_iter,
- i32,
- from_bytes_to_i32
-);
-
-get_decimal_page_stats_iterator!(
- MaxDecimal32DataPageStatsIterator,
- max_values_iter,
- i32,
- from_bytes_to_i32
-);
-
-get_decimal_page_stats_iterator!(
- MinDecimal64DataPageStatsIterator,
- min_values_iter,
- i64,
- from_bytes_to_i64
-);
-
-get_decimal_page_stats_iterator!(
- MaxDecimal64DataPageStatsIterator,
- max_values_iter,
- i64,
- from_bytes_to_i64
-);
-
-get_decimal_page_stats_iterator!(
- MinDecimal128DataPageStatsIterator,
- min_values_iter,
- i128,
- from_bytes_to_i128
-);
-
-get_decimal_page_stats_iterator!(
- MaxDecimal128DataPageStatsIterator,
- max_values_iter,
- i128,
- from_bytes_to_i128
-);
-
-get_decimal_page_stats_iterator!(
- MinDecimal256DataPageStatsIterator,
- min_values_iter,
- i256,
- from_bytes_to_i256
-);
-
-get_decimal_page_stats_iterator!(
- MaxDecimal256DataPageStatsIterator,
- max_values_iter,
- i256,
- from_bytes_to_i256
-);
-
macro_rules! get_data_page_statistics {
($stat_type_prefix: ident, $data_type: ident, $iterator: ident,
$physical_type: ident) => {
- paste! {
- match $data_type {
+ {
+ let chunks: Vec<(usize, &ColumnIndexMetaData)> =
$iterator.collect();
+ let capacity: usize = chunks.iter().map(|c| c.0).sum();
+ paste! {
+ match $data_type {
DataType::Boolean => {
- let iterator = [<$stat_type_prefix
BooleanDataPageStatsIterator>]::new($iterator);
- let mut builder = BooleanBuilder::new();
- for x in iterator {
- for x in x.into_iter() {
- let Some(x) = x else {
- builder.append_null(); // no statistics value
- continue;
- };
- builder.append_value(x);
+ let mut b = BooleanBuilder::with_capacity(capacity);
+ for (len, index) in chunks {
+ match index {
+ ColumnIndexMetaData::BOOLEAN(index) => {
+ for val in index.[<$stat_type_prefix:lower
_values_iter>]() {
+ b.append_option(val.copied());
+ }
+ }
+ _ => b.append_nulls(len),
+ }
+ }
+ Ok(Arc::new(b.finish()))
+ },
+ DataType::UInt8 => {
+ let mut b = UInt8Builder::with_capacity(capacity);
+ for (len, index) in chunks {
+ match index {
+ ColumnIndexMetaData::INT32(index) => {
+ for val in index.[<$stat_type_prefix:lower
_values_iter>]() {
+ b.append_option(val.and_then(|&x|
u8::try_from(x).ok()));
+ }
+ }
+ _ => b.append_nulls(len),
+ }
+ }
+ Ok(Arc::new(b.finish()))
Review Comment:
Yes the latter would be best (and preferably also do validation directly on
read)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]