alamb commented on code in PR #9303:
URL: https://github.com/apache/arrow-rs/pull/9303#discussion_r2743375673
##########
parquet/src/arrow/arrow_reader/statistics.rs:
##########
@@ -1142,14 +1245,22 @@ pub(crate) fn null_counts_page_statistics<'a,
I>(iterator: I) -> Result<UInt64Ar
where
I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
{
- let iter = iterator.flat_map(|(len, index)| match index {
- ColumnIndexMetaData::NONE => vec![None; len],
- column_index => column_index.null_counts().map_or(vec![None; len], |v|
{
- v.iter().map(|i| Some(*i as u64)).collect::<Vec<_>>()
- }),
- });
-
- Ok(UInt64Array::from_iter(iter))
+ let chunks: Vec<_> = iterator.collect();
+ let total_capacity: usize = chunks.iter().map(|(len, _)| *len).sum();
+ let mut builder = UInt64Builder::with_capacity(total_capacity);
Review Comment:
this seems like it saves an allocation too
##########
parquet/src/arrow/arrow_reader/statistics.rs:
##########
@@ -596,473 +600,572 @@ macro_rules! get_statistics {
}}}
}
-macro_rules! make_data_page_stats_iterator {
- ($iterator_type: ident, $func: ident, $stat_value_type: ty) => {
- struct $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- iter: I,
- }
-
- impl<'a, I> $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- fn new(iter: I) -> Self {
- Self { iter }
- }
- }
-
- impl<'a, I> Iterator for $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- type Item = Vec<Option<$stat_value_type>>;
-
- fn next(&mut self) -> Option<Self::Item> {
- let next = self.iter.next();
- match next {
- Some((len, index)) => match index {
- // No matching `Index` found;
- // thus no statistics that can be extracted.
- // We return vec![None; len] to effectively
- // create an arrow null-array with the length
- // corresponding to the number of entries in
- // `ParquetOffsetIndex` per row group per column.
- ColumnIndexMetaData::NONE => Some(vec![None; len]),
- _ =>
Some(<$stat_value_type>::$func(&index).collect::<Vec<_>>()),
- },
- _ => None,
- }
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.iter.size_hint()
- }
- }
- };
-}
-
-make_data_page_stats_iterator!(MinBooleanDataPageStatsIterator,
min_values_iter, bool);
-make_data_page_stats_iterator!(MaxBooleanDataPageStatsIterator,
max_values_iter, bool);
-make_data_page_stats_iterator!(MinInt32DataPageStatsIterator, min_values_iter,
i32);
-make_data_page_stats_iterator!(MaxInt32DataPageStatsIterator, max_values_iter,
i32);
-make_data_page_stats_iterator!(MinInt64DataPageStatsIterator, min_values_iter,
i64);
-make_data_page_stats_iterator!(MaxInt64DataPageStatsIterator, max_values_iter,
i64);
-make_data_page_stats_iterator!(
- MinFloat16DataPageStatsIterator,
- min_values_iter,
- FixedLenByteArray
-);
-make_data_page_stats_iterator!(
- MaxFloat16DataPageStatsIterator,
- max_values_iter,
- FixedLenByteArray
-);
-make_data_page_stats_iterator!(MinFloat32DataPageStatsIterator,
min_values_iter, f32);
-make_data_page_stats_iterator!(MaxFloat32DataPageStatsIterator,
max_values_iter, f32);
-make_data_page_stats_iterator!(MinFloat64DataPageStatsIterator,
min_values_iter, f64);
-make_data_page_stats_iterator!(MaxFloat64DataPageStatsIterator,
max_values_iter, f64);
-make_data_page_stats_iterator!(
- MinByteArrayDataPageStatsIterator,
- min_values_iter,
- ByteArray
-);
-make_data_page_stats_iterator!(
- MaxByteArrayDataPageStatsIterator,
- max_values_iter,
- ByteArray
-);
-make_data_page_stats_iterator!(
- MaxFixedLenByteArrayDataPageStatsIterator,
- max_values_iter,
- FixedLenByteArray
-);
-
-make_data_page_stats_iterator!(
- MinFixedLenByteArrayDataPageStatsIterator,
- min_values_iter,
- FixedLenByteArray
-);
-
-macro_rules! get_decimal_page_stats_iterator {
- ($iterator_type: ident, $func: ident, $stat_value_type: ident,
$convert_func: ident) => {
- struct $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- iter: I,
- }
-
- impl<'a, I> $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- fn new(iter: I) -> Self {
- Self { iter }
- }
- }
-
- impl<'a, I> Iterator for $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- type Item = Vec<Option<$stat_value_type>>;
-
- // Some(native_index.$func().map(|v|
v.map($conv)).collect::<Vec<_>>())
- fn next(&mut self) -> Option<Self::Item> {
- let next = self.iter.next();
- match next {
- Some((len, index)) => match index {
- ColumnIndexMetaData::INT32(native_index) => Some(
- native_index
- .$func()
- .map(|x| x.map(|x| $stat_value_type::from(*x)))
- .collect::<Vec<_>>(),
- ),
- ColumnIndexMetaData::INT64(native_index) => Some(
Review Comment:
The decimal statistics currently handle Int32 / Int64/BYTE_ARRAY/BYTE_ARRAY
including for Decimal32 and Decimal64. However, the new code seems to only
handle INT32 stats 🤔 for Decimal32 and INT32/INT64 for Decimal 64
##########
parquet/src/arrow/arrow_reader/statistics.rs:
##########
@@ -596,473 +600,572 @@ macro_rules! get_statistics {
}}}
}
-macro_rules! make_data_page_stats_iterator {
- ($iterator_type: ident, $func: ident, $stat_value_type: ty) => {
- struct $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- iter: I,
- }
-
- impl<'a, I> $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- fn new(iter: I) -> Self {
- Self { iter }
- }
- }
-
- impl<'a, I> Iterator for $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- type Item = Vec<Option<$stat_value_type>>;
-
- fn next(&mut self) -> Option<Self::Item> {
- let next = self.iter.next();
- match next {
- Some((len, index)) => match index {
- // No matching `Index` found;
- // thus no statistics that can be extracted.
- // We return vec![None; len] to effectively
- // create an arrow null-array with the length
- // corresponding to the number of entries in
- // `ParquetOffsetIndex` per row group per column.
- ColumnIndexMetaData::NONE => Some(vec![None; len]),
- _ =>
Some(<$stat_value_type>::$func(&index).collect::<Vec<_>>()),
- },
- _ => None,
- }
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.iter.size_hint()
- }
- }
- };
-}
-
-make_data_page_stats_iterator!(MinBooleanDataPageStatsIterator,
min_values_iter, bool);
-make_data_page_stats_iterator!(MaxBooleanDataPageStatsIterator,
max_values_iter, bool);
-make_data_page_stats_iterator!(MinInt32DataPageStatsIterator, min_values_iter,
i32);
-make_data_page_stats_iterator!(MaxInt32DataPageStatsIterator, max_values_iter,
i32);
-make_data_page_stats_iterator!(MinInt64DataPageStatsIterator, min_values_iter,
i64);
-make_data_page_stats_iterator!(MaxInt64DataPageStatsIterator, max_values_iter,
i64);
-make_data_page_stats_iterator!(
- MinFloat16DataPageStatsIterator,
- min_values_iter,
- FixedLenByteArray
-);
-make_data_page_stats_iterator!(
- MaxFloat16DataPageStatsIterator,
- max_values_iter,
- FixedLenByteArray
-);
-make_data_page_stats_iterator!(MinFloat32DataPageStatsIterator,
min_values_iter, f32);
-make_data_page_stats_iterator!(MaxFloat32DataPageStatsIterator,
max_values_iter, f32);
-make_data_page_stats_iterator!(MinFloat64DataPageStatsIterator,
min_values_iter, f64);
-make_data_page_stats_iterator!(MaxFloat64DataPageStatsIterator,
max_values_iter, f64);
-make_data_page_stats_iterator!(
- MinByteArrayDataPageStatsIterator,
- min_values_iter,
- ByteArray
-);
-make_data_page_stats_iterator!(
- MaxByteArrayDataPageStatsIterator,
- max_values_iter,
- ByteArray
-);
-make_data_page_stats_iterator!(
- MaxFixedLenByteArrayDataPageStatsIterator,
- max_values_iter,
- FixedLenByteArray
-);
-
-make_data_page_stats_iterator!(
- MinFixedLenByteArrayDataPageStatsIterator,
- min_values_iter,
- FixedLenByteArray
-);
-
-macro_rules! get_decimal_page_stats_iterator {
- ($iterator_type: ident, $func: ident, $stat_value_type: ident,
$convert_func: ident) => {
- struct $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- iter: I,
- }
-
- impl<'a, I> $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- fn new(iter: I) -> Self {
- Self { iter }
- }
- }
-
- impl<'a, I> Iterator for $iterator_type<'a, I>
- where
- I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
- {
- type Item = Vec<Option<$stat_value_type>>;
-
- // Some(native_index.$func().map(|v|
v.map($conv)).collect::<Vec<_>>())
- fn next(&mut self) -> Option<Self::Item> {
- let next = self.iter.next();
- match next {
- Some((len, index)) => match index {
- ColumnIndexMetaData::INT32(native_index) => Some(
- native_index
- .$func()
- .map(|x| x.map(|x| $stat_value_type::from(*x)))
- .collect::<Vec<_>>(),
- ),
- ColumnIndexMetaData::INT64(native_index) => Some(
- native_index
- .$func()
- .map(|x| x.map(|x|
$stat_value_type::try_from(*x).unwrap()))
- .collect::<Vec<_>>(),
- ),
- ColumnIndexMetaData::BYTE_ARRAY(native_index) => Some(
- native_index
- .$func()
- .map(|x| x.map(|x| $convert_func(x)))
- .collect::<Vec<_>>(),
- ),
-
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(native_index) => Some(
- native_index
- .$func()
- .map(|x| x.map(|x| $convert_func(x)))
- .collect::<Vec<_>>(),
- ),
- _ => Some(vec![None; len]),
- },
- _ => None,
- }
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- self.iter.size_hint()
- }
- }
- };
-}
-
-get_decimal_page_stats_iterator!(
- MinDecimal32DataPageStatsIterator,
- min_values_iter,
- i32,
- from_bytes_to_i32
-);
-
-get_decimal_page_stats_iterator!(
- MaxDecimal32DataPageStatsIterator,
- max_values_iter,
- i32,
- from_bytes_to_i32
-);
-
-get_decimal_page_stats_iterator!(
- MinDecimal64DataPageStatsIterator,
- min_values_iter,
- i64,
- from_bytes_to_i64
-);
-
-get_decimal_page_stats_iterator!(
- MaxDecimal64DataPageStatsIterator,
- max_values_iter,
- i64,
- from_bytes_to_i64
-);
-
-get_decimal_page_stats_iterator!(
- MinDecimal128DataPageStatsIterator,
- min_values_iter,
- i128,
- from_bytes_to_i128
-);
-
-get_decimal_page_stats_iterator!(
- MaxDecimal128DataPageStatsIterator,
- max_values_iter,
- i128,
- from_bytes_to_i128
-);
-
-get_decimal_page_stats_iterator!(
- MinDecimal256DataPageStatsIterator,
- min_values_iter,
- i256,
- from_bytes_to_i256
-);
-
-get_decimal_page_stats_iterator!(
- MaxDecimal256DataPageStatsIterator,
- max_values_iter,
- i256,
- from_bytes_to_i256
-);
-
macro_rules! get_data_page_statistics {
($stat_type_prefix: ident, $data_type: ident, $iterator: ident,
$physical_type: ident) => {
- paste! {
- match $data_type {
+ {
+ let chunks: Vec<(usize, &ColumnIndexMetaData)> =
$iterator.collect();
+ let capacity: usize = chunks.iter().map(|c| c.0).sum();
+ paste! {
+ match $data_type {
DataType::Boolean => {
- let iterator = [<$stat_type_prefix
BooleanDataPageStatsIterator>]::new($iterator);
- let mut builder = BooleanBuilder::new();
- for x in iterator {
- for x in x.into_iter() {
- let Some(x) = x else {
- builder.append_null(); // no statistics value
- continue;
- };
- builder.append_value(x);
+ let mut b = BooleanBuilder::with_capacity(capacity);
+ for (len, index) in chunks {
+ match index {
+ ColumnIndexMetaData::BOOLEAN(index) => {
+ for val in index.[<$stat_type_prefix:lower
_values_iter>]() {
+ b.append_option(val.copied());
+ }
+ }
+ _ => b.append_nulls(len),
+ }
+ }
+ Ok(Arc::new(b.finish()))
+ },
+ DataType::UInt8 => {
+ let mut b = UInt8Builder::with_capacity(capacity);
+ for (len, index) in chunks {
+ match index {
+ ColumnIndexMetaData::INT32(index) => {
+ for val in index.[<$stat_type_prefix:lower
_values_iter>]() {
+ b.append_option(val.and_then(|&x|
u8::try_from(x).ok()));
+ }
+ }
+ _ => b.append_nulls(len),
+ }
+ }
+ Ok(Arc::new(b.finish()))
Review Comment:
Seeing all these calls to `finish` with a builder that is not re-used looks
wasteful to me -- I also tried to code up a PR to make this faster too (both
`finish` as well as add a new `build`):
- https://github.com/apache/arrow-rs/pull/9305
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]