Dandandan commented on code in PR #9303:
URL: https://github.com/apache/arrow-rs/pull/9303#discussion_r2749489138


##########
parquet/src/arrow/arrow_reader/statistics.rs:
##########
@@ -596,473 +600,572 @@ macro_rules! get_statistics {
         }}}
 }
 
-macro_rules! make_data_page_stats_iterator {
-    ($iterator_type: ident, $func: ident, $stat_value_type: ty) => {
-        struct $iterator_type<'a, I>
-        where
-            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
-        {
-            iter: I,
-        }
-
-        impl<'a, I> $iterator_type<'a, I>
-        where
-            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
-        {
-            fn new(iter: I) -> Self {
-                Self { iter }
-            }
-        }
-
-        impl<'a, I> Iterator for $iterator_type<'a, I>
-        where
-            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
-        {
-            type Item = Vec<Option<$stat_value_type>>;
-
-            fn next(&mut self) -> Option<Self::Item> {
-                let next = self.iter.next();
-                match next {
-                    Some((len, index)) => match index {
-                        // No matching `Index` found;
-                        // thus no statistics that can be extracted.
-                        // We return vec![None; len] to effectively
-                        // create an arrow null-array with the length
-                        // corresponding to the number of entries in
-                        // `ParquetOffsetIndex` per row group per column.
-                        ColumnIndexMetaData::NONE => Some(vec![None; len]),
-                        _ => 
Some(<$stat_value_type>::$func(&index).collect::<Vec<_>>()),
-                    },
-                    _ => None,
-                }
-            }
-
-            fn size_hint(&self) -> (usize, Option<usize>) {
-                self.iter.size_hint()
-            }
-        }
-    };
-}
-
-make_data_page_stats_iterator!(MinBooleanDataPageStatsIterator, 
min_values_iter, bool);
-make_data_page_stats_iterator!(MaxBooleanDataPageStatsIterator, 
max_values_iter, bool);
-make_data_page_stats_iterator!(MinInt32DataPageStatsIterator, min_values_iter, 
i32);
-make_data_page_stats_iterator!(MaxInt32DataPageStatsIterator, max_values_iter, 
i32);
-make_data_page_stats_iterator!(MinInt64DataPageStatsIterator, min_values_iter, 
i64);
-make_data_page_stats_iterator!(MaxInt64DataPageStatsIterator, max_values_iter, 
i64);
-make_data_page_stats_iterator!(
-    MinFloat16DataPageStatsIterator,
-    min_values_iter,
-    FixedLenByteArray
-);
-make_data_page_stats_iterator!(
-    MaxFloat16DataPageStatsIterator,
-    max_values_iter,
-    FixedLenByteArray
-);
-make_data_page_stats_iterator!(MinFloat32DataPageStatsIterator, 
min_values_iter, f32);
-make_data_page_stats_iterator!(MaxFloat32DataPageStatsIterator, 
max_values_iter, f32);
-make_data_page_stats_iterator!(MinFloat64DataPageStatsIterator, 
min_values_iter, f64);
-make_data_page_stats_iterator!(MaxFloat64DataPageStatsIterator, 
max_values_iter, f64);
-make_data_page_stats_iterator!(
-    MinByteArrayDataPageStatsIterator,
-    min_values_iter,
-    ByteArray
-);
-make_data_page_stats_iterator!(
-    MaxByteArrayDataPageStatsIterator,
-    max_values_iter,
-    ByteArray
-);
-make_data_page_stats_iterator!(
-    MaxFixedLenByteArrayDataPageStatsIterator,
-    max_values_iter,
-    FixedLenByteArray
-);
-
-make_data_page_stats_iterator!(
-    MinFixedLenByteArrayDataPageStatsIterator,
-    min_values_iter,
-    FixedLenByteArray
-);
-
-macro_rules! get_decimal_page_stats_iterator {
-    ($iterator_type: ident, $func: ident, $stat_value_type: ident, 
$convert_func: ident) => {
-        struct $iterator_type<'a, I>
-        where
-            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
-        {
-            iter: I,
-        }
-
-        impl<'a, I> $iterator_type<'a, I>
-        where
-            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
-        {
-            fn new(iter: I) -> Self {
-                Self { iter }
-            }
-        }
-
-        impl<'a, I> Iterator for $iterator_type<'a, I>
-        where
-            I: Iterator<Item = (usize, &'a ColumnIndexMetaData)>,
-        {
-            type Item = Vec<Option<$stat_value_type>>;
-
-            // Some(native_index.$func().map(|v| 
v.map($conv)).collect::<Vec<_>>())
-            fn next(&mut self) -> Option<Self::Item> {
-                let next = self.iter.next();
-                match next {
-                    Some((len, index)) => match index {
-                        ColumnIndexMetaData::INT32(native_index) => Some(
-                            native_index
-                                .$func()
-                                .map(|x| x.map(|x| $stat_value_type::from(*x)))
-                                .collect::<Vec<_>>(),
-                        ),
-                        ColumnIndexMetaData::INT64(native_index) => Some(
-                            native_index
-                                .$func()
-                                .map(|x| x.map(|x| 
$stat_value_type::try_from(*x).unwrap()))
-                                .collect::<Vec<_>>(),
-                        ),
-                        ColumnIndexMetaData::BYTE_ARRAY(native_index) => Some(
-                            native_index
-                                .$func()
-                                .map(|x| x.map(|x| $convert_func(x)))
-                                .collect::<Vec<_>>(),
-                        ),
-                        
ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(native_index) => Some(
-                            native_index
-                                .$func()
-                                .map(|x| x.map(|x| $convert_func(x)))
-                                .collect::<Vec<_>>(),
-                        ),
-                        _ => Some(vec![None; len]),
-                    },
-                    _ => None,
-                }
-            }
-
-            fn size_hint(&self) -> (usize, Option<usize>) {
-                self.iter.size_hint()
-            }
-        }
-    };
-}
-
-get_decimal_page_stats_iterator!(
-    MinDecimal32DataPageStatsIterator,
-    min_values_iter,
-    i32,
-    from_bytes_to_i32
-);
-
-get_decimal_page_stats_iterator!(
-    MaxDecimal32DataPageStatsIterator,
-    max_values_iter,
-    i32,
-    from_bytes_to_i32
-);
-
-get_decimal_page_stats_iterator!(
-    MinDecimal64DataPageStatsIterator,
-    min_values_iter,
-    i64,
-    from_bytes_to_i64
-);
-
-get_decimal_page_stats_iterator!(
-    MaxDecimal64DataPageStatsIterator,
-    max_values_iter,
-    i64,
-    from_bytes_to_i64
-);
-
-get_decimal_page_stats_iterator!(
-    MinDecimal128DataPageStatsIterator,
-    min_values_iter,
-    i128,
-    from_bytes_to_i128
-);
-
-get_decimal_page_stats_iterator!(
-    MaxDecimal128DataPageStatsIterator,
-    max_values_iter,
-    i128,
-    from_bytes_to_i128
-);
-
-get_decimal_page_stats_iterator!(
-    MinDecimal256DataPageStatsIterator,
-    min_values_iter,
-    i256,
-    from_bytes_to_i256
-);
-
-get_decimal_page_stats_iterator!(
-    MaxDecimal256DataPageStatsIterator,
-    max_values_iter,
-    i256,
-    from_bytes_to_i256
-);
-
 macro_rules! get_data_page_statistics {
     ($stat_type_prefix: ident, $data_type: ident, $iterator: ident, 
$physical_type: ident) => {
-        paste! {
-            match $data_type {
+        {
+            let chunks: Vec<(usize, &ColumnIndexMetaData)> = 
$iterator.collect();
+            let capacity: usize = chunks.iter().map(|c| c.0).sum();
+            paste! {
+                match $data_type {
                 DataType::Boolean => {
-                    let iterator = [<$stat_type_prefix 
BooleanDataPageStatsIterator>]::new($iterator);
-                    let mut builder = BooleanBuilder::new();
-                    for x in iterator {
-                        for x in x.into_iter() {
-                            let Some(x) = x else {
-                                builder.append_null(); // no statistics value
-                                continue;
-                            };
-                            builder.append_value(x);
+                    let mut b = BooleanBuilder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::BOOLEAN(index) => {
+                                for val in index.[<$stat_type_prefix:lower 
_values_iter>]() {
+                                    b.append_option(val.copied());
+                                }
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))
+                },
+                DataType::UInt8 => {
+                    let mut b = UInt8Builder::with_capacity(capacity);
+                    for (len, index) in chunks {
+                        match index {
+                            ColumnIndexMetaData::INT32(index) => {
+                                for val in index.[<$stat_type_prefix:lower 
_values_iter>]() {
+                                    b.append_option(val.and_then(|&x| 
u8::try_from(x).ok()));
+                                }
+                            }
+                            _ => b.append_nulls(len),
+                        }
+                    }
+                    Ok(Arc::new(b.finish()))

Review Comment:
   Yes the latter would be best (and preferably also do validation directly on 
read)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to