alamb commented on code in PR #10711:
URL: https://github.com/apache/datafusion/pull/10711#discussion_r1618800964


##########
datafusion/core/src/datasource/physical_plan/parquet/statistics.rs:
##########
@@ -52,131 +55,311 @@ fn sign_extend_be(b: &[u8]) -> [u8; 16] {
     result
 }
 
-/// Extract a single min/max statistics from a [`ParquetStatistics`] object
-///
-/// * `$column_statistics` is the `ParquetStatistics` object
-/// * `$func is the function` (`min`/`max`) to call to get the value
-/// * `$bytes_func` is the function (`min_bytes`/`max_bytes`) to call to get 
the value as bytes
-/// * `$target_arrow_type` is the [`DataType`] of the target statistics
-macro_rules! get_statistic {
-    ($column_statistics:expr, $func:ident, $bytes_func:ident, 
$target_arrow_type:expr) => {{
-        if !$column_statistics.has_min_max_set() {
-            return None;
-        }
-        match $column_statistics {
-            ParquetStatistics::Boolean(s) => 
Some(ScalarValue::Boolean(Some(*s.$func()))),
-            ParquetStatistics::Int32(s) => {
-                match $target_arrow_type {
-                    // int32 to decimal with the precision and scale
-                    Some(DataType::Decimal128(precision, scale)) => {
-                        Some(ScalarValue::Decimal128(
-                            Some(*s.$func() as i128),
-                            *precision,
-                            *scale,
-                        ))
-                    }
-                    Some(DataType::Int8) => {
-                        
Some(ScalarValue::Int8(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::Int16) => {
-                        
Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::UInt8) => {
-                        
Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::UInt16) => {
-                        
Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::UInt32) => {
-                        Some(ScalarValue::UInt32(Some((*s.$func()) as u32)))
-                    }
-                    Some(DataType::Date32) => {
-                        Some(ScalarValue::Date32(Some(*s.$func())))
-                    }
-                    Some(DataType::Date64) => {
-                        Some(ScalarValue::Date64(Some(i64::from(*s.$func()) * 
24 * 60 * 60 * 1000)))
-                    }
-                    _ => Some(ScalarValue::Int32(Some(*s.$func()))),
-                }
+macro_rules! get_statistics_iter {
+    ($column_statistics_iter:expr, $func:ident, $bytes_func:ident, 
$target_arrow_type:expr) => {{
+        match $target_arrow_type {
+            Some(DataType::Boolean) => Some(Arc::new(BooleanArray::from_iter(
+                $column_statistics_iter.map(|statistic| {
+                    statistic.and_then(|x| {
+                        if !x.has_min_max_set() {
+                            return None;
+                        }
+                        match x {
+                            ParquetStatistics::Boolean(s) => Some(*s.$func()),
+                            _ => None,
+                        }
+                    })
+                }),
+            )) as ArrayRef),
+            Some(DataType::Int8) => Some(Arc::new(Int8Array::from_iter(
+                $column_statistics_iter.map(|statistic| {
+                    statistic.and_then(|x| {
+                        if !x.has_min_max_set() {
+                            return None;
+                        }
+                        match x {
+                            ParquetStatistics::Int32(s) => {
+                                Some((*s.$func()).try_into().unwrap())

Review Comment:
   I think these unwraps are not great (though I see you didn't add them) -- as 
they may panic on invalid parquet statistics



##########
datafusion/core/src/datasource/physical_plan/parquet/statistics.rs:
##########
@@ -52,131 +55,311 @@ fn sign_extend_be(b: &[u8]) -> [u8; 16] {
     result
 }
 
-/// Extract a single min/max statistics from a [`ParquetStatistics`] object
-///
-/// * `$column_statistics` is the `ParquetStatistics` object
-/// * `$func is the function` (`min`/`max`) to call to get the value
-/// * `$bytes_func` is the function (`min_bytes`/`max_bytes`) to call to get 
the value as bytes
-/// * `$target_arrow_type` is the [`DataType`] of the target statistics
-macro_rules! get_statistic {
-    ($column_statistics:expr, $func:ident, $bytes_func:ident, 
$target_arrow_type:expr) => {{
-        if !$column_statistics.has_min_max_set() {
-            return None;
-        }
-        match $column_statistics {
-            ParquetStatistics::Boolean(s) => 
Some(ScalarValue::Boolean(Some(*s.$func()))),
-            ParquetStatistics::Int32(s) => {
-                match $target_arrow_type {
-                    // int32 to decimal with the precision and scale
-                    Some(DataType::Decimal128(precision, scale)) => {
-                        Some(ScalarValue::Decimal128(
-                            Some(*s.$func() as i128),
-                            *precision,
-                            *scale,
-                        ))
-                    }
-                    Some(DataType::Int8) => {
-                        
Some(ScalarValue::Int8(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::Int16) => {
-                        
Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::UInt8) => {
-                        
Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::UInt16) => {
-                        
Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::UInt32) => {
-                        Some(ScalarValue::UInt32(Some((*s.$func()) as u32)))
-                    }
-                    Some(DataType::Date32) => {
-                        Some(ScalarValue::Date32(Some(*s.$func())))
-                    }
-                    Some(DataType::Date64) => {
-                        Some(ScalarValue::Date64(Some(i64::from(*s.$func()) * 
24 * 60 * 60 * 1000)))
-                    }
-                    _ => Some(ScalarValue::Int32(Some(*s.$func()))),
-                }
+macro_rules! get_statistics_iter {

Review Comment:
   Could you please update the doc strings to explain this macro a little more 
-- I think especially without type infomration doc strings on macros makes them 
easier to understand



##########
datafusion/core/src/datasource/physical_plan/parquet/statistics.rs:
##########
@@ -211,32 +394,25 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = 
Option<&'a ParquetStatistics
     data_type: &DataType,
     iterator: I,
 ) -> Result<ArrayRef> {
-    let scalars = iterator
-        .map(|x| x.and_then(|s| get_statistic!(s, min, min_bytes, 
Some(data_type))));
-    collect_scalars(data_type, scalars)
+    match get_statistics_iter!(iterator, min, min_bytes, Some(data_type)) {

Review Comment:
   This is pretty neat @xinlifoobar 
   
   As this is currently implemented, it has quite a bit of repetition (e.g. the 
clauses for                             `ParquetStatistics::ByteArray(s)` in 
the macro above
   
   What do you think of a slightly different mode where the iteration over the 
statistics values and the array creation looked different
   
   Something like
   ```rust
   match data_type {
     DataType::Int8 => {
       // get an iterator over int32 stored in 
       let int32_iter = get_statistics_iter!(iterator, min, 
ParquetStatistics::Int32)
       // convert int32 to int8, ignoring 
       let int8_iter = int32_iter.map(|v| if let Ok(v) = v.try_into:<i8>() { 
Some(v) } else { None });
       Int8Array::from_iter(int8_iter)
     } 
     // similarly for other types
    ....
     DataType::Utf8 => {
       // get an iterator over bytes stored
       let byte_iter = get_statistics_iter!(iterator, min_bytes, 
ParquetStatistics::ByteArray)
       // convert bytes to utf8 strings
       let string_array = byte_iter.map(|v| if let Ok(v) = str::from_utf8....);
       StringArray::from_iter(int8_iter)
     } 
    ...
   ```
   
   Another reason I suggest this structure is that I think it would make it 
easier to support extracting statistics from multiple files in the future more 
easily



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to