alamb commented on code in PR #10711:
URL: https://github.com/apache/datafusion/pull/10711#discussion_r1618800964
##########
datafusion/core/src/datasource/physical_plan/parquet/statistics.rs:
##########
@@ -52,131 +55,311 @@ fn sign_extend_be(b: &[u8]) -> [u8; 16] {
result
}
-/// Extract a single min/max statistics from a [`ParquetStatistics`] object
-///
-/// * `$column_statistics` is the `ParquetStatistics` object
-/// * `$func is the function` (`min`/`max`) to call to get the value
-/// * `$bytes_func` is the function (`min_bytes`/`max_bytes`) to call to get
the value as bytes
-/// * `$target_arrow_type` is the [`DataType`] of the target statistics
-macro_rules! get_statistic {
- ($column_statistics:expr, $func:ident, $bytes_func:ident,
$target_arrow_type:expr) => {{
- if !$column_statistics.has_min_max_set() {
- return None;
- }
- match $column_statistics {
- ParquetStatistics::Boolean(s) =>
Some(ScalarValue::Boolean(Some(*s.$func()))),
- ParquetStatistics::Int32(s) => {
- match $target_arrow_type {
- // int32 to decimal with the precision and scale
- Some(DataType::Decimal128(precision, scale)) => {
- Some(ScalarValue::Decimal128(
- Some(*s.$func() as i128),
- *precision,
- *scale,
- ))
- }
- Some(DataType::Int8) => {
-
Some(ScalarValue::Int8(Some((*s.$func()).try_into().unwrap())))
- }
- Some(DataType::Int16) => {
-
Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap())))
- }
- Some(DataType::UInt8) => {
-
Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap())))
- }
- Some(DataType::UInt16) => {
-
Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap())))
- }
- Some(DataType::UInt32) => {
- Some(ScalarValue::UInt32(Some((*s.$func()) as u32)))
- }
- Some(DataType::Date32) => {
- Some(ScalarValue::Date32(Some(*s.$func())))
- }
- Some(DataType::Date64) => {
- Some(ScalarValue::Date64(Some(i64::from(*s.$func()) *
24 * 60 * 60 * 1000)))
- }
- _ => Some(ScalarValue::Int32(Some(*s.$func()))),
- }
+macro_rules! get_statistics_iter {
+ ($column_statistics_iter:expr, $func:ident, $bytes_func:ident,
$target_arrow_type:expr) => {{
+ match $target_arrow_type {
+ Some(DataType::Boolean) => Some(Arc::new(BooleanArray::from_iter(
+ $column_statistics_iter.map(|statistic| {
+ statistic.and_then(|x| {
+ if !x.has_min_max_set() {
+ return None;
+ }
+ match x {
+ ParquetStatistics::Boolean(s) => Some(*s.$func()),
+ _ => None,
+ }
+ })
+ }),
+ )) as ArrayRef),
+ Some(DataType::Int8) => Some(Arc::new(Int8Array::from_iter(
+ $column_statistics_iter.map(|statistic| {
+ statistic.and_then(|x| {
+ if !x.has_min_max_set() {
+ return None;
+ }
+ match x {
+ ParquetStatistics::Int32(s) => {
+ Some((*s.$func()).try_into().unwrap())
Review Comment:
I think these unwraps are not great (though I see you didn't add them) -- as
they may panic on invalid parquet statistics
##########
datafusion/core/src/datasource/physical_plan/parquet/statistics.rs:
##########
@@ -52,131 +55,311 @@ fn sign_extend_be(b: &[u8]) -> [u8; 16] {
result
}
-/// Extract a single min/max statistics from a [`ParquetStatistics`] object
-///
-/// * `$column_statistics` is the `ParquetStatistics` object
-/// * `$func is the function` (`min`/`max`) to call to get the value
-/// * `$bytes_func` is the function (`min_bytes`/`max_bytes`) to call to get
the value as bytes
-/// * `$target_arrow_type` is the [`DataType`] of the target statistics
-macro_rules! get_statistic {
- ($column_statistics:expr, $func:ident, $bytes_func:ident,
$target_arrow_type:expr) => {{
- if !$column_statistics.has_min_max_set() {
- return None;
- }
- match $column_statistics {
- ParquetStatistics::Boolean(s) =>
Some(ScalarValue::Boolean(Some(*s.$func()))),
- ParquetStatistics::Int32(s) => {
- match $target_arrow_type {
- // int32 to decimal with the precision and scale
- Some(DataType::Decimal128(precision, scale)) => {
- Some(ScalarValue::Decimal128(
- Some(*s.$func() as i128),
- *precision,
- *scale,
- ))
- }
- Some(DataType::Int8) => {
-
Some(ScalarValue::Int8(Some((*s.$func()).try_into().unwrap())))
- }
- Some(DataType::Int16) => {
-
Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap())))
- }
- Some(DataType::UInt8) => {
-
Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap())))
- }
- Some(DataType::UInt16) => {
-
Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap())))
- }
- Some(DataType::UInt32) => {
- Some(ScalarValue::UInt32(Some((*s.$func()) as u32)))
- }
- Some(DataType::Date32) => {
- Some(ScalarValue::Date32(Some(*s.$func())))
- }
- Some(DataType::Date64) => {
- Some(ScalarValue::Date64(Some(i64::from(*s.$func()) *
24 * 60 * 60 * 1000)))
- }
- _ => Some(ScalarValue::Int32(Some(*s.$func()))),
- }
+macro_rules! get_statistics_iter {
Review Comment:
Could you please update the doc strings to explain this macro a little more
-- I think especially without type infomration doc strings on macros makes them
easier to understand
##########
datafusion/core/src/datasource/physical_plan/parquet/statistics.rs:
##########
@@ -211,32 +394,25 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item =
Option<&'a ParquetStatistics
data_type: &DataType,
iterator: I,
) -> Result<ArrayRef> {
- let scalars = iterator
- .map(|x| x.and_then(|s| get_statistic!(s, min, min_bytes,
Some(data_type))));
- collect_scalars(data_type, scalars)
+ match get_statistics_iter!(iterator, min, min_bytes, Some(data_type)) {
Review Comment:
This is pretty neat @xinlifoobar
As this is currently implemented, it has quite a bit of repetition (e.g. the
clauses for `ParquetStatistics::ByteArray(s)` in
the macro above
What do you think of a slightly different mode where the iteration over the
statistics values and the array creation looked different
Something like
```rust
match data_type {
DataType::Int8 => {
// get an iterator over int32 stored in
let int32_iter = get_statistics_iter!(iterator, min,
ParquetStatistics::Int32)
// convert int32 to int8, ignoring
let int8_iter = int32_iter.map(|v| if let Ok(v) = v.try_into:<i8>() {
Some(v) } else { None });
Int8Array::from_iter(int8_iter)
}
// similarly for other types
....
DataType::Utf8 => {
// get an iterator over bytes stored
let byte_iter = get_statistics_iter!(iterator, min_bytes,
ParquetStatistics::ByteArray)
// convert bytes to utf8 strings
let string_array = byte_iter.map(|v| if let Ok(v) = str::from_utf8....);
StringArray::from_iter(int8_iter)
}
...
```
Another reason I suggest this structure is that I think it would make it
easier to support extracting statistics from multiple files in the future more
easily
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]