[GitHub] [arrow-datafusion] alamb commented on a change in pull request #1407: support decimal for min/max agg

GitBox Thu, 09 Dec 2021 06:39:06 -0800


alamb commented on a change in pull request #1407:
URL: https://github.com/apache/arrow-datafusion/pull/1407#discussion_r765844134




##########
File path: datafusion/src/physical_plan/expressions/min_max.rs
##########
@@ -129,11 +131,49 @@ macro_rules! typed_min_max_batch {
     }};
 }
 
+// TODO implement this in arrow-rs with simd
+// https://github.com/apache/arrow-rs/issues/1010
+// Statically-typed version of min/max(array) -> ScalarValue for decimal types.
+macro_rules! typed_min_max_batch_decimal128 {
+    ($VALUES:expr, $PRECISION:ident, $SCALE:ident, $OP:ident) => {{
+        let null_count = $VALUES.null_count();
+        if null_count == $VALUES.len() {
+            ScalarValue::Decimal128(None, *$PRECISION, *$SCALE)
+        } else {
+            let array = 
$VALUES.as_any().downcast_ref::<DecimalArray>().unwrap();
+            if null_count == 0 {
+                // there is no null value
+                let mut result = array.value(0);
+                for i in 1..array.len() {
+                    result = result.$OP(array.value(i));
+                }
+                ScalarValue::Decimal128(Some(result), *$PRECISION, *$SCALE)
+            } else {
+                let mut result = 0_i128;
+                let mut has_value = false;
+                for i in 0..array.len() {
+                    if !has_value && array.is_valid(i) {
+                        has_value = true;
+                        result = array.value(i);
+                    }
+                    if array.is_valid(i) {
+                        result = result.$OP(array.value(i));
+                    }
+                }
+                ScalarValue::Decimal128(Some(result), *$PRECISION, *$SCALE)
+            }
+        }
+    }};
+}
+
 // Statically-typed version of min/max(array) -> ScalarValue  for non-string 
types.
 // this is a macro to support both operations (min and max).
 macro_rules! min_max_batch {
     ($VALUES:expr, $OP:ident) => {{
         match $VALUES.data_type() {
+            DataType::Decimal(precision, scale) => {
+                typed_min_max_batch_decimal128!($VALUES, precision, scale, $OP)

Review comment:
       I think if you added `DecimalArray` support to 
`arrow::compute::kernels::min` and `arrow::compute::kernels::max` you might be 
able to use `typed_min_max_batch!` here. Work for the future perhaps

##########
File path: datafusion/src/physical_plan/expressions/min_max.rs
##########
@@ -129,11 +131,49 @@ macro_rules! typed_min_max_batch {
     }};
 }
 
+// TODO implement this in arrow-rs with simd
+// https://github.com/apache/arrow-rs/issues/1010

Review comment:
       I have not reviewed the code in the datafusion aggregate functions for a 
while, so I am not familiar with how much they do / don't use the arrow compute 
kernels, but I think the more we can leverage / reuse those kernels (and their 
SIMD specializations, if they exist), the better

##########
File path: datafusion/src/physical_plan/expressions/min_max.rs
##########
@@ -129,11 +131,49 @@ macro_rules! typed_min_max_batch {
     }};
 }
 
+// TODO implement this in arrow-rs with simd
+// https://github.com/apache/arrow-rs/issues/1010
+// Statically-typed version of min/max(array) -> ScalarValue for decimal types.
+macro_rules! typed_min_max_batch_decimal128 {
+    ($VALUES:expr, $PRECISION:ident, $SCALE:ident, $OP:ident) => {{
+        let null_count = $VALUES.null_count();
+        if null_count == $VALUES.len() {
+            ScalarValue::Decimal128(None, *$PRECISION, *$SCALE)
+        } else {
+            let array = 
$VALUES.as_any().downcast_ref::<DecimalArray>().unwrap();
+            if null_count == 0 {
+                // there is no null value
+                let mut result = array.value(0);
+                for i in 1..array.len() {
+                    result = result.$OP(array.value(i));
+                }
+                ScalarValue::Decimal128(Some(result), *$PRECISION, *$SCALE)
+            } else {
+                let mut result = 0_i128;

Review comment:
       It might be more idomatic to use `let mut rust: Option<i128> = 0;` (aka 
use an `Option` rather than explicit flag)
   
   Then instead of code like
   
   ```rust
                       if !has_value && array.is_valid(i) {
                           has_value = true;
                           result = array.value(i);
                       }
                       if array.is_valid(i) {
                           result = result.$OP(array.value(i));
                       }
   ```
   
   You could write code like the following (which I think saves at least one 
check of `is_valid()`):
   
   ```rust
                       if array.is_valid(i) {
                           let value = array.value(i);
                           result = result.$OP(result.unwrap_or(value)
                       }
   ```
   
   This is just a style suggestion, it is not needed I don't think
   
   

##########
File path: datafusion/src/execution/context.rs
##########
@@ -1842,6 +1842,46 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn aggregate_decimal_min() -> Result<()> {
+        let mut ctx = ExecutionContext::new();
+        ctx.register_table("d_table", test::table_with_decimal())
+            .unwrap();
+
+        let result = plan_and_collect(&mut ctx, "select min(c1) from d_table")
+            .await
+            .unwrap();
+        let expected = vec![
+            "+-----------------+",
+            "| MIN(d_table.c1) |",
+            "+-----------------+",
+            "| -100.009        |",
+            "+-----------------+",
+        ];
+        assert_batches_sorted_eq!(expected, &result);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn aggregate_decimal_max() -> Result<()> {

Review comment:
       👍 

##########
File path: datafusion/src/physical_plan/expressions/min_max.rs
##########
@@ -237,6 +291,16 @@ macro_rules! typed_min_max_string {
 macro_rules! min_max {
     ($VALUE:expr, $DELTA:expr, $OP:ident) => {{
         Ok(match ($VALUE, $DELTA) {
+            (ScalarValue::Decimal128(lhsv,lhsp,lhss), 
ScalarValue::Decimal128(rhsv,rhsp,rhss)) => {
+                if lhsp.eq(rhsp) && lhss.eq(rhss) {
+                    typed_min_max_decimal!(lhsv, rhsv, lhsp, lhss, Decimal128, 
$OP)
+                } else {
+                    return Err(DataFusionError::Internal(format!(
+                    "MIN/MAX is not expected to receive scalars of 
incompatible types {:?}",

Review comment:
       👍 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-datafusion] alamb commented on a change in pull request #1407: support decimal for min/max agg

Reply via email to