alamb commented on code in PR #8266:
URL: https://github.com/apache/arrow-datafusion/pull/8266#discussion_r1402126714


##########
datafusion/optimizer/src/single_distinct_to_groupby.rs:
##########
@@ -64,22 +67,32 @@ fn is_single_distinct_agg(plan: &LogicalPlan) -> 
Result<bool> {
     match plan {
         LogicalPlan::Aggregate(Aggregate { aggr_expr, .. }) => {
             let mut fields_set = HashSet::new();
-            let mut distinct_count = 0;
+            let mut aggregate_count = 0;
             for expr in aggr_expr {
                 if let Expr::AggregateFunction(AggregateFunction {
-                    distinct, args, ..
+                    fun,
+                    distinct,
+                    args,
+                    filter,
+                    ..
                 }) = expr
                 {
-                    if *distinct {
-                        distinct_count += 1;
-                    }
-                    for e in args {
-                        fields_set.insert(e.canonical_name());
+                    match filter {
+                        Some(_) => return Ok(false),
+                        None => {
+                            aggregate_count += 1;
+                            if *distinct {
+                                for e in args {
+                                    fields_set.insert(e.canonical_name());
+                                }
+                            } else if !matches!(fun, Sum | Min | Max) {

Review Comment:
   Can you please add a comment here explaining why this is checking for only 
`Sum`, `Min` and `Max`
   
   I think it is because these functions have the property that they can be 
used to combine themselves -- so like `SUM(SUM(x))` is the same as `SUM(x)`
   
   I think this transformation could be made more general by using a different 
combination. For example we could support `COUNT` by using `SUM(COUNT(x))`
   
   
   ```sql
   SELECT a, COUNT(DINSTINCT b), COUNT(c)
   FROM t
   GROUP BY a
   ```
   
   ```sql
   SELECT a, COUNT(alias1), SUM(alias2) -- <-- This is SUM, not COUNT
   FROM (
     SELECT a, b as alias1, COUNT(c) as alias2
     FROM t
     GROUP BY a, b
   )
   GROUP BY a
   ```
   
   We can do this as a follow on PR ( I can file a ticket)



##########
datafusion/optimizer/src/single_distinct_to_groupby.rs:
##########
@@ -478,4 +520,52 @@ mod tests {
 
         assert_optimized_plan_equal(&plan, expected)
     }
+
+    #[test]
+    fn two_distinct_and_one_common() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(
+                vec![col("a")],
+                vec![
+                    sum(col("c")),
+                    count_distinct(col("b")),
+                    Expr::AggregateFunction(expr::AggregateFunction::new(
+                        AggregateFunction::Max,
+                        vec![col("b")],
+                        true,
+                        None,
+                        None,
+                    )),
+                ],
+            )?
+            .build()?;
+        // Should work
+        let expected = "Projection: test.a, SUM(alias2) AS SUM(test.c), 
COUNT(alias1) AS COUNT(DISTINCT test.b), MAX(alias1) AS MAX(DISTINCT test.b) 
[a:UInt32, SUM(test.c):UInt64;N, COUNT(DISTINCT test.b):Int64;N, MAX(DISTINCT 
test.b):UInt32;N]\
+                            \n  Aggregate: groupBy=[[test.a]], 
aggr=[[SUM(alias2), COUNT(alias1), MAX(alias1)]] [a:UInt32, 
SUM(alias2):UInt64;N, COUNT(alias1):Int64;N, MAX(alias1):UInt32;N]\
+                            \n    Aggregate: groupBy=[[test.a, test.b AS 
alias1]], aggr=[[SUM(test.c) AS alias2]] [a:UInt32, alias1:UInt32, 
alias2:UInt64;N]\
+                            \n      TableScan: test [a:UInt32, b:UInt32, 
c:UInt32]";
+
+        assert_optimized_plan_equal(&plan, expected)
+    }
+
+    #[test]

Review Comment:
   Can you please also add tests for:
   
   * MIN(A), COUNT(DISTINCT b) -- positive case
   * COUNT(A), COUNT(DISTINCT b)  -- negative case
   * SUM(A filter A > 5), COUNT DISTINCT(B) -- negative case



##########
datafusion/optimizer/src/single_distinct_to_groupby.rs:
##########
@@ -64,22 +67,32 @@ fn is_single_distinct_agg(plan: &LogicalPlan) -> 
Result<bool> {
     match plan {
         LogicalPlan::Aggregate(Aggregate { aggr_expr, .. }) => {
             let mut fields_set = HashSet::new();
-            let mut distinct_count = 0;
+            let mut aggregate_count = 0;
             for expr in aggr_expr {
                 if let Expr::AggregateFunction(AggregateFunction {
-                    distinct, args, ..
+                    fun,
+                    distinct,
+                    args,
+                    filter,
+                    ..
                 }) = expr
                 {
-                    if *distinct {
-                        distinct_count += 1;
-                    }
-                    for e in args {
-                        fields_set.insert(e.canonical_name());
+                    match filter {

Review Comment:
   Minor suggestion here is that using `if` could reduce the indent level:
   
   ```rust
                       if filter.is_some() || order_by.is_some() {
                           return Ok(false);
                       }
                       aggregate_count += 1;
   ```



##########
datafusion/optimizer/src/single_distinct_to_groupby.rs:
##########
@@ -64,22 +67,32 @@ fn is_single_distinct_agg(plan: &LogicalPlan) -> 
Result<bool> {
     match plan {
         LogicalPlan::Aggregate(Aggregate { aggr_expr, .. }) => {
             let mut fields_set = HashSet::new();
-            let mut distinct_count = 0;
+            let mut aggregate_count = 0;
             for expr in aggr_expr {
                 if let Expr::AggregateFunction(AggregateFunction {
-                    distinct, args, ..
+                    fun,
+                    distinct,
+                    args,
+                    filter,
+                    ..

Review Comment:
   Can you also please check that `order_by` is null as well?
   
   In general, I think it would make sense to avoid the `..` to make sure we 
don't forget to handle new fields if/when they are added
   
   ```rust
                 if let Expr::AggregateFunction(AggregateFunction {
                       fun,
                       distinct,
                       args,
                       filter,
                       order_by
   }) = expr 
   ```
   
   However, the code on main doesn't seem to handle this case either so I don't 
think the change is required in this PR



##########
datafusion/optimizer/src/single_distinct_to_groupby.rs:
##########
@@ -478,4 +520,52 @@ mod tests {
 
         assert_optimized_plan_equal(&plan, expected)
     }
+
+    #[test]
+    fn two_distinct_and_one_common() -> Result<()> {
+        let table_scan = test_table_scan()?;
+
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(
+                vec![col("a")],
+                vec![
+                    sum(col("c")),
+                    count_distinct(col("b")),
+                    Expr::AggregateFunction(expr::AggregateFunction::new(

Review Comment:
   Perhaps this could use 
https://docs.rs/datafusion/latest/datafusion/logical_expr/expr_fn/fn.max.html ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to