alamb commented on code in PR #8266: URL: https://github.com/apache/arrow-datafusion/pull/8266#discussion_r1402126714
########## datafusion/optimizer/src/single_distinct_to_groupby.rs: ########## @@ -64,22 +67,32 @@ fn is_single_distinct_agg(plan: &LogicalPlan) -> Result<bool> { match plan { LogicalPlan::Aggregate(Aggregate { aggr_expr, .. }) => { let mut fields_set = HashSet::new(); - let mut distinct_count = 0; + let mut aggregate_count = 0; for expr in aggr_expr { if let Expr::AggregateFunction(AggregateFunction { - distinct, args, .. + fun, + distinct, + args, + filter, + .. }) = expr { - if *distinct { - distinct_count += 1; - } - for e in args { - fields_set.insert(e.canonical_name()); + match filter { + Some(_) => return Ok(false), + None => { + aggregate_count += 1; + if *distinct { + for e in args { + fields_set.insert(e.canonical_name()); + } + } else if !matches!(fun, Sum | Min | Max) { Review Comment: Can you please add a comment here explaining why this is checking for only `Sum`, `Min` and `Max` I think it is because these functions have the property that they can be used to combine themselves -- so like `SUM(SUM(x))` is the same as `SUM(x)` I think this transformation could be made more general by using a different combination. For example we could support `COUNT` by using `SUM(COUNT(x))` ```sql SELECT a, COUNT(DINSTINCT b), COUNT(c) FROM t GROUP BY a ``` ```sql SELECT a, COUNT(alias1), SUM(alias2) -- <-- This is SUM, not COUNT FROM ( SELECT a, b as alias1, COUNT(c) as alias2 FROM t GROUP BY a, b ) GROUP BY a ``` We can do this as a follow on PR ( I can file a ticket) ########## datafusion/optimizer/src/single_distinct_to_groupby.rs: ########## @@ -478,4 +520,52 @@ mod tests { assert_optimized_plan_equal(&plan, expected) } + + #[test] + fn two_distinct_and_one_common() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .aggregate( + vec![col("a")], + vec![ + sum(col("c")), + count_distinct(col("b")), + Expr::AggregateFunction(expr::AggregateFunction::new( + AggregateFunction::Max, + vec![col("b")], + true, + None, + None, + )), + ], + )? + .build()?; + // Should work + let expected = "Projection: test.a, SUM(alias2) AS SUM(test.c), COUNT(alias1) AS COUNT(DISTINCT test.b), MAX(alias1) AS MAX(DISTINCT test.b) [a:UInt32, SUM(test.c):UInt64;N, COUNT(DISTINCT test.b):Int64;N, MAX(DISTINCT test.b):UInt32;N]\ + \n Aggregate: groupBy=[[test.a]], aggr=[[SUM(alias2), COUNT(alias1), MAX(alias1)]] [a:UInt32, SUM(alias2):UInt64;N, COUNT(alias1):Int64;N, MAX(alias1):UInt32;N]\ + \n Aggregate: groupBy=[[test.a, test.b AS alias1]], aggr=[[SUM(test.c) AS alias2]] [a:UInt32, alias1:UInt32, alias2:UInt64;N]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_equal(&plan, expected) + } + + #[test] Review Comment: Can you please also add tests for: * MIN(A), COUNT(DISTINCT b) -- positive case * COUNT(A), COUNT(DISTINCT b) -- negative case * SUM(A filter A > 5), COUNT DISTINCT(B) -- negative case ########## datafusion/optimizer/src/single_distinct_to_groupby.rs: ########## @@ -64,22 +67,32 @@ fn is_single_distinct_agg(plan: &LogicalPlan) -> Result<bool> { match plan { LogicalPlan::Aggregate(Aggregate { aggr_expr, .. }) => { let mut fields_set = HashSet::new(); - let mut distinct_count = 0; + let mut aggregate_count = 0; for expr in aggr_expr { if let Expr::AggregateFunction(AggregateFunction { - distinct, args, .. + fun, + distinct, + args, + filter, + .. }) = expr { - if *distinct { - distinct_count += 1; - } - for e in args { - fields_set.insert(e.canonical_name()); + match filter { Review Comment: Minor suggestion here is that using `if` could reduce the indent level: ```rust if filter.is_some() || order_by.is_some() { return Ok(false); } aggregate_count += 1; ``` ########## datafusion/optimizer/src/single_distinct_to_groupby.rs: ########## @@ -64,22 +67,32 @@ fn is_single_distinct_agg(plan: &LogicalPlan) -> Result<bool> { match plan { LogicalPlan::Aggregate(Aggregate { aggr_expr, .. }) => { let mut fields_set = HashSet::new(); - let mut distinct_count = 0; + let mut aggregate_count = 0; for expr in aggr_expr { if let Expr::AggregateFunction(AggregateFunction { - distinct, args, .. + fun, + distinct, + args, + filter, + .. Review Comment: Can you also please check that `order_by` is null as well? In general, I think it would make sense to avoid the `..` to make sure we don't forget to handle new fields if/when they are added ```rust if let Expr::AggregateFunction(AggregateFunction { fun, distinct, args, filter, order_by }) = expr ``` However, the code on main doesn't seem to handle this case either so I don't think the change is required in this PR ########## datafusion/optimizer/src/single_distinct_to_groupby.rs: ########## @@ -478,4 +520,52 @@ mod tests { assert_optimized_plan_equal(&plan, expected) } + + #[test] + fn two_distinct_and_one_common() -> Result<()> { + let table_scan = test_table_scan()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .aggregate( + vec![col("a")], + vec![ + sum(col("c")), + count_distinct(col("b")), + Expr::AggregateFunction(expr::AggregateFunction::new( Review Comment: Perhaps this could use https://docs.rs/datafusion/latest/datafusion/logical_expr/expr_fn/fn.max.html ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org