viirya commented on a change in pull request #27224: [SPARK-30523][SQL] -
Collapse nested aggregates
URL: https://github.com/apache/spark/pull/27224#discussion_r367163290
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
##########
@@ -964,6 +965,155 @@ object CombineFilters extends Rule[LogicalPlan] with
PredicateHelper {
}
}
+/**
+ * Combines two adjacent [[Aggregate]] operators into one, if the first one is
not necessary.
+ *
+ * If we are referencing the outputs of aggregate functions in the inner
aggregate from the outer
+ * one, check if they are being used in outer aggregates in a way that can be
collapsed into a
+ * single aggregate. A sum of sums, or a max of max, or min of min are all
collapsible.
+ * avg over avg will not be collapsible because different number of raw rows
will have contributed
+ * to the partial averages of the inner aggregate
+ *
+ * Min an Max can be folded in the case described above, or if they are
referencing
+ * the group by columns, as they can safely be computed just using the set of
+ * unique values.
+ */
+object CombineAggregates extends Rule[LogicalPlan] with PredicateHelper {
+
+ /**
+ * The aggregate expression list includes both aggregate expressions and
+ * the projected group by keys, this filters out the aggregate expressions
+ * in the list leaving just the group by keys. It also unwraps aliases to
+ * just give a list of the projected grouping expressions themselves.
+ */
+ def justProjectedGroupExprs(aggExprs: Seq[NamedExpression],
+ groupExprs: Seq[Expression]): Seq[NamedExpression] = {
+ aggExprs.filter(namedEx =>
+ groupExprs.exists(_.semanticEquals(unwrapAlias(namedEx)))
+ )
+ }
+
+ def unwrapAlias(ex: Expression): Expression = {
+ if (ex.isInstanceOf[Alias]) ex.children.head
+ else ex
+ }
Review comment:
```scala
def unwrapAlias(ex: Expression): Expression = ex match {
case Alias(c, _) => c
case _ => ex
}
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]