maropu commented on a change in pull request #26420: [SPARK-27986][SQL] Support
ANSI SQL filter predicate for aggregate expression.
URL: https://github.com/apache/spark/pull/26420#discussion_r349866351
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
##########
@@ -157,38 +180,87 @@ abstract class AggregationIterator(
inputAttributes: Seq[Attribute]): (InternalRow, InternalRow) => Unit = {
val joinedRow = new JoinedRow
if (expressions.nonEmpty) {
- val mergeExpressions = functions.zip(expressions).flatMap {
- case (ae: DeclarativeAggregate, expression) =>
- expression.mode match {
+ var isFinalOrMerge = false
+ val mergeExpressions = functions.zipWithIndex.collect {
+ case (ae: DeclarativeAggregate, i) =>
+ expressions(i).mode match {
case Partial | Complete => ae.updateExpressions
- case PartialMerge | Final => ae.mergeExpressions
+ case PartialMerge | Final =>
+ isFinalOrMerge = true
+ ae.mergeExpressions
}
case (agg: AggregateFunction, _) =>
Seq.fill(agg.aggBufferAttributes.length)(NoOp)
}
val updateFunctions = functions.zipWithIndex.collect {
case (ae: ImperativeAggregate, i) =>
expressions(i).mode match {
case Partial | Complete =>
- (buffer: InternalRow, row: InternalRow) => ae.update(buffer, row)
+ Option(predicates(i)) match {
+ case Some(predicate) =>
+ (buffer: InternalRow, row: InternalRow) =>
+ if (predicate.eval(row)) { ae.update(buffer, row) }
+ case _ => (buffer: InternalRow, row: InternalRow) =>
ae.update(buffer, row)
+ }
case PartialMerge | Final =>
(buffer: InternalRow, row: InternalRow) => ae.merge(buffer, row)
}
}.toArray
// This projection is used to merge buffer values for all
expression-based aggregates.
val aggregationBufferSchema = functions.flatMap(_.aggBufferAttributes)
- val updateProjection =
- newMutableProjection(mergeExpressions, aggregationBufferSchema ++
inputAttributes)
+ val updateProjection = newMutableProjection(
+ mergeExpressions.flatMap(_.seq), aggregationBufferSchema ++
inputAttributes)
- (currentBuffer: InternalRow, row: InternalRow) => {
- // Process all expression-based aggregate functions.
- updateProjection.target(currentBuffer)(joinedRow(currentBuffer, row))
+ val processImperative = (currentBuffer: InternalRow, row: InternalRow)
=> {
// Process all imperative aggregate functions.
var i = 0
while (i < updateFunctions.length) {
updateFunctions(i)(currentBuffer, row)
i += 1
}
}
+
+ // The following two situations will adopt a common implementation:
+ // First, no filter predicate is specified for any aggregate expression.
+ // Second, aggregate expressions are in merge or final mode.
+ if (predicates.isEmpty || isFinalOrMerge) {
+ (currentBuffer: InternalRow, row: InternalRow) => {
+ updateProjection.target(currentBuffer)(joinedRow(currentBuffer, row))
+ processImperative(currentBuffer, row)
+ }
+ } else {
+ // In the list of aggregate expressions, if a filter predicate is
specified for at least one
+ // aggregate expression and aggregate expressions are in partial or
complete mode,
+ // then the filter will be used.
+ // Suppose there is a list of aggregate expressions, such as exprA
with filterA, exprB,
+ // exprC with filterC, then the specific implementation process is as
follows:
+ // 1. Accept data row.
+ // 2. Execute multiple aggregate expressions in sequence.
+ // 2-1. Filter the data row using filter predicate filterA. If the
filter predicate
+ // filterA is met, then calculate using aggregate expression
exprA.
+ // 2-2. Calculate using aggregate expression exprB.
+ // 2-3. Filter the data row using filter predicate filterC. If the
filter predicate
+ // filterC is met, then calculate using aggregate expression
exprC.
+ (currentBuffer: InternalRow, row: InternalRow) => {
+ val dynamicMergeExpressions = new mutable.ArrayBuffer[Expression]
Review comment:
Can you move the `predicate` process for expression-based agg functions
outside this row-by-row loop? The current code case cause overkilling overhead
when processing rows....
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]