This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push: new 7aea21eae377 [SPARK-47895][SQL] group by all should be idempotent 7aea21eae377 is described below commit 7aea21eae377321633d3eeeeddd34898e9a5ea43 Author: Wenchen Fan <wenc...@databricks.com> AuthorDate: Thu Apr 18 16:33:47 2024 +0800 [SPARK-47895][SQL] group by all should be idempotent ### What changes were proposed in this pull request? This is a followup of https://github.com/apache/spark/pull/43797 . GROUP BY ALL has the same bug and this PR applies the same fix to GROUP BY ALL ### Why are the changes needed? For advanced users or Spark plugins, they may manipulate the logical plans directly. We need to make the framework more reliable. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? new test ### Was this patch authored or co-authored using generative AI tooling? no Closes #46113 from cloud-fan/group-all. Authored-by: Wenchen Fan <wenc...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> (cherry picked from commit b5bb75ca240a98ae5651e5cb429fd4bd31b7bb8a) Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../analysis/ResolveReferencesInAggregate.scala | 16 ++++++++++++++-- .../analysis/SubstituteUnresolvedOrdinalsSuite.scala | 18 ++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala index 09ae87b071fd..a03d5438ff6a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.SQLConfHelper -import org.apache.spark.sql.catalyst.expressions.{AliasHelper, Attribute, Expression, NamedExpression} +import org.apache.spark.sql.catalyst.expressions.{AliasHelper, Attribute, Expression, IntegerLiteral, Literal, NamedExpression} import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, AppendColumns, LogicalPlan} import org.apache.spark.sql.catalyst.trees.TreePattern.{LATERAL_COLUMN_ALIAS_REFERENCE, UNRESOLVED_ATTRIBUTE} @@ -134,7 +134,19 @@ object ResolveReferencesInAggregate extends SQLConfHelper groupExprs } else { // This is a valid GROUP BY ALL aggregate. - expandedGroupExprs.get + expandedGroupExprs.get.zipWithIndex.map { case (expr, index) => + trimAliases(expr) match { + // HACK ALERT: If the expanded grouping expression is an integer literal, don't use it + // but use an integer literal of the index. The reason is we may repeatedly + // analyze the plan, and the original integer literal may cause failures + // with a later GROUP BY ordinal resolution. GROUP BY constant is + // meaningless so whatever value does not matter here. + case IntegerLiteral(_) => + // GROUP BY ordinal uses 1-based index. + Literal(index + 1) + case _ => expr + } + } } } else { groupExprs diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala index 953b2c8bb101..39cf298aec43 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala @@ -86,4 +86,22 @@ class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest { testRelationWithData.groupBy(Literal(1))(Literal(100).as("a")) ) } + + test("SPARK-47895: group by all repeated analysis") { + val plan = testRelation.groupBy($"all")(Literal(100).as("a")).analyze + comparePlans( + plan, + testRelation.groupBy(Literal(1))(Literal(100).as("a")) + ) + + val testRelationWithData = testRelation.copy(data = Seq(new GenericInternalRow(Array(1: Any)))) + // Copy the plan to reset its `analyzed` flag, so that analyzer rules will re-apply. + val copiedPlan = plan.transform { + case _: LocalRelation => testRelationWithData + } + comparePlans( + copiedPlan.analyze, // repeated analysis + testRelationWithData.groupBy(Literal(1))(Literal(100).as("a")) + ) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org