(spark) branch master updated: [SPARK-47895][SQL] group by all should be idempotent

wenchen Thu, 18 Apr 2024 01:34:05 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new b5bb75ca240a [SPARK-47895][SQL] group by all should be idempotent
b5bb75ca240a is described below

commit b5bb75ca240a98ae5651e5cb429fd4bd31b7bb8a
Author: Wenchen Fan <wenc...@databricks.com>
AuthorDate: Thu Apr 18 16:33:47 2024 +0800

    [SPARK-47895][SQL] group by all should be idempotent
    
    ### What changes were proposed in this pull request?
    
    This is a followup of https://github.com/apache/spark/pull/43797 . GROUP BY 
ALL has the same bug and this PR applies the same fix to GROUP BY ALL
    
    ### Why are the changes needed?
    
    For advanced users or Spark plugins, they may manipulate the logical plans 
directly. We need to make the framework more reliable.
    
    ### Does this PR introduce _any_ user-facing change?
    
    no
    
    ### How was this patch tested?
    
    new test
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    no
    
    Closes #46113 from cloud-fan/group-all.
    
    Authored-by: Wenchen Fan <wenc...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../analysis/ResolveReferencesInAggregate.scala        | 16 ++++++++++++++--
 .../analysis/SubstituteUnresolvedOrdinalsSuite.scala   | 18 ++++++++++++++++++
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala
index 4f5a11835c33..7ea90854932e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.SQLConfHelper
-import org.apache.spark.sql.catalyst.expressions.{AliasHelper, Attribute, 
Expression, NamedExpression}
+import org.apache.spark.sql.catalyst.expressions.{AliasHelper, Attribute, 
Expression, IntegerLiteral, Literal, NamedExpression}
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, AppendColumns, 
LogicalPlan}
 import 
org.apache.spark.sql.catalyst.trees.TreePattern.{LATERAL_COLUMN_ALIAS_REFERENCE,
 UNRESOLVED_ATTRIBUTE}
@@ -136,7 +136,19 @@ class ResolveReferencesInAggregate(val catalogManager: 
CatalogManager) extends S
         groupExprs
       } else {
         // This is a valid GROUP BY ALL aggregate.
-        expandedGroupExprs.get
+        expandedGroupExprs.get.zipWithIndex.map { case (expr, index) =>
+          trimAliases(expr) match {
+            // HACK ALERT: If the expanded grouping expression is an integer 
literal, don't use it
+            //             but use an integer literal of the index. The reason 
is we may repeatedly
+            //             analyze the plan, and the original integer literal 
may cause failures
+            //             with a later GROUP BY ordinal resolution. GROUP BY 
constant is
+            //             meaningless so whatever value does not matter here.
+            case IntegerLiteral(_) =>
+              // GROUP BY ordinal uses 1-based index.
+              Literal(index + 1)
+            case _ => expr
+          }
+        }
       }
     } else {
       groupExprs
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
index 953b2c8bb101..39cf298aec43 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
@@ -86,4 +86,22 @@ class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest 
{
       testRelationWithData.groupBy(Literal(1))(Literal(100).as("a"))
     )
   }
+
+  test("SPARK-47895: group by all repeated analysis") {
+    val plan = testRelation.groupBy($"all")(Literal(100).as("a")).analyze
+    comparePlans(
+      plan,
+      testRelation.groupBy(Literal(1))(Literal(100).as("a"))
+    )
+
+    val testRelationWithData = testRelation.copy(data = Seq(new 
GenericInternalRow(Array(1: Any))))
+    // Copy the plan to reset its `analyzed` flag, so that analyzer rules will 
re-apply.
+    val copiedPlan = plan.transform {
+      case _: LocalRelation => testRelationWithData
+    }
+    comparePlans(
+      copiedPlan.analyze, // repeated analysis
+      testRelationWithData.groupBy(Literal(1))(Literal(100).as("a"))
+    )
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-47895][SQL] group by all should be idempotent

Reply via email to