This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new dc1db95  [SPARK-36867][SQL] Fix error message with GROUP BY alias
dc1db95 is described below

commit dc1db950adb9a210acfe4a0a77988955a5f35e5e
Author: Wenchen Fan <[email protected]>
AuthorDate: Tue Oct 12 22:47:31 2021 +0800

    [SPARK-36867][SQL] Fix error message with GROUP BY alias
    
    ### What changes were proposed in this pull request?
    
    When checking unresolved attributes, we should check 
`Aggregate.aggregateExpressions` before `Aggregate.groupingExpressions`, 
because the latter may rely on the former, due to the GROUP BY alias feature.
    
    ### Why are the changes needed?
    
    improve error message
    
    ### Does this PR introduce _any_ user-facing change?
    
    no
    
    ### How was this patch tested?
    
    new test
    
    Closes #34244 from cloud-fan/bug.
    
    Authored-by: Wenchen Fan <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../sql/catalyst/analysis/CheckAnalysis.scala      | 28 +++++++++++++---------
 .../test/resources/sql-tests/inputs/group-by.sql   |  3 +++
 .../resources/sql-tests/results/group-by.sql.out   | 11 ++++++++-
 3 files changed, 30 insertions(+), 12 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index bdd7ffb..5bf37a2 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -165,7 +165,14 @@ trait CheckAnalysis extends PredicateHelper with 
LookupCatalog {
             }
         }
 
-        operator transformExpressionsUp {
+        val exprs = operator match {
+          // `groupingExpressions` may rely on `aggregateExpressions`, due to 
the GROUP BY alias
+          // feature. We should check errors in `aggregateExpressions` first.
+          case a: Aggregate => a.aggregateExpressions ++ a.groupingExpressions
+          case _ => operator.expressions
+        }
+
+        exprs.foreach(_.foreachUp {
           case a: Attribute if !a.resolved =>
             val missingCol = a.sql
             val candidates = operator.inputSet.toSeq.map(_.qualifiedName)
@@ -209,27 +216,26 @@ trait CheckAnalysis extends PredicateHelper with 
LookupCatalog {
             failAnalysis(s"${wf.prettyName} function can only be evaluated in 
an ordered " +
               s"row-based window frame with a single offset: $w")
 
-          case w @ WindowExpression(e, s) =>
+          case w: WindowExpression =>
             // Only allow window functions with an aggregate expression or an 
offset window
             // function or a Pandas window UDF.
-            e match {
+            w.windowFunction match {
               case _: AggregateExpression | _: FrameLessOffsetWindowFunction |
-                  _: AggregateWindowFunction =>
-                w
-              case f: PythonUDF if PythonUDF.isWindowPandasUDF(f) =>
-                w
-              case _ =>
-                failAnalysis(s"Expression '$e' not supported within a window 
function.")
+                  _: AggregateWindowFunction => // OK
+              case f: PythonUDF if PythonUDF.isWindowPandasUDF(f) => // OK
+              case other =>
+                failAnalysis(s"Expression '$other' not supported within a 
window function.")
             }
 
           case s: SubqueryExpression =>
             checkSubqueryExpression(operator, s)
-            s
 
           case e: ExpressionWithRandomSeed if !e.seedExpression.foldable =>
             failAnalysis(
               s"Input argument to ${e.prettyName} must be a constant.")
-        }
+
+          case _ =>
+        })
 
         operator match {
           case etw: EventTimeWatermark =>
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql 
b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index e2c3672..039373b 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -45,6 +45,9 @@ SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 
AS a, 2 AS b, 3 AS
 SELECT a AS k, COUNT(b) FROM testData GROUP BY k;
 SELECT a AS k, COUNT(b) FROM testData GROUP BY k HAVING k > 1;
 
+-- GROUP BY alias with invalid col in SELECT list
+SELECT a AS k, COUNT(non_existing) FROM testData GROUP BY k;
+
 -- Aggregate functions cannot be used in GROUP BY
 SELECT COUNT(b) AS k FROM testData GROUP BY k;
 
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out 
b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index 37deb87..f598f49 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 64
+-- Number of queries: 65
 
 
 -- !query
@@ -162,6 +162,15 @@ struct<k:int,count(b):bigint>
 
 
 -- !query
+SELECT a AS k, COUNT(non_existing) FROM testData GROUP BY k
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Column 'non_existing' does not exist. Did you mean one of the following? 
[testdata.a, testdata.b]; line 1 pos 21
+
+
+-- !query
 SELECT COUNT(b) AS k FROM testData GROUP BY k
 -- !query schema
 struct<>

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to