This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new dc1db95 [SPARK-36867][SQL] Fix error message with GROUP BY alias
dc1db95 is described below
commit dc1db950adb9a210acfe4a0a77988955a5f35e5e
Author: Wenchen Fan <[email protected]>
AuthorDate: Tue Oct 12 22:47:31 2021 +0800
[SPARK-36867][SQL] Fix error message with GROUP BY alias
### What changes were proposed in this pull request?
When checking unresolved attributes, we should check
`Aggregate.aggregateExpressions` before `Aggregate.groupingExpressions`,
because the latter may rely on the former, due to the GROUP BY alias feature.
### Why are the changes needed?
improve error message
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
new test
Closes #34244 from cloud-fan/bug.
Authored-by: Wenchen Fan <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../sql/catalyst/analysis/CheckAnalysis.scala | 28 +++++++++++++---------
.../test/resources/sql-tests/inputs/group-by.sql | 3 +++
.../resources/sql-tests/results/group-by.sql.out | 11 ++++++++-
3 files changed, 30 insertions(+), 12 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index bdd7ffb..5bf37a2 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -165,7 +165,14 @@ trait CheckAnalysis extends PredicateHelper with
LookupCatalog {
}
}
- operator transformExpressionsUp {
+ val exprs = operator match {
+ // `groupingExpressions` may rely on `aggregateExpressions`, due to
the GROUP BY alias
+ // feature. We should check errors in `aggregateExpressions` first.
+ case a: Aggregate => a.aggregateExpressions ++ a.groupingExpressions
+ case _ => operator.expressions
+ }
+
+ exprs.foreach(_.foreachUp {
case a: Attribute if !a.resolved =>
val missingCol = a.sql
val candidates = operator.inputSet.toSeq.map(_.qualifiedName)
@@ -209,27 +216,26 @@ trait CheckAnalysis extends PredicateHelper with
LookupCatalog {
failAnalysis(s"${wf.prettyName} function can only be evaluated in
an ordered " +
s"row-based window frame with a single offset: $w")
- case w @ WindowExpression(e, s) =>
+ case w: WindowExpression =>
// Only allow window functions with an aggregate expression or an
offset window
// function or a Pandas window UDF.
- e match {
+ w.windowFunction match {
case _: AggregateExpression | _: FrameLessOffsetWindowFunction |
- _: AggregateWindowFunction =>
- w
- case f: PythonUDF if PythonUDF.isWindowPandasUDF(f) =>
- w
- case _ =>
- failAnalysis(s"Expression '$e' not supported within a window
function.")
+ _: AggregateWindowFunction => // OK
+ case f: PythonUDF if PythonUDF.isWindowPandasUDF(f) => // OK
+ case other =>
+ failAnalysis(s"Expression '$other' not supported within a
window function.")
}
case s: SubqueryExpression =>
checkSubqueryExpression(operator, s)
- s
case e: ExpressionWithRandomSeed if !e.seedExpression.foldable =>
failAnalysis(
s"Input argument to ${e.prettyName} must be a constant.")
- }
+
+ case _ =>
+ })
operator match {
case etw: EventTimeWatermark =>
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index e2c3672..039373b 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -45,6 +45,9 @@ SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1
AS a, 2 AS b, 3 AS
SELECT a AS k, COUNT(b) FROM testData GROUP BY k;
SELECT a AS k, COUNT(b) FROM testData GROUP BY k HAVING k > 1;
+-- GROUP BY alias with invalid col in SELECT list
+SELECT a AS k, COUNT(non_existing) FROM testData GROUP BY k;
+
-- Aggregate functions cannot be used in GROUP BY
SELECT COUNT(b) AS k FROM testData GROUP BY k;
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index 37deb87..f598f49 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 64
+-- Number of queries: 65
-- !query
@@ -162,6 +162,15 @@ struct<k:int,count(b):bigint>
-- !query
+SELECT a AS k, COUNT(non_existing) FROM testData GROUP BY k
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Column 'non_existing' does not exist. Did you mean one of the following?
[testdata.a, testdata.b]; line 1 pos 21
+
+
+-- !query
SELECT COUNT(b) AS k FROM testData GROUP BY k
-- !query schema
struct<>
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]