This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new e8e7dcce4ac7 [SPARK-47786] SELECT DISTINCT (*) should not become
SELECT DISTINCT struct(*) (revert to previous behavior)
e8e7dcce4ac7 is described below
commit e8e7dcce4ac76f494781ccd7712634b73a6dbe14
Author: Serge Rielau <[email protected]>
AuthorDate: Wed Apr 10 10:29:32 2024 +0800
[SPARK-47786] SELECT DISTINCT (*) should not become SELECT DISTINCT
struct(*) (revert to previous behavior)
### What changes were proposed in this pull request?
We special case SELECT DISTINCT (*) to become SELECT DISTINCT *
This prevents (*) to be treated as struct(*).
We used to ignore parens around stars (*) everywhere, but that is
inconsistent with e.g. (c1, c2).
However there seems to be a reasonable number of users treating DISTINCT as
a function.
### Why are the changes needed?
Prevent regression for a common weird case.
### Does this PR introduce _any_ user-facing change?
No, It undoes a user changing change
### How was this patch tested?
Existing QA, added new tests to selectExcept.sql
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #45970 from srielau/SPARK-47786-distinct-star-fix.
Authored-by: Serge Rielau <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../spark/sql/catalyst/parser/SqlBaseParser.g4 | 7 +++-
.../spark/sql/catalyst/parser/AstBuilder.scala | 9 +++++
.../analyzer-results/selectExcept.sql.out | 41 ++++++++++++++++++++++
.../resources/sql-tests/inputs/selectExcept.sql | 12 +++++++
.../sql-tests/results/selectExcept.sql.out | 32 +++++++++++++++++
5 files changed, 100 insertions(+), 1 deletion(-)
diff --git
a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
index 6e79d4af2f5e..8ff1d3f95301 100644
---
a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
+++
b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
@@ -575,8 +575,13 @@ transformClause
(RECORDREADER recordReader=stringLit)?
;
+parenthesizedStar
+ : LEFT_PAREN ASTERISK RIGHT_PAREN
+ ;
+
selectClause
- : SELECT (hints+=hint)* setQuantifier? namedExpressionSeq
+ : SELECT (hints+=hint)* setQuantifier parenthesizedStar
+ | SELECT (hints+=hint)* setQuantifier? namedExpressionSeq
;
setClause
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index b4ba2c1caa22..4ffff6bf765b 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -2595,6 +2595,15 @@ class AstBuilder extends DataTypeAstBuilder with
SQLConfHelper with Logging {
UnresolvedExtractValue(expression(ctx.value), expression(ctx.index))
}
+ /**
+ * Create an expression for an expression between parentheses. This is need
because the ANTLR
+ * visitor cannot automatically convert the nested context into an
expression.
+ */
+ override def visitParenthesizedStar(
+ ctx: ParenthesizedStarContext): Seq[Expression] = withOrigin(ctx) {
+ Seq(UnresolvedStar(None))
+ }
+
/**
* Create an expression for an expression between parentheses. This is need
because the ANTLR
* visitor cannot automatically convert the nested context into an
expression.
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out
index 28c6a77d2333..340712399d4f 100644
---
a/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out
+++
b/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out
@@ -548,13 +548,54 @@ Project [c1#x, c2#x, c3#x, c4#x, c5#x]
+- LocalRelation [c1#x, c2#x, c3#x, c4#x, c5#x]
+-- !query
+SELECT DISTINCT * FROM v1
+-- !query analysis
+Distinct
++- Project [c1#x, c2#x, c3#x, c4#x, c5#x]
+ +- SubqueryAlias v1
+ +- View (`v1`, [c1#x, c2#x, c3#x, c4#x, c5#x])
+ +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as int) AS c2#x,
cast(c3#x as void) AS c3#x, cast(c4#x as int) AS c4#x, cast(c5#x as int) AS
c5#x]
+ +- SubqueryAlias T
+ +- LocalRelation [c1#x, c2#x, c3#x, c4#x, c5#x]
+
+
+-- !query
+SELECT DISTINCT(*) FROM v1
+-- !query analysis
+Distinct
++- SubqueryAlias v1
+ +- View (`v1`, [c1#x, c2#x, c3#x, c4#x, c5#x])
+ +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as int) AS c2#x,
cast(c3#x as void) AS c3#x, cast(c4#x as int) AS c4#x, cast(c5#x as int) AS
c5#x]
+ +- SubqueryAlias T
+ +- LocalRelation [c1#x, c2#x, c3#x, c4#x, c5#x]
+
+
-- !query
SET spark.sql.legacy.ignoreParenthesesAroundStar = false
-- !query analysis
SetCommand (spark.sql.legacy.ignoreParenthesesAroundStar,Some(false))
+-- !query
+SELECT DISTINCT(*) FROM v1
+-- !query analysis
+Distinct
++- SubqueryAlias v1
+ +- View (`v1`, [c1#x, c2#x, c3#x, c4#x, c5#x])
+ +- Project [cast(c1#x as int) AS c1#x, cast(c2#x as int) AS c2#x,
cast(c3#x as void) AS c3#x, cast(c4#x as int) AS c4#x, cast(c5#x as int) AS
c5#x]
+ +- SubqueryAlias T
+ +- LocalRelation [c1#x, c2#x, c3#x, c4#x, c5#x]
+
+
-- !query
DROP VIEW v1
-- !query analysis
DropTempViewCommand v1
+
+
+-- !query
+SELECT DISTINCT(*)
+-- !query analysis
+Distinct
++- OneRowRelation
diff --git a/sql/core/src/test/resources/sql-tests/inputs/selectExcept.sql
b/sql/core/src/test/resources/sql-tests/inputs/selectExcept.sql
index 88b30388ca08..96b0fcc1e6ae 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/selectExcept.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/selectExcept.sql
@@ -77,5 +77,17 @@ SELECT T.* FROM v1, LATERAL (SELECT COALESCE(v1.*)) AS T(x);
-- We used to ignore () around * in the past, but now we don't
SET spark.sql.legacy.ignoreParenthesesAroundStar = true;
SELECT (*) FROM v1;
+
+SELECT DISTINCT * FROM v1;
+
+SELECT DISTINCT(*) FROM v1;
+
SET spark.sql.legacy.ignoreParenthesesAroundStar = false;
+
+SELECT DISTINCT(*) FROM v1;
+
DROP VIEW v1;
+
+-- Except for DISTINCT, where we still ignore ()
+SELECT DISTINCT(*)
+
diff --git a/sql/core/src/test/resources/sql-tests/results/selectExcept.sql.out
b/sql/core/src/test/resources/sql-tests/results/selectExcept.sql.out
index 22f68fb9e5da..ecf47fdb7a29 100644
--- a/sql/core/src/test/resources/sql-tests/results/selectExcept.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/selectExcept.sql.out
@@ -512,6 +512,22 @@ struct<c1:int,c2:int,c3:void,c4:int,c5:int>
1 2 NULL 4 5
+-- !query
+SELECT DISTINCT * FROM v1
+-- !query schema
+struct<c1:int,c2:int,c3:void,c4:int,c5:int>
+-- !query output
+1 2 NULL 4 5
+
+
+-- !query
+SELECT DISTINCT(*) FROM v1
+-- !query schema
+struct<c1:int,c2:int,c3:void,c4:int,c5:int>
+-- !query output
+1 2 NULL 4 5
+
+
-- !query
SET spark.sql.legacy.ignoreParenthesesAroundStar = false
-- !query schema
@@ -520,9 +536,25 @@ struct<key:string,value:string>
spark.sql.legacy.ignoreParenthesesAroundStar false
+-- !query
+SELECT DISTINCT(*) FROM v1
+-- !query schema
+struct<c1:int,c2:int,c3:void,c4:int,c5:int>
+-- !query output
+1 2 NULL 4 5
+
+
-- !query
DROP VIEW v1
-- !query schema
struct<>
-- !query output
+
+
+-- !query
+SELECT DISTINCT(*)
+-- !query schema
+struct<>
+-- !query output
+
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]