This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push:
new 8bcbf7701388 [SPARK-47561][SQL] Fix analyzer rule order issues about
Alias
8bcbf7701388 is described below
commit 8bcbf7701388a2da06369ae9317d7707624edba0
Author: Wenchen Fan <[email protected]>
AuthorDate: Tue Mar 26 07:45:54 2024 -0700
[SPARK-47561][SQL] Fix analyzer rule order issues about Alias
### What changes were proposed in this pull request?
We found two analyzer rule execution order issues in our internal workloads:
- `CreateStruct.apply` creates `NamePlaceholder` for unresolved
`NamedExpression`. However, with certain rule execution order, the
`NamedExpression` may be removed (e.g. remove unnecessary `Alias`) before
`NamePlaceholder` is resolved, then `NamePlaceholder` can't be resolved anymore.
- UNPIVOT uses `UnresolvedAlias` to wrap `UnresolvedAttribute`. There is a
conflict about how to determine the final alias name. If `ResolveAliases` runs
first, then `UnresolvedAlias` will be removed and eventually the alias will be
`b` for nested column `a.b`. If `ResolveReferences` runs first, then we resolve
`a.b` first and then `UnresolvedAlias` will determine the alias as `a.b` not
`b`.
This PR fixes the two issues
- `CreateStruct.apply` should determine the field name immediately if the
input is `Alias`
- The parser rule for UNPIVOT should follow how we parse SELECT and return
`UnresolvedAttribute` directly without the `UnresolvedAlias` wrapper. It's a
bit risky to fix the order issue between `ResolveAliases` and
`ResolveReferences` as it can change the final query schema, we will save it
for later.
### Why are the changes needed?
fix unstable analyzer behavior with different rule execution orders.
### Does this PR introduce _any_ user-facing change?
Yes, some failed queries can run now. The issue for UNPIVOT only affects
the error message.
### How was this patch tested?
verified by our internal workloads. The repro query is quite complicated to
trigger a certain rule execution order so we won't add tests for it. The fix is
quite obvious.
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #45718 from cloud-fan/rule.
Authored-by: Wenchen Fan <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../catalyst/expressions/complexTypeCreator.scala | 1 +
.../spark/sql/catalyst/parser/AstBuilder.scala | 2 +-
.../sql/catalyst/parser/UnpivotParserSuite.scala | 39 ++++++++++------------
3 files changed, 20 insertions(+), 22 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 205121913121..c95a0987330d 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -372,6 +372,7 @@ object CreateStruct {
// alias name inside CreateNamedStruct.
case (u: UnresolvedAttribute, _) => Seq(Literal(u.nameParts.last), u)
case (u @ UnresolvedExtractValue(_, e: Literal), _) if e.dataType ==
StringType => Seq(e, u)
+ case (a: Alias, _) => Seq(Literal(a.name), a)
case (e: NamedExpression, _) if e.resolved => Seq(Literal(e.name), e)
case (e: NamedExpression, _) => Seq(NamePlaceholder, e)
case (e, index) => Seq(Literal(s"col${index + 1}"), e)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 90fbdd94dc38..5d68aed9245a 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -1291,7 +1291,7 @@ class AstBuilder extends DataTypeAstBuilder with
SQLConfHelper with Logging {
* Create an Unpivot column.
*/
override def visitUnpivotColumn(ctx: UnpivotColumnContext): NamedExpression
= withOrigin(ctx) {
-
UnresolvedAlias(UnresolvedAttribute(visitMultipartIdentifier(ctx.multipartIdentifier)))
+ UnresolvedAttribute(visitMultipartIdentifier(ctx.multipartIdentifier))
}
/**
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/UnpivotParserSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/UnpivotParserSuite.scala
index c680e08c1c83..3012ef6f1544 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/UnpivotParserSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/UnpivotParserSuite.scala
@@ -39,7 +39,7 @@ class UnpivotParserSuite extends AnalysisTest {
"SELECT * FROM t UNPIVOT (val FOR col in (a, b))",
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
@@ -59,7 +59,7 @@ class UnpivotParserSuite extends AnalysisTest {
sql,
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
Some(Seq(Some("A"), None)),
"col",
Seq("val"),
@@ -76,7 +76,7 @@ class UnpivotParserSuite extends AnalysisTest {
"SELECT * FROM t UNPIVOT ((val1, val2) FOR col in ((a, b), (c, d)))",
Unpivot(
None,
- Some(Seq(Seq($"a", $"b").map(UnresolvedAlias(_)), Seq($"c",
$"d").map(UnresolvedAlias(_)))),
+ Some(Seq(Seq($"a", $"b"), Seq($"c", $"d"))),
None,
"col",
Seq("val1", "val2"),
@@ -96,10 +96,7 @@ class UnpivotParserSuite extends AnalysisTest {
sql,
Unpivot(
None,
- Some(Seq(
- Seq($"a", $"b").map(UnresolvedAlias(_)),
- Seq($"c", $"d").map(UnresolvedAlias(_))
- )),
+ Some(Seq(Seq($"a", $"b"), Seq($"c", $"d"))),
Some(Seq(Some("first"), None)),
"col",
Seq("val1", "val2"),
@@ -132,7 +129,7 @@ class UnpivotParserSuite extends AnalysisTest {
sql,
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
@@ -169,7 +166,7 @@ class UnpivotParserSuite extends AnalysisTest {
"SELECT * FROM t UNPIVOT EXCLUDE NULLS (val FOR col in (a, b))",
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
@@ -184,7 +181,7 @@ class UnpivotParserSuite extends AnalysisTest {
"SELECT * FROM t UNPIVOT INCLUDE NULLS (val FOR col in (a, b))",
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
@@ -199,7 +196,7 @@ class UnpivotParserSuite extends AnalysisTest {
"SELECT * FROM t1 UNPIVOT (val FOR col in (a, b)) JOIN t2",
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
@@ -211,7 +208,7 @@ class UnpivotParserSuite extends AnalysisTest {
"SELECT * FROM t1 JOIN t2 UNPIVOT (val FOR col in (a, b))",
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
@@ -224,7 +221,7 @@ class UnpivotParserSuite extends AnalysisTest {
table("t1").join(
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
@@ -239,7 +236,7 @@ class UnpivotParserSuite extends AnalysisTest {
"SELECT * FROM t1 UNPIVOT (val FOR col in (a, b)), t2",
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
@@ -251,7 +248,7 @@ class UnpivotParserSuite extends AnalysisTest {
"SELECT * FROM t1, t2 UNPIVOT (val FOR col in (a, b))",
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
@@ -267,7 +264,7 @@ class UnpivotParserSuite extends AnalysisTest {
table("t1").join(
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
@@ -282,7 +279,7 @@ class UnpivotParserSuite extends AnalysisTest {
table("t1").join(
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
@@ -296,7 +293,7 @@ class UnpivotParserSuite extends AnalysisTest {
"SELECT * FROM t1, t2 JOIN t3 UNPIVOT (val FOR col in (a, b))",
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
@@ -311,7 +308,7 @@ class UnpivotParserSuite extends AnalysisTest {
table("t1").join(
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
@@ -326,13 +323,13 @@ class UnpivotParserSuite extends AnalysisTest {
"SELECT * FROM t1 UNPIVOT (val FOR col in (a, b)) UNPIVOT (val FOR col
in (a, b))",
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
Unpivot(
None,
- Some(Seq(Seq(UnresolvedAlias($"a")), Seq(UnresolvedAlias($"b")))),
+ Some(Seq(Seq($"a"), Seq($"b"))),
None,
"col",
Seq("val"),
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]