This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new fae2503 [SPARK-31577][SQL] Fix case-sensitivity and forward name
conflict problems when check name conflicts of CTE relations
fae2503 is described below
commit fae25030de002099f5ad3e741e4fec1281ea29ab
Author: Wenchen Fan <[email protected]>
AuthorDate: Mon Apr 27 16:47:39 2020 -0700
[SPARK-31577][SQL] Fix case-sensitivity and forward name conflict problems
when check name conflicts of CTE relations
### What changes were proposed in this pull request?
This is a followup of https://github.com/apache/spark/pull/28318, to make
the code more readable, by adding some comments to explain the trick and
simplify the code to use a boolean flag instead of 2 string sets.
This PR also fixes various problems:
1. the name check should consider case sensitivity
2. forward name conflicts like `with t as (with t2 as ...), t2 as ...` is
not a real conflict and we shouldn't fail.
### Why are the changes needed?
correct the behavior
### Does this PR introduce any user-facing change?
yes, fix the fore-mentioned behaviors.
### How was this patch tested?
new tests
Closes #28371 from cloud-fan/followup.
Authored-by: Wenchen Fan <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
(cherry picked from commit 2f4f38b6f16cdc4bdb2f78c671707760cf429e4f)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../sql/catalyst/analysis/CTESubstitution.scala | 48 +++++++++++++--------
.../test/resources/sql-tests/inputs/cte-nested.sql | 25 +++++++++++
.../resources/sql-tests/results/cte-legacy.sql.out | 42 ++++++++++++++++++-
.../resources/sql-tests/results/cte-nested.sql.out | 49 ++++++++++++++++++++--
.../sql-tests/results/cte-nonlegacy.sql.out | 42 ++++++++++++++++++-
5 files changed, 182 insertions(+), 24 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
index d9fdb56..41e4c1f 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
@@ -17,9 +17,11 @@
package org.apache.spark.sql.catalyst.analysis
+import scala.collection.mutable
+
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, With}
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan,
SubqueryAlias, With}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.{LEGACY_CTE_PRECEDENCE_POLICY,
LegacyBehaviorPolicy}
@@ -41,34 +43,44 @@ object CTESubstitution extends Rule[LogicalPlan] {
}
/**
- * Check the plan to be traversed has naming conflicts in nested CTE or not,
traverse through
- * child, innerChildren and subquery expressions for the current plan.
+ * Spark 3.0 changes the CTE relations resolution, and inner relations take
precedence. This is
+ * correct but we need to warn users about this behavior change under
EXCEPTION mode, when we see
+ * CTE relations with conflicting names.
+ *
+ * Note that, before Spark 3.0 the parser didn't support CTE in the FROM
clause. For example,
+ * `WITH ... SELECT * FROM (WITH ... SELECT ...)` was not supported. We
should not fail for this
+ * case, as Spark versions before 3.0 can't run it anyway. The parameter
`startOfQuery` is used
+ * to indicate where we can define CTE relations before Spark 3.0, and we
should only check
+ * name conflicts when `startOfQuery` is true.
*/
private def assertNoNameConflictsInCTE(
plan: LogicalPlan,
- outerCTERelationNames: Set[String] = Set.empty,
- namesInSubqueries: Set[String] = Set.empty): Unit = {
+ outerCTERelationNames: Seq[String] = Nil,
+ startOfQuery: Boolean = true): Unit = {
+ val resolver = SQLConf.get.resolver
plan match {
- case w @ With(child, relations) =>
- val newNames = relations.map {
- case (cteName, _) =>
- if (outerCTERelationNames.contains(cteName)) {
- throw new AnalysisException(s"Name $cteName is ambiguous in
nested CTE. " +
+ case With(child, relations) =>
+ val newNames = mutable.ArrayBuffer.empty[String]
+ newNames ++= outerCTERelationNames
+ relations.foreach {
+ case (name, relation) =>
+ if (startOfQuery && outerCTERelationNames.exists(resolver(_,
name))) {
+ throw new AnalysisException(s"Name $name is ambiguous in nested
CTE. " +
s"Please set ${LEGACY_CTE_PRECEDENCE_POLICY.key} to CORRECTED
so that name " +
"defined in inner CTE takes precedence. If set it to LEGACY,
outer CTE " +
"definitions will take precedence. See more details in
SPARK-28228.")
- } else {
- cteName
}
- }.toSet ++ namesInSubqueries
- assertNoNameConflictsInCTE(child, outerCTERelationNames, newNames)
- w.innerChildren.foreach(assertNoNameConflictsInCTE(_, newNames,
newNames))
+ // CTE relation is defined as `SubqueryAlias`. Here we skip it and
check the child
+ // directly, so that `startOfQuery` is set correctly.
+ assertNoNameConflictsInCTE(relation.child, newNames)
+ newNames += name
+ }
+ assertNoNameConflictsInCTE(child, newNames, startOfQuery = false)
case other =>
- other.subqueries.foreach(
- assertNoNameConflictsInCTE(_, namesInSubqueries, namesInSubqueries))
+ other.subqueries.foreach(assertNoNameConflictsInCTE(_,
outerCTERelationNames))
other.children.foreach(
- assertNoNameConflictsInCTE(_, outerCTERelationNames,
namesInSubqueries))
+ assertNoNameConflictsInCTE(_, outerCTERelationNames, startOfQuery =
false))
}
}
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
index 134f88b..3b64b5d 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
@@ -111,3 +111,28 @@ WHERE c IN (
WITH t(c) AS (SELECT 2)
SELECT * FROM t
);
+
+-- forward name conflict is not a real conflict
+WITH
+ t AS (
+ WITH t2 AS (SELECT 1)
+ SELECT * FROM t2
+ ),
+ t2 AS (SELECT 2)
+SELECT * FROM t;
+
+-- case insensitive name conflicts: in other CTE relations
+WITH
+ abc AS (SELECT 1),
+ t AS (
+ WITH aBc AS (SELECT 2)
+ SELECT * FROM aBC
+ )
+SELECT * FROM t;
+
+-- case insensitive name conflicts: in subquery expressions
+WITH abc AS (SELECT 1)
+SELECT (
+ WITH aBc AS (SELECT 2)
+ SELECT * FROM aBC
+);
\ No newline at end of file
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
index 629daf7..4d0e5ea 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 13
+-- Number of queries: 16
-- !query
@@ -179,3 +179,43 @@ WHERE c IN (
struct<c:int>
-- !query output
1
+
+
+-- !query
+WITH
+ t AS (
+ WITH t2 AS (SELECT 1)
+ SELECT * FROM t2
+ ),
+ t2 AS (SELECT 2)
+SELECT * FROM t
+-- !query schema
+struct<1:int>
+-- !query output
+1
+
+
+-- !query
+WITH
+ abc AS (SELECT 1),
+ t AS (
+ WITH aBc AS (SELECT 2)
+ SELECT * FROM aBC
+ )
+SELECT * FROM t
+-- !query schema
+struct<1:int>
+-- !query output
+1
+
+
+-- !query
+WITH abc AS (SELECT 1)
+SELECT (
+ WITH aBc AS (SELECT 2)
+ SELECT * FROM aBC
+)
+-- !query schema
+struct<scalarsubquery():int>
+-- !query output
+1
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
index 34e9be0..2f736c7 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 13
+-- Number of queries: 16
-- !query
@@ -64,10 +64,9 @@ WITH
)
SELECT * FROM t2
-- !query schema
-struct<>
+struct<scalarsubquery():int>
-- !query output
-org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set
spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner
CTE takes precedence. If set it to LEGACY, outer CTE definitions will take
precedence. See more details in SPARK-28228.;
+2
-- !query
@@ -186,3 +185,45 @@ struct<>
-- !query output
org.apache.spark.sql.AnalysisException
Name t is ambiguous in nested CTE. Please set
spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner
CTE takes precedence. If set it to LEGACY, outer CTE definitions will take
precedence. See more details in SPARK-28228.;
+
+
+-- !query
+WITH
+ t AS (
+ WITH t2 AS (SELECT 1)
+ SELECT * FROM t2
+ ),
+ t2 AS (SELECT 2)
+SELECT * FROM t
+-- !query schema
+struct<1:int>
+-- !query output
+1
+
+
+-- !query
+WITH
+ abc AS (SELECT 1),
+ t AS (
+ WITH aBc AS (SELECT 2)
+ SELECT * FROM aBC
+ )
+SELECT * FROM t
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name aBc is ambiguous in nested CTE. Please set
spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner
CTE takes precedence. If set it to LEGACY, outer CTE definitions will take
precedence. See more details in SPARK-28228.;
+
+
+-- !query
+WITH abc AS (SELECT 1)
+SELECT (
+ WITH aBc AS (SELECT 2)
+ SELECT * FROM aBC
+)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name aBc is ambiguous in nested CTE. Please set
spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner
CTE takes precedence. If set it to LEGACY, outer CTE definitions will take
precedence. See more details in SPARK-28228.;
diff --git
a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
index 6eba1ad..74394ee 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 13
+-- Number of queries: 16
-- !query
@@ -179,3 +179,43 @@ WHERE c IN (
struct<c:int>
-- !query output
+
+
+-- !query
+WITH
+ t AS (
+ WITH t2 AS (SELECT 1)
+ SELECT * FROM t2
+ ),
+ t2 AS (SELECT 2)
+SELECT * FROM t
+-- !query schema
+struct<1:int>
+-- !query output
+1
+
+
+-- !query
+WITH
+ abc AS (SELECT 1),
+ t AS (
+ WITH aBc AS (SELECT 2)
+ SELECT * FROM aBC
+ )
+SELECT * FROM t
+-- !query schema
+struct<2:int>
+-- !query output
+2
+
+
+-- !query
+WITH abc AS (SELECT 1)
+SELECT (
+ WITH aBc AS (SELECT 2)
+ SELECT * FROM aBC
+)
+-- !query schema
+struct<scalarsubquery():int>
+-- !query output
+2
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]