This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new fae2503  [SPARK-31577][SQL] Fix case-sensitivity and forward name 
conflict problems when check name conflicts of CTE relations
fae2503 is described below

commit fae25030de002099f5ad3e741e4fec1281ea29ab
Author: Wenchen Fan <[email protected]>
AuthorDate: Mon Apr 27 16:47:39 2020 -0700

    [SPARK-31577][SQL] Fix case-sensitivity and forward name conflict problems 
when check name conflicts of CTE relations
    
    ### What changes were proposed in this pull request?
    
    This is a followup of https://github.com/apache/spark/pull/28318, to make 
the code more readable, by adding some comments to explain the trick and 
simplify the code to use a boolean flag instead of 2 string sets.
    
    This PR also fixes various problems:
    1. the name check should consider case sensitivity
    2. forward name conflicts like `with t as (with t2 as ...), t2 as ...` is 
not a real conflict and we shouldn't fail.
    
    ### Why are the changes needed?
    
    correct the behavior
    
    ### Does this PR introduce any user-facing change?
    
    yes, fix the fore-mentioned behaviors.
    
    ### How was this patch tested?
    
    new tests
    
    Closes #28371 from cloud-fan/followup.
    
    Authored-by: Wenchen Fan <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
    (cherry picked from commit 2f4f38b6f16cdc4bdb2f78c671707760cf429e4f)
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../sql/catalyst/analysis/CTESubstitution.scala    | 48 +++++++++++++--------
 .../test/resources/sql-tests/inputs/cte-nested.sql | 25 +++++++++++
 .../resources/sql-tests/results/cte-legacy.sql.out | 42 ++++++++++++++++++-
 .../resources/sql-tests/results/cte-nested.sql.out | 49 ++++++++++++++++++++--
 .../sql-tests/results/cte-nonlegacy.sql.out        | 42 ++++++++++++++++++-
 5 files changed, 182 insertions(+), 24 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
index d9fdb56..41e4c1f 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import scala.collection.mutable
+
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, With}
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, 
SubqueryAlias, With}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.{LEGACY_CTE_PRECEDENCE_POLICY, 
LegacyBehaviorPolicy}
@@ -41,34 +43,44 @@ object CTESubstitution extends Rule[LogicalPlan] {
   }
 
   /**
-   * Check the plan to be traversed has naming conflicts in nested CTE or not, 
traverse through
-   * child, innerChildren and subquery expressions for the current plan.
+   * Spark 3.0 changes the CTE relations resolution, and inner relations take 
precedence. This is
+   * correct but we need to warn users about this behavior change under 
EXCEPTION mode, when we see
+   * CTE relations with conflicting names.
+   *
+   * Note that, before Spark 3.0 the parser didn't support CTE in the FROM 
clause. For example,
+   * `WITH ... SELECT * FROM (WITH ... SELECT ...)` was not supported. We 
should not fail for this
+   * case, as Spark versions before 3.0 can't run it anyway. The parameter 
`startOfQuery` is used
+   * to indicate where we can define CTE relations before Spark 3.0, and we 
should only check
+   * name conflicts when `startOfQuery` is true.
    */
   private def assertNoNameConflictsInCTE(
       plan: LogicalPlan,
-      outerCTERelationNames: Set[String] = Set.empty,
-      namesInSubqueries: Set[String] = Set.empty): Unit = {
+      outerCTERelationNames: Seq[String] = Nil,
+      startOfQuery: Boolean = true): Unit = {
+    val resolver = SQLConf.get.resolver
     plan match {
-      case w @ With(child, relations) =>
-        val newNames = relations.map {
-          case (cteName, _) =>
-            if (outerCTERelationNames.contains(cteName)) {
-              throw new AnalysisException(s"Name $cteName is ambiguous in 
nested CTE. " +
+      case With(child, relations) =>
+        val newNames = mutable.ArrayBuffer.empty[String]
+        newNames ++= outerCTERelationNames
+        relations.foreach {
+          case (name, relation) =>
+            if (startOfQuery && outerCTERelationNames.exists(resolver(_, 
name))) {
+              throw new AnalysisException(s"Name $name is ambiguous in nested 
CTE. " +
                 s"Please set ${LEGACY_CTE_PRECEDENCE_POLICY.key} to CORRECTED 
so that name " +
                 "defined in inner CTE takes precedence. If set it to LEGACY, 
outer CTE " +
                 "definitions will take precedence. See more details in 
SPARK-28228.")
-            } else {
-              cteName
             }
-        }.toSet ++ namesInSubqueries
-        assertNoNameConflictsInCTE(child, outerCTERelationNames, newNames)
-        w.innerChildren.foreach(assertNoNameConflictsInCTE(_, newNames, 
newNames))
+            // CTE relation is defined as `SubqueryAlias`. Here we skip it and 
check the child
+            // directly, so that `startOfQuery` is set correctly.
+            assertNoNameConflictsInCTE(relation.child, newNames)
+            newNames += name
+        }
+        assertNoNameConflictsInCTE(child, newNames, startOfQuery = false)
 
       case other =>
-        other.subqueries.foreach(
-          assertNoNameConflictsInCTE(_, namesInSubqueries, namesInSubqueries))
+        other.subqueries.foreach(assertNoNameConflictsInCTE(_, 
outerCTERelationNames))
         other.children.foreach(
-          assertNoNameConflictsInCTE(_, outerCTERelationNames, 
namesInSubqueries))
+          assertNoNameConflictsInCTE(_, outerCTERelationNames, startOfQuery = 
false))
     }
   }
 
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql 
b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
index 134f88b..3b64b5d 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
@@ -111,3 +111,28 @@ WHERE c IN (
   WITH t(c) AS (SELECT 2)
   SELECT * FROM t
 );
+
+-- forward name conflict is not a real conflict
+WITH
+  t AS (
+    WITH t2 AS (SELECT 1)
+    SELECT * FROM t2
+  ),
+  t2 AS (SELECT 2)
+SELECT * FROM t;
+
+-- case insensitive name conflicts: in other CTE relations
+WITH
+  abc AS (SELECT 1),
+  t AS (
+    WITH aBc AS (SELECT 2)
+    SELECT * FROM aBC
+  )
+SELECT * FROM t;
+
+-- case insensitive name conflicts: in subquery expressions
+WITH abc AS (SELECT 1)
+SELECT (
+  WITH aBc AS (SELECT 2)
+  SELECT * FROM aBC
+);
\ No newline at end of file
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out 
b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
index 629daf7..4d0e5ea 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 13
+-- Number of queries: 16
 
 
 -- !query
@@ -179,3 +179,43 @@ WHERE c IN (
 struct<c:int>
 -- !query output
 1
+
+
+-- !query
+WITH
+  t AS (
+    WITH t2 AS (SELECT 1)
+    SELECT * FROM t2
+  ),
+  t2 AS (SELECT 2)
+SELECT * FROM t
+-- !query schema
+struct<1:int>
+-- !query output
+1
+
+
+-- !query
+WITH
+  abc AS (SELECT 1),
+  t AS (
+    WITH aBc AS (SELECT 2)
+    SELECT * FROM aBC
+  )
+SELECT * FROM t
+-- !query schema
+struct<1:int>
+-- !query output
+1
+
+
+-- !query
+WITH abc AS (SELECT 1)
+SELECT (
+  WITH aBc AS (SELECT 2)
+  SELECT * FROM aBC
+)
+-- !query schema
+struct<scalarsubquery():int>
+-- !query output
+1
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out 
b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
index 34e9be0..2f736c7 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 13
+-- Number of queries: 16
 
 
 -- !query
@@ -64,10 +64,9 @@ WITH
   )
 SELECT * FROM t2
 -- !query schema
-struct<>
+struct<scalarsubquery():int>
 -- !query output
-org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set 
spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner 
CTE takes precedence. If set it to LEGACY, outer CTE definitions will take 
precedence. See more details in SPARK-28228.;
+2
 
 
 -- !query
@@ -186,3 +185,45 @@ struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
 Name t is ambiguous in nested CTE. Please set 
spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner 
CTE takes precedence. If set it to LEGACY, outer CTE definitions will take 
precedence. See more details in SPARK-28228.;
+
+
+-- !query
+WITH
+  t AS (
+    WITH t2 AS (SELECT 1)
+    SELECT * FROM t2
+  ),
+  t2 AS (SELECT 2)
+SELECT * FROM t
+-- !query schema
+struct<1:int>
+-- !query output
+1
+
+
+-- !query
+WITH
+  abc AS (SELECT 1),
+  t AS (
+    WITH aBc AS (SELECT 2)
+    SELECT * FROM aBC
+  )
+SELECT * FROM t
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name aBc is ambiguous in nested CTE. Please set 
spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner 
CTE takes precedence. If set it to LEGACY, outer CTE definitions will take 
precedence. See more details in SPARK-28228.;
+
+
+-- !query
+WITH abc AS (SELECT 1)
+SELECT (
+  WITH aBc AS (SELECT 2)
+  SELECT * FROM aBC
+)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name aBc is ambiguous in nested CTE. Please set 
spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner 
CTE takes precedence. If set it to LEGACY, outer CTE definitions will take 
precedence. See more details in SPARK-28228.;
diff --git 
a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out 
b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
index 6eba1ad..74394ee 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 13
+-- Number of queries: 16
 
 
 -- !query
@@ -179,3 +179,43 @@ WHERE c IN (
 struct<c:int>
 -- !query output
 
+
+
+-- !query
+WITH
+  t AS (
+    WITH t2 AS (SELECT 1)
+    SELECT * FROM t2
+  ),
+  t2 AS (SELECT 2)
+SELECT * FROM t
+-- !query schema
+struct<1:int>
+-- !query output
+1
+
+
+-- !query
+WITH
+  abc AS (SELECT 1),
+  t AS (
+    WITH aBc AS (SELECT 2)
+    SELECT * FROM aBC
+  )
+SELECT * FROM t
+-- !query schema
+struct<2:int>
+-- !query output
+2
+
+
+-- !query
+WITH abc AS (SELECT 1)
+SELECT (
+  WITH aBc AS (SELECT 2)
+  SELECT * FROM aBC
+)
+-- !query schema
+struct<scalarsubquery():int>
+-- !query output
+2


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to