This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new e38c5b582440 [SPARK-53040][SQL] Ban self references inside topmost 
CTEs in Recursive CTEs
e38c5b582440 is described below

commit e38c5b5824404bad67c91f2608c7bd4fb3d312c2
Author: pavle-martinovic_data <pavle.martino...@databricks.com>
AuthorDate: Tue Aug 5 00:12:24 2025 +0800

    [SPARK-53040][SQL] Ban self references inside topmost CTEs in Recursive CTEs
    
    ### What changes were proposed in this pull request?
    
    Ban self references inside with topmost CTEs (that apply to the whole 
'UNION ALL') inside recursive CTEs.
    
    ### Why are the changes needed?
    
    Most other engines ban this behavior. An example of an affected query:
    
    ```
    WITH RECURSIVE t1 AS (
      WITH t2(n) AS (SELECT * FROM t1)
      SELECT 1 AS n
      UNION ALL
      SELECT n+1 FROM t2 WHERE n < 5)
    SELECT * FROM t1;
    ```
    
    This removes no functionality, since just moving the WITH beneath the UNION 
ALL will create an equivalent query.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Existing golden file tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #51748 from 
Pajaraja/pavle-martinovic_data/banSelfReferencesInTopmostCTE.
    
    Authored-by: pavle-martinovic_data <pavle.martino...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../src/main/resources/error/error-conditions.json |  2 +-
 .../sql/catalyst/analysis/ResolveWithCTE.scala     |  2 +-
 .../sql/catalyst/plans/logical/cteOperators.scala  | 16 +++++++
 .../analyzer-results/cte-recursion.sql.out         | 52 +++++-----------------
 .../analyzer-results/postgreSQL/with.sql.out       | 11 +----
 .../sql-tests/results/cte-recursion.sql.out        | 24 +++++-----
 .../sql-tests/results/postgreSQL/with.sql.out      | 11 +----
 7 files changed, 44 insertions(+), 74 deletions(-)

diff --git a/common/utils/src/main/resources/error/error-conditions.json 
b/common/utils/src/main/resources/error/error-conditions.json
index 60b2df793230..9cf3ba1a029d 100644
--- a/common/utils/src/main/resources/error/error-conditions.json
+++ b/common/utils/src/main/resources/error/error-conditions.json
@@ -3499,7 +3499,7 @@
   },
   "INVALID_RECURSIVE_CTE" : {
     "message" : [
-      "Invalid recursive definition found. Recursive queries must contain an 
UNION or an UNION ALL statement with 2 children. The first child needs to be 
the anchor term without any recursive references."
+      "Invalid recursive definition found. Recursive queries must contain an 
UNION or an UNION ALL statement with 2 children. The first child needs to be 
the anchor term without any recursive references. Any top level inner CTE must 
not contain self references."
     ],
     "sqlState" : "42836"
   },
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala
index 69b341b5574f..0f6fcec467ce 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala
@@ -59,7 +59,7 @@ object ResolveWithCTE extends Rule[LogicalPlan] {
               cteDefMap.put(cteDef.id, cteDef)
             }
             cteDef
-          case cteDef if cteDef.hasSelfReferenceInAnchor =>
+          case cteDef if cteDef.hasSelfReferenceInAnchor || 
cteDef.hasSelfReferenceInSubCTE =>
             throw new AnalysisException(
               errorClass = "INVALID_RECURSIVE_CTE",
               messageParameters = Map.empty)
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala
index 68e93455d453..c3c662c471f8 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala
@@ -150,6 +150,22 @@ case class CTERelationDef(
       false
     }
   }
+  lazy val hasSelfReferenceInSubCTE: Boolean = {
+    val withCTENode: Option[WithCTE] = child match {
+      case SubqueryAlias(_, withCTE @ WithCTE(_, _)) =>
+        Some(withCTE)
+      case SubqueryAlias(_, UnresolvedSubqueryColumnAliases(_, withCTE @ 
WithCTE(_, _))) =>
+        Some(withCTE)
+      case _ => None
+    }
+    if (withCTENode.isDefined) {
+      withCTENode.exists(_.cteDefs.exists(_.collectFirstWithSubqueries {
+        case CTERelationRef(this.id, _, _, _, _, true, _) => true
+      }.isDefined))
+    } else {
+      false
+    }
+  }
   lazy val hasSelfReferenceAsUnionLoopRef: Boolean = 
child.collectFirstWithSubqueries {
     case UnionLoopRef(this.id, _, _) => true
   }.getOrElse(false)
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out
index 32169cb51b96..6bb0b27ca9ea 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out
@@ -1583,28 +1583,11 @@ WITH RECURSIVE t1(n) AS (
     SELECT n+1 FROM t2 WHERE n < 5)
 SELECT * FROM t1
 -- !query analysis
-WithCTE
-:- CTERelationDef xxxx, false
-:  +- SubqueryAlias t1
-:     +- Project [1#x AS n#x]
-:        +- WithCTE
-:           :- CTERelationDef xxxx, false
-:           :  +- SubqueryAlias t2
-:           :     +- Project [n#x AS n#x]
-:           :        +- Project [n#x]
-:           :           +- SubqueryAlias t1
-:           :              +- Project [1#x AS n#x]
-:           :                 +- UnionLoopRef xxxx, [1#x], false
-:           +- UnionLoop xxxx
-:              :- Project [1 AS 1#x]
-:              :  +- OneRowRelation
-:              +- Project [(n#x + 1) AS (n + 1)#x]
-:                 +- Filter (n#x < 5)
-:                    +- SubqueryAlias t2
-:                       +- CTERelationRef xxxx, true, [n#x], false, false
-+- Project [n#x]
-   +- SubqueryAlias t1
-      +- CTERelationRef xxxx, true, [n#x], false, false
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
+}
 
 
 -- !query
@@ -1615,26 +1598,11 @@ WITH RECURSIVE t1 AS (
     SELECT n+1 FROM t2 WHERE n < 5)
 SELECT * FROM t1
 -- !query analysis
-WithCTE
-:- CTERelationDef xxxx, false
-:  +- SubqueryAlias t1
-:     +- WithCTE
-:        :- CTERelationDef xxxx, false
-:        :  +- SubqueryAlias t2
-:        :     +- Project [n#x AS n#x]
-:        :        +- Project [n#x]
-:        :           +- SubqueryAlias t1
-:        :              +- UnionLoopRef xxxx, [n#x], false
-:        +- UnionLoop xxxx
-:           :- Project [1 AS n#x]
-:           :  +- OneRowRelation
-:           +- Project [(n#x + 1) AS (n + 1)#x]
-:              +- Filter (n#x < 5)
-:                 +- SubqueryAlias t2
-:                    +- CTERelationRef xxxx, true, [n#x], false, false
-+- Project [n#x]
-   +- SubqueryAlias t1
-      +- CTERelationRef xxxx, true, [n#x], false, false
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
+}
 
 
 -- !query
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out
 
b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out
index 2081f4a4f32a..edbdcba85148 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out
@@ -1660,15 +1660,8 @@ SELECT * FROM outermost ORDER BY 1
 -- !query analysis
 org.apache.spark.sql.AnalysisException
 {
-  "errorClass" : "UNION_NOT_SUPPORTED_IN_RECURSIVE_CTE",
-  "sqlState" : "42836",
-  "queryContext" : [ {
-    "objectType" : "",
-    "objectName" : "",
-    "startIndex" : 1,
-    "stopIndex" : 185,
-    "fragment" : "WITH RECURSIVE outermost(x) AS (\n  WITH innermost as 
(SELECT 2 FROM outermost) -- fail\n    SELECT * FROM innermost\n    UNION 
SELECT * from outermost\n)\nSELECT * FROM outermost ORDER BY 1"
-  } ]
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
 }
 
 
diff --git 
a/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out 
b/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out
index 6689bbeeefdd..310f313cd2a3 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out
@@ -1458,13 +1458,13 @@ WITH RECURSIVE t1(n) AS (
     SELECT n+1 FROM t2 WHERE n < 5)
 SELECT * FROM t1
 -- !query schema
-struct<n:int>
+struct<>
 -- !query output
-1
-2
-3
-4
-5
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
+}
 
 
 -- !query
@@ -1475,13 +1475,13 @@ WITH RECURSIVE t1 AS (
     SELECT n+1 FROM t2 WHERE n < 5)
 SELECT * FROM t1
 -- !query schema
-struct<n:int>
+struct<>
 -- !query output
-1
-2
-3
-4
-5
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
+}
 
 
 -- !query
diff --git 
a/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out 
b/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out
index 4094c69ec463..1e0441e07393 100644
--- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out
@@ -1543,15 +1543,8 @@ struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
 {
-  "errorClass" : "UNION_NOT_SUPPORTED_IN_RECURSIVE_CTE",
-  "sqlState" : "42836",
-  "queryContext" : [ {
-    "objectType" : "",
-    "objectName" : "",
-    "startIndex" : 1,
-    "stopIndex" : 185,
-    "fragment" : "WITH RECURSIVE outermost(x) AS (\n  WITH innermost as 
(SELECT 2 FROM outermost) -- fail\n    SELECT * FROM innermost\n    UNION 
SELECT * from outermost\n)\nSELECT * FROM outermost ORDER BY 1"
-  } ]
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
 }
 
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to