This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new e38c5b582440 [SPARK-53040][SQL] Ban self references inside topmost CTEs in Recursive CTEs e38c5b582440 is described below commit e38c5b5824404bad67c91f2608c7bd4fb3d312c2 Author: pavle-martinovic_data <pavle.martino...@databricks.com> AuthorDate: Tue Aug 5 00:12:24 2025 +0800 [SPARK-53040][SQL] Ban self references inside topmost CTEs in Recursive CTEs ### What changes were proposed in this pull request? Ban self references inside with topmost CTEs (that apply to the whole 'UNION ALL') inside recursive CTEs. ### Why are the changes needed? Most other engines ban this behavior. An example of an affected query: ``` WITH RECURSIVE t1 AS ( WITH t2(n) AS (SELECT * FROM t1) SELECT 1 AS n UNION ALL SELECT n+1 FROM t2 WHERE n < 5) SELECT * FROM t1; ``` This removes no functionality, since just moving the WITH beneath the UNION ALL will create an equivalent query. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing golden file tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51748 from Pajaraja/pavle-martinovic_data/banSelfReferencesInTopmostCTE. Authored-by: pavle-martinovic_data <pavle.martino...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../src/main/resources/error/error-conditions.json | 2 +- .../sql/catalyst/analysis/ResolveWithCTE.scala | 2 +- .../sql/catalyst/plans/logical/cteOperators.scala | 16 +++++++ .../analyzer-results/cte-recursion.sql.out | 52 +++++----------------- .../analyzer-results/postgreSQL/with.sql.out | 11 +---- .../sql-tests/results/cte-recursion.sql.out | 24 +++++----- .../sql-tests/results/postgreSQL/with.sql.out | 11 +---- 7 files changed, 44 insertions(+), 74 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 60b2df793230..9cf3ba1a029d 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -3499,7 +3499,7 @@ }, "INVALID_RECURSIVE_CTE" : { "message" : [ - "Invalid recursive definition found. Recursive queries must contain an UNION or an UNION ALL statement with 2 children. The first child needs to be the anchor term without any recursive references." + "Invalid recursive definition found. Recursive queries must contain an UNION or an UNION ALL statement with 2 children. The first child needs to be the anchor term without any recursive references. Any top level inner CTE must not contain self references." ], "sqlState" : "42836" }, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala index 69b341b5574f..0f6fcec467ce 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala @@ -59,7 +59,7 @@ object ResolveWithCTE extends Rule[LogicalPlan] { cteDefMap.put(cteDef.id, cteDef) } cteDef - case cteDef if cteDef.hasSelfReferenceInAnchor => + case cteDef if cteDef.hasSelfReferenceInAnchor || cteDef.hasSelfReferenceInSubCTE => throw new AnalysisException( errorClass = "INVALID_RECURSIVE_CTE", messageParameters = Map.empty) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala index 68e93455d453..c3c662c471f8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala @@ -150,6 +150,22 @@ case class CTERelationDef( false } } + lazy val hasSelfReferenceInSubCTE: Boolean = { + val withCTENode: Option[WithCTE] = child match { + case SubqueryAlias(_, withCTE @ WithCTE(_, _)) => + Some(withCTE) + case SubqueryAlias(_, UnresolvedSubqueryColumnAliases(_, withCTE @ WithCTE(_, _))) => + Some(withCTE) + case _ => None + } + if (withCTENode.isDefined) { + withCTENode.exists(_.cteDefs.exists(_.collectFirstWithSubqueries { + case CTERelationRef(this.id, _, _, _, _, true, _) => true + }.isDefined)) + } else { + false + } + } lazy val hasSelfReferenceAsUnionLoopRef: Boolean = child.collectFirstWithSubqueries { case UnionLoopRef(this.id, _, _) => true }.getOrElse(false) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out index 32169cb51b96..6bb0b27ca9ea 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out @@ -1583,28 +1583,11 @@ WITH RECURSIVE t1(n) AS ( SELECT n+1 FROM t2 WHERE n < 5) SELECT * FROM t1 -- !query analysis -WithCTE -:- CTERelationDef xxxx, false -: +- SubqueryAlias t1 -: +- Project [1#x AS n#x] -: +- WithCTE -: :- CTERelationDef xxxx, false -: : +- SubqueryAlias t2 -: : +- Project [n#x AS n#x] -: : +- Project [n#x] -: : +- SubqueryAlias t1 -: : +- Project [1#x AS n#x] -: : +- UnionLoopRef xxxx, [1#x], false -: +- UnionLoop xxxx -: :- Project [1 AS 1#x] -: : +- OneRowRelation -: +- Project [(n#x + 1) AS (n + 1)#x] -: +- Filter (n#x < 5) -: +- SubqueryAlias t2 -: +- CTERelationRef xxxx, true, [n#x], false, false -+- Project [n#x] - +- SubqueryAlias t1 - +- CTERelationRef xxxx, true, [n#x], false, false +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" +} -- !query @@ -1615,26 +1598,11 @@ WITH RECURSIVE t1 AS ( SELECT n+1 FROM t2 WHERE n < 5) SELECT * FROM t1 -- !query analysis -WithCTE -:- CTERelationDef xxxx, false -: +- SubqueryAlias t1 -: +- WithCTE -: :- CTERelationDef xxxx, false -: : +- SubqueryAlias t2 -: : +- Project [n#x AS n#x] -: : +- Project [n#x] -: : +- SubqueryAlias t1 -: : +- UnionLoopRef xxxx, [n#x], false -: +- UnionLoop xxxx -: :- Project [1 AS n#x] -: : +- OneRowRelation -: +- Project [(n#x + 1) AS (n + 1)#x] -: +- Filter (n#x < 5) -: +- SubqueryAlias t2 -: +- CTERelationRef xxxx, true, [n#x], false, false -+- Project [n#x] - +- SubqueryAlias t1 - +- CTERelationRef xxxx, true, [n#x], false, false +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" +} -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out index 2081f4a4f32a..edbdcba85148 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out @@ -1660,15 +1660,8 @@ SELECT * FROM outermost ORDER BY 1 -- !query analysis org.apache.spark.sql.AnalysisException { - "errorClass" : "UNION_NOT_SUPPORTED_IN_RECURSIVE_CTE", - "sqlState" : "42836", - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 1, - "stopIndex" : 185, - "fragment" : "WITH RECURSIVE outermost(x) AS (\n WITH innermost as (SELECT 2 FROM outermost) -- fail\n SELECT * FROM innermost\n UNION SELECT * from outermost\n)\nSELECT * FROM outermost ORDER BY 1" - } ] + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" } diff --git a/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out index 6689bbeeefdd..310f313cd2a3 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out @@ -1458,13 +1458,13 @@ WITH RECURSIVE t1(n) AS ( SELECT n+1 FROM t2 WHERE n < 5) SELECT * FROM t1 -- !query schema -struct<n:int> +struct<> -- !query output -1 -2 -3 -4 -5 +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" +} -- !query @@ -1475,13 +1475,13 @@ WITH RECURSIVE t1 AS ( SELECT n+1 FROM t2 WHERE n < 5) SELECT * FROM t1 -- !query schema -struct<n:int> +struct<> -- !query output -1 -2 -3 -4 -5 +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" +} -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out index 4094c69ec463..1e0441e07393 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out @@ -1543,15 +1543,8 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException { - "errorClass" : "UNION_NOT_SUPPORTED_IN_RECURSIVE_CTE", - "sqlState" : "42836", - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 1, - "stopIndex" : 185, - "fragment" : "WITH RECURSIVE outermost(x) AS (\n WITH innermost as (SELECT 2 FROM outermost) -- fail\n SELECT * FROM innermost\n UNION SELECT * from outermost\n)\nSELECT * FROM outermost ORDER BY 1" - } ] + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org