peter-toth commented on a change in pull request #33671:
URL: https://github.com/apache/spark/pull/33671#discussion_r684794342



##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
##########
@@ -20,25 +20,60 @@ package org.apache.spark.sql.catalyst.analysis
 import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, 
SubqueryAlias, With}
+import org.apache.spark.sql.catalyst.plans.logical.{Command, CTERelationDef, 
CTERelationRef, InsertIntoDir, LogicalPlan, ParsedStatement, SubqueryAlias, 
UnresolvedWith, WithCTE}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.trees.TreePattern._
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.internal.SQLConf.{LEGACY_CTE_PRECEDENCE_POLICY, 
LegacyBehaviorPolicy}
 
 /**
- * Analyze WITH nodes and substitute child plan with CTE definitions.
+ * Analyze WITH nodes and substitute child plan with CTE references or CTE 
definitions depending
+ * on the conditions below:
+ * 1. If in legacy mode, or if the query is a SQL command or DML statement, 
replace with CTE
+ *    definitions, i.e., inline CTEs.
+ * 2. Otherwise, replace with CTE references `CTERelationRef`s. The decision 
to inline or not
+ *    inline will be made later by the rule `InlineCTE` after query analysis.
+ *
+ * All the CTE definitions that are not inlined after this substitution will 
be grouped together
+ * under one `WithCTE` node for each of the main query and the subqueries. Any 
of the main query
+ * or the subqueries that do not contain CTEs or have had all CTEs inlined 
will obviously not have
+ * any `WithCTE` nodes. If any though, the `WithCTE` node will be in the same 
place as where the
+ * outermost `With` node once was.
+ *
+ * The CTE definitions in a `WithCTE` node are kept in the order of how they 
have been resolved.
+ * That means the CTE definitions are guaranteed to be in topological order 
base on their
+ * dependency for any valid CTE query (i.e., given CTE definitions A and B 
with B referencing A,
+ * A is guaranteed to appear before B). Otherwise, it must be an invalid user 
query, and an
+ * analysis exception will be thrown later by relation resolving rules.
  */
 object CTESubstitution extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = {
-    LegacyBehaviorPolicy.withName(conf.getConf(LEGACY_CTE_PRECEDENCE_POLICY)) 
match {
-      case LegacyBehaviorPolicy.EXCEPTION =>
-        assertNoNameConflictsInCTE(plan)
-        traverseAndSubstituteCTE(plan)
-      case LegacyBehaviorPolicy.LEGACY =>
-        legacyTraverseAndSubstituteCTE(plan)
-      case LegacyBehaviorPolicy.CORRECTED =>
-        traverseAndSubstituteCTE(plan)
+    val isCommand = plan.find {
+      case _: Command | _: ParsedStatement | _: InsertIntoDir => true
+      case _ => false
+    }.isDefined
+    val cteDefs = mutable.ArrayBuffer.empty[CTERelationDef]
+    val (substituted, lastSubstituted) =
+      
LegacyBehaviorPolicy.withName(conf.getConf(LEGACY_CTE_PRECEDENCE_POLICY)) match 
{
+        case LegacyBehaviorPolicy.EXCEPTION =>
+          assertNoNameConflictsInCTE(plan)
+          traverseAndSubstituteCTE(plan, isCommand, cteDefs)
+        case LegacyBehaviorPolicy.LEGACY =>
+          (legacyTraverseAndSubstituteCTE(plan, cteDefs), None)
+        case LegacyBehaviorPolicy.CORRECTED =>
+          traverseAndSubstituteCTE(plan, isCommand, cteDefs)
+    }
+    if (cteDefs.isEmpty) {
+      substituted
+    } else if (substituted eq lastSubstituted.get) {
+      WithCTE(substituted, cteDefs)
+    } else {
+      var done = false
+      substituted.resolveOperators {

Review comment:
       Maybe we could use `substituted.resolveOperatorsWithPruning(_ => !done) 
{` to break out?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to