spark git commit: [SPARK-16771][SQL] WITH clause should not fall into infinite loop.

hvanhovell Fri, 12 Aug 2016 10:08:08 -0700

Repository: spark
Updated Branches:
  refs/heads/master bbae20ade -> 2a105134e



[SPARK-16771][SQL] WITH clause should not fall into infinite loop.

## What changes were proposed in this pull request?

This PR changes the CTE resolving rule to use only **forward-declared** tables 
in order to prevent infinite loops. More specifically, new logic is like the 
following.

* Resolve CTEs in `WITH` clauses first before replacing the main SQL body.
* When resolving CTEs, only forward-declared CTEs or base tables are referenced.
  - Self-referencing is not allowed any more.
  - Cross-referencing is not allowed any more.

**Reported Error Scenarios**
```scala
scala> sql("WITH t AS (SELECT 1 FROM t) SELECT * FROM t")
java.lang.StackOverflowError
...
scala> sql("WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * 
FROM t1, t2")
java.lang.StackOverflowError
...
```
Note that `t`, `t1`, and `t2` are not declared in database. Spark falls into 
infinite loops before resolving table names.

## How was this patch tested?

Pass the Jenkins tests with new two testcases.

Author: Dongjoon Hyun <dongj...@apache.org>

Closes #14397 from dongjoon-hyun/SPARK-16771-TREENODE.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2a105134
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2a105134
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2a105134

Branch: refs/heads/master
Commit: 2a105134e9a3efd46b761fab5e563ddebb26575d
Parents: bbae20a
Author: Dongjoon Hyun <dongj...@apache.org>
Authored: Fri Aug 12 19:07:34 2016 +0200
Committer: Herman van Hovell <hvanhov...@databricks.com>
Committed: Fri Aug 12 19:07:34 2016 +0200

----------------------------------------------------------------------
 .../spark/sql/catalyst/analysis/Analyzer.scala  | 24 ++++-----
 .../spark/sql/catalyst/parser/AstBuilder.scala  |  2 +-
 .../plans/logical/basicLogicalOperators.scala   |  7 ++-
 .../sql/catalyst/parser/PlanParserSuite.scala   |  2 +-
 .../src/test/resources/sql-tests/inputs/cte.sql | 14 +++++
 .../resources/sql-tests/results/cte.sql.out     | 57 ++++++++++++++++++++
 6 files changed, 88 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/2a105134/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 14a2a32..a2e276e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -125,22 +125,22 @@ class Analyzer(
   object CTESubstitution extends Rule[LogicalPlan] {
     // TODO allow subquery to define CTE
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators  {
-      case With(child, relations) => substituteCTE(child, relations)
+      case With(child, relations) =>
+        substituteCTE(child, relations.foldLeft(Seq.empty[(String, 
LogicalPlan)]) {
+          case (resolved, (name, relation)) =>
+            resolved :+ name -> ResolveRelations(substituteCTE(relation, 
resolved))
+        })
       case other => other
     }
 
-    def substituteCTE(plan: LogicalPlan, cteRelations: Map[String, 
LogicalPlan]): LogicalPlan = {
-      plan transform {
-        // In hive, if there is same table name in database and CTE definition,
-        // hive will use the table in database, not the CTE one.
-        // Taking into account the reasonableness and the implementation 
complexity,
-        // here use the CTE definition first, check table name only and ignore 
database name
-        // see https://github.com/apache/spark/pull/4929#discussion_r27186638 
for more info
+    def substituteCTE(plan: LogicalPlan, cteRelations: Seq[(String, 
LogicalPlan)]): LogicalPlan = {
+      plan transformDown {
         case u : UnresolvedRelation =>
-          val substituted = cteRelations.get(u.tableIdentifier.table).map { 
relation =>
-            val withAlias = u.alias.map(SubqueryAlias(_, relation))
-            withAlias.getOrElse(relation)
-          }
+          val substituted = cteRelations.find(x => resolver(x._1, 
u.tableIdentifier.table))
+            .map(_._2).map { relation =>
+              val withAlias = u.alias.map(SubqueryAlias(_, relation))
+              withAlias.getOrElse(relation)
+            }
           substituted.getOrElse(u)
         case other =>
           // This cannot be done in ResolveSubquery because ResolveSubquery 
does not know the CTE.

http://git-wip-us.apache.org/repos/asf/spark/blob/2a105134/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index c7fdc28..25c8445 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -97,7 +97,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
Logging {
       }
       // Check for duplicate names.
       checkDuplicateKeys(ctes, ctx)
-      With(query, ctes.toMap)
+      With(query, ctes)
     }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/2a105134/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index eb612c4..2917d8d 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -392,11 +392,10 @@ case class InsertIntoTable(
  * This operator will be removed during analysis and the relations will be 
substituted into child.
  *
  * @param child The final query of this CTE.
- * @param cteRelations Queries that this CTE defined,
- *                     key is the alias of the CTE definition,
- *                     value is the CTE definition.
+ * @param cteRelations A sequence of pair (alias, the CTE definition) that 
this CTE defined
+ *                     Each CTE can see the base tables and the previously 
defined CTEs only.
  */
-case class With(child: LogicalPlan, cteRelations: Map[String, SubqueryAlias]) 
extends UnaryNode {
+case class With(child: LogicalPlan, cteRelations: Seq[(String, 
SubqueryAlias)]) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/2a105134/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 00a37cf..34d52c7 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -81,7 +81,7 @@ class PlanParserSuite extends PlanTest {
       val ctes = namedPlans.map {
         case (name, cte) =>
           name -> SubqueryAlias(name, cte)
-      }.toMap
+      }
       With(plan, ctes)
     }
     assertEqual(

http://git-wip-us.apache.org/repos/asf/spark/blob/2a105134/sql/core/src/test/resources/sql-tests/inputs/cte.sql
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte.sql 
b/sql/core/src/test/resources/sql-tests/inputs/cte.sql
new file mode 100644
index 0000000..10d34de
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte.sql
@@ -0,0 +1,14 @@
+create temporary view t as select * from values 0, 1, 2 as t(id);
+create temporary view t2 as select * from values 0, 1 as t(id);
+
+-- WITH clause should not fall into infinite loop by referencing self
+WITH s AS (SELECT 1 FROM s) SELECT * FROM s;
+
+-- WITH clause should reference the base table
+WITH t AS (SELECT 1 FROM t) SELECT * FROM t;
+
+-- WITH clause should not allow cross reference
+WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2;
+
+-- WITH clause should reference the previous CTE
+WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * FROM t1, t2;

http://git-wip-us.apache.org/repos/asf/spark/blob/2a105134/sql/core/src/test/resources/sql-tests/results/cte.sql.out
----------------------------------------------------------------------
diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out 
b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
new file mode 100644
index 0000000..ddee5bf
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
@@ -0,0 +1,57 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 6
+
+
+-- !query 0
+create temporary view t as select * from values 0, 1, 2 as t(id)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+create temporary view t2 as select * from values 0, 1 as t(id)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+WITH s AS (SELECT 1 FROM s) SELECT * FROM s
+-- !query 2 schema
+struct<>
+-- !query 2 output
+org.apache.spark.sql.AnalysisException
+Table or view not found: s; line 1 pos 25
+
+
+-- !query 3
+WITH t AS (SELECT 1 FROM t) SELECT * FROM t
+-- !query 3 schema
+struct<1:int>
+-- !query 3 output
+1
+1
+1
+
+
+-- !query 4
+WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+Table or view not found: s2; line 1 pos 26
+
+
+-- !query 5
+WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * FROM t1, t2
+-- !query 5 schema
+struct<id:int,2:int>
+-- !query 5 output
+0      2
+0      2
+1      2
+1      2


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16771][SQL] WITH clause should not fall into infinite loop.

Reply via email to