[GitHub] spark pull request #21853: [SPARK-23957][SQL] Sorts in subqueries are redund...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/21853 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21853: [SPARK-23957][SQL] Sorts in subqueries are redund...
Github user maropu commented on a diff in the pull request: https://github.com/apache/spark/pull/21853#discussion_r204612114 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala --- @@ -970,4 +973,300 @@ class SubquerySuite extends QueryTest with SharedSQLContext { Row("3", "b") :: Row("4", "b") :: Nil) } } + + private def getNumSortsInQuery(query: String): Int = { +val plan = sql(query).queryExecution.optimizedPlan +getNumSorts(plan) + getSubqueryExpressions(plan).map{s => getNumSorts(s.plan)}.sum + } + + private def getSubqueryExpressions(plan: LogicalPlan): Seq[SubqueryExpression] = { +val subqueryExpressions = ArrayBuffer.empty[SubqueryExpression] +plan transformAllExpressions { + case s: SubqueryExpression => +subqueryExpressions ++= (getSubqueryExpressions(s.plan) :+ s) +s +} +subqueryExpressions + } + + private def getNumSorts(plan: LogicalPlan): Int = { +plan.collect { case s: Sort => s }.size + } + + test("SPARK-23957 Remove redundant sort from subquery plan(in subquery)") { +withTempView("t1", "t2", "t3") { + Seq((1, 1), (2, 2)).toDF("c1", "c2").createOrReplaceTempView("t1") + Seq((1, 1), (2, 2)).toDF("c1", "c2").createOrReplaceTempView("t2") + Seq((1, 1, 1), (2, 2, 2)).toDF("c1", "c2", "c3").createOrReplaceTempView("t3") + + // Simple order by + val query1 = +""" + |SELECT c1 FROM t1 + |WHERE + |c1 IN (SELECT c1 FROM t2 ORDER BY c1) +""".stripMargin + assert(getNumSortsInQuery(query1) == 0) + + // Nested order bys + val query2 = +""" + |SELECT c1 + |FROM t1 + |WHERE c1 IN (SELECT c1 + | FROM (SELECT * + | FROM t2 + | ORDER BY c2) + | ORDER BY c1) +""".stripMargin + assert(getNumSortsInQuery(query2) == 0) + + + // nested IN + val query3 = +""" + |SELECT c1 + |FROM t1 + |WHERE c1 IN (SELECT c1 + | FROM t2 + | WHERE c1 IN (SELECT c1 + |FROM t3 + |WHERE c1 = 1 + |ORDER BY c3) + | ORDER BY c2) +""".stripMargin + assert(getNumSortsInQuery(query3) == 0) + + // Complex subplan and multiple sorts + val query4 = +""" + |SELECT c1 + |FROM t1 + |WHERE c1 IN (SELECT c1 + | FROM (SELECT c1, c2, count(*) + | FROM t2 + | GROUP BY c1, c2 + | HAVING count(*) > 0 + | ORDER BY c2) + | ORDER BY c1) +""".stripMargin + assert(getNumSortsInQuery(query4) == 0) + + // Join in subplan + val query5 = +""" + |SELECT c1 FROM t1 + |WHERE + |c1 IN (SELECT t2.c1 FROM t2, t3 + | WHERE t2.c1 = t3.c1 + | ORDER BY t2.c1) +""".stripMargin + assert(getNumSortsInQuery(query5) == 0) + + val query6 = +""" + |SELECT c1 + |FROM t1 + |WHERE (c1, c2) IN (SELECT c1, max(c2) + |FROM (SELECT c1, c2, count(*) + |FROM t2 + |GROUP BY c1, c2 + |HAVING count(*) > 0 + |ORDER BY c2) + |GROUP BY c1 + |HAVING max(c2) > 0 + |ORDER BY c1) +""".stripMargin + // The rule to remove redundant sorts is not able to remove the inner sort under + // an Aggregate operator. We only remove the top level sort. + assert(getNumSortsInQuery(query6) == 1) + + // Cases when sort is not removed from the plan + // Limit on top of sort + val query7 = +""" + |SELECT c1 FROM t1 + |WHERE + |c1 IN (SELECT c1 FROM t2 ORDER BY c1 limit 1) +""".stripMargin + assert(getNumSortsInQuery(query7) == 1) + + // Sort below a set operations (intersect,
[GitHub] spark pull request #21853: [SPARK-23957][SQL] Sorts in subqueries are redund...
Github user maropu commented on a diff in the pull request: https://github.com/apache/spark/pull/21853#discussion_r204609653 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala --- @@ -164,10 +164,20 @@ abstract class Optimizer(sessionCatalog: SessionCatalog) * Optimize all the subqueries inside expression. */ object OptimizeSubqueries extends Rule[LogicalPlan] { +private def removeTopLevelSorts(plan: LogicalPlan): LogicalPlan = { + plan match { +case Sort(_, _, child) => child +case Project(fields, child) => Project(fields, removeTopLevelSorts(child)) +case other => other + } +} def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { case s: SubqueryExpression => val Subquery(newPlan) = Optimizer.this.execute(Subquery(s.plan)) -s.withNewPlan(newPlan) +// At this point we have an optimized subquery plan that we are going to attach +// to this subquery expression. Here we can safely remove any top level sorts --- End diff -- super nit: `any top level sort`? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21853: [SPARK-23957][SQL] Sorts in subqueries are redund...
Github user maropu commented on a diff in the pull request: https://github.com/apache/spark/pull/21853#discussion_r204609622 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala --- @@ -164,10 +164,20 @@ abstract class Optimizer(sessionCatalog: SessionCatalog) * Optimize all the subqueries inside expression. */ object OptimizeSubqueries extends Rule[LogicalPlan] { +private def removeTopLevelSorts(plan: LogicalPlan): LogicalPlan = { --- End diff -- nit: `removeTopLevelSort`? (I think this func removes a single sort on the top?) --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21853: [SPARK-23957][SQL] Sorts in subqueries are redund...
Github user maropu commented on a diff in the pull request: https://github.com/apache/spark/pull/21853#discussion_r204609532 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala --- @@ -970,4 +973,300 @@ class SubquerySuite extends QueryTest with SharedSQLContext { Row("3", "b") :: Row("4", "b") :: Nil) } } + + private def getNumSortsInQuery(query: String): Int = { +val plan = sql(query).queryExecution.optimizedPlan +getNumSorts(plan) + getSubqueryExpressions(plan).map{s => getNumSorts(s.plan)}.sum + } + + private def getSubqueryExpressions(plan: LogicalPlan): Seq[SubqueryExpression] = { +val subqueryExpressions = ArrayBuffer.empty[SubqueryExpression] +plan transformAllExpressions { + case s: SubqueryExpression => +subqueryExpressions ++= (getSubqueryExpressions(s.plan) :+ s) +s +} +subqueryExpressions + } + + private def getNumSorts(plan: LogicalPlan): Int = { +plan.collect { case s: Sort => s }.size + } + + test("SPARK-23957 Remove redundant sort from subquery plan(in subquery)") { +withTempView("t1", "t2", "t3") { + Seq((1, 1), (2, 2)).toDF("c1", "c2").createOrReplaceTempView("t1") + Seq((1, 1), (2, 2)).toDF("c1", "c2").createOrReplaceTempView("t2") + Seq((1, 1, 1), (2, 2, 2)).toDF("c1", "c2", "c3").createOrReplaceTempView("t3") + + // Simple order by + val query1 = +""" + |SELECT c1 FROM t1 + |WHERE + |c1 IN (SELECT c1 FROM t2 ORDER BY c1) +""".stripMargin + assert(getNumSortsInQuery(query1) == 0) + + // Nested order bys + val query2 = +""" + |SELECT c1 + |FROM t1 + |WHERE c1 IN (SELECT c1 + | FROM (SELECT * + | FROM t2 + | ORDER BY c2) + | ORDER BY c1) +""".stripMargin + assert(getNumSortsInQuery(query2) == 0) + + + // nested IN + val query3 = +""" + |SELECT c1 + |FROM t1 + |WHERE c1 IN (SELECT c1 + | FROM t2 + | WHERE c1 IN (SELECT c1 + |FROM t3 + |WHERE c1 = 1 + |ORDER BY c3) + | ORDER BY c2) +""".stripMargin + assert(getNumSortsInQuery(query3) == 0) + + // Complex subplan and multiple sorts + val query4 = +""" + |SELECT c1 + |FROM t1 + |WHERE c1 IN (SELECT c1 + | FROM (SELECT c1, c2, count(*) + | FROM t2 + | GROUP BY c1, c2 + | HAVING count(*) > 0 + | ORDER BY c2) + | ORDER BY c1) +""".stripMargin + assert(getNumSortsInQuery(query4) == 0) + + // Join in subplan + val query5 = +""" + |SELECT c1 FROM t1 + |WHERE + |c1 IN (SELECT t2.c1 FROM t2, t3 + | WHERE t2.c1 = t3.c1 + | ORDER BY t2.c1) +""".stripMargin + assert(getNumSortsInQuery(query5) == 0) + + val query6 = +""" + |SELECT c1 + |FROM t1 + |WHERE (c1, c2) IN (SELECT c1, max(c2) + |FROM (SELECT c1, c2, count(*) + |FROM t2 + |GROUP BY c1, c2 + |HAVING count(*) > 0 + |ORDER BY c2) + |GROUP BY c1 + |HAVING max(c2) > 0 + |ORDER BY c1) +""".stripMargin + // The rule to remove redundant sorts is not able to remove the inner sort under + // an Aggregate operator. We only remove the top level sort. + assert(getNumSortsInQuery(query6) == 1) + + // Cases when sort is not removed from the plan + // Limit on top of sort + val query7 = +""" + |SELECT c1 FROM t1 + |WHERE + |c1 IN (SELECT c1 FROM t2 ORDER BY c1 limit 1) +""".stripMargin + assert(getNumSortsInQuery(query7) == 1) + + // Sort below a set operations (intersect,
[GitHub] spark pull request #21853: [SPARK-23957][SQL] Sorts in subqueries are redund...
GitHub user dilipbiswal opened a pull request: https://github.com/apache/spark/pull/21853 [SPARK-23957][SQL] Sorts in subqueries are redundant and can be removed ## What changes were proposed in this pull request? Thanks to @henryr for the original idea at https://github.com/apache/spark/pull/21049 Description from the original PR : Subqueries (at least in SQL) have 'bag of tuples' semantics. Ordering them is therefore redundant (unless combined with a limit). This patch removes the top sort operators from the subquery plans. ## How was this patch tested? Added test cases in SubquerySuite to cover in, exists and scalar subqueries. Please review http://spark.apache.org/contributing.html before opening a pull request. You can merge this pull request into a Git repository by running: $ git pull https://github.com/dilipbiswal/spark SPARK-23957 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/21853.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #21853 commit 191c0eb9c12a1ba7ad210e4429feca11df832598 Author: Dilip Biswal Date: 2018-07-23T18:46:24Z [SPARK-23957] Sorts in subqueries are redundant and can be removed --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org