[spark] branch master updated: [SPARK-32276][SQL] Remove redundant sorts before repartition nodes

dongjoon Tue, 14 Jul 2020 21:19:21 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new af8e65f  [SPARK-32276][SQL] Remove redundant sorts before repartition 
nodes
af8e65f is described below

commit af8e65fca989518cf65ec47f77eea2ce649bd6bb
Author: Anton Okolnychyi <[email protected]>
AuthorDate: Tue Jul 14 21:17:33 2020 -0700

    [SPARK-32276][SQL] Remove redundant sorts before repartition nodes
    
    ### What changes were proposed in this pull request?
    
    This PR removes redundant sorts before repartition nodes with shuffles and 
repartitionByExpression with deterministic expressions.
    
    ### Why are the changes needed?
    
    It looks like our `EliminateSorts` rule can be extended further to remove 
sorts before repartition nodes that shuffle data as such repartition operations 
change the ordering and distribution of data. That's why it seems safe to 
perform the following rewrites:
    - `Repartition -> Sort -> Scan` as `Repartition -> Scan`
    - `Repartition -> Project -> Sort -> Scan` as `Repartition -> Project -> 
Scan`
    
    We don't apply this optimization to coalesce as it uses 
`DefaultPartitionCoalescer` that may preserve the ordering of data if there is 
no locality info in the parent RDD. At the same time, there is no guarantee 
that will happen.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    More test cases.
    
    Closes #29089 from aokolnychyi/spark-32276.
    
    Authored-by: Anton Okolnychyi <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala   |   8 +
 .../EliminateSortsBeforeRepartitionSuite.scala     | 179 +++++++++++++++++++++
 2 files changed, 187 insertions(+)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index e481cdb..29f5399 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -974,6 +974,10 @@ object CombineFilters extends Rule[LogicalPlan] with 
PredicateHelper {
  *    and the Join conditions is deterministic
  * 5) if the Sort operator is within GroupBy separated by 0...n Project/Filter 
operators only,
  *    and the aggregate function is order irrelevant
+ * 6) if the Sort operator is within RepartitionByExpression separated by 
0...n Project/Filter
+ *    operators and the repartition expressions are deterministic
+ * 7) if the Sort operator is within Repartition separated by 0...n 
Project/Filter operators
+ *    and the repartition requires a shuffle
  */
 object EliminateSorts extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
@@ -987,6 +991,10 @@ object EliminateSorts extends Rule[LogicalPlan] {
       j.copy(left = recursiveRemoveSort(originLeft), right = 
recursiveRemoveSort(originRight))
     case g @ Aggregate(_, aggs, originChild) if isOrderIrrelevantAggs(aggs) =>
       g.copy(child = recursiveRemoveSort(originChild))
+    case r: RepartitionByExpression if 
r.partitionExpressions.forall(_.deterministic) =>
+      r.copy(child = recursiveRemoveSort(r.child))
+    case r: Repartition if r.shuffle =>
+      r.copy(child = recursiveRemoveSort(r.child))
   }
 
   private def recursiveRemoveSort(plan: LogicalPlan): LogicalPlan = plan match 
{
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSortsBeforeRepartitionSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSortsBeforeRepartitionSuite.scala
new file mode 100644
index 0000000..ff5521f
--- /dev/null
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSortsBeforeRepartitionSuite.scala
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.analysis.{Analyzer, EmptyFunctionRegistry}
+import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+
+class EliminateSortsBeforeRepartitionSuite extends PlanTest {
+
+  val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, 
conf)
+  val analyzer = new Analyzer(catalog, conf)
+  val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("Default", FixedPoint(10),
+        FoldablePropagation,
+        LimitPushDown) ::
+      Batch("Eliminate Sorts", Once,
+        EliminateSorts) ::
+      Batch("Collapse Project", Once,
+        CollapseProject) :: Nil
+  }
+
+  def repartition(plan: LogicalPlan): LogicalPlan = plan.repartition(10)
+  def isOptimized: Boolean = true
+
+  test("sortBy") {
+    val plan = testRelation.select('a, 'b).sortBy('a.asc, 'b.desc)
+    val planWithRepartition = repartition(plan)
+    val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
+    val correctPlan = if (isOptimized) {
+      repartition(testRelation.select('a, 'b))
+    } else {
+      planWithRepartition
+    }
+    comparePlans(optimizedPlan, analyzer.execute(correctPlan))
+  }
+
+  test("sortBy with projection") {
+    val plan = testRelation.select('a, 'b)
+      .sortBy('a.asc, 'b.asc)
+      .select('a + 1 as "a", 'b + 2 as "b")
+    val planWithRepartition = repartition(plan)
+    val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
+    val correctPlan = if (isOptimized) {
+      repartition(testRelation.select('a + 1 as "a", 'b + 2 as "b"))
+    } else {
+      planWithRepartition
+    }
+    comparePlans(optimizedPlan, analyzer.execute(correctPlan))
+  }
+
+  test("sortBy with projection and filter") {
+    val plan = testRelation.sortBy('a.asc, 'b.asc)
+      .select('a, 'b)
+      .where('a === 10)
+    val planWithRepartition = repartition(plan)
+    val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
+    val correctPlan = if (isOptimized) {
+      repartition(testRelation.select('a, 'b).where('a === 10))
+    } else {
+      planWithRepartition
+    }
+    comparePlans(optimizedPlan, analyzer.execute(correctPlan))
+  }
+
+  test("sortBy with limit") {
+    val plan = testRelation.sortBy('a.asc, 'b.asc).limit(10)
+    val planWithRepartition = repartition(plan)
+    val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
+    comparePlans(optimizedPlan, analyzer.execute(planWithRepartition))
+  }
+
+  test("sortBy with non-deterministic projection") {
+    val plan = testRelation.sortBy('a.asc, 'b.asc).select(rand(1), 'a, 'b)
+    val planWithRepartition = repartition(plan)
+    val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
+    comparePlans(optimizedPlan, analyzer.execute(planWithRepartition))
+  }
+
+  test("orderBy") {
+    val plan = testRelation.select('a, 'b).orderBy('a.asc, 'b.asc)
+    val planWithRepartition = repartition(plan)
+    val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
+    val correctPlan = if (isOptimized) {
+      repartition(testRelation.select('a, 'b))
+    } else {
+      planWithRepartition
+    }
+    comparePlans(optimizedPlan, analyzer.execute(correctPlan))
+  }
+
+  test("orderBy with projection") {
+    val plan = testRelation.select('a, 'b)
+      .orderBy('a.asc, 'b.asc)
+      .select('a + 1 as "a", 'b + 2 as "b")
+    val planWithRepartition = repartition(plan)
+    val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
+    val correctPlan = if (isOptimized) {
+      repartition(testRelation.select('a + 1 as "a", 'b + 2 as "b"))
+    } else {
+      planWithRepartition
+    }
+    comparePlans(optimizedPlan, analyzer.execute(correctPlan))
+  }
+
+  test("orderBy with projection and filter") {
+    val plan = testRelation.orderBy('a.asc, 'b.asc)
+      .select('a, 'b)
+      .where('a === 10)
+    val planWithRepartition = repartition(plan)
+    val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
+    val correctPlan = if (isOptimized) {
+      repartition(testRelation.select('a, 'b).where('a === 10))
+    } else {
+      planWithRepartition
+    }
+    comparePlans(optimizedPlan, analyzer.execute(correctPlan))
+  }
+
+  test("orderBy with limit") {
+    val plan = testRelation.orderBy('a.asc, 'b.asc).limit(10)
+    val planWithRepartition = repartition(plan)
+    val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
+    comparePlans(optimizedPlan, analyzer.execute(planWithRepartition))
+  }
+
+  test("orderBy with non-deterministic projection") {
+    val plan = testRelation.orderBy('a.asc, 'b.asc).select(rand(1), 'a, 'b)
+    val planWithRepartition = repartition(plan)
+    val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
+    comparePlans(optimizedPlan, analyzer.execute(planWithRepartition))
+  }
+}
+
+class EliminateSortsBeforeRepartitionByExprsSuite extends 
EliminateSortsBeforeRepartitionSuite {
+  override def repartition(plan: LogicalPlan): LogicalPlan = 
plan.distribute('a, 'b)(10)
+  override def isOptimized: Boolean = true
+
+  test("sortBy before repartition with non-deterministic expressions") {
+    val plan = testRelation.sortBy('a.asc, 'b.asc).limit(10)
+    val planWithRepartition = plan.distribute(rand(1).asc, 'a.asc)(20)
+    val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
+    comparePlans(optimizedPlan, analyzer.execute(planWithRepartition))
+  }
+
+  test("orderBy before repartition with non-deterministic expressions") {
+    val plan = testRelation.orderBy('a.asc, 'b.asc).limit(10)
+    val planWithRepartition = plan.distribute(rand(1).asc, 'a.asc)(20)
+    val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
+    comparePlans(optimizedPlan, analyzer.execute(planWithRepartition))
+  }
+}
+
+class EliminateSortsBeforeCoalesceSuite extends 
EliminateSortsBeforeRepartitionSuite {
+  override def repartition(plan: LogicalPlan): LogicalPlan = plan.coalesce(1)
+  override def isOptimized: Boolean = false
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-32276][SQL] Remove redundant sorts before repartition nodes

Reply via email to