[GitHub] [spark] cloud-fan commented on a diff in pull request #38950: [SPARK-41413][SQL] Avoid shuffle in Storage-Partitioned Join when partition keys mismatch, but join expressions are compatible

GitBox Wed, 21 Dec 2022 05:46:46 -0800


cloud-fan commented on code in PR #38950:
URL: https://github.com/apache/spark/pull/38950#discussion_r1054407365



##########
sql/core/src/test/scala/org/apache/spark/sql/connector/KeyGroupedPartitioningSuite.scala:
##########
@@ -404,8 +409,104 @@ class KeyGroupedPartitioningSuite extends 
DistributionAndOrderingSuiteBase {
         s"FROM testcat.ns.$items i JOIN testcat.ns.$purchases p " +
         "ON i.id = p.item_id AND i.arrive_time = p.time ORDER BY id, 
purchase_price, sale_price")
 
+    checkAnswer(df,
+      Seq(Row(1, "aa", 40.0, 42.0), Row(2, "bb", 10.0, 11.0)))
+
+    val shuffles = collectShuffles(df.queryExecution.executedPlan)
+    assert(shuffles.isEmpty, "should not add shuffle when partition keys 
mismatch")
+  }
+
+  test("SPARK-41413: partitioned join: partition values from one side are 
subset of those from " +
+      "the other side") {
+    val items_partitions = Array(bucket(4, "id"))
+    createTable(items, items_schema, items_partitions)
+
+    sql(s"INSERT INTO testcat.ns.$items VALUES " +
+        "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
+        "(3, 'bb', 10.0, cast('2020-01-01' as timestamp)), " +
+        "(4, 'cc', 15.5, cast('2020-02-01' as timestamp))")
+
+    val purchases_partitions = Array(bucket(4, "item_id"))
+    createTable(purchases, purchases_schema, purchases_partitions)
+
+    sql(s"INSERT INTO testcat.ns.$purchases VALUES " +
+        "(1, 42.0, cast('2020-01-01' as timestamp)), " +
+        "(3, 19.5, cast('2020-02-01' as timestamp))")
+
+    val df = sql("SELECT id, name, i.price as purchase_price, p.price as 
sale_price " +
+        s"FROM testcat.ns.$items i JOIN testcat.ns.$purchases p " +
+        "ON i.id = p.item_id ORDER BY id, purchase_price, sale_price")
+
+    val shuffles = collectShuffles(df.queryExecution.executedPlan)
+    shuffles.foreach { s =>
+      assert(s.outputPartitioning.isInstanceOf[KeyGroupedPartitioning],
+        s"expected KeyGroupedPartitioning but found 
${s.outputPartitioning.getClass.getName}")
+      assert(s.outputPartitioning.numPartitions == 3,
+        s"expected 3 partitions for $s but found 
${s.outputPartitioning.numPartitions}")
+    }
+    assert(shuffles.isEmpty, "should not contain any shuffle")
+    checkAnswer(df, Seq(Row(1, "aa", 40.0, 42.0), Row(3, "bb", 10.0, 19.5)))
+  }
+
+  test("SPARK-41413: partitioned join: partition values from both sides 
overlaps") {
+    val items_partitions = Array(identity("id"))
+    createTable(items, items_schema, items_partitions)
+
+    sql(s"INSERT INTO testcat.ns.$items VALUES " +
+        "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
+        "(2, 'bb', 10.0, cast('2020-01-01' as timestamp)), " +
+        "(3, 'cc', 15.5, cast('2020-02-01' as timestamp))")
+
+    val purchases_partitions = Array(identity("item_id"))
+    createTable(purchases, purchases_schema, purchases_partitions)
+    sql(s"INSERT INTO testcat.ns.$purchases VALUES " +
+        "(1, 42.0, cast('2020-01-01' as timestamp)), " +
+        "(2, 19.5, cast('2020-02-01' as timestamp)), " +
+        "(4, 30.0, cast('2020-02-01' as timestamp))")
+
+    val df = sql("SELECT id, name, i.price as purchase_price, p.price as 
sale_price " +
+        s"FROM testcat.ns.$items i JOIN testcat.ns.$purchases p " +
+        "ON i.id = p.item_id ORDER BY id, purchase_price, sale_price")
+
+    val shuffles = collectShuffles(df.queryExecution.executedPlan)
+    shuffles.foreach { s =>

Review Comment:
   ditto



##########
sql/core/src/test/scala/org/apache/spark/sql/connector/KeyGroupedPartitioningSuite.scala:
##########
@@ -404,8 +409,104 @@ class KeyGroupedPartitioningSuite extends 
DistributionAndOrderingSuiteBase {
         s"FROM testcat.ns.$items i JOIN testcat.ns.$purchases p " +
         "ON i.id = p.item_id AND i.arrive_time = p.time ORDER BY id, 
purchase_price, sale_price")
 
+    checkAnswer(df,
+      Seq(Row(1, "aa", 40.0, 42.0), Row(2, "bb", 10.0, 11.0)))
+
+    val shuffles = collectShuffles(df.queryExecution.executedPlan)
+    assert(shuffles.isEmpty, "should not add shuffle when partition keys 
mismatch")
+  }
+
+  test("SPARK-41413: partitioned join: partition values from one side are 
subset of those from " +
+      "the other side") {
+    val items_partitions = Array(bucket(4, "id"))
+    createTable(items, items_schema, items_partitions)
+
+    sql(s"INSERT INTO testcat.ns.$items VALUES " +
+        "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
+        "(3, 'bb', 10.0, cast('2020-01-01' as timestamp)), " +
+        "(4, 'cc', 15.5, cast('2020-02-01' as timestamp))")
+
+    val purchases_partitions = Array(bucket(4, "item_id"))
+    createTable(purchases, purchases_schema, purchases_partitions)
+
+    sql(s"INSERT INTO testcat.ns.$purchases VALUES " +
+        "(1, 42.0, cast('2020-01-01' as timestamp)), " +
+        "(3, 19.5, cast('2020-02-01' as timestamp))")
+
+    val df = sql("SELECT id, name, i.price as purchase_price, p.price as 
sale_price " +
+        s"FROM testcat.ns.$items i JOIN testcat.ns.$purchases p " +
+        "ON i.id = p.item_id ORDER BY id, purchase_price, sale_price")
+
+    val shuffles = collectShuffles(df.queryExecution.executedPlan)
+    shuffles.foreach { s =>
+      assert(s.outputPartitioning.isInstanceOf[KeyGroupedPartitioning],
+        s"expected KeyGroupedPartitioning but found 
${s.outputPartitioning.getClass.getName}")
+      assert(s.outputPartitioning.numPartitions == 3,
+        s"expected 3 partitions for $s but found 
${s.outputPartitioning.numPartitions}")
+    }
+    assert(shuffles.isEmpty, "should not contain any shuffle")
+    checkAnswer(df, Seq(Row(1, "aa", 40.0, 42.0), Row(3, "bb", 10.0, 19.5)))
+  }
+
+  test("SPARK-41413: partitioned join: partition values from both sides 
overlaps") {
+    val items_partitions = Array(identity("id"))
+    createTable(items, items_schema, items_partitions)
+
+    sql(s"INSERT INTO testcat.ns.$items VALUES " +
+        "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
+        "(2, 'bb', 10.0, cast('2020-01-01' as timestamp)), " +
+        "(3, 'cc', 15.5, cast('2020-02-01' as timestamp))")
+
+    val purchases_partitions = Array(identity("item_id"))
+    createTable(purchases, purchases_schema, purchases_partitions)
+    sql(s"INSERT INTO testcat.ns.$purchases VALUES " +
+        "(1, 42.0, cast('2020-01-01' as timestamp)), " +
+        "(2, 19.5, cast('2020-02-01' as timestamp)), " +
+        "(4, 30.0, cast('2020-02-01' as timestamp))")
+
+    val df = sql("SELECT id, name, i.price as purchase_price, p.price as 
sale_price " +
+        s"FROM testcat.ns.$items i JOIN testcat.ns.$purchases p " +
+        "ON i.id = p.item_id ORDER BY id, purchase_price, sale_price")
+
+    val shuffles = collectShuffles(df.queryExecution.executedPlan)
+    shuffles.foreach { s =>
+      assert(s.outputPartitioning.isInstanceOf[KeyGroupedPartitioning],
+        s"expected KeyGroupedPartitioning but found 
${s.outputPartitioning.getClass.getName}")
+      assert(s.outputPartitioning.numPartitions == 4,
+        s"expected 4 partitions for $s but found 
${s.outputPartitioning.numPartitions}")
+    }
+    assert(shuffles.isEmpty, "should not contain any shuffle")
+    checkAnswer(df, Seq(Row(1, "aa", 40.0, 42.0), Row(2, "bb", 10.0, 19.5)))
+  }
+
+  test("SPARK-41413: partitioned join: non-overlapping partition values from 
both sides") {
+    val items_partitions = Array(identity("id"))
+    createTable(items, items_schema, items_partitions)
+    sql(s"INSERT INTO testcat.ns.$items VALUES " +
+        "(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
+        "(2, 'bb', 10.0, cast('2020-01-01' as timestamp)), " +
+        "(3, 'cc', 15.5, cast('2020-02-01' as timestamp))")
+
+    val purchases_partitions = Array(identity("item_id"))
+    createTable(purchases, purchases_schema, purchases_partitions)
+    sql(s"INSERT INTO testcat.ns.$purchases VALUES " +
+        "(4, 42.0, cast('2020-01-01' as timestamp)), " +
+        "(5, 19.5, cast('2020-02-01' as timestamp)), " +
+        "(6, 30.0, cast('2020-02-01' as timestamp))")
+
+    val df = sql("SELECT id, name, i.price as purchase_price, p.price as 
sale_price " +
+        s"FROM testcat.ns.$items i JOIN testcat.ns.$purchases p " +
+        "ON i.id = p.item_id ORDER BY id, purchase_price, sale_price")
+
     val shuffles = collectShuffles(df.queryExecution.executedPlan)
-    assert(shuffles.nonEmpty, "should add shuffle when partition keys 
mismatch")
+    shuffles.foreach { s =>

Review Comment:
   ditto



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] cloud-fan commented on a diff in pull request #38950: [SPARK-41413][SQL] Avoid shuffle in Storage-Partitioned Join when partition keys mismatch, but join expressions are compatible

Reply via email to