cloud-fan commented on a change in pull request #32816:
URL: https://github.com/apache/spark/pull/32816#discussion_r696361665



##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
##########
@@ -1908,6 +1914,69 @@ class AdaptiveQueryExecSuite
     }
   }
 
+  test("SPARK-33832: Support optimize skew join even if introduce extra 
shuffle") {
+    withSQLConf(
+      SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
+      SQLConf.ADAPTIVE_OPTIMIZE_SKEWS_IN_REBALANCE_PARTITIONS_ENABLED.key -> 
"false",
+      SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
+      SQLConf.SKEW_JOIN_SKEWED_PARTITION_THRESHOLD.key -> "100",
+      SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "100",
+      SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM.key -> "1",
+      SQLConf.SHUFFLE_PARTITIONS.key -> "10",
+      SQLConf.ADAPTIVE_FORCE_OPTIMIZE_SKEWED_JOIN.key -> "true") {
+      withTempView("skewData1", "skewData2") {
+        spark
+          .range(0, 1000, 1, 10)
+          .selectExpr("id % 3 as key1", "id as value1")
+          .createOrReplaceTempView("skewData1")
+        spark
+          .range(0, 1000, 1, 10)
+          .selectExpr("id % 1 as key2", "id as value2")
+          .createOrReplaceTempView("skewData2")
+
+        // check if optimized skewed join does not satisfy the required 
distribution
+        Seq(true, false).foreach { hasRequiredDistribution =>
+          Seq(true, false).foreach { hasPartitionNumber =>
+            val repartition = if (hasRequiredDistribution) {
+              s"/*+ repartition(${ if (hasPartitionNumber) "10," else ""}key1) 
*/"
+            } else {
+              ""
+            }
+
+            // check required distribution and extra shuffle
+            val (_, adaptive1) =
+              runAdaptiveAndVerifyResult(s"SELECT $repartition key1 FROM 
skewData1 " +
+                s"JOIN skewData2 ON key1 = key2 GROUP BY key1")
+            val shuffles1 = findTopLevelShuffle(adaptive1)
+            assert(shuffles1.size == 3)
+            assert(shuffles1.head.shuffleOrigin == ENSURE_REQUIREMENTS)
+            val smj1 = findTopLevelSortMergeJoin(adaptive1)
+            assert(smj1.size == 1 && smj1.exists(_.isSkewJoin))

Review comment:
       `smj1.exists(_.isSkewJoin)` -> `smj1.head.isSkewJoin`




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to