sunchao commented on code in PR #55927:
URL: https://github.com/apache/spark/pull/55927#discussion_r3268819175
##########
sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala:
##########
@@ -345,4 +346,249 @@ class OuterJoinSuite extends SharedSparkSession with
SQLTestData {
val df2 = join("SHUFFLE_MERGE(t1)")
checkAnswer(df1, identity, df2.collect().toSeq)
}
+
+ test("ordinary outer equi-join spreads NULL keys in shuffle partitioning") {
+ val nullableLeft = Seq(
+ (Integer.valueOf(1), "left-1"),
+ (null.asInstanceOf[Integer], "left-null-1"),
+ (null.asInstanceOf[Integer], "left-null-2")).toDF("k", "lv")
+ val nullableRight = Seq(
+ (Integer.valueOf(1), "right-1"),
+ (null.asInstanceOf[Integer], "right-null")).toDF("k", "rv")
+ val joinCondition = (nullableLeft("k") === nullableRight("k")).expr
+ val join = Join(nullableLeft.logicalPlan, nullableRight.logicalPlan,
+ LeftOuter, Some(joinCondition), JoinHint.NONE)
+
+ ExtractEquiJoinKeys.unapply(join).foreach {
Review Comment:
Updated
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledJoin.scala:
##########
@@ -28,6 +29,15 @@ import
org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Dist
trait ShuffledJoin extends JoinCodegenSupport {
def isSkewJoin: Boolean
+ private lazy val canSpreadNullJoinKeys: Boolean = {
+ // Null-safe equality usually rewrites to non-null shuffle keys. The
NullType corner can still
+ // produce NULL shuffle keys, but shuffled join execution already treats
those rows as
+ // unmatched, so spreading them does not change the result.
+ val isOuterJoin = joinType == LeftOuter || joinType == RightOuter ||
joinType == FullOuter
+ conf.getConf(SQLConf.SHUFFLE_SPREAD_NULL_JOIN_KEYS_ENABLED) &&
+ isOuterJoin
+ }
Review Comment:
Updated
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]