Github user vidma commented on a diff in the pull request:
https://github.com/apache/spark/pull/9451#discussion_r44222500
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
---
@@ -904,3 +909,36 @@ object RemoveLiteralFromGroupExpressions extends
Rule[LogicalPlan] {
a.copy(groupingExpressions = newGrouping)
}
}
+
+/**
+ * For an inner join - remove rows with null keys on both sides
+ */
+object JoinSkewOptimizer extends Rule[LogicalPlan] with PredicateHelper {
+ /**
+ * Adds a null filter on given columns, if any
+ */
+ def addNullFilter(columns: Seq[Expression], expr: LogicalPlan):
LogicalPlan = {
+ columns.map(IsNotNull)
+ .reduceLeftOption(And)
+ .map(Filter(_, expr))
+ .getOrElse(expr)
+ }
+
+ private def hasNullableKeys(leftKeys: Seq[Expression], rightKeys:
Seq[Expression]) = {
+ leftKeys.exists(_.nullable) || rightKeys.exists(_.nullable)
+ }
+
+ def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+ case join @ Join(left, right, joinType, originalJoinCondition) =>
+ join match {
+ case ExtractEquiJoinKeys(_, leftKeys, rightKeys, _, _, _)
+ if hasNullableKeys(leftKeys, rightKeys) && Seq(Inner,
LeftSemi).contains(joinType) =>
+ // add a non-null join-key filter on both sides of join
+ val newLeft = addNullFilter(leftKeys.filter(_.nullable), left)
+ val newRight = addNullFilter(rightKeys.filter(_.nullable),
right)
+ Join(newLeft, newRight, joinType, originalJoinCondition)
--- End diff --
in `Inner | Semi` join case, the null filter could be added to
`joinCondition` (instead of left/right relations), assuming that I'll be pushed
down by subsequent optimizer rules.
which do you prefer?
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]