Github user marmbrus commented on a diff in the pull request:
https://github.com/apache/spark/pull/1147#discussion_r15678978
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/joins.scala ---
@@ -37,6 +37,135 @@ case object BuildLeft extends BuildSide
@DeveloperApi
case object BuildRight extends BuildSide
+/**
+ * Constant Value for Binary Join Node
+ */
+object BinaryJoinNode {
+ val SINGLE_NULL_LIST = Seq[Row](null)
+ val EMPTY_NULL_LIST = Seq[Row]()
+}
+
+// TODO If join key was null should be considered as equal? In Hive this
is configurable.
+
+/**
+ * Output the tuples for the matched (with the same join key) join group,
base on the join types,
+ * Both input iterators should be repeatable.
+ */
+trait BinaryRepeatableIteratorNode extends BinaryNode {
+ self: Product =>
+
+ val leftNullRow = new GenericRow(left.output.length)
+ val rightNullRow = new GenericRow(right.output.length)
+
+ val joinedRow = new JoinedRow()
+
+ val boundCondition = InterpretedPredicate(
+ condition
+ .map(c => BindReferences.bindReference(c, left.output ++
right.output))
+ .getOrElse(Literal(true)))
+
+ def condition: Option[Expression]
+ def joinType: JoinType
+
+ // TODO we need to rewrite all of the iterators with our own
implementation instead of the scala
+ // iterator for performance / memory usage reason.
+
+ def leftOuterIterator(key: Row, leftIter: Iterable[Row], rightIter:
Iterable[Row])
+ : Iterator[Row] = {
+ leftIter.iterator.flatMap { l =>
+ joinedRow.withLeft(l)
+ var matched = false
+ (if (!key.anyNull) rightIter else
BinaryJoinNode.EMPTY_NULL_LIST).collect {
+ case r if (boundCondition(joinedRow.withRight(r))) => {
+ matched = true
+ joinedRow.copy
+ }
+ } ++ BinaryJoinNode.SINGLE_NULL_LIST.collect {
+ case dummy if (!matched) => {
+ joinedRow.withRight(rightNullRow).copy
+ }
+ }
+ }
+ }
+
+ // TODO need to unit test this, currently it's the dead code, but should
be used in SortMergeJoin
+ def leftSemiIterator(key: Row, leftIter: Iterable[Row], rightIter:
Iterable[Row])
+ : Iterator[Row] = {
+ leftIter.iterator.filter { l =>
+ joinedRow.withLeft(l)
+ (if (!key.anyNull) rightIter else
BinaryJoinNode.EMPTY_NULL_LIST).exists {
+ case r => (boundCondition(joinedRow.withRight(r)))
+ }
+ }
+ }
+
+ def rightOuterIterator(key: Row, leftIter: Iterable[Row], rightIter:
Iterable[Row])
+ : Iterator[Row] = {
+ rightIter.iterator.flatMap{r =>
+ joinedRow.withRight(r)
+ var matched = false
+ (if (!key.anyNull) leftIter else
BinaryJoinNode.EMPTY_NULL_LIST).collect {
+ case l if (boundCondition(joinedRow.withLeft(l))) => {
+ matched = true
+ joinedRow.copy
+ }
+ } ++ BinaryJoinNode.SINGLE_NULL_LIST.collect {
--- End diff --
I'm having a pretty hard time following the logic of each of these cases
that are getting `++` together. I think it might be more clear if you assign
each intermediate phase to a variable (e.g., `matchedPairs`,
`unmatchedLeftRows`...) with a comment about what conditions are being checked,
why and what the result it.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---