c21 commented on a change in pull request #32476: URL: https://github.com/apache/spark/pull/32476#discussion_r628966219
########## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala ########## @@ -353,12 +353,37 @@ case class SortMergeJoinExec( } } - override def supportCodegen: Boolean = { - joinType.isInstanceOf[InnerLike] + private lazy val (streamedPlan, bufferedPlan) = joinType match { Review comment: @maropu - yes, this is used for code-gen only. Note here we only pattern match inner/left outer/right outer join, so it will throw exception with `val` for other join types. ########## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala ########## @@ -418,115 +443,140 @@ case class SortMergeJoinExec( // Inline mutable state since not many join operations in a task val matches = ctx.addMutableState(clsName, "matches", v => s"$v = new $clsName($inMemoryThreshold, $spillThreshold);", forceInline = true) - // Copy the left keys as class members so they could be used in next function call. - val matchedKeyVars = copyKeys(ctx, leftKeyVars) + // Copy the streamed keys as class members so they could be used in next function call. + val matchedKeyVars = copyKeys(ctx, streamedKeyVars) + + // Handle the case when streamed rows has any NULL keys. + val handleStreamedAnyNull = joinType match { + case _: InnerLike => + // Skip streamed row. + s""" + |$streamedRow = null; + |continue; + """.stripMargin + case LeftOuter | RightOuter => + // Eagerly return streamed row. + s""" + |if (!$matches.isEmpty()) { + | $matches.clear(); + |} + |return false; Review comment: Wanted to avoid `clear()` if `isEmpty()` is true. `ExternalAppendOnlyUnsafeRowArray.isEmpty()` is very cheap but `clear()` sets multiple variables. ########## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala ########## @@ -418,115 +443,140 @@ case class SortMergeJoinExec( // Inline mutable state since not many join operations in a task val matches = ctx.addMutableState(clsName, "matches", v => s"$v = new $clsName($inMemoryThreshold, $spillThreshold);", forceInline = true) - // Copy the left keys as class members so they could be used in next function call. - val matchedKeyVars = copyKeys(ctx, leftKeyVars) + // Copy the streamed keys as class members so they could be used in next function call. + val matchedKeyVars = copyKeys(ctx, streamedKeyVars) + + // Handle the case when streamed rows has any NULL keys. + val handleStreamedAnyNull = joinType match { + case _: InnerLike => + // Skip streamed row. + s""" + |$streamedRow = null; + |continue; + """.stripMargin + case LeftOuter | RightOuter => + // Eagerly return streamed row. + s""" + |if (!$matches.isEmpty()) { + | $matches.clear(); + |} + |return false; + """.stripMargin + case x => + throw new IllegalArgumentException( + s"SortMergeJoin.genScanner should not take $x as the JoinType") + } - ctx.addNewFunction("findNextInnerJoinRows", + // Handle the case when streamed keys less than buffered keys. + val handleStreamedLessThanBuffered = joinType match { + case _: InnerLike => + // Skip streamed row. + s"$streamedRow = null;" + case LeftOuter | RightOuter => + // Eagerly return with streamed row. + "return false;" + case x => + throw new IllegalArgumentException( + s"SortMergeJoin.genScanner should not take $x as the JoinType") + } + + ctx.addNewFunction("findNextJoinRows", s""" - |private boolean findNextInnerJoinRows( - | scala.collection.Iterator leftIter, - | scala.collection.Iterator rightIter) { - | $leftRow = null; + |private boolean findNextJoinRows( Review comment: > For example, if there are too many matched duplicate rows in the buffered side, it seems we don't need to put all the rows in matches, right? Why we don't need to put all the rows? We anyway need to evaluate all the rows on buffered side for join, right? ########## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala ########## @@ -418,115 +443,140 @@ case class SortMergeJoinExec( // Inline mutable state since not many join operations in a task val matches = ctx.addMutableState(clsName, "matches", v => s"$v = new $clsName($inMemoryThreshold, $spillThreshold);", forceInline = true) - // Copy the left keys as class members so they could be used in next function call. - val matchedKeyVars = copyKeys(ctx, leftKeyVars) + // Copy the streamed keys as class members so they could be used in next function call. + val matchedKeyVars = copyKeys(ctx, streamedKeyVars) + + // Handle the case when streamed rows has any NULL keys. + val handleStreamedAnyNull = joinType match { + case _: InnerLike => + // Skip streamed row. + s""" + |$streamedRow = null; + |continue; + """.stripMargin + case LeftOuter | RightOuter => + // Eagerly return streamed row. + s""" + |if (!$matches.isEmpty()) { + | $matches.clear(); + |} + |return false; + """.stripMargin + case x => + throw new IllegalArgumentException( + s"SortMergeJoin.genScanner should not take $x as the JoinType") + } - ctx.addNewFunction("findNextInnerJoinRows", + // Handle the case when streamed keys less than buffered keys. + val handleStreamedLessThanBuffered = joinType match { + case _: InnerLike => + // Skip streamed row. + s"$streamedRow = null;" + case LeftOuter | RightOuter => + // Eagerly return with streamed row. + "return false;" + case x => + throw new IllegalArgumentException( + s"SortMergeJoin.genScanner should not take $x as the JoinType") + } + + ctx.addNewFunction("findNextJoinRows", s""" - |private boolean findNextInnerJoinRows( - | scala.collection.Iterator leftIter, - | scala.collection.Iterator rightIter) { - | $leftRow = null; + |private boolean findNextJoinRows( Review comment: > In the outer case, a return value is not used? Yes. Otherwise it's very hard to re-use code in `findNextJoinRows`. I can further make more change to not return anything for `findNextJoinRows` in case it's an outer join. Do we want to do that? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org