cloud-fan commented on a change in pull request #26164: [SPARK-21492][SQL] Fix
memory leak in SortMergeJoin
URL: https://github.com/apache/spark/pull/26164#discussion_r336534176
##########
File path: sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
##########
@@ -1040,3 +1044,49 @@ class JoinSuite extends QueryTest with
SharedSparkSession {
checkAnswer(df, Row(1, 2, 1, 2) :: Nil)
}
}
+
+class JoinWithResourceCleanSuite extends JoinSuite with BeforeAndAfterAll {
+ import testImplicits._
+ import scala.collection.mutable.ArrayBuffer
+
+ private def checkCleanupResourceTriggered(plan: SparkPlan) :
ArrayBuffer[SortExec] = {
+ // Check cleanupResources are finally triggered in SortExec node
+ val sorts = new ArrayBuffer[SortExec]()
+ plan.foreachUp {
+ case s: SortExec => sorts += s
+ case _ =>
+ }
+ sorts.foreach { sort =>
+ val sortExec = spy(sort)
+ verify(sortExec, atLeastOnce).cleanupResources()
+ verify(sortExec.rowSorter, atLeastOnce).cleanupResources()
+ }
+ sorts
+ }
+
+ override def checkAnswer(df: => DataFrame, rows: Seq[Row]): Unit = {
+ withSQLConf(
+ SQLConf.SORT_MERGE_JOIN_EXEC_EAGER_CLEANUP_RESOURCES.key -> "true") {
+ checkCleanupResourceTriggered(df.queryExecution.sparkPlan)
+ super.checkAnswer(df, rows)
+ }
+ }
+
+ test("cleanupResource in code generation") {
+ withSQLConf(
+ SQLConf.SORT_MERGE_JOIN_EXEC_EAGER_CLEANUP_RESOURCES.key -> "true",
+ SQLConf.SHUFFLE_PARTITIONS.key -> "1",
+ SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") {
+ val df1 = spark.range(0, 10, 1, 2)
+ val df2 = spark.range(10).select($"id".as("b1"), (- $"id").as("b2"))
+ val res = df1.join(df2, $"id" === $"b1" && $"id" ===
$"b2").select($"b1", $"b2", $"id")
+
+ val sorts = checkCleanupResourceTriggered(res.queryExecution.sparkPlan)
Review comment:
`checkCleanupResourceTriggered` is called twice: once here, once in
`checkAnswer`. Is it expected?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]