This is an automated email from the ASF dual-hosted git repository.
gengliangwang pushed a commit to branch branch-4.2
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.2 by this push:
new dfdf66edfbcb [SPARK-57037][CORE][TESTS] Force GC before allocating
large array in SorterSuite to fix flaky OOM
dfdf66edfbcb is described below
commit dfdf66edfbcb44c4329d8854d0f8413bcca82953
Author: Gengliang Wang <[email protected]>
AuthorDate: Tue May 26 14:11:11 2026 -0700
[SPARK-57037][CORE][TESTS] Force GC before allocating large array in
SorterSuite to fix flaky OOM
### What changes were proposed in this pull request?
`SorterSuite`'s `"java.lang.ArrayIndexOutOfBoundsException in TimSort"`
test allocates a ~1 GB byte array. Previously this test called `System.gc()`
unconditionally before the allocation, but `System.gc()` is only a hint and may
not actually run before the next allocation.
This PR:
- Lifts `runGC()` (a `WeakReference` busy-wait that ensures GC has actually
run) from `ContextCleanerSuiteBase` to `SparkFunSuite`, so all test suites can
reuse it. `ContextCleanerSuite`'s callers now inherit the same method.
- Adds a `retryOnOOM[T](body: => T): T` helper on `SparkFunSuite` that runs
`body` and, if it throws `OutOfMemoryError`, calls `runGC()` and retries once.
- In `SorterSuite`, replaces the unconditional `System.gc()` call before
the allocation with `retryOnOOM(new Array[Byte](arrayToSortSize))`, so the GC
only fires on the slow path.
### Why are the changes needed?
The test is flaky in CI with `java.lang.OutOfMemoryError: Java heap space`.
The preceding test `"SPARK-5984 TimSort bug"` allocates a ~256 MB int array,
which may not be reclaimed before this test's >1 GB allocation if GC has not
actually run.
Example failed job:
https://github.com/gengliangwang/spark/actions/runs/26343824415/job/77552562041
A previous fix (`0390e4b44aa`) added `System.gc()` before the allocation,
but that is only a hint and does not guarantee GC has run. Forcing the GC only
on the failure path also avoids paying the cost on the happy path.
### Does this PR introduce _any_ user-facing change?
No. Test-only change.
### How was this patch tested?
Monitor CI stability.
### Was this patch authored or co-authored using generative AI tooling?
Generated-by: Claude Code
Closes #56086 from gengliangwang/fix-sorter-suite-oom.
Authored-by: Gengliang Wang <[email protected]>
Signed-off-by: Gengliang Wang <[email protected]>
(cherry picked from commit 4606a4b9005558b2bf721ea2edd235098304c712)
Signed-off-by: Gengliang Wang <[email protected]>
---
.../org/apache/spark/ContextCleanerSuite.scala | 15 -------------
.../scala/org/apache/spark/SparkFunSuite.scala | 25 ++++++++++++++++++++++
.../apache/spark/util/collection/SorterSuite.scala | 6 ++++--
3 files changed, 29 insertions(+), 17 deletions(-)
diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
index 05709c9bdd75..813de4132ab2 100644
--- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
@@ -17,9 +17,6 @@
package org.apache.spark
-import java.lang.ref.WeakReference
-import java.util.concurrent.TimeUnit
-
import scala.collection.mutable.HashSet
import scala.util.Random
@@ -96,18 +93,6 @@ abstract class ContextCleanerSuiteBase(val shuffleManager:
Class[_] = classOf[So
rdd
}
- /** Run GC and make sure it actually has run */
- protected def runGC(): Unit = {
- val weakRef = new WeakReference(new Object())
- val startTimeNs = System.nanoTime()
- System.gc() // Make a best effort to run the garbage collection. It
*usually* runs GC.
- // Wait until a weak reference object has been GCed
- while (System.nanoTime() - startTimeNs < TimeUnit.SECONDS.toNanos(10) &&
weakRef.get != null) {
- System.gc()
- Thread.sleep(200)
- }
- }
-
protected def cleaner = sc.cleaner.get
}
diff --git a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
index 15e150ab8b93..a0f17f8af3f3 100644
--- a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala
@@ -17,6 +17,9 @@
package org.apache.spark
+import java.lang.ref.WeakReference
+import java.util.concurrent.TimeUnit
+
import scala.annotation.tailrec
import org.scalactic.source.Position
@@ -97,4 +100,26 @@ abstract class SparkFunSuite
test(testNamePrefix + s" ${param._1}", testTags: _*)(testFun(param._2))
}
}
+
+ /** Run GC and make sure it actually has run. */
+ protected def runGC(): Unit = {
+ val weakRef = new WeakReference(new Object())
+ val startTimeNs = System.nanoTime()
+ System.gc() // Make a best effort to run the garbage collection. It
*usually* runs GC.
+ // Wait until a weak reference object has been GCed
+ while (System.nanoTime() - startTimeNs < TimeUnit.SECONDS.toNanos(10) &&
weakRef.get != null) {
+ System.gc()
+ Thread.sleep(200)
+ }
+ }
+
+ /** Run `body`; if it throws OutOfMemoryError, force a GC and retry once. */
+ protected def retryOnOOM[T](body: => T): T = {
+ try body
+ catch {
+ case _: OutOfMemoryError =>
+ runGC()
+ body
+ }
+ }
}
diff --git
a/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
b/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
index 7551327d704b..2767769924bc 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala
@@ -71,7 +71,6 @@ class SorterSuite extends SparkFunSuite {
}
test("java.lang.ArrayIndexOutOfBoundsException in TimSort") {
- System.gc()
// scalastyle:off
val runLengths = Array(76405736, 74830360, 1181532, 787688, 1575376,
2363064, 3938440, 6301504,
1181532, 393844, 15753760, 1575376, 787688, 393844, 1969220, 3150752,
1181532,787688, 5513816, 3938440,
@@ -140,7 +139,10 @@ class SorterSuite extends SparkFunSuite {
21, 20, 22, 18, 452, 114, 95, 18, 17, 21, 36, 18, 17, 115, 76, 144, 44,
38, 61,20, 19, 21, 17)
// scalastyle:on
val arrayToSortSize = 1091482190
- val arrayToSort = new Array[Byte](arrayToSortSize)
+ // Memory held by the previous test (e.g. the ~256 MB int array in
"SPARK-5984
+ // TimSort bug") may not be reclaimed before this >1 GB allocation,
causing flaky
+ // OOM in CI. Force a GC and retry once on OOM.
+ val arrayToSort = retryOnOOM(new Array[Byte](arrayToSortSize))
var sum: Int = -1
for (i <- runLengths) {
sum += i
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]