This is an automated email from the ASF dual-hosted git repository.
zuston pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/uniffle.git
The following commit(s) were added to refs/heads/master by this push:
new d6c598820 feat(spark): Show shuffle failures into spark UI (#2668)
d6c598820 is described below
commit d6c5988208f071d7f2e226c3c45b8f623c49ff71
Author: Junfan Zhang <[email protected]>
AuthorDate: Thu Nov 13 17:01:22 2025 +0800
feat(spark): Show shuffle failures into spark UI (#2668)
### What changes were proposed in this pull request?
Show the shuffle failure reason into the spark UI
### Why are the changes needed?
followup issue #2508 to be more easier to find out the root cause of
shuffle failure
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Internal job tests
---
.../scala/org/apache/spark/UniffleListener.scala | 15 +++++++++------
.../scala/org/apache/spark/ui/ShufflePage.scala | 21 ++++++++++++++++++++-
2 files changed, 29 insertions(+), 7 deletions(-)
diff --git
a/client-spark/extension/src/main/scala/org/apache/spark/UniffleListener.scala
b/client-spark/extension/src/main/scala/org/apache/spark/UniffleListener.scala
index 92187912f..4770b89ff 100644
---
a/client-spark/extension/src/main/scala/org/apache/spark/UniffleListener.scala
+++
b/client-spark/extension/src/main/scala/org/apache/spark/UniffleListener.scala
@@ -128,12 +128,13 @@ class UniffleListener(conf: SparkConf, kvstore:
ElementTrackingStore)
aggregatedShuffleWriteTimes.inc(event.getWriteTimes)
totalUncompressedShuffleBytes.addAndGet(event.getUncompressedByteSize)
- if (event.isShuffleWriteFailed) {
+ val failureReason = event.getFailureReason
+ if (StringUtils.isNotEmpty(failureReason)) {
+ writeTaskInfo.failureReasons.add(failureReason)
writeTaskInfo.failedTaskNumber += 1
if (event.getTaskAttemptNumber >
writeTaskInfo.failedTaskMaxAttemptNumber) {
writeTaskInfo.failedTaskMaxAttemptNumber = event.getTaskAttemptNumber
}
- writeTaskInfo.failureReasons.add(event.getFailureReason)
}
}
@@ -157,12 +158,14 @@ class UniffleListener(conf: SparkConf, kvstore:
ElementTrackingStore)
}
aggregatedShuffleReadTimes.merge(event.getShuffleReadTimes)
- if (event.isShuffleReadFailed) {
+ val failureReason = event.getFailureReason
+ if (StringUtils.isNotEmpty(failureReason)) {
+ readTaskInfo.failureReasons.add(failureReason)
readTaskInfo.failedTaskNumber += 1
- if (event.getTaskAttemptNumber >
readTaskInfo.failedTaskMaxAttemptNumber) {}
- readTaskInfo.failedTaskMaxAttemptNumber = event.getTaskAttemptNumber
+ if (event.getTaskAttemptNumber >
readTaskInfo.failedTaskMaxAttemptNumber) {
+ readTaskInfo.failedTaskMaxAttemptNumber = event.getTaskAttemptNumber
+ }
}
- readTaskInfo.failureReasons.add(event.getFailureReason)
}
override def onOtherEvent(event: SparkListenerEvent): Unit = event match {
diff --git
a/client-spark/extension/src/main/scala/org/apache/spark/ui/ShufflePage.scala
b/client-spark/extension/src/main/scala/org/apache/spark/ui/ShufflePage.scala
index 4667775f6..ecdbd0c93 100644
---
a/client-spark/extension/src/main/scala/org/apache/spark/ui/ShufflePage.scala
+++
b/client-spark/extension/src/main/scala/org/apache/spark/ui/ShufflePage.scala
@@ -18,7 +18,6 @@
package org.apache.spark.ui
import org.apache.spark.internal.Logging
-import org.apache.spark.shuffle.events.ShuffleWriteTimes
import org.apache.spark.util.Utils
import org.apache.spark.{AggregatedShuffleMetric, AggregatedShuffleReadMetric,
AggregatedShuffleWriteMetric, AggregatedTaskInfoUIData, ShuffleType}
@@ -462,6 +461,26 @@ class ShufflePage(parent: ShuffleTab) extends
WebUIPage("") with Logging {
{readTimesUI}
</div>
</div>
+
+ <div>
+ <span class="collapse-failures-properties collapse-table"
+ onClick="collapseTable('collapse-failures-properties',
'failures-table')">
+ <h4>
+ <span class="collapse-table-arrow arrow-closed"></span>
+ <a>Shuffle Failures</a>
+ </h4>
+ </span>
+ <div class="failures-table collapsible-table collapsed">
+ <h4>Write {writeSummary.failedTaskNumber} Failures
(maxAttemptNumber:{writeSummary.failedTaskMaxAttemptNumber})</h4>
+ <pre>
+ {writeSummary.failureReasons.mkString("\n\n")}
+ </pre>
+ <h4>Read {readSummary.failedTaskNumber} Failures
(maxAttemptNumber:{readSummary.failedTaskMaxAttemptNumber})</h4>
+ <pre>
+ {readSummary.failureReasons.mkString("\n\n")}
+ </pre>
+ </div>
+ </div>
</div>
}