This is an automated email from the ASF dual-hosted git repository.

zuston pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/uniffle.git


The following commit(s) were added to refs/heads/master by this push:
     new d6c598820 feat(spark): Show shuffle failures into spark UI (#2668)
d6c598820 is described below

commit d6c5988208f071d7f2e226c3c45b8f623c49ff71
Author: Junfan Zhang <[email protected]>
AuthorDate: Thu Nov 13 17:01:22 2025 +0800

    feat(spark): Show shuffle failures into spark UI (#2668)
    
    ### What changes were proposed in this pull request?
    
    Show the shuffle failure reason into the spark UI
    
    ### Why are the changes needed?
    
    followup issue #2508 to be more easier to find out the root cause of 
shuffle failure
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Internal job tests
---
 .../scala/org/apache/spark/UniffleListener.scala    | 15 +++++++++------
 .../scala/org/apache/spark/ui/ShufflePage.scala     | 21 ++++++++++++++++++++-
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git 
a/client-spark/extension/src/main/scala/org/apache/spark/UniffleListener.scala 
b/client-spark/extension/src/main/scala/org/apache/spark/UniffleListener.scala
index 92187912f..4770b89ff 100644
--- 
a/client-spark/extension/src/main/scala/org/apache/spark/UniffleListener.scala
+++ 
b/client-spark/extension/src/main/scala/org/apache/spark/UniffleListener.scala
@@ -128,12 +128,13 @@ class UniffleListener(conf: SparkConf, kvstore: 
ElementTrackingStore)
     aggregatedShuffleWriteTimes.inc(event.getWriteTimes)
     totalUncompressedShuffleBytes.addAndGet(event.getUncompressedByteSize)
 
-    if (event.isShuffleWriteFailed) {
+    val failureReason = event.getFailureReason
+    if (StringUtils.isNotEmpty(failureReason)) {
+      writeTaskInfo.failureReasons.add(failureReason)
       writeTaskInfo.failedTaskNumber += 1
       if (event.getTaskAttemptNumber > 
writeTaskInfo.failedTaskMaxAttemptNumber) {
         writeTaskInfo.failedTaskMaxAttemptNumber = event.getTaskAttemptNumber
       }
-      writeTaskInfo.failureReasons.add(event.getFailureReason)
     }
   }
 
@@ -157,12 +158,14 @@ class UniffleListener(conf: SparkConf, kvstore: 
ElementTrackingStore)
     }
     aggregatedShuffleReadTimes.merge(event.getShuffleReadTimes)
 
-    if (event.isShuffleReadFailed) {
+    val failureReason = event.getFailureReason
+    if (StringUtils.isNotEmpty(failureReason)) {
+      readTaskInfo.failureReasons.add(failureReason)
       readTaskInfo.failedTaskNumber += 1
-      if (event.getTaskAttemptNumber > 
readTaskInfo.failedTaskMaxAttemptNumber) {}
-      readTaskInfo.failedTaskMaxAttemptNumber = event.getTaskAttemptNumber
+      if (event.getTaskAttemptNumber > 
readTaskInfo.failedTaskMaxAttemptNumber) {
+        readTaskInfo.failedTaskMaxAttemptNumber = event.getTaskAttemptNumber
+      }
     }
-    readTaskInfo.failureReasons.add(event.getFailureReason)
   }
 
   override def onOtherEvent(event: SparkListenerEvent): Unit = event match {
diff --git 
a/client-spark/extension/src/main/scala/org/apache/spark/ui/ShufflePage.scala 
b/client-spark/extension/src/main/scala/org/apache/spark/ui/ShufflePage.scala
index 4667775f6..ecdbd0c93 100644
--- 
a/client-spark/extension/src/main/scala/org/apache/spark/ui/ShufflePage.scala
+++ 
b/client-spark/extension/src/main/scala/org/apache/spark/ui/ShufflePage.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.ui
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.shuffle.events.ShuffleWriteTimes
 import org.apache.spark.util.Utils
 import org.apache.spark.{AggregatedShuffleMetric, AggregatedShuffleReadMetric, 
AggregatedShuffleWriteMetric, AggregatedTaskInfoUIData, ShuffleType}
 
@@ -462,6 +461,26 @@ class ShufflePage(parent: ShuffleTab) extends 
WebUIPage("") with Logging {
             {readTimesUI}
           </div>
         </div>
+
+        <div>
+          <span class="collapse-failures-properties collapse-table"
+                onClick="collapseTable('collapse-failures-properties', 
'failures-table')">
+            <h4>
+              <span class="collapse-table-arrow arrow-closed"></span>
+              <a>Shuffle Failures</a>
+            </h4>
+          </span>
+          <div class="failures-table collapsible-table collapsed">
+            <h4>Write {writeSummary.failedTaskNumber} Failures 
(maxAttemptNumber:{writeSummary.failedTaskMaxAttemptNumber})</h4>
+            <pre>
+              {writeSummary.failureReasons.mkString("\n\n")}
+            </pre>
+            <h4>Read {readSummary.failedTaskNumber} Failures 
(maxAttemptNumber:{readSummary.failedTaskMaxAttemptNumber})</h4>
+            <pre>
+              {readSummary.failureReasons.mkString("\n\n")}
+            </pre>
+          </div>
+        </div>
       </div>
     }
 

Reply via email to