This is an automated email from the ASF dual-hosted git repository.

nicholasjiang pushed a commit to branch branch-0.5
in repository https://gitbox.apache.org/repos/asf/celeborn.git


The following commit(s) were added to refs/heads/branch-0.5 by this push:
     new 4de52a391 [CELEBORN-1701][FOLLOWUP] Support stage rerun for shuffle 
data lost
4de52a391 is described below

commit 4de52a3919b3d703d336adaa8666f2b27bc69ea8
Author: mingji <[email protected]>
AuthorDate: Thu Dec 26 17:58:41 2024 +0800

    [CELEBORN-1701][FOLLOWUP] Support stage rerun for shuffle data lost
    
    ### What changes were proposed in this pull request?
    Fix an error that may cause the application master retry stage rerun 
infinitely.
    
    ### Why are the changes needed?
    Correct the parameters passed.
    
    ### Does this PR introduce _any_ user-facing change?
    NO.
    
    ### How was this patch tested?
    GA.
    
    Closes #3033 from FMX/b1071-1.
    
    Authored-by: mingji <[email protected]>
    Signed-off-by: SteNicholas <[email protected]>
    (cherry picked from commit 52fa151aa4c618ee82370ba3959298b4056ee352)
    Signed-off-by: SteNicholas <[email protected]>
---
 .../spark/shuffle/celeborn/CelebornShuffleReader.scala | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git 
a/client-spark/spark-3/src/main/scala/org/apache/spark/shuffle/celeborn/CelebornShuffleReader.scala
 
b/client-spark/spark-3/src/main/scala/org/apache/spark/shuffle/celeborn/CelebornShuffleReader.scala
index 745cc0770..f4edc0f2f 100644
--- 
a/client-spark/spark-3/src/main/scala/org/apache/spark/shuffle/celeborn/CelebornShuffleReader.scala
+++ 
b/client-spark/spark-3/src/main/scala/org/apache/spark/shuffle/celeborn/CelebornShuffleReader.scala
@@ -111,7 +111,7 @@ class CelebornShuffleReader[K, C](
       fileGroups = shuffleClient.updateFileGroup(shuffleId, startPartition)
     } catch {
       case ce @ (_: CelebornIOException | _: PartitionUnRetryAbleException) =>
-        handleFetchExceptions(shuffleId, 0, ce)
+        handleFetchExceptions(handle.shuffleId, shuffleId, 0, ce)
       case e: Throwable => throw e
     }
 
@@ -254,7 +254,7 @@ class CelebornShuffleReader[K, C](
           if (exceptionRef.get() != null) {
             exceptionRef.get() match {
               case ce @ (_: CelebornIOException | _: 
PartitionUnRetryAbleException) =>
-                handleFetchExceptions(handle.shuffleId, partitionId, ce)
+                handleFetchExceptions(handle.shuffleId, shuffleId, 
partitionId, ce)
               case e => throw e
             }
           }
@@ -289,7 +289,7 @@ class CelebornShuffleReader[K, C](
         iter
       } catch {
         case e @ (_: CelebornIOException | _: PartitionUnRetryAbleException) =>
-          handleFetchExceptions(handle.shuffleId, partitionId, e)
+          handleFetchExceptions(handle.shuffleId, shuffleId, partitionId, e)
       }
     }
 
@@ -369,17 +369,21 @@ class CelebornShuffleReader[K, C](
     }
   }
 
-  private def handleFetchExceptions(shuffleId: Int, partitionId: Int, ce: 
Throwable) = {
+  private def handleFetchExceptions(
+      appShuffleId: Int,
+      shuffleId: Int,
+      partitionId: Int,
+      ce: Throwable) = {
     if (throwsFetchFailure &&
-      shuffleClient.reportShuffleFetchFailure(handle.shuffleId, shuffleId)) {
+      shuffleClient.reportShuffleFetchFailure(appShuffleId, shuffleId)) {
       logWarning(s"Handle fetch exceptions for ${shuffleId}-${partitionId}", 
ce)
       throw new FetchFailedException(
         null,
-        handle.shuffleId,
+        appShuffleId,
         -1,
         -1,
         partitionId,
-        SparkUtils.FETCH_FAILURE_ERROR_MSG + handle.shuffleId + "/" + 
shuffleId,
+        SparkUtils.FETCH_FAILURE_ERROR_MSG + appShuffleId + "/" + shuffleId,
         ce)
     } else
       throw ce

Reply via email to