This is an automated email from the ASF dual-hosted git repository.

nicholasjiang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git


The following commit(s) were added to refs/heads/main by this push:
     new 9e00d726e [CELEBORN-1714] Optimize handleApplicationLost
9e00d726e is described below

commit 9e00d726e802363645e896a40bb18ac7c439cd36
Author: xzh <[email protected]>
AuthorDate: Fri Nov 15 19:30:06 2024 +0800

    [CELEBORN-1714] Optimize handleApplicationLost
    
    ### What changes were proposed in this pull request?
    Optimize handleApplicationLost
    
    ### Why are the changes needed?
    timeoutDeadApplications#ApplicationLost should be handled promptly, rather 
than being processed in the Master RPC queue.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    
    Closes #2910 from xy2953396112/optimize_applost.
    
    Authored-by: xzh <[email protected]>
    Signed-off-by: SteNicholas <[email protected]>
---
 .../org/apache/celeborn/service/deploy/master/Master.scala | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
index 3a684d9a1..310eaf6e1 100644
--- 
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
+++ 
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
@@ -635,15 +635,7 @@ private[celeborn] class Master(
       if (heartbeatTime < currentTime - appHeartbeatTimeoutMs) {
         logWarning(s"Application $appId timeout, trigger applicationLost 
event.")
         val requestId = MasterClient.genRequestId()
-        var res = self.askSync[ApplicationLostResponse](ApplicationLost(appId, 
requestId))
-        var retry = 1
-        while (res.status != StatusCode.SUCCESS && retry <= 3) {
-          res = self.askSync[ApplicationLostResponse](ApplicationLost(appId, 
requestId))
-          retry += 1
-        }
-        if (retry > 3) {
-          logWarning(s"Handle ApplicationLost event for $appId failed more 
than 3 times!")
-        }
+        handleApplicationLost(null, appId, requestId)
       }
     }
   }
@@ -1060,7 +1052,9 @@ private[celeborn] class Master(
         if (hasHDFSStorage || hasS3Storage) {
           checkAndCleanExpiredAppDirsOnDFS(appId)
         }
-        context.reply(ApplicationLostResponse(StatusCode.SUCCESS))
+        if (context != null) {
+          context.reply(ApplicationLostResponse(StatusCode.SUCCESS))
+        }
       }
     })
   }

Reply via email to