This is an automated email from the ASF dual-hosted git repository.
nicholasjiang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/celeborn.git
The following commit(s) were added to refs/heads/main by this push:
new 9e00d726e [CELEBORN-1714] Optimize handleApplicationLost
9e00d726e is described below
commit 9e00d726e802363645e896a40bb18ac7c439cd36
Author: xzh <[email protected]>
AuthorDate: Fri Nov 15 19:30:06 2024 +0800
[CELEBORN-1714] Optimize handleApplicationLost
### What changes were proposed in this pull request?
Optimize handleApplicationLost
### Why are the changes needed?
timeoutDeadApplications#ApplicationLost should be handled promptly, rather
than being processed in the Master RPC queue.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Closes #2910 from xy2953396112/optimize_applost.
Authored-by: xzh <[email protected]>
Signed-off-by: SteNicholas <[email protected]>
---
.../org/apache/celeborn/service/deploy/master/Master.scala | 14 ++++----------
1 file changed, 4 insertions(+), 10 deletions(-)
diff --git
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
index 3a684d9a1..310eaf6e1 100644
---
a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
+++
b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
@@ -635,15 +635,7 @@ private[celeborn] class Master(
if (heartbeatTime < currentTime - appHeartbeatTimeoutMs) {
logWarning(s"Application $appId timeout, trigger applicationLost
event.")
val requestId = MasterClient.genRequestId()
- var res = self.askSync[ApplicationLostResponse](ApplicationLost(appId,
requestId))
- var retry = 1
- while (res.status != StatusCode.SUCCESS && retry <= 3) {
- res = self.askSync[ApplicationLostResponse](ApplicationLost(appId,
requestId))
- retry += 1
- }
- if (retry > 3) {
- logWarning(s"Handle ApplicationLost event for $appId failed more
than 3 times!")
- }
+ handleApplicationLost(null, appId, requestId)
}
}
}
@@ -1060,7 +1052,9 @@ private[celeborn] class Master(
if (hasHDFSStorage || hasS3Storage) {
checkAndCleanExpiredAppDirsOnDFS(appId)
}
- context.reply(ApplicationLostResponse(StatusCode.SUCCESS))
+ if (context != null) {
+ context.reply(ApplicationLostResponse(StatusCode.SUCCESS))
+ }
}
})
}