autumnust commented on a change in pull request #2940: GOBBLIN-1099: Handle 
orphaned Yarn containers in Gobblin-on-Yarn clus…
URL: https://github.com/apache/incubator-gobblin/pull/2940#discussion_r399583366
 
 

 ##########
 File path: gobblin-yarn/src/main/java/org/apache/gobblin/yarn/YarnService.java
 ##########
 @@ -635,10 +644,26 @@ protected void handleContainerCompletion(ContainerStatus 
containerStatus) {
           containerStatus.getContainerId(), containerStatus.getDiagnostics()));
     }
 
-    if 
(this.releasedContainerCache.getIfPresent(containerStatus.getContainerId()) != 
null) {
-      LOGGER.info("Container release requested, so not spawning a replacement 
for containerId {}",
-          containerStatus.getContainerId());
-      return;
+    if (containerStatus.getExitStatus() == ContainerExitStatus.ABORTED) {
+      if 
(this.releasedContainerCache.getIfPresent(containerStatus.getContainerId()) != 
null) {
+        LOGGER.info("Container release requested, so not spawning a 
replacement for containerId {}", containerStatus.getContainerId());
+        return;
+      } else {
+        LOGGER.info("Container {} aborted due to lost NM", 
containerStatus.getContainerId());
+       // Container release was not requested. Likely, the container was 
running on a node on which the NM died.
+       // In this case, RM assumes that the containers are "lost", even though 
the container process may still be
+        // running on the node. We need to ensure that the Helix instances 
running on the orphaned containers
+        // are fenced off from the Helix cluster to avoid double publishing 
and state being committed by the
+        // instances.
+        if (!UNKNOWN_HELIX_INSTANCE.equals(completedInstanceName)) {
+          String clusterName = this.helixManager.getClusterName();
+          //Disable the orphaned instance.
+          if (HelixUtils.isInstanceLive(helixManager, completedInstanceName)) {
+            LOGGER.info("Disabling the Helix instance {}", 
completedInstanceName);
+            
this.helixManager.getClusterManagmentTool().enableInstance(clusterName, 
completedInstanceName, false);
 
 Review comment:
   enableInstance, This is a very good API :) 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to