Repository: hadoop Updated Branches: refs/heads/branch-2 3eaa79c9c -> a16ba4296
YARN-4051. ContainerKillEvent lost when container is still recovering and application finishes. Contributed by sandflee Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/a16ba429 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/a16ba429 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/a16ba429 Branch: refs/heads/branch-2 Commit: a16ba4296e163d5cb4caed129f2f1612a69a8d84 Parents: 3eaa79c Author: Jason Lowe <[email protected]> Authored: Thu Mar 16 09:31:20 2017 -0500 Committer: Jason Lowe <[email protected]> Committed: Thu Mar 16 09:31:20 2017 -0500 ---------------------------------------------------------------------- .../containermanager/ContainerManagerImpl.java | 53 +++++++++++++++++--- .../application/ApplicationImpl.java | 4 +- .../containermanager/container/Container.java | 2 + .../container/ContainerImpl.java | 8 +++ .../TestContainerManagerRecovery.java | 18 ++++--- .../nodemanager/webapp/MockContainer.java | 5 ++ 6 files changed, 74 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/a16ba429/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java index 0dc0f9c..7a121c0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java @@ -386,15 +386,15 @@ public class ContainerManagerImpl extends CompositeService implements LOG.info("Recovering " + containerId + " in state " + rcs.getStatus() + " with exit code " + rcs.getExitCode()); - if (context.getApplications().containsKey(appId)) { + Application app = context.getApplications().get(appId); + if (app != null) { Credentials credentials = YarnServerSecurityUtils.parseCredentials(launchContext); Container container = new ContainerImpl(getConfig(), dispatcher, req.getContainerLaunchContext(), credentials, metrics, token, context, rcs); context.getContainers().put(containerId, container); - dispatcher.getEventHandler().handle( - new ApplicationContainerInitEvent(container)); + app.handle(new ApplicationContainerInitEvent(container)); if (rcs.getRecoveryType() == RecoveredContainerType.KILL) { dispatcher.getEventHandler().handle( new ContainerKillEvent(containerId, ContainerExitStatus.ABORTED, @@ -1234,6 +1234,10 @@ public class ContainerManagerImpl extends CompositeService implements + " is not handled by this NodeManager"); } } else { + if (container.isRecovering()) { + throw new NMNotYetReadyException("Container " + containerIDStr + + " is recovering, try later"); + } context.getNMStateStore().storeContainerKilled(containerID); container.sendKillEvent(ContainerExitStatus.KILLED_BY_APPMASTER, "Container killed by the ApplicationMaster."); @@ -1377,6 +1381,21 @@ public class ContainerManagerImpl extends CompositeService implements + " FINISH_APPS event"); continue; } + + boolean shouldDropEvent = false; + for (Container container : app.getContainers().values()) { + if (container.isRecovering()) { + LOG.info("drop FINISH_APPS event to " + appID + " because " + + "container " + container.getContainerId() + + " is recovering"); + shouldDropEvent = true; + break; + } + } + if (shouldDropEvent) { + continue; + } + String diagnostic = ""; if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.ON_SHUTDOWN) { diagnostic = "Application killed on shutdown"; @@ -1391,10 +1410,32 @@ public class ContainerManagerImpl extends CompositeService implements case FINISH_CONTAINERS: CMgrCompletedContainersEvent containersFinishedEvent = (CMgrCompletedContainersEvent) event; - for (ContainerId container : containersFinishedEvent + for (ContainerId containerId : containersFinishedEvent .getContainersToCleanup()) { - this.dispatcher.getEventHandler().handle( - new ContainerKillEvent(container, + ApplicationId appId = + containerId.getApplicationAttemptId().getApplicationId(); + Application app = this.context.getApplications().get(appId); + if (app == null) { + LOG.warn("couldn't find app " + appId + " while processing" + + " FINISH_CONTAINERS event"); + continue; + } + + Container container = app.getContainers().get(containerId); + if (container == null) { + LOG.warn("couldn't find container " + containerId + + " while processing FINISH_CONTAINERS event"); + continue; + } + + if (container.isRecovering()) { + LOG.info("drop FINISH_CONTAINERS event to " + containerId + + " because container is recovering"); + continue; + } + + this.dispatcher.getEventHandler().handle( + new ContainerKillEvent(containerId, ContainerExitStatus.KILLED_BY_RESOURCEMANAGER, "Container Killed by ResourceManager")); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/a16ba429/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java index d3548f8..9a6262a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java @@ -20,8 +20,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.application; import java.io.IOException; import java.util.EnumSet; -import java.util.HashMap; import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; @@ -84,7 +84,7 @@ public class ApplicationImpl implements Application { private LogAggregationContext logAggregationContext; Map<ContainerId, Container> containers = - new HashMap<ContainerId, Container>(); + new ConcurrentHashMap<>(); /** * The timestamp when the log aggregation has started for this application. http://git-wip-us.apache.org/repos/asf/hadoop/blob/a16ba429/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java index 77ac357..67c1cc9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java @@ -89,4 +89,6 @@ public interface Container extends EventHandler<ContainerEvent> { void sendLaunchEvent(); void sendKillEvent(int exitStatus, String description); + + boolean isRecovering(); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/a16ba429/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index a2e8f07..4d93aa7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -1746,4 +1746,12 @@ public class ContainerImpl implements Container { public void commitUpgrade() { this.reInitContext = null; } + + @Override + public boolean isRecovering() { + boolean isRecovering = ( + recoveredStatus != RecoveredContainerStatus.REQUESTED && + getContainerState() == ContainerState.NEW); + return isRecovering; + } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/a16ba429/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java index eb30c5d..6fae625 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java @@ -87,6 +87,7 @@ import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEventType; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationFinishEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; @@ -248,8 +249,8 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest { // simulate application completion List<ApplicationId> finishedApps = new ArrayList<ApplicationId>(); finishedApps.add(appId); - cm.handle(new CMgrCompletedAppsEvent(finishedApps, - CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER)); + app.handle(new ApplicationFinishEvent( + appId, "Application killed by ResourceManager")); waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP); // restart and verify app is marked for finishing @@ -263,8 +264,8 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest { assertNotNull(app); // no longer saving FINISH_APP event in NM stateStore, // simulate by resending FINISH_APP event - cm.handle(new CMgrCompletedAppsEvent(finishedApps, - CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER)); + app.handle(new ApplicationFinishEvent( + appId, "Application killed by ResourceManager")); waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP); assertTrue(context.getApplicationACLsManager().checkAccess( UserGroupInformation.createRemoteUser(modUser), @@ -335,8 +336,8 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest { // simulate application completion List<ApplicationId> finishedApps = new ArrayList<ApplicationId>(); finishedApps.add(appId); - cm.handle(new CMgrCompletedAppsEvent(finishedApps, - CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER)); + app.handle(new ApplicationFinishEvent( + appId, "Application killed by ResourceManager")); waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP); app.handle(new ApplicationEvent(app.getAppId(), @@ -357,8 +358,9 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest { // no longer saving FINISH_APP event in NM stateStore, // simulate by resending FINISH_APP event - cm.handle(new CMgrCompletedAppsEvent(finishedApps, - CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER)); + app.handle(new ApplicationFinishEvent( + appId, "Application killed by ResourceManager")); + waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP); // TODO need to figure out why additional APPLICATION_RESOURCES_CLEANEDUP // is needed. http://git-wip-us.apache.org/repos/asf/hadoop/blob/a16ba429/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java index 92966ab..247f2b0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java @@ -224,4 +224,9 @@ public class MockContainer implements Container { public void sendKillEvent(int exitStatus, String description) { } + + @Override + public boolean isRecovering() { + return false; + } } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
