Repository: hadoop
Updated Branches:
refs/heads/branch-2.8 72fbdf708 -> 5d86ce7b0
YARN-4051. ContainerKillEvent lost when container is still recovering and
application finishes. Contributed by sandflee
(cherry picked from commit a16ba4296e163d5cb4caed129f2f1612a69a8d84)
Conflicts:
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/5d86ce7b
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/5d86ce7b
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/5d86ce7b
Branch: refs/heads/branch-2.8
Commit: 5d86ce7b0d416ba18a0be3a07195de4dc28b5bc5
Parents: 72fbdf7
Author: Jason Lowe <[email protected]>
Authored: Thu Mar 16 09:31:20 2017 -0500
Committer: Jason Lowe <[email protected]>
Committed: Thu Mar 16 09:36:34 2017 -0500
----------------------------------------------------------------------
.../containermanager/ContainerManagerImpl.java | 53 +++++++++++++++++---
.../application/ApplicationImpl.java | 4 +-
.../containermanager/container/Container.java | 1 +
.../container/ContainerImpl.java | 8 +++
.../TestContainerManagerRecovery.java | 18 ++++---
.../nodemanager/webapp/MockContainer.java | 5 ++
6 files changed, 73 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hadoop/blob/5d86ce7b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
index 57272fb..cca73ac 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
@@ -340,15 +340,15 @@ public class ContainerManagerImpl extends
CompositeService implements
LOG.info("Recovering " + containerId + " in state " + rcs.getStatus()
+ " with exit code " + rcs.getExitCode());
- if (context.getApplications().containsKey(appId)) {
+ Application app = context.getApplications().get(appId);
+ if (app != null) {
Credentials credentials =
YarnServerSecurityUtils.parseCredentials(launchContext);
Container container = new ContainerImpl(getConfig(), dispatcher,
req.getContainerLaunchContext(),
credentials, metrics, token, context, rcs);
context.getContainers().put(containerId, container);
- dispatcher.getEventHandler().handle(
- new ApplicationContainerInitEvent(container));
+ app.handle(new ApplicationContainerInitEvent(container));
} else {
if (rcs.getStatus() != RecoveredContainerStatus.COMPLETED) {
LOG.warn(containerId + " has no corresponding application!");
@@ -1168,6 +1168,10 @@ public class ContainerManagerImpl extends
CompositeService implements
+ " is not handled by this NodeManager");
}
} else {
+ if (container.isRecovering()) {
+ throw new NMNotYetReadyException("Container " + containerIDStr
+ + " is recovering, try later");
+ }
context.getNMStateStore().storeContainerKilled(containerID);
dispatcher.getEventHandler().handle(
new ContainerKillEvent(containerID,
@@ -1313,6 +1317,21 @@ public class ContainerManagerImpl extends
CompositeService implements
+ " FINISH_APPS event");
continue;
}
+
+ boolean shouldDropEvent = false;
+ for (Container container : app.getContainers().values()) {
+ if (container.isRecovering()) {
+ LOG.info("drop FINISH_APPS event to " + appID + " because "
+ + "container " + container.getContainerId()
+ + " is recovering");
+ shouldDropEvent = true;
+ break;
+ }
+ }
+ if (shouldDropEvent) {
+ continue;
+ }
+
String diagnostic = "";
if (appsFinishedEvent.getReason() ==
CMgrCompletedAppsEvent.Reason.ON_SHUTDOWN) {
diagnostic = "Application killed on shutdown";
@@ -1327,10 +1346,32 @@ public class ContainerManagerImpl extends
CompositeService implements
case FINISH_CONTAINERS:
CMgrCompletedContainersEvent containersFinishedEvent =
(CMgrCompletedContainersEvent) event;
- for (ContainerId container : containersFinishedEvent
+ for (ContainerId containerId : containersFinishedEvent
.getContainersToCleanup()) {
- this.dispatcher.getEventHandler().handle(
- new ContainerKillEvent(container,
+ ApplicationId appId =
+ containerId.getApplicationAttemptId().getApplicationId();
+ Application app = this.context.getApplications().get(appId);
+ if (app == null) {
+ LOG.warn("couldn't find app " + appId + " while processing"
+ + " FINISH_CONTAINERS event");
+ continue;
+ }
+
+ Container container = app.getContainers().get(containerId);
+ if (container == null) {
+ LOG.warn("couldn't find container " + containerId
+ + " while processing FINISH_CONTAINERS event");
+ continue;
+ }
+
+ if (container.isRecovering()) {
+ LOG.info("drop FINISH_CONTAINERS event to " + containerId
+ + " because container is recovering");
+ continue;
+ }
+
+ this.dispatcher.getEventHandler().handle(
+ new ContainerKillEvent(containerId,
ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
"Container Killed by ResourceManager"));
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/5d86ce7b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java
index efa258a..d614fc6 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java
@@ -20,8 +20,8 @@ package
org.apache.hadoop.yarn.server.nodemanager.containermanager.application;
import java.io.IOException;
import java.util.EnumSet;
-import java.util.HashMap;
import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
@@ -77,7 +77,7 @@ public class ApplicationImpl implements Application {
private LogAggregationContext logAggregationContext;
Map<ContainerId, Container> containers =
- new HashMap<ContainerId, Container>();
+ new ConcurrentHashMap<>();
public ApplicationImpl(Dispatcher dispatcher, String user, ApplicationId
appId,
Credentials credentials, Context context) {
http://git-wip-us.apache.org/repos/asf/hadoop/blob/5d86ce7b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java
index 1d2ec56..8c63c95 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java
@@ -57,4 +57,5 @@ public interface Container extends
EventHandler<ContainerEvent> {
String toString();
+ boolean isRecovering();
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/5d86ce7b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
index 052fe8d..9d1d3c7 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
@@ -1188,4 +1188,12 @@ public class ContainerImpl implements Container {
LocalResourceRequest resource) {
return container.resourcesUploadPolicies.get(resource);
}
+
+ @Override
+ public boolean isRecovering() {
+ boolean isRecovering = (
+ recoveredStatus != RecoveredContainerStatus.REQUESTED &&
+ getContainerState() == ContainerState.NEW);
+ return isRecovering;
+ }
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/5d86ce7b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
index 6cfa914..ee863c7 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
@@ -87,6 +87,7 @@ import
org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEventType;
+import
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationFinishEvent;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
@@ -246,8 +247,8 @@ public class TestContainerManagerRecovery extends
BaseContainerManagerTest {
// simulate application completion
List<ApplicationId> finishedApps = new ArrayList<ApplicationId>();
finishedApps.add(appId);
- cm.handle(new CMgrCompletedAppsEvent(finishedApps,
- CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));
+ app.handle(new ApplicationFinishEvent(
+ appId, "Application killed by ResourceManager"));
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
// restart and verify app is marked for finishing
@@ -261,8 +262,8 @@ public class TestContainerManagerRecovery extends
BaseContainerManagerTest {
assertNotNull(app);
// no longer saving FINISH_APP event in NM stateStore,
// simulate by resending FINISH_APP event
- cm.handle(new CMgrCompletedAppsEvent(finishedApps,
- CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));
+ app.handle(new ApplicationFinishEvent(
+ appId, "Application killed by ResourceManager"));
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
assertTrue(context.getApplicationACLsManager().checkAccess(
UserGroupInformation.createRemoteUser(modUser),
@@ -333,8 +334,8 @@ public class TestContainerManagerRecovery extends
BaseContainerManagerTest {
// simulate application completion
List<ApplicationId> finishedApps = new ArrayList<ApplicationId>();
finishedApps.add(appId);
- cm.handle(new CMgrCompletedAppsEvent(finishedApps,
- CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));
+ app.handle(new ApplicationFinishEvent(
+ appId, "Application killed by ResourceManager"));
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
app.handle(new ApplicationEvent(app.getAppId(),
@@ -355,8 +356,9 @@ public class TestContainerManagerRecovery extends
BaseContainerManagerTest {
// no longer saving FINISH_APP event in NM stateStore,
// simulate by resending FINISH_APP event
- cm.handle(new CMgrCompletedAppsEvent(finishedApps,
- CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));
+ app.handle(new ApplicationFinishEvent(
+ appId, "Application killed by ResourceManager"));
+
waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
// TODO need to figure out why additional APPLICATION_RESOURCES_CLEANEDUP
// is needed.
http://git-wip-us.apache.org/repos/asf/hadoop/blob/5d86ce7b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
index 700d2e7..716fabb 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/MockContainer.java
@@ -144,4 +144,9 @@ public class MockContainer implements Container {
public NMContainerStatus getNMContainerStatus() {
return null;
}
+
+ @Override
+ public boolean isRecovering() {
+ return false;
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]