YARN-8153. Guaranteed containers always stay in SCHEDULED on NM after restart.
Contributed by Yang Wang.
Advertising
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/226bedc0
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/226bedc0
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/226bedc0
Branch: refs/heads/HDFS-7240
Commit: 226bedc0239ba23f3ca0c40dac6aab3777d3ada6
Parents: 375654c
Author: Weiwei Yang <w...@apache.org>
Authored: Fri Apr 13 13:17:37 2018 +0800
Committer: Weiwei Yang <w...@apache.org>
Committed: Fri Apr 13 13:17:37 2018 +0800
----------------------------------------------------------------------
.../scheduler/ContainerScheduler.java | 7 ++-
.../TestContainerManagerRecovery.java | 51 ++++++++++++++++++++
2 files changed, 56 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hadoop/blob/226bedc0/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/ContainerScheduler.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/ContainerScheduler.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/ContainerScheduler.java
index d9b713f..57368ab 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/ContainerScheduler.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/ContainerScheduler.java
@@ -501,8 +501,11 @@ public class ContainerScheduler extends AbstractService
implements
private void startContainer(Container container) {
LOG.info("Starting container [" + container.getContainerId()+ "]");
- runningContainers.put(container.getContainerId(), container);
- this.utilizationTracker.addContainerResources(container);
+ // Skip to put into runningContainers and addUtilization when recover
+ if (!runningContainers.containsKey(container.getContainerId())) {
+ runningContainers.put(container.getContainerId(), container);
+ this.utilizationTracker.addContainerResources(container);
+ }
if (container.getContainerTokenIdentifier().getExecutionType() ==
ExecutionType.OPPORTUNISTIC) {
this.metrics.startOpportunisticContainer(container.getResource());
http://git-wip-us.apache.org/repos/asf/hadoop/blob/226bedc0/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
----------------------------------------------------------------------
diff --git
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
index cad835c..bf8b500 100644
---
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
+++
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
@@ -67,6 +67,7 @@ import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
import org.apache.hadoop.yarn.api.records.LogAggregationContext;
import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceUtilization;
import org.apache.hadoop.yarn.api.records.Token;
import org.apache.hadoop.yarn.api.records.URL;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
@@ -91,6 +92,7 @@ import
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Ap
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
+import
org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncher;
import
org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncherEvent;
@@ -439,6 +441,54 @@ public class TestContainerManagerRecovery extends
BaseContainerManagerTest {
assertNotNull(app);
containerStatus = getContainerStatus(context, cm, cid);
assertEquals(targetResource, containerStatus.getCapability());
+ cm.stop();
+ }
+
+ @Test
+ public void testContainerSchedulerRecovery() throws Exception {
+ conf.setBoolean(YarnConfiguration.NM_RECOVERY_ENABLED, true);
+ conf.setBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, true);
+ NMStateStoreService stateStore = new NMMemoryStateStoreService();
+ stateStore.init(conf);
+ stateStore.start();
+ context = createContext(conf, stateStore);
+ ContainerManagerImpl cm = createContainerManager(context, delSrvc);
+ ((NMContext) context).setContainerManager(cm);
+ cm.init(conf);
+ cm.start();
+ // add an application by starting a container
+ ApplicationId appId = ApplicationId.newInstance(0, 1);
+ ApplicationAttemptId attemptId =
+ ApplicationAttemptId.newInstance(appId, 1);
+ ContainerId cid = ContainerId.newContainerId(attemptId, 1);
+
+ commonLaunchContainer(appId, cid, cm);
+
+ Application app = context.getApplications().get(appId);
+ assertNotNull(app);
+
+ ResourceUtilization utilization =
+ ResourceUtilization.newInstance(1024, 2048, 0.25F);
+ assertEquals(cm.getContainerScheduler().getNumRunningContainers(), 1);
+ assertEquals(utilization,
+ cm.getContainerScheduler().getCurrentUtilization());
+
+ // restart and verify container scheduler has recovered correctly
+ cm.stop();
+ context = createContext(conf, stateStore);
+ cm = createContainerManager(context, delSrvc);
+ ((NMContext) context).setContainerManager(cm);
+ cm.init(conf);
+ cm.start();
+ assertEquals(1, context.getApplications().size());
+ app = context.getApplications().get(appId);
+ assertNotNull(app);
+ waitForNMContainerState(cm, cid, ContainerState.RUNNING);
+
+ assertEquals(cm.getContainerScheduler().getNumRunningContainers(), 1);
+ assertEquals(utilization,
+ cm.getContainerScheduler().getCurrentUtilization());
+ cm.stop();
}
@Test
@@ -494,6 +544,7 @@ public class TestContainerManagerRecovery extends
BaseContainerManagerTest {
resourceMappings.getAssignedResources("numa").equals(numaResources));
Assert.assertTrue(
resourceMappings.getAssignedResources("fpga").equals(fpgaResources));
+ cm.stop();
}
@Test
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org