http://git-wip-us.apache.org/repos/asf/hadoop/blob/0da69c32/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java index 22aa0ee..5a89e54 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java @@ -272,6 +272,11 @@ public class MockNodes { @Override public void setUntrackedTimeStamp(long timeStamp) { } + + @Override + public Integer getDecommissioningTimeout() { + return null; + } }; private static RMNode buildRMNode(int rack, final Resource perNode,
http://git-wip-us.apache.org/repos/asf/hadoop/blob/0da69c32/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java index 5856e59..f843261 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java @@ -709,6 +709,9 @@ public class MockRM extends ResourceManager { public void waitForState(NodeId nodeId, NodeState finalState) throws InterruptedException { RMNode node = getRMContext().getRMNodes().get(nodeId); + if (node == null) { + node = getRMContext().getInactiveRMNodes().get(nodeId); + } Assert.assertNotNull("node shouldn't be null", node); int timeWaiting = 0; while (!finalState.equals(node.getState())) { @@ -722,11 +725,17 @@ public class MockRM extends ResourceManager { timeWaiting += WAIT_MS_PER_LOOP; } - System.out.println("Node State is : " + node.getState()); + System.out.println("Node " + nodeId + " State is : " + node.getState()); Assert.assertEquals("Node state is not correct (timedout)", finalState, node.getState()); } + public void sendNodeEvent(MockNM nm, RMNodeEventType event) throws Exception { + RMNodeImpl node = (RMNodeImpl) + getRMContext().getRMNodes().get(nm.getNodeId()); + node.handle(new RMNodeEvent(nm.getNodeId(), event)); + } + public KillApplicationResponse killApp(ApplicationId appId) throws Exception { ApplicationClientProtocol client = getClientRMService(); KillApplicationRequest req = KillApplicationRequest.newInstance(appId); http://git-wip-us.apache.org/repos/asf/hadoop/blob/0da69c32/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestDecommissioningNodesWatcher.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestDecommissioningNodesWatcher.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestDecommissioningNodesWatcher.java new file mode 100644 index 0000000..690de30 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestDecommissioningNodesWatcher.java @@ -0,0 +1,131 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.ContainerState; +import org.apache.hadoop.yarn.api.records.ContainerStatus; +import org.apache.hadoop.yarn.api.records.NodeId; +import org.apache.hadoop.yarn.api.records.NodeState; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus; +import org.apache.hadoop.yarn.server.api.records.NodeStatus; +import org.apache.hadoop.yarn.server.resourcemanager.DecommissioningNodesWatcher.DecommissioningNodeStatus; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; +import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; +import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEventType; +import org.junit.After; +import org.junit.Assert; +import org.junit.Test; + +/** + * This class tests DecommissioningNodesWatcher. + */ +public class TestDecommissioningNodesWatcher { + private MockRM rm; + + @Test + public void testDecommissioningNodesWatcher() throws Exception { + Configuration conf = new Configuration(); + conf.set(YarnConfiguration.RM_NODE_GRACEFUL_DECOMMISSION_TIMEOUT, "40"); + + rm = new MockRM(conf); + rm.start(); + + DecommissioningNodesWatcher watcher = + new DecommissioningNodesWatcher(rm.getRMContext()); + + MockNM nm1 = rm.registerNode("host1:1234", 10240); + RMNode node1 = rm.getRMContext().getRMNodes().get(nm1.getNodeId()); + NodeId id1 = nm1.getNodeId(); + + rm.waitForState(id1, NodeState.RUNNING); + Assert.assertFalse(watcher.checkReadyToBeDecommissioned(id1)); + + RMApp app = rm.submitApp(2000); + MockAM am = MockRM.launchAndRegisterAM(app, rm, nm1); + + // Setup nm1 as DECOMMISSIONING for DecommissioningNodesWatcher. + rm.sendNodeEvent(nm1, RMNodeEventType.GRACEFUL_DECOMMISSION); + rm.waitForState(id1, NodeState.DECOMMISSIONING); + + // Update status with decreasing number of running containers until 0. + watcher.update(node1, createNodeStatus(id1, app, 12)); + watcher.update(node1, createNodeStatus(id1, app, 11)); + Assert.assertFalse(watcher.checkReadyToBeDecommissioned(id1)); + + watcher.update(node1, createNodeStatus(id1, app, 1)); + Assert.assertEquals(DecommissioningNodeStatus.WAIT_CONTAINER, + watcher.checkDecommissioningStatus(id1)); + + watcher.update(node1, createNodeStatus(id1, app, 0)); + Assert.assertEquals(DecommissioningNodeStatus.WAIT_APP, + watcher.checkDecommissioningStatus(id1)); + + // Set app to be FINISHED and verified DecommissioningNodeStatus is READY. + MockRM.finishAMAndVerifyAppState(app, rm, nm1, am); + rm.waitForState(app.getApplicationId(), RMAppState.FINISHED); + Assert.assertEquals(DecommissioningNodeStatus.READY, + watcher.checkDecommissioningStatus(id1)); + } + + @After + public void tearDown() { + if (rm != null) { + rm.stop(); + } + } + + private NodeStatus createNodeStatus( + NodeId nodeId, RMApp app, int numRunningContainers) { + return NodeStatus.newInstance( + nodeId, 0, getContainerStatuses(app, numRunningContainers), + new ArrayList<ApplicationId>(), + NodeHealthStatus.newInstance( + true, "", System.currentTimeMillis() - 1000), + null, null, null); + } + + // Get mocked ContainerStatus for bunch of containers, + // where numRunningContainers are RUNNING. + private List<ContainerStatus> getContainerStatuses( + RMApp app, int numRunningContainers) { + // Total 12 containers + final int total = 12; + numRunningContainers = Math.min(total, numRunningContainers); + List<ContainerStatus> output = new ArrayList<ContainerStatus>(); + for (int i = 0; i < total; i++) { + ContainerState cstate = (i >= numRunningContainers)? + ContainerState.COMPLETE : ContainerState.RUNNING; + output.add(ContainerStatus.newInstance( + ContainerId.newContainerId( + ApplicationAttemptId.newInstance(app.getApplicationId(), i), 1), + cstate, "Dummy", 0)); + } + return output; + } +} + http://git-wip-us.apache.org/repos/asf/hadoop/blob/0da69c32/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMNodeTransitions.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMNodeTransitions.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMNodeTransitions.java index 83a7c73..e82b93c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMNodeTransitions.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMNodeTransitions.java @@ -254,17 +254,6 @@ public class TestRMNodeTransitions { cm.getNumDecommissioningNMs()); Assert.assertEquals("Decommissioned Nodes", initialDecommissioned, cm.getNumDecommisionedNMs()); - - // Verify node in DECOMMISSIONING will be changed by status update - // without running apps - statusEvent = getMockRMNodeStatusEventWithoutRunningApps(); - node.handle(statusEvent); - Assert.assertEquals(NodeState.DECOMMISSIONED, node.getState()); - Assert.assertEquals("Active Nodes", initialActive, cm.getNumActiveNMs()); - Assert.assertEquals("Decommissioning Nodes", initialDecommissioning - 1, - cm.getNumDecommissioningNMs()); - Assert.assertEquals("Decommissioned Nodes", initialDecommissioned + 1, - cm.getNumDecommisionedNMs()); } @Test http://git-wip-us.apache.org/repos/asf/hadoop/blob/0da69c32/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java index 098ba54..aa5b336 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java @@ -68,6 +68,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NodeLabelsUtils; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; +import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl; @@ -87,6 +88,9 @@ public class TestResourceTrackerService extends NodeLabelTestBase { private final static File TEMP_DIR = new File(System.getProperty( "test.build.data", "/tmp"), "decommision"); private final File hostFile = new File(TEMP_DIR + File.separator + "hostFile.txt"); + private final File excludeHostFile = new File(TEMP_DIR + File.separator + + "excludeHostFile.txt"); + private MockRM rm; /** @@ -222,6 +226,109 @@ public class TestResourceTrackerService extends NodeLabelTestBase { } /** + * Graceful decommission node with no running application. + */ + @Test + public void testGracefulDecommissionNoApp() throws Exception { + Configuration conf = new Configuration(); + conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile + .getAbsolutePath()); + + writeToHostsFile(""); + rm = new MockRM(conf); + rm.start(); + + MockNM nm1 = rm.registerNode("host1:1234", 5120); + MockNM nm2 = rm.registerNode("host2:5678", 10240); + MockNM nm3 = rm.registerNode("host3:4433", 5120); + + int metricCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs(); + NodeHeartbeatResponse nodeHeartbeat1 = nm1.nodeHeartbeat(true); + NodeHeartbeatResponse nodeHeartbeat2 = nm2.nodeHeartbeat(true); + NodeHeartbeatResponse nodeHeartbeat3 = nm3.nodeHeartbeat(true); + + Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat1.getNodeAction())); + Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat2.getNodeAction())); + Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat3.getNodeAction())); + + rm.waitForState(nm2.getNodeId(), NodeState.RUNNING); + rm.waitForState(nm3.getNodeId(), NodeState.RUNNING); + + // Graceful decommission both host2 and host3. + writeToHostsFile("host2", "host3"); + rm.getNodesListManager().refreshNodes(conf, true); + + rm.waitForState(nm2.getNodeId(), NodeState.DECOMMISSIONING); + rm.waitForState(nm3.getNodeId(), NodeState.DECOMMISSIONING); + + nodeHeartbeat1 = nm1.nodeHeartbeat(true); + nodeHeartbeat2 = nm2.nodeHeartbeat(true); + nodeHeartbeat3 = nm3.nodeHeartbeat(true); + + checkDecommissionedNMCount(rm, metricCount + 2); + rm.waitForState(nm2.getNodeId(), NodeState.DECOMMISSIONED); + rm.waitForState(nm3.getNodeId(), NodeState.DECOMMISSIONED); + + Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat1.getNodeAction())); + Assert.assertEquals(NodeAction.SHUTDOWN, nodeHeartbeat2.getNodeAction()); + Assert.assertEquals(NodeAction.SHUTDOWN, nodeHeartbeat3.getNodeAction()); + } + + /** + * Graceful decommission node with running application. + */ + @Test + public void testGracefulDecommissionWithApp() throws Exception { + Configuration conf = new Configuration(); + conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile + .getAbsolutePath()); + + writeToHostsFile(""); + rm = new MockRM(conf); + rm.start(); + + MockNM nm1 = rm.registerNode("host1:1234", 10240); + MockNM nm2 = rm.registerNode("host2:5678", 20480); + MockNM nm3 = rm.registerNode("host3:4433", 10240); + NodeId id1 = nm1.getNodeId(); + NodeId id3 = nm3.getNodeId(); + rm.waitForState(id1, NodeState.RUNNING); + rm.waitForState(id3, NodeState.RUNNING); + + // Create an app and launch two containers on host1. + RMApp app = rm.submitApp(2000); + MockAM am = MockRM.launchAndRegisterAM(app, rm, nm1); + ApplicationAttemptId aaid = app.getCurrentAppAttempt().getAppAttemptId(); + + // Graceful decommission host1 and host3 + writeToHostsFile("host1", "host3"); + rm.getNodesListManager().refreshNodes(conf, true); + rm.waitForState(id1, NodeState.DECOMMISSIONING); + rm.waitForState(id3, NodeState.DECOMMISSIONING); + + // host1 should be DECOMMISSIONING due to running containers. + // host3 should become DECOMMISSIONED. + nm1.nodeHeartbeat(aaid, 2, ContainerState.RUNNING); + nm3.nodeHeartbeat(true); + rm.waitForState(id1, NodeState.DECOMMISSIONING); + rm.waitForState(id3, NodeState.DECOMMISSIONED); + nm1.nodeHeartbeat(aaid, 2, ContainerState.RUNNING); + + // Complete containers on host1. + // Since the app is still RUNNING, expect NodeAction.NORMAL. + NodeHeartbeatResponse nodeHeartbeat1 = + nm1.nodeHeartbeat(aaid, 2, ContainerState.COMPLETE); + Assert.assertEquals(NodeAction.NORMAL, nodeHeartbeat1.getNodeAction()); + + // Finish the app and verified DECOMMISSIONED. + MockRM.finishAMAndVerifyAppState(app, rm, nm1, am); + rm.waitForState(app.getApplicationId(), RMAppState.FINISHED); + nodeHeartbeat1 = nm1.nodeHeartbeat(aaid, 2, ContainerState.COMPLETE); + Assert.assertEquals(NodeAction.SHUTDOWN, nodeHeartbeat1.getNodeAction()); + rm.waitForState(id1, NodeState.DECOMMISSIONED); + } + + /** * Decommissioning using a post-configured include hosts file */ @Test @@ -1225,19 +1332,17 @@ public class TestResourceTrackerService extends NodeLabelTestBase { MockNM nm1 = new MockNM("host1:1234", 5120, resourceTrackerService); RegisterNodeManagerResponse response = nm1.registerNode(); Assert.assertEquals(NodeAction.NORMAL, response.getNodeAction()); + int shutdownNMsCount = ClusterMetrics.getMetrics().getNumShutdownNMs(); writeToHostsFile("host2"); conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath()); rm.getNodesListManager().refreshNodes(conf); NodeHeartbeatResponse heartbeatResponse = nm1.nodeHeartbeat(true); Assert.assertEquals(NodeAction.SHUTDOWN, heartbeatResponse.getNodeAction()); - int shutdownNMsCount = ClusterMetrics.getMetrics().getNumShutdownNMs(); - checkShutdownNMCount(rm, shutdownNMsCount); checkDecommissionedNMCount(rm, decommisionedNMsCount); request.setNodeId(nm1.getNodeId()); resourceTrackerService.unRegisterNodeManager(request); - shutdownNMsCount = ClusterMetrics.getMetrics().getNumShutdownNMs(); - checkShutdownNMCount(rm, shutdownNMsCount); + checkShutdownNMCount(rm, ++shutdownNMsCount); checkDecommissionedNMCount(rm, decommisionedNMsCount); // 1. Register the Node Manager @@ -1273,8 +1378,6 @@ public class TestResourceTrackerService extends NodeLabelTestBase { nm1.nodeHeartbeat(true); nm2.nodeHeartbeat(true); - File excludeHostFile = - new File(TEMP_DIR + File.separator + "excludeHostFile.txt"); writeToHostsFile(excludeHostFile, "host1"); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, excludeHostFile.getAbsolutePath()); @@ -1300,8 +1403,6 @@ public class TestResourceTrackerService extends NodeLabelTestBase { Assert.assertEquals("The inactiveRMNodes should contain an entry for the" + "decommissioned node", 1, rm1.getRMContext().getInactiveRMNodes().size()); - excludeHostFile = - new File(TEMP_DIR + File.separator + "excludeHostFile.txt"); writeToHostsFile(excludeHostFile, ""); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, excludeHostFile.getAbsolutePath()); @@ -1331,8 +1432,6 @@ public class TestResourceTrackerService extends NodeLabelTestBase { nm1.nodeHeartbeat(true); nm2.nodeHeartbeat(true); //host3 will not register or heartbeat - File excludeHostFile = - new File(TEMP_DIR + File.separator + "excludeHostFile.txt"); writeToHostsFile(excludeHostFile, "host3", "host2"); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, excludeHostFile.getAbsolutePath()); @@ -1364,14 +1463,12 @@ public class TestResourceTrackerService extends NodeLabelTestBase { MockNM nm2 = rm.registerNode("host2:5678", 10240); nm1.nodeHeartbeat(true); nm2.nodeHeartbeat(true); - File excludeHostFile = - new File(TEMP_DIR + File.separator + "excludeHostFile.txt"); writeToHostsFile(excludeHostFile, "host3", "host2"); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, excludeHostFile.getAbsolutePath()); writeToHostsFile(hostFile, "host1", "host2"); writeToHostsFile(excludeHostFile, "host1"); - rm.getNodesListManager().refreshNodesGracefully(conf); + rm.getNodesListManager().refreshNodesGracefully(conf, null); rm.drainEvents(); nm1.nodeHeartbeat(true); rm.drainEvents(); @@ -1380,7 +1477,7 @@ public class TestResourceTrackerService extends NodeLabelTestBase { .getInactiveRMNodes().get(nm1.getNodeId()).getState() == NodeState .DECOMMISSIONED); writeToHostsFile(excludeHostFile, ""); - rm.getNodesListManager().refreshNodesGracefully(conf); + rm.getNodesListManager().refreshNodesGracefully(conf, null); rm.drainEvents(); Assert.assertTrue("Node " + nm1.getNodeId().getHost() + " should be Decommissioned", rm.getRMContext() @@ -1390,7 +1487,7 @@ public class TestResourceTrackerService extends NodeLabelTestBase { } /** - * Remove a node from all lists and check if its forgotten + * Remove a node from all lists and check if its forgotten. */ @Test public void testNodeRemovalNormally() throws Exception { @@ -1411,7 +1508,7 @@ public class TestResourceTrackerService extends NodeLabelTestBase { public void refreshNodesOption(boolean doGraceful, Configuration conf) throws Exception { if (doGraceful) { - rm.getNodesListManager().refreshNodesGracefully(conf); + rm.getNodesListManager().refreshNodesGracefully(conf, null); } else { rm.getNodesListManager().refreshNodes(conf); } @@ -1420,8 +1517,6 @@ public class TestResourceTrackerService extends NodeLabelTestBase { public void testNodeRemovalUtil(boolean doGraceful) throws Exception { Configuration conf = new Configuration(); int timeoutValue = 500; - File excludeHostFile = new File(TEMP_DIR + File.separator + - "excludeHostFile.txt"); conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, ""); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, ""); conf.setInt(YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC, @@ -1455,18 +1550,23 @@ public class TestResourceTrackerService extends NodeLabelTestBase { conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile .getAbsolutePath()); refreshNodesOption(doGraceful, conf); + if (doGraceful) { + rm.waitForState(nm2.getNodeId(), NodeState.DECOMMISSIONING); + } nm1.nodeHeartbeat(true); + nm2.nodeHeartbeat(true); rm.drainEvents(); Assert.assertTrue("Node should not be in active node list", !rmContext.getRMNodes().containsKey(nm2.getNodeId())); RMNode rmNode = rmContext.getInactiveRMNodes().get(nm2.getNodeId()); Assert.assertEquals("Node should be in inactive node list", - rmNode.getState(), NodeState.SHUTDOWN); + rmNode.getState(), + doGraceful? NodeState.DECOMMISSIONED : NodeState.SHUTDOWN); Assert.assertEquals("Active nodes should be 2", metrics.getNumActiveNMs(), 2); - Assert.assertEquals("Shutdown nodes should be 1", - metrics.getNumShutdownNMs(), 1); + Assert.assertEquals("Shutdown nodes should be expected", + metrics.getNumShutdownNMs(), doGraceful? 0 : 1); int nodeRemovalTimeout = conf.getInt( @@ -1491,14 +1591,18 @@ public class TestResourceTrackerService extends NodeLabelTestBase { rm.drainEvents(); writeToHostsFile("host1", ip); refreshNodesOption(doGraceful, conf); + rm.waitForState(nm2.getNodeId(), + doGraceful? NodeState.DECOMMISSIONING : NodeState.SHUTDOWN); + nm2.nodeHeartbeat(true); rm.drainEvents(); rmNode = rmContext.getInactiveRMNodes().get(nm2.getNodeId()); Assert.assertEquals("Node should be shutdown", - rmNode.getState(), NodeState.SHUTDOWN); + rmNode.getState(), + doGraceful? NodeState.DECOMMISSIONED : NodeState.SHUTDOWN); Assert.assertEquals("Active nodes should be 2", metrics.getNumActiveNMs(), 2); - Assert.assertEquals("Shutdown nodes should be 1", - metrics.getNumShutdownNMs(), 1); + Assert.assertEquals("Shutdown nodes should be expected", + metrics.getNumShutdownNMs(), doGraceful? 0 : 1); //add back the node before timer expires latch.await(maxThreadSleeptime - 2000, TimeUnit.MILLISECONDS); @@ -1542,6 +1646,20 @@ public class TestResourceTrackerService extends NodeLabelTestBase { } //Test decommed/ing node that transitions to untracked,timer should remove + testNodeRemovalUtilDecomToUntracked(rmContext, conf, nm1, nm2, nm3, + maxThreadSleeptime, doGraceful); + rm.stop(); + } + + // A helper method used by testNodeRemovalUtil to avoid exceeding + // max allowed length. + private void testNodeRemovalUtilDecomToUntracked( + RMContext rmContext, Configuration conf, + MockNM nm1, MockNM nm2, MockNM nm3, + long maxThreadSleeptime, boolean doGraceful) throws Exception { + ClusterMetrics metrics = ClusterMetrics.getMetrics(); + String ip = NetUtils.normalizeHostName("localhost"); + CountDownLatch latch = new CountDownLatch(1); writeToHostsFile("host1", ip, "host2"); writeToHostsFile(excludeHostFile, "host2"); refreshNodesOption(doGraceful, conf); @@ -1549,7 +1667,7 @@ public class TestResourceTrackerService extends NodeLabelTestBase { //nm2.nodeHeartbeat(true); nm3.nodeHeartbeat(true); latch.await(maxThreadSleeptime, TimeUnit.MILLISECONDS); - rmNode = doGraceful ? rmContext.getRMNodes().get(nm2.getNodeId()) : + RMNode rmNode = doGraceful ? rmContext.getRMNodes().get(nm2.getNodeId()) : rmContext.getInactiveRMNodes().get(nm2.getNodeId()); Assert.assertNotEquals("Timer for this node was not canceled!", rmNode, null); @@ -1560,6 +1678,7 @@ public class TestResourceTrackerService extends NodeLabelTestBase { writeToHostsFile("host1", ip); writeToHostsFile(excludeHostFile, ""); refreshNodesOption(doGraceful, conf); + nm2.nodeHeartbeat(true); latch.await(maxThreadSleeptime, TimeUnit.MILLISECONDS); rmNode = doGraceful ? rmContext.getRMNodes().get(nm2.getNodeId()) : rmContext.getInactiveRMNodes().get(nm2.getNodeId()); @@ -1571,16 +1690,12 @@ public class TestResourceTrackerService extends NodeLabelTestBase { metrics.getNumShutdownNMs(), 0); Assert.assertEquals("Active nodes should be 2", metrics.getNumActiveNMs(), 2); - - rm.stop(); } private void testNodeRemovalUtilLost(boolean doGraceful) throws Exception { Configuration conf = new Configuration(); conf.setLong(YarnConfiguration.RM_NM_EXPIRY_INTERVAL_MS, 2000); int timeoutValue = 500; - File excludeHostFile = new File(TEMP_DIR + File.separator + - "excludeHostFile.txt"); conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath()); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, @@ -1613,7 +1728,7 @@ public class TestResourceTrackerService extends NodeLabelTestBase { Assert.assertEquals("All 3 nodes should be active", metrics.getNumActiveNMs(), 3); int waitCount = 0; - while(waitCount ++<20){ + while(waitCount++ < 20){ synchronized (this) { wait(200); } @@ -1665,8 +1780,6 @@ public class TestResourceTrackerService extends NodeLabelTestBase { throws Exception { Configuration conf = new Configuration(); int timeoutValue = 500; - File excludeHostFile = new File(TEMP_DIR + File.separator + - "excludeHostFile.txt"); conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath()); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, @@ -1737,8 +1850,6 @@ public class TestResourceTrackerService extends NodeLabelTestBase { throws Exception { Configuration conf = new Configuration(); int timeoutValue = 500; - File excludeHostFile = new File(TEMP_DIR + File.separator + - "excludeHostFile.txt"); conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile.getAbsolutePath()); conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, @@ -1782,15 +1893,19 @@ public class TestResourceTrackerService extends NodeLabelTestBase { nm2.nodeHeartbeat(false); nm3.nodeHeartbeat(true); rm.drainEvents(); - Assert.assertNotEquals("host2 should be a shutdown NM!", - rmContext.getInactiveRMNodes().get(nm2.getNodeId()), null); - Assert.assertEquals("host2 should be a shutdown NM!", - rmContext.getInactiveRMNodes().get(nm2.getNodeId()).getState(), - NodeState.SHUTDOWN); + if (!doGraceful) { + Assert.assertNotEquals("host2 should be a shutdown NM!", + rmContext.getInactiveRMNodes().get(nm2.getNodeId()), null); + Assert.assertEquals("host2 should be a shutdown NM!", + rmContext.getInactiveRMNodes().get(nm2.getNodeId()).getState(), + NodeState.SHUTDOWN); + } Assert.assertEquals("There should be 2 Active NM!", clusterMetrics.getNumActiveNMs(), 2); - Assert.assertEquals("There should be 1 Shutdown NM!", - clusterMetrics.getNumShutdownNMs(), 1); + if (!doGraceful) { + Assert.assertEquals("There should be 1 Shutdown NM!", + clusterMetrics.getNumShutdownNMs(), 1); + } Assert.assertEquals("There should be 0 Unhealthy NM!", clusterMetrics.getUnhealthyNMs(), 0); int nodeRemovalTimeout = @@ -1818,7 +1933,7 @@ public class TestResourceTrackerService extends NodeLabelTestBase { } private void writeToHostsFile(String... hosts) throws IOException { - writeToHostsFile(hostFile, hosts); + writeToHostsFile(hostFile, hosts); } private void writeToHostsFile(File file, String... hosts) http://git-wip-us.apache.org/repos/asf/hadoop/blob/0da69c32/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java index 2c926d9..e7c7e51 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java @@ -210,8 +210,6 @@ public class TestNMReconnect extends ParameterizedSchedulerTestBase { nm1.registerNode(); rm.waitForState(nm1.getNodeId(), NodeState.RUNNING); - rm.getRMContext().getNodesListManager().getHostsReader(). - getExcludedHosts().add("127.0.0.1"); rm.getRMContext().getDispatcher().getEventHandler().handle( new RMNodeEvent(nm1.getNodeId(), RMNodeEventType.GRACEFUL_DECOMMISSION)); http://git-wip-us.apache.org/repos/asf/hadoop/blob/0da69c32/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/YarnCommands.md ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/YarnCommands.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/YarnCommands.md index 40704f0..2c38967 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/YarnCommands.md +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/YarnCommands.md @@ -243,7 +243,7 @@ Usage: | COMMAND\_OPTIONS | Description | |:---- |:---- | | -refreshQueues | Reload the queues' acls, states and scheduler specific properties. ResourceManager will reload the mapred-queues configuration file. | -| -refreshNodes | Refresh the hosts information at the ResourceManager. | +| -refreshNodes [-g|graceful [timeout in seconds] -client|server] | Refresh the hosts information at the ResourceManager. -g option indicates graceful decommission of excluded hosts, in which case, the optional timeout indicates maximal time in seconds ResourceManager should wait before forcefully mark the node as decommissioned. | | -refreshNodesResources | Refresh resources of NodeManagers at the ResourceManager. | | -refreshSuperUserGroupsConfiguration | Refresh superuser proxy groups mappings. | | -refreshUserToGroupsMappings | Refresh user-to-groups mappings. | --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-commits-h...@hadoop.apache.org