[
https://issues.apache.org/jira/browse/YARN-11497?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17726417#comment-17726417
]
ASF GitHub Bot commented on YARN-11497:
---------------------------------------
slfan1989 commented on code in PR #5681:
URL: https://github.com/apache/hadoop/pull/5681#discussion_r1206082428
##########
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java:
##########
@@ -3222,4 +3222,89 @@ public void testDecommissionWithoutIncludeFile() throws
Exception {
rm.close();
}
+
+ /**
+ * Decommissioning with selective states for untracked nodes.
+ */
+ @Test
+ public void testDecommissionWithSelectiveStates() throws Exception {
+ // clear exclude hosts
+ writeToHostsFile(excludeHostFile, "");
+ // init conf:
+ // (1) set untracked removal timeout to 500ms
+ // (2) set exclude path (no include path)
+ // (3) enable node untracked without pre-configured include path
+ Configuration conf = new Configuration();
+
conf.setInt(YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC,
+ 500);
+ conf.setBoolean(
+ YarnConfiguration.RM_ENABLE_NODE_UNTRACKED_WITHOUT_INCLUDE_PATH,
true);
+ conf.setStrings(
+
YarnConfiguration.RM_NODEMANAGER_UNTRACKED_NODE_SELECTIVE_STATES_TO_REMOVE,
+ "DECOMMISSIONED", "SHUTDOWN");
+ conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
+ excludeHostFile.getAbsolutePath());
+
+ rm = new MockRM(conf);
+ rm.start();
+ MockNM nm1 = rm.registerNode("host1:1234", 10240);
+ MockNM nm2 = rm.registerNode("host2:1234", 10240);
+ MockNM nm3 = rm.registerNode("host3:1234", 10240);
+ MockNM nm4 = rm.registerNode("host4:1234", 10240);
+ Assert.assertEquals(4, rm.getRMContext().getRMNodes().size());
+ Assert.assertEquals(0, rm.getRMContext().getInactiveRMNodes().size());
+
+ // decommission nm1 via adding nm1 into exclude hosts
+ RMNode rmNode1 = rm.getRMContext().getRMNodes().get(nm1.getNodeId());
+ writeToHostsFile(excludeHostFile, "host1");
+ rm.getNodesListManager().refreshNodes(conf);
+ rm.drainEvents();
+ Assert.assertEquals(rmNode1.getState(), NodeState.DECOMMISSIONED);
+ Assert.assertEquals(3, rm.getRMContext().getRMNodes().size());
+ Assert.assertEquals(1, rm.getRMContext().getInactiveRMNodes().size());
+ Assert.assertEquals(new HashSet(Arrays.asList(nm1.getNodeId())),
+ rm.getRMContext().getInactiveRMNodes().keySet());
+
+ // remove nm1 from exclude hosts, so that it will be marked as untracked
+ // and removed from inactive nodes after the timeout
+ writeToHostsFile(excludeHostFile, "");
+ rm.getNodesListManager().refreshNodes(conf);
+ // confirmed that nm1 should be removed from inactive nodes in 1 second
+ GenericTestUtils.waitFor(() ->
rm.getRMContext().getInactiveRMNodes().size() == 0,
+ 100, 1000);
+
+ // lost nm2
+ RMNode rmNode2 = rm.getRMContext().getRMNodes().get(nm2.getNodeId());
+ rm.getRMContext().getDispatcher().getEventHandler()
+ .handle(new RMNodeEvent(nm2.getNodeId(), RMNodeEventType.EXPIRE));
Review Comment:
Indentation
##########
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java:
##########
@@ -3222,4 +3222,89 @@ public void testDecommissionWithoutIncludeFile() throws
Exception {
rm.close();
}
+
+ /**
+ * Decommissioning with selective states for untracked nodes.
+ */
+ @Test
+ public void testDecommissionWithSelectiveStates() throws Exception {
+ // clear exclude hosts
+ writeToHostsFile(excludeHostFile, "");
+ // init conf:
+ // (1) set untracked removal timeout to 500ms
+ // (2) set exclude path (no include path)
+ // (3) enable node untracked without pre-configured include path
+ Configuration conf = new Configuration();
+
conf.setInt(YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC,
+ 500);
+ conf.setBoolean(
+ YarnConfiguration.RM_ENABLE_NODE_UNTRACKED_WITHOUT_INCLUDE_PATH,
true);
+ conf.setStrings(
+
YarnConfiguration.RM_NODEMANAGER_UNTRACKED_NODE_SELECTIVE_STATES_TO_REMOVE,
+ "DECOMMISSIONED", "SHUTDOWN");
+ conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
+ excludeHostFile.getAbsolutePath());
+
+ rm = new MockRM(conf);
+ rm.start();
+ MockNM nm1 = rm.registerNode("host1:1234", 10240);
+ MockNM nm2 = rm.registerNode("host2:1234", 10240);
+ MockNM nm3 = rm.registerNode("host3:1234", 10240);
+ MockNM nm4 = rm.registerNode("host4:1234", 10240);
+ Assert.assertEquals(4, rm.getRMContext().getRMNodes().size());
+ Assert.assertEquals(0, rm.getRMContext().getInactiveRMNodes().size());
+
+ // decommission nm1 via adding nm1 into exclude hosts
+ RMNode rmNode1 = rm.getRMContext().getRMNodes().get(nm1.getNodeId());
+ writeToHostsFile(excludeHostFile, "host1");
+ rm.getNodesListManager().refreshNodes(conf);
+ rm.drainEvents();
+ Assert.assertEquals(rmNode1.getState(), NodeState.DECOMMISSIONED);
+ Assert.assertEquals(3, rm.getRMContext().getRMNodes().size());
+ Assert.assertEquals(1, rm.getRMContext().getInactiveRMNodes().size());
+ Assert.assertEquals(new HashSet(Arrays.asList(nm1.getNodeId())),
+ rm.getRMContext().getInactiveRMNodes().keySet());
+
+ // remove nm1 from exclude hosts, so that it will be marked as untracked
+ // and removed from inactive nodes after the timeout
+ writeToHostsFile(excludeHostFile, "");
+ rm.getNodesListManager().refreshNodes(conf);
+ // confirmed that nm1 should be removed from inactive nodes in 1 second
+ GenericTestUtils.waitFor(() ->
rm.getRMContext().getInactiveRMNodes().size() == 0,
+ 100, 1000);
+
+ // lost nm2
+ RMNode rmNode2 = rm.getRMContext().getRMNodes().get(nm2.getNodeId());
+ rm.getRMContext().getDispatcher().getEventHandler()
+ .handle(new RMNodeEvent(nm2.getNodeId(), RMNodeEventType.EXPIRE));
+ rm.drainEvents();
+ Assert.assertEquals(rmNode2.getState(), NodeState.LOST);
+ Assert.assertEquals(2, rm.getRMContext().getRMNodes().size());
+ Assert.assertEquals(1, rm.getRMContext().getInactiveRMNodes().size());
+ // confirmed that nm2 should not be removed from inactive nodes in 1 second
+ GenericTestUtils.waitFor(() ->
rm.getRMContext().getInactiveRMNodes().size() == 1,
+ 100, 1000);
Review Comment:
Indentation
> Support removal of only selective node states in untracked removal flow
> -----------------------------------------------------------------------
>
> Key: YARN-11497
> URL: https://issues.apache.org/jira/browse/YARN-11497
> Project: Hadoop YARN
> Issue Type: Improvement
> Reporter: Mudit Sharma
> Priority: Major
> Labels: pull-request-available
>
> Currently inactive nodes are removed from the Yarn local memory irrespective
> of which state they are in. This makes the node removal process not too much
> configurable
> After this patch: https://issues.apache.org/jira/browse/YARN-10854
> If autoscaling is enabled, lot many nodes go into DECOMMISSIONED state but
> still other states like LOST, SHUTDOWN are very less and systems might want
> them to be still visible on UI for better tracking
> The proposal is to introduce a new config, which when set, will allow only
> selective node states to be removed after going into untracked state.
>
> PR: [https://github.com/apache/hadoop/pull/5681]
> Any thoughts/suggestions/feedbacks are welcome!
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]