[ 
https://issues.apache.org/jira/browse/YARN-11497?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17726417#comment-17726417
 ] 

ASF GitHub Bot commented on YARN-11497:
---------------------------------------

slfan1989 commented on code in PR #5681:
URL: https://github.com/apache/hadoop/pull/5681#discussion_r1206082428


##########
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java:
##########
@@ -3222,4 +3222,89 @@ public void testDecommissionWithoutIncludeFile() throws 
Exception {
 
     rm.close();
   }
+
+  /**
+   * Decommissioning with selective states for untracked nodes.
+   */
+  @Test
+  public void testDecommissionWithSelectiveStates() throws Exception {
+    // clear exclude hosts
+    writeToHostsFile(excludeHostFile, "");
+    // init conf:
+    // (1) set untracked removal timeout to 500ms
+    // (2) set exclude path (no include path)
+    // (3) enable node untracked without pre-configured include path
+    Configuration conf = new Configuration();
+    
conf.setInt(YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC,
+            500);
+    conf.setBoolean(
+            YarnConfiguration.RM_ENABLE_NODE_UNTRACKED_WITHOUT_INCLUDE_PATH, 
true);
+    conf.setStrings(
+            
YarnConfiguration.RM_NODEMANAGER_UNTRACKED_NODE_SELECTIVE_STATES_TO_REMOVE,
+            "DECOMMISSIONED", "SHUTDOWN");
+    conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
+            excludeHostFile.getAbsolutePath());
+
+    rm = new MockRM(conf);
+    rm.start();
+    MockNM nm1 = rm.registerNode("host1:1234", 10240);
+    MockNM nm2 = rm.registerNode("host2:1234", 10240);
+    MockNM nm3 = rm.registerNode("host3:1234", 10240);
+    MockNM nm4 = rm.registerNode("host4:1234", 10240);
+    Assert.assertEquals(4, rm.getRMContext().getRMNodes().size());
+    Assert.assertEquals(0, rm.getRMContext().getInactiveRMNodes().size());
+
+    // decommission nm1 via adding nm1 into exclude hosts
+    RMNode rmNode1 = rm.getRMContext().getRMNodes().get(nm1.getNodeId());
+    writeToHostsFile(excludeHostFile, "host1");
+    rm.getNodesListManager().refreshNodes(conf);
+    rm.drainEvents();
+    Assert.assertEquals(rmNode1.getState(), NodeState.DECOMMISSIONED);
+    Assert.assertEquals(3, rm.getRMContext().getRMNodes().size());
+    Assert.assertEquals(1, rm.getRMContext().getInactiveRMNodes().size());
+    Assert.assertEquals(new HashSet(Arrays.asList(nm1.getNodeId())),
+            rm.getRMContext().getInactiveRMNodes().keySet());
+
+    // remove nm1 from exclude hosts, so that it will be marked as untracked
+    // and removed from inactive nodes after the timeout
+    writeToHostsFile(excludeHostFile, "");
+    rm.getNodesListManager().refreshNodes(conf);
+    // confirmed that nm1 should be removed from inactive nodes in 1 second
+    GenericTestUtils.waitFor(() -> 
rm.getRMContext().getInactiveRMNodes().size() == 0,
+            100, 1000);
+
+    // lost nm2
+    RMNode rmNode2 = rm.getRMContext().getRMNodes().get(nm2.getNodeId());
+    rm.getRMContext().getDispatcher().getEventHandler()
+            .handle(new RMNodeEvent(nm2.getNodeId(), RMNodeEventType.EXPIRE));

Review Comment:
   Indentation



##########
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceTrackerService.java:
##########
@@ -3222,4 +3222,89 @@ public void testDecommissionWithoutIncludeFile() throws 
Exception {
 
     rm.close();
   }
+
+  /**
+   * Decommissioning with selective states for untracked nodes.
+   */
+  @Test
+  public void testDecommissionWithSelectiveStates() throws Exception {
+    // clear exclude hosts
+    writeToHostsFile(excludeHostFile, "");
+    // init conf:
+    // (1) set untracked removal timeout to 500ms
+    // (2) set exclude path (no include path)
+    // (3) enable node untracked without pre-configured include path
+    Configuration conf = new Configuration();
+    
conf.setInt(YarnConfiguration.RM_NODEMANAGER_UNTRACKED_REMOVAL_TIMEOUT_MSEC,
+            500);
+    conf.setBoolean(
+            YarnConfiguration.RM_ENABLE_NODE_UNTRACKED_WITHOUT_INCLUDE_PATH, 
true);
+    conf.setStrings(
+            
YarnConfiguration.RM_NODEMANAGER_UNTRACKED_NODE_SELECTIVE_STATES_TO_REMOVE,
+            "DECOMMISSIONED", "SHUTDOWN");
+    conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
+            excludeHostFile.getAbsolutePath());
+
+    rm = new MockRM(conf);
+    rm.start();
+    MockNM nm1 = rm.registerNode("host1:1234", 10240);
+    MockNM nm2 = rm.registerNode("host2:1234", 10240);
+    MockNM nm3 = rm.registerNode("host3:1234", 10240);
+    MockNM nm4 = rm.registerNode("host4:1234", 10240);
+    Assert.assertEquals(4, rm.getRMContext().getRMNodes().size());
+    Assert.assertEquals(0, rm.getRMContext().getInactiveRMNodes().size());
+
+    // decommission nm1 via adding nm1 into exclude hosts
+    RMNode rmNode1 = rm.getRMContext().getRMNodes().get(nm1.getNodeId());
+    writeToHostsFile(excludeHostFile, "host1");
+    rm.getNodesListManager().refreshNodes(conf);
+    rm.drainEvents();
+    Assert.assertEquals(rmNode1.getState(), NodeState.DECOMMISSIONED);
+    Assert.assertEquals(3, rm.getRMContext().getRMNodes().size());
+    Assert.assertEquals(1, rm.getRMContext().getInactiveRMNodes().size());
+    Assert.assertEquals(new HashSet(Arrays.asList(nm1.getNodeId())),
+            rm.getRMContext().getInactiveRMNodes().keySet());
+
+    // remove nm1 from exclude hosts, so that it will be marked as untracked
+    // and removed from inactive nodes after the timeout
+    writeToHostsFile(excludeHostFile, "");
+    rm.getNodesListManager().refreshNodes(conf);
+    // confirmed that nm1 should be removed from inactive nodes in 1 second
+    GenericTestUtils.waitFor(() -> 
rm.getRMContext().getInactiveRMNodes().size() == 0,
+            100, 1000);
+
+    // lost nm2
+    RMNode rmNode2 = rm.getRMContext().getRMNodes().get(nm2.getNodeId());
+    rm.getRMContext().getDispatcher().getEventHandler()
+            .handle(new RMNodeEvent(nm2.getNodeId(), RMNodeEventType.EXPIRE));
+    rm.drainEvents();
+    Assert.assertEquals(rmNode2.getState(), NodeState.LOST);
+    Assert.assertEquals(2, rm.getRMContext().getRMNodes().size());
+    Assert.assertEquals(1, rm.getRMContext().getInactiveRMNodes().size());
+    // confirmed that nm2 should not be removed from inactive nodes in 1 second
+    GenericTestUtils.waitFor(() -> 
rm.getRMContext().getInactiveRMNodes().size() == 1,
+            100, 1000);

Review Comment:
   Indentation





> Support removal of only selective node states in untracked removal flow
> -----------------------------------------------------------------------
>
>                 Key: YARN-11497
>                 URL: https://issues.apache.org/jira/browse/YARN-11497
>             Project: Hadoop YARN
>          Issue Type: Improvement
>            Reporter: Mudit Sharma
>            Priority: Major
>              Labels: pull-request-available
>
> Currently inactive nodes are removed from the Yarn local memory irrespective 
> of which state they are in. This makes the node removal process not too much 
> configurable
> After this patch: https://issues.apache.org/jira/browse/YARN-10854
> If autoscaling is enabled, lot many nodes go into DECOMMISSIONED state but 
> still other states like LOST, SHUTDOWN are very less and systems might want 
> them to be still visible on UI for better tracking
> The proposal is to introduce a new config, which when set, will allow only 
> selective node states to be removed after going into untracked state.
>  
> PR: [https://github.com/apache/hadoop/pull/5681]
> Any thoughts/suggestions/feedbacks are welcome!



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to