[
https://issues.apache.org/jira/browse/YARN-11420?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17695085#comment-17695085
]
ASF GitHub Bot commented on YARN-11420:
---------------------------------------
K0K0V0K commented on code in PR #5317:
URL: https://github.com/apache/hadoop/pull/5317#discussion_r1121770810
##########
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java:
##########
@@ -125,576 +117,362 @@ public void postTransition(
org.apache.hadoop.yarn.server.nodemanager.containermanager.container
.ContainerState afterState,
ContainerEvent processedEvent) {
- synchronized (TRANSITION_COUNTER) {
- if (beforeState != afterState) {
- ContainerId id = op.getContainerId();
- TRANSITION_COUNTER
- .putIfAbsent(id, new HashMap<>());
- long sum = TRANSITION_COUNTER.get(id)
- .compute(afterState,
- (state, count) -> count == null ? 1 : count + 1);
- LOG.info("***** " + id +
- " Transition from " + beforeState +
- " to " + afterState +
- "sum:" + sum);
- }
+ if (beforeState != afterState &&
+ afterState ==
org.apache.hadoop.yarn.server.nodemanager.containermanager.container
+ .ContainerState.RUNNING) {
+ RUNNING_TRANSITIONS.compute(op.getContainerId(),
+ (containerId, counter) -> counter == null ? 1 : ++counter);
}
}
-
- /**
- * Get the current number of state transitions.
- * This is useful to check, if an event has occurred in unit tests.
- * @param id Container id to check
- * @param state Return the overall number of transitions to this state
- * @return Number of transitions to the state specified
- */
- static long getTransitionCounter(ContainerId id,
- org.apache.hadoop.yarn.server.nodemanager
- .containermanager.container
- .ContainerState state) {
- Long ret = TRANSITION_COUNTER.getOrDefault(id, new HashMap<>())
- .get(state);
- return ret != null ? ret : 0;
- }
}
- @Before
- public void setup() throws YarnException, IOException {
- // start minicluster
+ public void setup() throws YarnException, IOException, InterruptedException,
TimeoutException {
conf = new YarnConfiguration();
- // Turn on state tracking
conf.set(YarnConfiguration.NM_CONTAINER_STATE_TRANSITION_LISTENERS,
DebugSumContainerStateListener.class.getName());
- yarnCluster =
- new MiniYARNCluster(TestAMRMClient.class.getName(), nodeCount, 1, 1);
+ startYarnCluster();
+ startYarnClient();
+ UserGroupInformation.setLoginUser(UserGroupInformation
+ .createRemoteUser(UserGroupInformation.getCurrentUser().getUserName()));
+ UserGroupInformation.getCurrentUser().addToken(appAttempt.getAMRMToken());
+ nmTokenCache = new NMTokenCache();
+ startRMClient();
+ startNMClient();
+ }
+
+
+ private void startYarnCluster() {
+ yarnCluster = new MiniYARNCluster(TestNMClient.class.getName(), 3, 1, 1);
yarnCluster.init(conf);
yarnCluster.start();
- assertNotNull(yarnCluster);
assertEquals(STATE.STARTED, yarnCluster.getServiceState());
+ }
- // start rm client
+ private void startYarnClient()
+ throws IOException, YarnException, InterruptedException,
TimeoutException {
yarnClient = (YarnClientImpl) YarnClient.createYarnClient();
yarnClient.init(conf);
yarnClient.start();
- assertNotNull(yarnClient);
assertEquals(STATE.STARTED, yarnClient.getServiceState());
-
- // get node info
nodeReports = yarnClient.getNodeReports(NodeState.RUNNING);
-
- // submit new app
- ApplicationSubmissionContext appContext =
+ ApplicationSubmissionContext appContext =
yarnClient.createApplication().getApplicationSubmissionContext();
ApplicationId appId = appContext.getApplicationId();
- // set the application name
appContext.setApplicationName("Test");
- // Set the priority for the application master
Priority pri = Priority.newInstance(0);
appContext.setPriority(pri);
- // Set the queue to which this application is to be submitted in the RM
appContext.setQueue("default");
- // Set up the container launch context for the application master
- ContainerLaunchContext amContainer = Records
- .newRecord(ContainerLaunchContext.class);
+ ContainerLaunchContext amContainer =
Records.newRecord(ContainerLaunchContext.class);
appContext.setAMContainerSpec(amContainer);
- // unmanaged AM
appContext.setUnmanagedAM(true);
- // Create the request to send to the applications manager
- SubmitApplicationRequest appRequest = Records
- .newRecord(SubmitApplicationRequest.class);
+ SubmitApplicationRequest appRequest =
Records.newRecord(SubmitApplicationRequest.class);
appRequest.setApplicationSubmissionContext(appContext);
- // Submit the application to the applications manager
yarnClient.submitApplication(appContext);
+ GenericTestUtils.waitFor(() ->
yarnCluster.getResourceManager().getRMContext().getRMApps()
+ .get(appId).getCurrentAppAttempt().getAppAttemptState() ==
RMAppAttemptState.LAUNCHED,
+ 100, 30_000, "Failed to start app");
+ appAttempt = yarnCluster.getResourceManager().getRMContext().getRMApps()
+ .get(appId).getCurrentAppAttempt();
+ }
- // wait for app to start
- int iterationsLeft = 30;
- RMAppAttempt appAttempt = null;
- while (iterationsLeft > 0) {
- ApplicationReport appReport = yarnClient.getApplicationReport(appId);
- if (appReport.getYarnApplicationState() ==
- YarnApplicationState.ACCEPTED) {
- attemptId = appReport.getCurrentApplicationAttemptId();
- appAttempt =
- yarnCluster.getResourceManager().getRMContext().getRMApps()
- .get(attemptId.getApplicationId()).getCurrentAppAttempt();
- while (true) {
- if (appAttempt.getAppAttemptState() == RMAppAttemptState.LAUNCHED) {
- break;
- }
- }
- break;
- }
- sleep(1000);
- --iterationsLeft;
- }
- if (iterationsLeft == 0) {
- fail("Application hasn't bee started");
- }
-
- // Just dig into the ResourceManager and get the AMRMToken just for the
sake
- // of testing.
- UserGroupInformation.setLoginUser(UserGroupInformation
- .createRemoteUser(UserGroupInformation.getCurrentUser().getUserName()));
- UserGroupInformation.getCurrentUser().addToken(appAttempt.getAMRMToken());
-
- //creating an instance NMTokenCase
- nmTokenCache = new NMTokenCache();
-
- // start am rm client
- rmClient =
- (AMRMClientImpl<ContainerRequest>) AMRMClient
- .<ContainerRequest> createAMRMClient();
-
- //setting an instance NMTokenCase
+ private void startRMClient() {
+ rmClient = (AMRMClientImpl<ContainerRequest>)
AMRMClient.createAMRMClient();
rmClient.setNMTokenCache(nmTokenCache);
rmClient.init(conf);
rmClient.start();
- assertNotNull(rmClient);
assertEquals(STATE.STARTED, rmClient.getServiceState());
+ }
- // start am nm client
+ private void startNMClient() {
nmClient = (NMClientImpl) NMClient.createNMClient();
-
- //propagating the AMRMClient NMTokenCache instance
nmClient.setNMTokenCache(rmClient.getNMTokenCache());
nmClient.init(conf);
nmClient.start();
- assertNotNull(nmClient);
assertEquals(STATE.STARTED, nmClient.getServiceState());
}
- @After
- public void tearDown() {
+ public void tearDown() throws InterruptedException {
rmClient.stop();
yarnClient.stop();
- yarnCluster.stop();
+ yarnCluster.asyncStop(this);
}
- private void stopNmClient(boolean stopContainers) {
- assertNotNull("Null nmClient", nmClient);
- // leave one unclosed
- assertEquals(1, nmClient.startedContainers.size());
- // default true
- assertTrue(nmClient.getCleanupRunningContainers().get());
- nmClient.cleanupRunningContainersOnStop(stopContainers);
- assertEquals(stopContainers, nmClient.getCleanupRunningContainers().get());
- nmClient.stop();
- }
-
- @Test (timeout = 180000)
+ @Test (timeout = 180_000 * MAX_EARLY_FINISH)
public void testNMClientNoCleanupOnStop()
- throws YarnException, IOException {
-
- rmClient.registerApplicationMaster("Host", 10000, "");
-
- testContainerManagement(nmClient, allocateContainers(rmClient, 5));
-
- rmClient.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED,
- null, null);
- // don't stop the running containers
- stopNmClient(false);
- assertFalse(nmClient.startedContainers.isEmpty());
- //now cleanup
- nmClient.cleanupRunningContainers();
- assertEquals(0, nmClient.startedContainers.size());
+ throws YarnException, IOException, InterruptedException,
TimeoutException {
+ int earlyFinishCounter = MAX_EARLY_FINISH;
Review Comment:
Sure,
In short: this test is a mess
Long: if the test can not allocate container ( for example low memory )
there will be a RUNNING -> COMPLETED status transation, but during the test we
interested some some hack what happening in RUNNING state, so what else we can
do in this case? My original answer was rerun the test and try to allocate
container, but know i think we should just delete this hack.
What do you think?
> Stabilize TestNMClient
> ----------------------
>
> Key: YARN-11420
> URL: https://issues.apache.org/jira/browse/YARN-11420
> Project: Hadoop YARN
> Issue Type: Improvement
> Components: yarn
> Affects Versions: 3.4.0
> Reporter: Bence Kosztolnik
> Assignee: Bence Kosztolnik
> Priority: Major
> Labels: pull-request-available
>
> The TestNMClient test methods can stuck if the test container fails, while
> the test is expecting it running state. This can happen for example if the
> container fails due low memory. To fix this the test should tolerate some
> failure like this.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]