susheel-gupta commented on code in PR #5317:
URL: https://github.com/apache/hadoop/pull/5317#discussion_r1400210934
##########
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java:
##########
@@ -125,576 +117,344 @@ public void postTransition(
org.apache.hadoop.yarn.server.nodemanager.containermanager.container
.ContainerState afterState,
ContainerEvent processedEvent) {
- synchronized (TRANSITION_COUNTER) {
- if (beforeState != afterState) {
- ContainerId id = op.getContainerId();
- TRANSITION_COUNTER
- .putIfAbsent(id, new HashMap<>());
- long sum = TRANSITION_COUNTER.get(id)
- .compute(afterState,
- (state, count) -> count == null ? 1 : count + 1);
- LOG.info("***** " + id +
- " Transition from " + beforeState +
- " to " + afterState +
- "sum:" + sum);
- }
+ if (beforeState != afterState &&
+ afterState ==
org.apache.hadoop.yarn.server.nodemanager.containermanager.container
+ .ContainerState.RUNNING) {
+ RUNNING_TRANSITIONS.compute(op.getContainerId(),
+ (containerId, counter) -> counter == null ? 1 : ++counter);
}
}
-
- /**
- * Get the current number of state transitions.
- * This is useful to check, if an event has occurred in unit tests.
- * @param id Container id to check
- * @param state Return the overall number of transitions to this state
- * @return Number of transitions to the state specified
- */
- static long getTransitionCounter(ContainerId id,
- org.apache.hadoop.yarn.server.nodemanager
- .containermanager.container
- .ContainerState state) {
- Long ret = TRANSITION_COUNTER.getOrDefault(id, new HashMap<>())
- .get(state);
- return ret != null ? ret : 0;
- }
}
- @Before
- public void setup() throws YarnException, IOException {
- // start minicluster
+ public void setup() throws YarnException, IOException, InterruptedException,
TimeoutException {
conf = new YarnConfiguration();
- // Turn on state tracking
conf.set(YarnConfiguration.NM_CONTAINER_STATE_TRANSITION_LISTENERS,
DebugSumContainerStateListener.class.getName());
- yarnCluster =
- new MiniYARNCluster(TestAMRMClient.class.getName(), nodeCount, 1, 1);
+ startYarnCluster();
+ startYarnClient();
+ UserGroupInformation.setLoginUser(UserGroupInformation
+ .createRemoteUser(UserGroupInformation.getCurrentUser().getUserName()));
+ UserGroupInformation.getCurrentUser().addToken(appAttempt.getAMRMToken());
+ nmTokenCache = new NMTokenCache();
+ startRMClient();
+ startNMClient();
+ }
+
+
+ private void startYarnCluster() {
+ yarnCluster = new MiniYARNCluster(TestNMClient.class.getName(), 3, 1, 1);
yarnCluster.init(conf);
yarnCluster.start();
- assertNotNull(yarnCluster);
assertEquals(STATE.STARTED, yarnCluster.getServiceState());
+ }
- // start rm client
+ private void startYarnClient()
+ throws IOException, YarnException, InterruptedException,
TimeoutException {
yarnClient = (YarnClientImpl) YarnClient.createYarnClient();
yarnClient.init(conf);
yarnClient.start();
- assertNotNull(yarnClient);
assertEquals(STATE.STARTED, yarnClient.getServiceState());
-
- // get node info
nodeReports = yarnClient.getNodeReports(NodeState.RUNNING);
- // submit new app
- ApplicationSubmissionContext appContext =
+ ApplicationSubmissionContext appContext =
yarnClient.createApplication().getApplicationSubmissionContext();
ApplicationId appId = appContext.getApplicationId();
- // set the application name
appContext.setApplicationName("Test");
- // Set the priority for the application master
Priority pri = Priority.newInstance(0);
appContext.setPriority(pri);
- // Set the queue to which this application is to be submitted in the RM
appContext.setQueue("default");
- // Set up the container launch context for the application master
- ContainerLaunchContext amContainer = Records
- .newRecord(ContainerLaunchContext.class);
+ ContainerLaunchContext amContainer =
Records.newRecord(ContainerLaunchContext.class);
appContext.setAMContainerSpec(amContainer);
- // unmanaged AM
appContext.setUnmanagedAM(true);
- // Create the request to send to the applications manager
- SubmitApplicationRequest appRequest = Records
- .newRecord(SubmitApplicationRequest.class);
+
+ SubmitApplicationRequest appRequest =
Records.newRecord(SubmitApplicationRequest.class);
appRequest.setApplicationSubmissionContext(appContext);
- // Submit the application to the applications manager
yarnClient.submitApplication(appContext);
+ GenericTestUtils.waitFor(() ->
yarnCluster.getResourceManager().getRMContext().getRMApps()
+ .get(appId).getCurrentAppAttempt().getAppAttemptState() ==
RMAppAttemptState.LAUNCHED,
+ 100, 30_000, "Failed to start app");
+ appAttempt = yarnCluster.getResourceManager().getRMContext().getRMApps()
+ .get(appId).getCurrentAppAttempt();
+ }
- // wait for app to start
- int iterationsLeft = 30;
- RMAppAttempt appAttempt = null;
- while (iterationsLeft > 0) {
- ApplicationReport appReport = yarnClient.getApplicationReport(appId);
- if (appReport.getYarnApplicationState() ==
- YarnApplicationState.ACCEPTED) {
- attemptId = appReport.getCurrentApplicationAttemptId();
- appAttempt =
- yarnCluster.getResourceManager().getRMContext().getRMApps()
- .get(attemptId.getApplicationId()).getCurrentAppAttempt();
- while (true) {
- if (appAttempt.getAppAttemptState() == RMAppAttemptState.LAUNCHED) {
- break;
- }
- }
- break;
- }
- sleep(1000);
- --iterationsLeft;
- }
- if (iterationsLeft == 0) {
- fail("Application hasn't bee started");
- }
-
- // Just dig into the ResourceManager and get the AMRMToken just for the
sake
- // of testing.
- UserGroupInformation.setLoginUser(UserGroupInformation
- .createRemoteUser(UserGroupInformation.getCurrentUser().getUserName()));
- UserGroupInformation.getCurrentUser().addToken(appAttempt.getAMRMToken());
-
- //creating an instance NMTokenCase
- nmTokenCache = new NMTokenCache();
-
- // start am rm client
- rmClient =
- (AMRMClientImpl<ContainerRequest>) AMRMClient
- .<ContainerRequest> createAMRMClient();
-
- //setting an instance NMTokenCase
+ private void startRMClient() {
+ rmClient = (AMRMClientImpl<ContainerRequest>)
AMRMClient.createAMRMClient();
rmClient.setNMTokenCache(nmTokenCache);
rmClient.init(conf);
rmClient.start();
- assertNotNull(rmClient);
assertEquals(STATE.STARTED, rmClient.getServiceState());
+ }
- // start am nm client
+ private void startNMClient() {
nmClient = (NMClientImpl) NMClient.createNMClient();
-
- //propagating the AMRMClient NMTokenCache instance
nmClient.setNMTokenCache(rmClient.getNMTokenCache());
nmClient.init(conf);
nmClient.start();
- assertNotNull(nmClient);
assertEquals(STATE.STARTED, nmClient.getServiceState());
}
- @After
- public void tearDown() {
+ public void tearDown() throws InterruptedException {
rmClient.stop();
yarnClient.stop();
- yarnCluster.stop();
- }
-
- private void stopNmClient(boolean stopContainers) {
- assertNotNull("Null nmClient", nmClient);
- // leave one unclosed
- assertEquals(1, nmClient.startedContainers.size());
- // default true
- assertTrue(nmClient.getCleanupRunningContainers().get());
- nmClient.cleanupRunningContainersOnStop(stopContainers);
- assertEquals(stopContainers, nmClient.getCleanupRunningContainers().get());
- nmClient.stop();
+ yarnCluster.asyncStop(this);
}
- @Test (timeout = 180000)
+ @Test (timeout = 180_000 * MAX_EARLY_FINISH)
public void testNMClientNoCleanupOnStop()
- throws YarnException, IOException {
-
- rmClient.registerApplicationMaster("Host", 10000, "");
+ throws YarnException, IOException, InterruptedException,
TimeoutException {
+ runTest(() -> {
+ stopNmClient();
+ assertFalse(nmClient.startedContainers.isEmpty());
+ nmClient.cleanupRunningContainers();
+ assertEquals(0, nmClient.startedContainers.size());
+ });
+ }
- testContainerManagement(nmClient, allocateContainers(rmClient, 5));
+ @Test (timeout = 200_000 * MAX_EARLY_FINISH)
+ public void testNMClient()
+ throws YarnException, IOException, InterruptedException,
TimeoutException {
+ runTest(() -> {
+ // stop the running containers on close
+ assertFalse(nmClient.startedContainers.isEmpty());
+ nmClient.cleanupRunningContainersOnStop(true);
+ assertTrue(nmClient.getCleanupRunningContainers().get());
+ nmClient.stop();
+ });
+ }
- rmClient.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED,
- null, null);
- // don't stop the running containers
- stopNmClient(false);
- assertFalse(nmClient.startedContainers.isEmpty());
- //now cleanup
- nmClient.cleanupRunningContainers();
- assertEquals(0, nmClient.startedContainers.size());
+ public void runTest(
+ Runnable test
+ ) throws IOException, InterruptedException, YarnException, TimeoutException {
+ int earlyFinishCounter = MAX_EARLY_FINISH;
+ int earlyFinishCounterWhenTestWasStarted;
+ do {
+ earlyFinishCounterWhenTestWasStarted = earlyFinishCounter;
+ setup();
+ rmClient.registerApplicationMaster("Host", 10_000, "");
+ testContainerManagement(nmClient, allocateContainers(rmClient));
+ rmClient.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED,
null, null);
+ test.run();
+ tearDown();
+ } while (earlyFinishCounter != 0 && earlyFinishCounter !=
earlyFinishCounterWhenTestWasStarted);
+ if (earlyFinishCounter == 0) {
+ fail("Too many early finish exception happened");
+ }
}
- @Test (timeout = 200000)
- public void testNMClient()
- throws YarnException, IOException {
- rmClient.registerApplicationMaster("Host", 10000, "");
-
- testContainerManagement(nmClient, allocateContainers(rmClient, 5));
-
- rmClient.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED,
- null, null);
- // stop the running containers on close
- assertFalse(nmClient.startedContainers.isEmpty());
- nmClient.cleanupRunningContainersOnStop(true);
+ private void stopNmClient() {
+ assertNotNull("Null nmClient", nmClient);
+ // leave one unclosed
+ assertEquals(1, nmClient.startedContainers.size());
+ // default true
assertTrue(nmClient.getCleanupRunningContainers().get());
+ nmClient.cleanupRunningContainersOnStop(false);
+ assertFalse(nmClient.getCleanupRunningContainers().get());
nmClient.stop();
}
private Set<Container> allocateContainers(
- AMRMClientImpl<ContainerRequest> rmClient, int num)
- throws YarnException, IOException {
- // setup container request
- Resource capability = Resource.newInstance(1024, 0);
- Priority priority = Priority.newInstance(0);
- String node = nodeReports.get(0).getNodeId().getHost();
- String rack = nodeReports.get(0).getRackName();
- String[] nodes = new String[] {node};
- String[] racks = new String[] {rack};
-
- for (int i = 0; i < num; ++i) {
- rmClient.addContainerRequest(new ContainerRequest(capability, nodes,
- racks, priority));
+ AMRMClientImpl<ContainerRequest> rmClient
+ ) throws YarnException, IOException {
+ for (int i = 0; i < NUMBER_OF_CONTAINERS; ++i) {
+ rmClient.addContainerRequest(new ContainerRequest(
+ Resource.newInstance(1024, 0),
+ new String[] {nodeReports.get(0).getNodeId().getHost()},
+ new String[] {nodeReports.get(0).getRackName()},
+ Priority.newInstance(0)
+ ));
}
-
- int containersRequestedAny = rmClient.getTable(0)
- .get(priority, ResourceRequest.ANY, ExecutionType.GUARANTEED,
- capability).remoteRequest.getNumContainers();
-
- // RM should allocate container within 2 calls to allocate()
- int allocatedContainerCount = 0;
- int iterationsLeft = 2;
- Set<Container> containers = new TreeSet<Container>();
- while (allocatedContainerCount < containersRequestedAny
- && iterationsLeft > 0) {
+ Set<Container> allocatedContainers = new TreeSet<>();
+ while (allocatedContainers.size() < NUMBER_OF_CONTAINERS) {
AllocateResponse allocResponse = rmClient.allocate(0.1f);
-
- allocatedContainerCount += allocResponse.getAllocatedContainers().size();
- for(Container container : allocResponse.getAllocatedContainers()) {
- containers.add(container);
- }
- if (!allocResponse.getNMTokens().isEmpty()) {
- for (NMToken token : allocResponse.getNMTokens()) {
- rmClient.getNMTokenCache().setToken(token.getNodeId().toString(),
- token.getToken());
- }
+ allocatedContainers.addAll(allocResponse.getAllocatedContainers());
+ for (NMToken token : allocResponse.getNMTokens()) {
+ rmClient.getNMTokenCache().setToken(token.getNodeId().toString(),
token.getToken());
}
- if(allocatedContainerCount < containersRequestedAny) {
- // sleep to let NM's heartbeat to RM and trigger allocations
- sleep(1000);
+ if (allocatedContainers.size() < NUMBER_OF_CONTAINERS) {
+ sleep(100);
}
-
- --iterationsLeft;
}
- return containers;
+ return allocatedContainers;
}
- private void testContainerManagement(NMClientImpl client,
- Set<Container> containers) throws YarnException, IOException {
+ private void testContainerManagement(
+ NMClientImpl client, Set<Container> containers
+ ) throws YarnException, IOException {
int size = containers.size();
int i = 0;
for (Container container : containers) {
// getContainerStatus shouldn't be called before startContainer,
// otherwise, NodeManager cannot find the container
- try {
- client.getContainerStatus(container.getId(), container.getNodeId());
- fail("Exception is expected");
- } catch (YarnException e) {
- assertTrue("The thrown exception is not expected",
- e.getMessage().contains("is not handled by this NodeManager"));
- }
+ assertYarnException(
+ () -> client.getContainerStatus(container.getId(),
container.getNodeId()),
+ IS_NOT_HANDLED_BY_THIS_NODEMANAGER);
// upadateContainerResource shouldn't be called before startContainer,
// otherwise, NodeManager cannot find the container
- try {
- client.updateContainerResource(container);
- fail("Exception is expected");
- } catch (YarnException e) {
- assertTrue("The thrown exception is not expected",
- e.getMessage().contains("is not handled by this NodeManager"));
- }
-
+ assertYarnException(
+ () -> client.updateContainerResource(container),
+ IS_NOT_HANDLED_BY_THIS_NODEMANAGER);
// restart shouldn't be called before startContainer,
// otherwise, NodeManager cannot find the container
- try {
- client.restartContainer(container.getId());
- fail("Exception is expected");
- } catch (YarnException e) {
- assertTrue("The thrown exception is not expected",
- e.getMessage().contains("Unknown container"));
- }
-
+ assertYarnException(
+ () -> client.restartContainer(container.getId()),
+ UNKNOWN_NODEMANAGER);
// rollback shouldn't be called before startContainer,
// otherwise, NodeManager cannot find the container
- try {
- client.rollbackLastReInitialization(container.getId());
- fail("Exception is expected");
- } catch (YarnException e) {
- assertTrue("The thrown exception is not expected",
- e.getMessage().contains("Unknown container"));
- }
-
+ assertYarnException(
+ () -> client.rollbackLastReInitialization(container.getId()),
+ UNKNOWN_NODEMANAGER);
// commit shouldn't be called before startContainer,
// otherwise, NodeManager cannot find the container
- try {
- client.commitLastReInitialization(container.getId());
- fail("Exception is expected");
- } catch (YarnException e) {
- assertTrue("The thrown exception is not expected",
- e.getMessage().contains("Unknown container"));
- }
-
+ assertYarnException(
+ () -> client.commitLastReInitialization(container.getId()),
+ UNKNOWN_NODEMANAGER);
// stopContainer shouldn't be called before startContainer,
// otherwise, an exception will be thrown
- try {
- client.stopContainer(container.getId(), container.getNodeId());
- fail("Exception is expected");
- } catch (YarnException e) {
- if (!e.getMessage()
- .contains("is not handled by this NodeManager")) {
- throw new AssertionError("Exception is not expected: ", e);
- }
- }
+ assertYarnException(
+ () -> client.stopContainer(container.getId(), container.getNodeId()),
+ IS_NOT_HANDLED_BY_THIS_NODEMANAGER);
Credentials ts = new Credentials();
DataOutputBuffer dob = new DataOutputBuffer();
ts.writeTokenStorageToStream(dob);
- ByteBuffer securityTokens =
- ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
- ContainerLaunchContext clc =
- Records.newRecord(ContainerLaunchContext.class);
- if (Shell.WINDOWS) {
- clc.setCommands(
- Arrays.asList("ping", "-n", "10000000", "127.0.0.1", ">nul"));
- } else {
- clc.setCommands(Arrays.asList("sleep", "1000000"));
- }
+ ByteBuffer securityTokens = ByteBuffer.wrap(dob.getData(), 0,
dob.getLength());
+ ContainerLaunchContext clc =
Records.newRecord(ContainerLaunchContext.class);
+ clc.setCommands(Shell.WINDOWS
+ ? Arrays.asList("ping", "-n", "10000000", "127.0.0.1", ">nul")
+ : Arrays.asList("sleep", "1000000")
+ );
clc.setTokens(securityTokens);
- try {
- client.startContainer(container, clc);
- } catch (YarnException e) {
- throw new AssertionError("Exception is not expected ", e);
- }
- List<Integer> exitStatuses = Collections.singletonList(-1000);
+ client.startContainer(container, clc);
+ List<Integer> exitStatuses = Arrays.asList(-1000, -105);
// leave one container unclosed
if (++i < size) {
testContainer(client, i, container, clc, exitStatuses);
-
}
}
}
private void testContainer(NMClientImpl client, int i, Container container,
ContainerLaunchContext clc, List<Integer>
exitCode)
- throws YarnException, IOException {
- // NodeManager may still need some time to make the container started
+ throws YarnException, IOException {
testGetContainerStatus(container, i, ContainerState.RUNNING, "",
- exitCode);
- waitForContainerTransitionCount(container,
- org.apache.hadoop.yarn.server.nodemanager.
- containermanager.container.ContainerState.RUNNING, 1);
- // Test increase container API and make sure requests can reach NM
+ exitCode);
+ waitForContainerRunningTransitionCount(container, 1);
testIncreaseContainerResource(container);
-
- testRestartContainer(container.getId());
+ testRestartContainer(container);
testGetContainerStatus(container, i, ContainerState.RUNNING,
- "will be Restarted", exitCode);
- waitForContainerTransitionCount(container,
- org.apache.hadoop.yarn.server.nodemanager.
- containermanager.container.ContainerState.RUNNING, 2);
-
+ "will be Restarted", exitCode);
+ waitForContainerRunningTransitionCount(container, 2);
if (i % 2 == 0) {
- testReInitializeContainer(container.getId(), clc, false);
+ testReInitializeContainer(container, clc, false);
testGetContainerStatus(container, i, ContainerState.RUNNING,
- "will be Re-initialized", exitCode);
- waitForContainerTransitionCount(container,
- org.apache.hadoop.yarn.server.nodemanager.
- containermanager.container.ContainerState.RUNNING, 3);
-
- testRollbackContainer(container.getId(), false);
+ "will be Re-initialized", exitCode);
+ waitForContainerRunningTransitionCount(container, 3);
+ testContainerRollback(container, true);
testGetContainerStatus(container, i, ContainerState.RUNNING,
- "will be Rolled-back", exitCode);
- waitForContainerTransitionCount(container,
- org.apache.hadoop.yarn.server.nodemanager.
- containermanager.container.ContainerState.RUNNING, 4);
-
- testCommitContainer(container.getId(), true);
- testReInitializeContainer(container.getId(), clc, false);
+ "will be Rolled-back", exitCode);
+ waitForContainerRunningTransitionCount(container, 4);
+ testContainerCommit(container, false);
+ testReInitializeContainer(container, clc, false);
testGetContainerStatus(container, i, ContainerState.RUNNING,
- "will be Re-initialized", exitCode);
- waitForContainerTransitionCount(container,
- org.apache.hadoop.yarn.server.nodemanager.
- containermanager.container.ContainerState.RUNNING, 5);
- testCommitContainer(container.getId(), false);
+ "will be Re-initialized", exitCode);
+ waitForContainerRunningTransitionCount(container, 5);
+ testContainerCommit(container, true);
} else {
- testReInitializeContainer(container.getId(), clc, true);
+ testReInitializeContainer(container, clc, true);
testGetContainerStatus(container, i, ContainerState.RUNNING,
- "will be Re-initialized", exitCode);
- waitForContainerTransitionCount(container,
- org.apache.hadoop.yarn.server.nodemanager.
- containermanager.container.ContainerState.RUNNING, 3);
- testRollbackContainer(container.getId(), true);
- testCommitContainer(container.getId(), true);
+ "will be Re-initialized", exitCode);
+ waitForContainerRunningTransitionCount(container, 3);
+ testContainerRollback(container, false);
+ testContainerCommit(container, false);
}
-
- try {
- client.stopContainer(container.getId(), container.getNodeId());
- } catch (YarnException e) {
- throw (AssertionError)
- (new AssertionError("Exception is not expected: " + e, e));
- }
-
- // getContainerStatus can be called after stopContainer
- try {
- // O is possible if CLEANUP_CONTAINER is executed too late
- // -105 is possible if the container is not terminated but killed
- testGetContainerStatus(container, i, ContainerState.COMPLETE,
- "Container killed by the ApplicationMaster.",
- Arrays.asList(
- ContainerExitStatus.KILLED_BY_APPMASTER,
- ContainerExitStatus.SUCCESS));
- } catch (YarnException e) {
- // The exception is possible because, after the container is stopped,
- // it may be removed from NM's context.
- if (!e.getMessage()
- .contains("was recently stopped on node manager")) {
- throw (AssertionError)
- (new AssertionError("Exception is not expected: ", e));
- }
- }
- }
-
- /**
- * Wait until the container reaches a state N times.
- * @param container container to watch
- * @param state state to test
- * @param transitions the number N above
- * @throws YarnException This happens if the test times out while waiting
- */
- private void waitForContainerTransitionCount(
- Container container,
- org.apache.hadoop.yarn.server.nodemanager.
- containermanager.container.ContainerState state, long transitions)
- throws YarnException {
- long transitionCount = -1;
- do {
- if (transitionCount != -1) {
- try {
- Thread.sleep(10);
- } catch (InterruptedException e) {
- throw new YarnException(
- "Timeout at transition count:" + transitionCount, e);
- }
- }
- transitionCount = DebugSumContainerStateListener
- .getTransitionCounter(container.getId(), state);
- } while (transitionCount != transitions);
+ client.stopContainer(container.getId(), container.getNodeId());
+ testGetContainerStatus(container, i, ContainerState.COMPLETE,
+ "killed by the ApplicationMaster", exitCode);
}
- private void sleep(int sleepTime) {
- try {
- Thread.sleep(sleepTime);
- } catch (InterruptedException e) {
- e.printStackTrace();
+ private void waitForContainerRunningTransitionCount(Container container,
long transitions) {
+ while (DebugSumContainerStateListener.RUNNING_TRANSITIONS
Review Comment:
Sometimes we are waiting for the container to acquire state running, but
before we start to wait it will jump to completed, that cause the error. And we
got timeout. I think this is what stuck means.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]