Repository: hadoop Updated Branches: refs/heads/branch-2 3d36f75f2 -> 1d34a4805
YARN-7382. NoSuchElementException in FairScheduler after failover causes RM crash (rkanter) (cherry picked from commit 025c6565725c1819566377632753e8b9055617a6) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/1d34a480 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/1d34a480 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/1d34a480 Branch: refs/heads/branch-2 Commit: 1d34a4805e0b5472bb039ae05cdb052e2976ca14 Parents: 3d36f75 Author: Robert Kanter <[email protected]> Authored: Tue Oct 24 10:21:44 2017 -0700 Committer: Robert Kanter <[email protected]> Committed: Tue Oct 24 10:29:36 2017 -0700 ---------------------------------------------------------------------- .../scheduler/fair/FSAppAttempt.java | 10 ++++++++++ .../TestWorkPreservingRMRestart.java | 21 +++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/1d34a480/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java index 006acea..21863b8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSAppAttempt.java @@ -665,6 +665,16 @@ public class FSAppAttempt extends SchedulerApplicationAttempt if (!rmContainer.getState().equals(RMContainerState.COMPLETED)) { getQueue().incUsedResource(rmContainer.getContainer().getResource()); } + + // If not running unmanaged, the first container we recover is always + // the AM. Set the amResource for this app and update the leaf queue's AM + // usage + if (!isAmRunning() && !getUnmanagedAM()) { + Resource resource = rmContainer.getAllocatedResource(); + setAMResource(resource); + getQueue().addAMResourceUsage(resource); + setAmRunning(true); + } } finally { writeLock.unlock(); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/1d34a480/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java index eb73db1..59f6092 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java @@ -66,6 +66,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueu import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSAppAttempt; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSParentQueue; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSQueueMetrics; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulerConfiguration; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulerTestBase; @@ -154,6 +155,7 @@ public class TestWorkPreservingRMRestart extends ParameterizedSchedulerTestBase new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService()); nm1.registerNode(); RMApp app1 = rm1.submitApp(200); + Resource amResources = app1.getAMResourceRequests().get(0).getCapability(); MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1); // clear queue metrics @@ -236,7 +238,8 @@ public class TestWorkPreservingRMRestart extends ParameterizedSchedulerTestBase if (getSchedulerType() == SchedulerType.CAPACITY) { checkCSQueue(rm2, schedulerApp, nmResource, nmResource, usedResources, 2); } else { - checkFSQueue(rm2, schedulerApp, usedResources, availableResources); + checkFSQueue(rm2, schedulerApp, usedResources, availableResources, + amResources); } // *********** check scheduler attempt state.******** @@ -306,6 +309,7 @@ public class TestWorkPreservingRMRestart extends ParameterizedSchedulerTestBase RMApp app1 = rm1.submitApp(200, "dynamicQApp", UserGroupInformation.getCurrentUser().getShortUserName(), null, ReservationSystemTestUtil.getReservationQueueName()); + Resource amResources = app1.getAMResourceRequests().get(0).getCapability(); MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1); // clear queue metrics @@ -380,7 +384,8 @@ public class TestWorkPreservingRMRestart extends ParameterizedSchedulerTestBase if (getSchedulerType() == SchedulerType.CAPACITY) { checkCSQueue(rm2, schedulerApp, nmResource, nmResource, usedResources, 2); } else { - checkFSQueue(rm2, schedulerApp, usedResources, availableResources); + checkFSQueue(rm2, schedulerApp, usedResources, availableResources, + amResources); } // *********** check scheduler attempt state.******** @@ -452,7 +457,7 @@ public class TestWorkPreservingRMRestart extends ParameterizedSchedulerTestBase private void checkFSQueue(ResourceManager rm, SchedulerApplication schedulerApp, Resource usedResources, - Resource availableResources) throws Exception { + Resource availableResources, Resource amResources) throws Exception { // waiting for RM's scheduling apps int retry = 0; Resource assumedFairShare = Resource.newInstance(8192, 8); @@ -484,6 +489,16 @@ public class TestWorkPreservingRMRestart extends ParameterizedSchedulerTestBase assertMetrics(queueMetrics, 1, 0, 1, 0, 2, availableResources.getMemorySize(), availableResources.getVirtualCores(), usedResources.getMemorySize(), usedResources.getVirtualCores()); + + // ************ check AM resources **************** + assertEquals(amResources, + schedulerApp.getCurrentAppAttempt().getAMResource()); + FSQueueMetrics fsQueueMetrics = + (FSQueueMetrics) schedulerApp.getQueue().getMetrics(); + assertEquals(amResources.getMemorySize(), + fsQueueMetrics.getAMResourceUsageMB()); + assertEquals(amResources.getVirtualCores(), + fsQueueMetrics.getAMResourceUsageVCores()); } // create 3 container reports for AM --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
