YARN-4546. ResourceManager crash due to scheduling opportunity overflow. Contributed by Jason Lowe.
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/c1462a67 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/c1462a67 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/c1462a67 Branch: refs/heads/HDFS-1312 Commit: c1462a67ff7bb632df50e1c52de971cced56c6a3 Parents: 2d16f40 Author: Junping Du <[email protected]> Authored: Wed Jan 6 05:49:24 2016 -0800 Committer: Junping Du <[email protected]> Committed: Wed Jan 6 05:49:24 2016 -0800 ---------------------------------------------------------------------- hadoop-yarn-project/CHANGES.txt | 9 ++++++++ .../scheduler/SchedulerApplicationAttempt.java | 11 ++++++++-- .../TestSchedulerApplicationAttempt.java | 22 ++++++++++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/c1462a67/hadoop-yarn-project/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 28935c6..0ec5acdf 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -1215,6 +1215,9 @@ Release 2.8.0 - UNRELEASED YARN-1382. Remove unusableRMNodesConcurrentSet (never used) in NodeListManager to get rid of memory leak. (Rohith Sharma K S via junping_du) + YARN-4546. ResourceManager crash due to scheduling opportunity overflow. + (Jason Lowe via junping_du) + Release 2.7.3 - UNRELEASED INCOMPATIBLE CHANGES @@ -1270,6 +1273,9 @@ Release 2.7.3 - UNRELEASED YARN-4510. Fix SLS startup failure caused by NPE. (Bibin A Chundatt via wangda) + YARN-4546. ResourceManager crash due to scheduling opportunity overflow. + (Jason Lowe via junping_du) + Release 2.7.2 - UNRELEASED INCOMPATIBLE CHANGES @@ -2130,6 +2136,9 @@ Release 2.6.4 - UNRELEASED YARN-3697. FairScheduler: ContinuousSchedulingThread can fail to shutdown. (Zhihai Xu via kasha) + YARN-4546. ResourceManager crash due to scheduling opportunity overflow. + (Jason Lowe via junping_du) + Release 2.6.3 - 2015-12-17 INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/c1462a67/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java index 4d81350..c1f1c3d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java @@ -625,8 +625,10 @@ public class SchedulerApplicationAttempt implements SchedulableEntity { public synchronized void addSchedulingOpportunity(Priority priority) { - schedulingOpportunities.setCount(priority, - schedulingOpportunities.count(priority) + 1); + int count = schedulingOpportunities.count(priority); + if (count < Integer.MAX_VALUE) { + schedulingOpportunities.setCount(priority, count + 1); + } } public synchronized void subtractSchedulingOpportunity(Priority priority) { @@ -661,6 +663,11 @@ public class SchedulerApplicationAttempt implements SchedulableEntity { schedulingOpportunities.setCount(priority, 0); } + @VisibleForTesting + void setSchedulingOpportunities(Priority priority, int count) { + schedulingOpportunities.setCount(priority, count); + } + synchronized AggregateAppResourceUsage getRunningAggregateAppResourceUsage() { long currentTimeMillis = System.currentTimeMillis(); // Don't walk the whole container list if the resources were computed http://git-wip-us.apache.org/repos/asf/hadoop/blob/c1462a67/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerApplicationAttempt.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerApplicationAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerApplicationAttempt.java index 987aa68..88216f8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerApplicationAttempt.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerApplicationAttempt.java @@ -249,4 +249,26 @@ public class TestSchedulerApplicationAttempt { assertEquals(0.0f, app.getResourceUsageReport().getClusterUsagePercentage(), 0.0f); } + + @Test + public void testSchedulingOpportunityOverflow() throws Exception { + ApplicationAttemptId attemptId = createAppAttemptId(0, 0); + Queue queue = createQueue("test", null); + RMContext rmContext = mock(RMContext.class); + when(rmContext.getEpoch()).thenReturn(3L); + SchedulerApplicationAttempt app = new SchedulerApplicationAttempt( + attemptId, "user", queue, queue.getActiveUsersManager(), rmContext); + Priority priority = Priority.newInstance(1); + assertEquals(0, app.getSchedulingOpportunities(priority)); + app.addSchedulingOpportunity(priority); + assertEquals(1, app.getSchedulingOpportunities(priority)); + // verify the count is capped at MAX_VALUE and does not overflow + app.setSchedulingOpportunities(priority, Integer.MAX_VALUE - 1); + assertEquals(Integer.MAX_VALUE - 1, + app.getSchedulingOpportunities(priority)); + app.addSchedulingOpportunity(priority); + assertEquals(Integer.MAX_VALUE, app.getSchedulingOpportunities(priority)); + app.addSchedulingOpportunity(priority); + assertEquals(Integer.MAX_VALUE, app.getSchedulingOpportunities(priority)); + } }
