Author: yhemanth
Date: Tue May 12 13:38:19 2009
New Revision: 773891
URL: http://svn.apache.org/viewvc?rev=773891&view=rev
Log:
Merge -r 773888:773889 from trunk to branch 0.20 to fix HADOOP-5641.
Modified:
hadoop/core/branches/branch-0.20/ (props changed)
hadoop/core/branches/branch-0.20/CHANGES.txt (contents, props changed)
hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/CapacityTaskScheduler.java
hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/MemoryMatcher.java
hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/test/org/apache/hadoop/mapred/TestCapacityScheduler.java
Propchange: hadoop/core/branches/branch-0.20/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue May 12 13:38:19 2009
@@ -1,2 +1,2 @@
/hadoop/core/branches/branch-0.19:713112
-/hadoop/core/trunk:727001,727117,727191,727212,727217,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,736426,738328,738697,740077,740157,741703,741762,743745,743816,743892,744894,745180,746010,746206,746227,746233,746274,746338,746902-746903,746925,746944,746968,746970,747279,747289,747802,748084,748090,748783,749262,749318,749863,750533,752073,752609,752834,752836,752913,752932,753112-753113,753346,754645,754847,754927,755035,755226,755348,755370,755418,755426,755790,755905,755938,755960,755986,755998,756352,757448,757624,757849,758156,759398,759932,760502,760783,761046,761482,761632,762216,762879,763107,763502,764967,765016,765809,765951,771607,771661,772844,772876,772920
+/hadoop/core/trunk:727001,727117,727191,727212,727217,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,736426,738328,738697,740077,740157,741703,741762,743745,743816,743892,744894,745180,746010,746206,746227,746233,746274,746338,746902-746903,746925,746944,746968,746970,747279,747289,747802,748084,748090,748783,749262,749318,749863,750533,752073,752609,752834,752836,752913,752932,753112-753113,753346,754645,754847,754927,755035,755226,755348,755370,755418,755426,755790,755905,755938,755960,755986,755998,756352,757448,757624,757849,758156,759398,759932,760502,760783,761046,761482,761632,762216,762879,763107,763502,764967,765016,765809,765951,771607,771661,772844,772876,772920,773889
Modified: hadoop/core/branches/branch-0.20/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/CHANGES.txt?rev=773891&r1=773890&r2=773891&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.20/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.20/CHANGES.txt Tue May 12 13:38:19 2009
@@ -61,6 +61,9 @@
KILLED (this used to happen when the SetupTask would come back with a
success after the job has been killed). (Amar Kamat via ddas)
+ HADOOP-5641. Fix a NullPointerException in capacity scheduler's memory
+ based scheduling code when jobs get retired. (yhemanth)
+
Release 0.20.0 - 2009-04-15
INCOMPATIBLE CHANGES
Propchange: hadoop/core/branches/branch-0.20/CHANGES.txt
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue May 12 13:38:19 2009
@@ -1,3 +1,3 @@
/hadoop/core/branches/branch-0.18/CHANGES.txt:727226
/hadoop/core/branches/branch-0.19/CHANGES.txt:713112
-/hadoop/core/trunk/CHANGES.txt:727001,727117,727191,727212,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,735082,736426,738602,738697,739416,740077,740157,741703,741762,743296,743745,743816,743892,744894,745180,745268,746010,746193,746206,746227,746233,746274,746902-746903,746925,746944,746968,746970,747279,747289,747802,748084,748090,748783,749262,749318,749863,750533,752073,752514,752555,752590,752609,752834,752836,752913,752932,753112-753113,753346,754645,754847,754927,755035,755226,755348,755370,755418,755426,755790,755905,755938,755986,755998,756352,757448,757624,757849,758156,759398,759932,760502,760783,761046,761482,761632,762216,762879,763107,763502,764967,765016,765809,765951,771607,772844,772876,772920
+/hadoop/core/trunk/CHANGES.txt:727001,727117,727191,727212,727228,727255,727869,728187,729052,729987,732385,732572,732613,732777,732838,732869,733887,734870,734916,735082,736426,738602,738697,739416,740077,740157,741703,741762,743296,743745,743816,743892,744894,745180,745268,746010,746193,746206,746227,746233,746274,746902-746903,746925,746944,746968,746970,747279,747289,747802,748084,748090,748783,749262,749318,749863,750533,752073,752514,752555,752590,752609,752834,752836,752913,752932,753112-753113,753346,754645,754847,754927,755035,755226,755348,755370,755418,755426,755790,755905,755938,755986,755998,756352,757448,757624,757849,758156,759398,759932,760502,760783,761046,761482,761632,762216,762879,763107,763502,764967,765016,765809,765951,771607,772844,772876,772920,773889
Modified:
hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/CapacityTaskScheduler.java
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/CapacityTaskScheduler.java?rev=773891&r1=773890&r2=773891&view=diff
==============================================================================
---
hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/CapacityTaskScheduler.java
(original)
+++
hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/CapacityTaskScheduler.java
Tue May 12 13:38:19 2009
@@ -413,7 +413,8 @@
}
}
else {
- // mem requirements not met. Rather than look at the next job,
+ // mem requirements not met or could not be computed for this TT
+ // Rather than look at the next job,
// we return nothing to the TT, with the hope that we improve
// chances of finding a suitable TT for this job. This lets us
// avoid starving jobs with high mem requirements.
Modified:
hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/MemoryMatcher.java
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/MemoryMatcher.java?rev=773891&r1=773890&r2=773891&view=diff
==============================================================================
---
hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/MemoryMatcher.java
(original)
+++
hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/MemoryMatcher.java
Tue May 12 13:38:19 2009
@@ -112,7 +112,8 @@
* residing on the given TaskTracker.
*
* @param taskTracker
- * @return amount of memory that is used by the residing tasks
+ * @return amount of memory that is used by the residing tasks,
+ * null if memory cannot be computed for some reason.
*/
private synchronized Memory getMemReservedForTasks(
TaskTrackerStatus taskTracker) {
@@ -141,6 +142,26 @@
// accounted in used memory.
if ((task.getRunState() == TaskStatus.State.RUNNING)
|| (task.getRunState() == TaskStatus.State.COMMIT_PENDING)) {
+ JobInProgress job = scheduler.taskTrackerManager.getJob(
+ task.getTaskID().getJobID());
+ if (job == null) {
+ // This scenario can happen if a job was completed/killed
+ // and retired from JT's memory. In this state, we can ignore
+ // the running task status and compute memory for the rest of
+ // the tasks. However, any scheduling done with this computation
+ // could result in over-subscribing of memory for tasks on this
+ // TT (as the unaccounted for task is still running).
+ // So, it is safer to not schedule anything for this TT
+ // One of the ways of doing that is to return null from here
+ // and check for null in the calling method.
+ LOG.info("Task tracker: " + taskTracker.getHost() + " is reporting "
+ + "a running / commit pending task: " + task.getTaskID()
+ + " but no corresponding job was found. "
+ + "Maybe job was retired. Not computing "
+ + "memory values for this TT.");
+ return null;
+ }
+
JobConf jConf =
scheduler.taskTrackerManager.getJob(task.getTaskID().getJobID())
.getJobConf();
@@ -194,6 +215,16 @@
}
Memory memReservedForTasks = getMemReservedForTasks(taskTracker);
+ if (memReservedForTasks == null) {
+ // For some reason, maybe because we could not find the job
+ // corresponding to a running task (as can happen if the job
+ // is retired in between), we could not compute the memory state
+ // on this TT. Treat this as an error, and fail memory
+ // requirements.
+ LOG.info("Could not compute memory for taskTracker: "
+ + taskTracker.getHost() + ". Failing memory requirements.");
+ return false;
+ }
long vmemUsedOnTT = memReservedForTasks.vmem;
long pmemUsedOnTT = memReservedForTasks.pmem;
@@ -243,4 +274,4 @@
+ jobVMemForTask + " jobPMemForTask = " + jobPMemForTask);
return true;
}
-}
\ No newline at end of file
+}
Modified:
hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/test/org/apache/hadoop/mapred/TestCapacityScheduler.java
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/test/org/apache/hadoop/mapred/TestCapacityScheduler.java?rev=773891&r1=773890&r2=773891&view=diff
==============================================================================
---
hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/test/org/apache/hadoop/mapred/TestCapacityScheduler.java
(original)
+++
hadoop/core/branches/branch-0.20/src/contrib/capacity-scheduler/src/test/org/apache/hadoop/mapred/TestCapacityScheduler.java
Tue May 12 13:38:19 2009
@@ -379,6 +379,10 @@
job.kill();
}
+ public void removeJob(JobID jobid) {
+ jobs.remove(jobid);
+ }
+
@Override
public JobInProgress getJob(JobID jobid) {
return jobs.get(jobid);
@@ -1785,6 +1789,84 @@
assertNull(scheduler.assignTasks(tracker("tt1")));
}
+ /**
+ * Testcase to verify fix for a NPE (HADOOP-5641), when memory based
+ * scheduling is enabled and jobs are retired from memory when tasks
+ * are still active on some Tasktrackers.
+ *
+ * @throws IOException
+ */
+ public void testMemoryMatchingWithRetiredJobs() throws IOException {
+ // create a cluster with a single node.
+ LOG.debug("Starting cluster with 1 tasktracker, 2 map and 2 reduce slots");
+ taskTrackerManager = new FakeTaskTrackerManager(1, 2, 2);
+ TaskTrackerStatus.ResourceStatus ttStatus =
+ taskTrackerManager.getTaskTracker("tt1").getResourceStatus();
+ LOG.debug("Assume TT has 4 GB virtual mem and 2 GB RAM");
+ ttStatus.setTotalVirtualMemory(4 * 1024 * 1024 * 1024L);
+ ttStatus.setReservedVirtualMemory(0);
+ ttStatus.setTotalPhysicalMemory(2 * 1024 * 1024 * 1024L);
+ ttStatus.setReservedPhysicalMemory(0);
+
+ // create scheduler
+ ArrayList<FakeQueueInfo> queues = new ArrayList<FakeQueueInfo>();
+ queues.add(new FakeQueueInfo("default", 100.0f, true, 100));
+ taskTrackerManager.addQueues(new String[] { "default" });
+ resConf.setFakeQueues(queues);
+ scheduler.setTaskTrackerManager(taskTrackerManager);
+ // enabled memory-based scheduling
+ LOG.debug("By default, jobs get 0.5 GB per task vmem" +
+ " and 2 GB max vmem, with 50% of it for RAM");
+ scheduler.getConf().setLong(JobConf.MAPRED_TASK_DEFAULT_MAXVMEM_PROPERTY,
+ 512 * 1024 * 1024L);
+ scheduler.getConf().setLong(JobConf.UPPER_LIMIT_ON_TASK_VMEM_PROPERTY,
+ 2 * 1024 * 1024 * 1024L);
+ resConf.setDefaultPercentOfPmemInVmem(50.0f);
+ resConf.setLimitMaxPmemForTasks(1 * 1024 * 1024 * 1024L);
+ scheduler.setResourceManagerConf(resConf);
+ scheduler.start();
+
+ // submit a normal job
+ LOG.debug("Submitting a normal job with 2 maps and 2 reduces");
+ JobConf jConf = new JobConf();
+ jConf.setNumMapTasks(2);
+ jConf.setNumReduceTasks(2);
+ jConf.setQueueName("default");
+ jConf.setUser("u1");
+ FakeJobInProgress job1 = submitJobAndInit(JobStatus.PREP, jConf);
+
+ // 1st cycle - 1 map gets assigned.
+ Task t = checkAssignment("tt1", "attempt_test_0001_m_000001_0 on tt1");
+
+ // kill this job !
+ taskTrackerManager.killJob(job1.getJobID());
+
+ // retire the job
+ taskTrackerManager.removeJob(job1.getJobID());
+
+ // submit another job.
+ LOG.debug("Submitting another normal job with 1 map and 1 reduce");
+ jConf = new JobConf();
+ jConf.setNumMapTasks(1);
+ jConf.setNumReduceTasks(1);
+ jConf.setQueueName("default");
+ jConf.setUser("u1");
+ FakeJobInProgress job2 = submitJobAndInit(JobStatus.PREP, jConf);
+
+ // 2nd cycle - nothing should get assigned. Memory matching code
+ // will see the job is missing and fail memory requirements.
+ assertNull(scheduler.assignTasks(tracker("tt1")));
+ // calling again should not make a difference, as the task is still running
+ assertNull(scheduler.assignTasks(tracker("tt1")));
+
+ // finish the task on the tracker.
+ taskTrackerManager.finishTask("tt1", t.getTaskID().toString(), job1);
+ // now a new task can be assigned.
+ t = checkAssignment("tt1", "attempt_test_0002_m_000001_0 on tt1");
+ // reduce can be assigned.
+ t = checkAssignment("tt1", "attempt_test_0002_r_000001_0 on tt1");
+ }
+
protected TaskTrackerStatus tracker(String taskTrackerName) {
return taskTrackerManager.getTaskTracker(taskTrackerName);
}