Author: mc
Date: Sun Jun 26 11:03:17 2005
New Revision: 201885
URL: http://svn.apache.org/viewcvs?rev=201885&view=rev
Log:
Abort Job if a single task fails too many times.
This prevents runaway processes.
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java?rev=201885&r1=201884&r2=201885&view=diff
==============================================================================
---
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java
(original)
+++
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java
Sun Jun 26 11:03:17 2005
@@ -33,6 +33,7 @@
*******************************************************/
public class JobTracker implements MRConstants, InterTrackerProtocol,
JobSubmissionProtocol {
static final int TRACKERINFO_PORT = 7845;
+ static final int MAX_TASK_FAILURES = 3;
public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.mapred.JobTracker");
public static JobTracker tracker = null;
@@ -475,6 +476,7 @@
TreeMap completeMapTasks = new TreeMap();
TreeMap incompleteReduceTasks = new TreeMap();
TreeMap completeReduceTasks = new TreeMap();
+ TreeMap taskFailures = new TreeMap();
// Info for user; useless for JobTracker
int numMapTasks = 0;
@@ -665,6 +667,15 @@
completeReduceTasks.remove(taskid);
incompleteReduceTasks.put(taskid, t);
}
+
+ // Check if we need to kill the job because of excess failures
+ Integer failures = (Integer) taskFailures.get(taskid);
+ int numFailures = ((failures == null) ? 0 : failures.intValue()) +
1;
+ taskFailures.put(taskid, new Integer(numFailures));
+ if (numFailures >= MAX_TASK_FAILURES) {
+ kill();
+ }
+
if (status.getRunState() == JobStatus.RUNNING) {
executeTask(taskid);
}