Author: cutting Date: Wed May 2 13:03:56 2007 New Revision: 534606 URL: http://svn.apache.org/viewvc?view=rev&rev=534606 Log: HADOOP-1304. Make configurable the maximum number of task attempts before a job fails. Contributed by Devaraj.
Modified: lucene/hadoop/trunk/CHANGES.txt lucene/hadoop/trunk/conf/hadoop-default.xml lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobConf.java lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java Modified: lucene/hadoop/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?view=diff&rev=534606&r1=534605&r2=534606 ============================================================================== --- lucene/hadoop/trunk/CHANGES.txt (original) +++ lucene/hadoop/trunk/CHANGES.txt Wed May 2 13:03:56 2007 @@ -306,6 +306,9 @@ the new format. Please backup your data first before upgrading (using 'hadoop distcp' for example). (tomwhite) +91. HADOOP-1304. Make configurable the maximum number of task + attempts before a job fails. (Devaraj Das via cutting) + Release 0.12.3 - 2007-04-06 Modified: lucene/hadoop/trunk/conf/hadoop-default.xml URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/conf/hadoop-default.xml?view=diff&rev=534606&r1=534605&r2=534606 ============================================================================== --- lucene/hadoop/trunk/conf/hadoop-default.xml (original) +++ lucene/hadoop/trunk/conf/hadoop-default.xml Wed May 2 13:03:56 2007 @@ -544,6 +544,24 @@ </property> <property> + <name>mapred.map.max.attempts</name> + <value>4</value> + <description>Expert: The maximum number of attempts per map task. + In other words, framework will try to execute a map task these many number + of times before giving up on it. + </description> +</property> + +<property> + <name>mapred.reduce.max.attempts</name> + <value>4</value> + <description>Expert: The maximum number of attempts per reduce task. + In other words, framework will try to execute a reduce task these many number + of times before giving up on it. + </description> +</property> + +<property> <name>mapred.reduce.parallel.copies</name> <value>5</value> <description>The default number of parallel transfers run by reduce Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobConf.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobConf.java?view=diff&rev=534606&r1=534605&r2=534606 ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobConf.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/JobConf.java Wed May 2 13:03:56 2007 @@ -510,7 +510,41 @@ public int getNumReduceTasks() { return getInt("mapred.reduce.tasks", 1); } public void setNumReduceTasks(int n) { setInt("mapred.reduce.tasks", n); } + + /** Get the configured number of maximum attempts that will be made to run a + * map task, as specified by the <code>mapred.map.max.attempts</code> + * property. If this property is not already set, the default is 4 attempts + * @return the max number of attempts + */ + public int getMaxMapAttempts() { + return getInt("mapred.map.max.attempts", 4); + } + /** Expert: Set the number of maximum attempts that will be made to run a + * map task + * @param n the number of attempts + * + */ + public void setMaxMapAttempts(int n) { + setInt("mapred.map.max.attempts", n); + } + /** Get the configured number of maximum attempts that will be made to run a + * reduce task, as specified by the <code>mapred.reduce.max.attempts</code> + * property. If this property is not already set, the default is 4 attempts + * @return the max number of attempts + */ + public int getMaxReduceAttempts() { + return getInt("mapred.reduce.max.attempts", 4); + } + /** Expert: Set the number of maximum attempts that will be made to run a + * reduce task + * @param n the number of attempts + * + */ + public void setMaxReduceAttempts(int n) { + setInt("mapred.reduce.max.attempts", n); + } + /** * Get the user-specified job name. This is only used to identify the * job to the user. Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java?view=diff&rev=534606&r1=534605&r2=534606 ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskInProgress.java Wed May 2 13:03:56 2007 @@ -49,7 +49,7 @@ //////////////////////////////////////////////////////// class TaskInProgress { static final int MAX_TASK_EXECS = 1; - static final int MAX_TASK_FAILURES = 4; + int maxTaskAttempts = 4; static final double SPECULATIVE_GAP = 0.2; static final long SPECULATIVE_LAG = 60 * 1000; private static NumberFormat idFormat = NumberFormat.getInstance(); @@ -125,6 +125,7 @@ this.job = job; this.conf = conf; this.partition = partition; + setMaxTaskAttempts(); init(uniqueString); } @@ -141,8 +142,19 @@ this.jobtracker = jobtracker; this.job = job; this.conf = conf; + setMaxTaskAttempts(); init(uniqueString); } + /** + * Set the max number of attempts before we declare a TIP as "failed" + */ + private void setMaxTaskAttempts() { + if (isMapTask()) { + this.maxTaskAttempts = conf.getMaxMapAttempts(); + } else { + this.maxTaskAttempts = conf.getMaxReduceAttempts(); + } + } /** * Make a unique name for this TIP. @@ -430,7 +442,7 @@ numKilledTasks++; } - if (numTaskFailures >= MAX_TASK_FAILURES) { + if (numTaskFailures >= maxTaskAttempts) { LOG.info("TaskInProgress " + getTIPId() + " has failed " + numTaskFailures + " times."); kill(); } @@ -620,11 +632,11 @@ // Create the 'taskid'; do not count the 'killed' tasks against the job! String taskid = null; - if (nextTaskId < (MAX_TASK_EXECS + MAX_TASK_FAILURES + numKilledTasks)) { + if (nextTaskId < (MAX_TASK_EXECS + maxTaskAttempts + numKilledTasks)) { taskid = new String("task_" + taskIdPrefix + "_" + nextTaskId); ++nextTaskId; } else { - LOG.warn("Exceeded limit of " + (MAX_TASK_EXECS + MAX_TASK_FAILURES) + + LOG.warn("Exceeded limit of " + (MAX_TASK_EXECS + maxTaskAttempts) + " (plus " + numKilledTasks + " killed)" + " attempts for the tip '" + getTIPId() + "'"); return null;