Author: tomwhite Date: Mon Aug 6 13:52:00 2007 New Revision: 563270 URL: http://svn.apache.org/viewvc?view=rev&rev=563270 Log: HADOOP-1610. Add metrics for failed tasks. Contributed by Devaraj Das.
Modified: lucene/hadoop/trunk/CHANGES.txt lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java Modified: lucene/hadoop/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?view=diff&rev=563270&r1=563269&r2=563270 ============================================================================== --- lucene/hadoop/trunk/CHANGES.txt (original) +++ lucene/hadoop/trunk/CHANGES.txt Mon Aug 6 13:52:00 2007 @@ -18,6 +18,9 @@ easier to read. Also remove numbering, to make merging easier. (cutting) + HADOOP-1610. Add metrics for failed tasks. + (Devaraj Das via tomwhite) + OPTIMIZATIONS HADOOP-1565. Reduce memory usage of NameNode by replacing Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java?view=diff&rev=563270&r1=563269&r2=563270 ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskRunner.java Mon Aug 6 13:52:00 2007 @@ -421,6 +421,9 @@ int exit_code = process.waitFor(); if (!killed && exit_code != 0) { + if (exit_code == 65) { + tracker.getTaskTrackerMetrics().taskFailedPing(); + } throw new IOException("Task process exit with nonzero status of " + exit_code + "."); } Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java?view=diff&rev=563270&r1=563269&r2=563270 ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/mapred/TaskTracker.java Mon Aug 6 13:52:00 2007 @@ -213,9 +213,11 @@ shuffleMetricsRecord.update(); } } - private class TaskTrackerMetrics implements Updater { + public class TaskTrackerMetrics implements Updater { private MetricsRecord metricsRecord = null; private int numCompletedTasks = 0; + private int timedoutTasks = 0; + private int tasksFailedPing = 0; TaskTrackerMetrics() { JobConf conf = getJobConf(); @@ -232,6 +234,15 @@ synchronized void completeTask() { ++numCompletedTasks; } + + synchronized void timedoutTask() { + ++timedoutTasks; + } + + synchronized void taskFailedPing() { + ++tasksFailedPing; + } + /** * Since this object is a registered updater, this method will be called * periodically, e.g. every 5 seconds. @@ -243,15 +254,23 @@ metricsRecord.setMetric("reduces_running", reduceTotal); metricsRecord.setMetric("taskSlots", (short)maxCurrentTasks); metricsRecord.incrMetric("tasks_completed", numCompletedTasks); - metricsRecord.update(); + metricsRecord.incrMetric("tasks_failed_timeout", timedoutTasks); + metricsRecord.incrMetric("tasks_failed_ping", tasksFailedPing); } numCompletedTasks = 0; + timedoutTasks = 0; + tasksFailedPing = 0; } + metricsRecord.update(); } } private TaskTrackerMetrics myMetrics = null; + public TaskTrackerMetrics getTaskTrackerMetrics() { + return myMetrics; + } + /** * A list of tips that should be cleaned up. */ @@ -991,6 +1010,7 @@ LOG.info(tip.getTask().getTaskId() + ": " + msg); ReflectionUtils.logThreadInfo(LOG, "lost task", 30); tip.reportDiagnosticInfo(msg); + myMetrics.timedoutTask(); purgeTask(tip, true); } }