This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 582cdd417 NUTCH-3058 Fetcher: counter for hung threads (#820)
582cdd417 is described below

commit 582cdd417b8ead6183db8cd6e787f612dbcd9f90
Author: Sebastian Nagel <[email protected]>
AuthorDate: Mon Sep 16 19:37:28 2024 +0200

    NUTCH-3058 Fetcher: counter for hung threads (#820)
    
    - add counter FetcherStatus:hungThreads
    - log stack traces of hung threads with level WARN (instead of DEBUG)
---
 conf/nutch-default.xml                         |  7 ++--
 src/java/org/apache/nutch/fetcher/Fetcher.java | 48 +++++++++++++++++---------
 2 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index fe6eeccf7..b1c2b1556 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1200,10 +1200,11 @@
 <property>
   <name>fetcher.threads.timeout.divisor</name>
   <value>2</value>
-  <description>(EXPERT)The thread time-out divisor to use. By default threads 
have a time-out
+  <description>(EXPERT) The thread time-out divisor to use. By default threads 
have a time-out
   value of mapreduce.task.timeout / 2. Increase this setting if the fetcher 
waits too
-  long before killing hanged threads. Be careful, a too high setting (+8) will 
most likely kill the
-  fetcher threads prematurely.
+  long before killing hung threads. Be careful, a too high setting (+8) will 
most likely kill the
+  fetcher threads prematurely. The fetcher thread time-out avoids that the 
task timeout (defined by
+  the Hadoop configuration property mapreduce.task.timeout) is reached and the 
fetcher job is failed.
   </description>
 </property>
 
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java 
b/src/java/org/apache/nutch/fetcher/Fetcher.java
index d1774f530..5ae13a52b 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -419,27 +419,41 @@ public class Fetcher extends NutchTool implements Tool {
                   .increment(hitByTimeLimit);
           }
 
-          // some requests seem to hang, despite all intentions
+          /*
+           * Some requests seem to hang, with no fetches finished and no new
+           * fetches started during half of the MapReduce task timeout
+           * (mapreduce.task.timeout, default value: 10 minutes). In order to
+           * avoid that the task timeout is hit and the fetcher job is failed,
+           * we stop the fetching now.
+           */
           if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) 
{
-            if (LOG.isWarnEnabled()) {
-              LOG.warn("Aborting with {} hung threads.", activeThreads);
-              for (int i = 0; i < fetcherThreads.size(); i++) {
-                FetcherThread thread = fetcherThreads.get(i);
-                if (thread.isAlive()) {
-                  LOG.warn("Thread #{} hung while processing {}", i,
-                      thread.getReprUrl());
-                  if (LOG.isDebugEnabled()) {
-                    StackTraceElement[] stack = thread.getStackTrace();
-                    StringBuilder sb = new StringBuilder();
-                    sb.append("Stack of thread #").append(i).append(":\n");
-                    for (StackTraceElement s : stack) {
-                      sb.append(s.toString()).append('\n');
-                    }
-                    LOG.debug(sb.toString());
-                  }
+            LOG.warn("Aborting with {} hung threads.", activeThreads);
+            innerContext.getCounter("FetcherStatus", "hungThreads")
+                .increment(activeThreads.get());
+            for (int i = 0; i < fetcherThreads.size(); i++) {
+              FetcherThread thread = fetcherThreads.get(i);
+              if (thread.isAlive()) {
+                LOG.warn("Thread #{} hung while processing {}", i,
+                    thread.getReprUrl());
+                StackTraceElement[] stack = thread.getStackTrace();
+                StringBuilder sb = new StringBuilder();
+                sb.append("Stack of thread #").append(i).append(":\n");
+                for (StackTraceElement s : stack) {
+                  sb.append(s.toString()).append('\n');
                 }
+                LOG.warn(sb.toString());
               }
             }
+            /*
+             * log and count queued items dropped from the fetch queues because
+             * of the timeout
+             */
+            LOG.warn("Aborting with {} queued fetch items in {} queues{}.",
+                fetchQueues.getTotalSize(), fetchQueues.getQueueCount(),
+                feeder.isAlive() ? " (queue feeder still alive)" : "");
+            int hitByTimeout = fetchQueues.emptyQueues();
+            innerContext.getCounter("FetcherStatus", "hitByTimeout")
+                .increment(hitByTimeout);
             return;
           }
 

Reply via email to