This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 582cdd417 NUTCH-3058 Fetcher: counter for hung threads (#820)
582cdd417 is described below
commit 582cdd417b8ead6183db8cd6e787f612dbcd9f90
Author: Sebastian Nagel <[email protected]>
AuthorDate: Mon Sep 16 19:37:28 2024 +0200
NUTCH-3058 Fetcher: counter for hung threads (#820)
- add counter FetcherStatus:hungThreads
- log stack traces of hung threads with level WARN (instead of DEBUG)
---
conf/nutch-default.xml | 7 ++--
src/java/org/apache/nutch/fetcher/Fetcher.java | 48 +++++++++++++++++---------
2 files changed, 35 insertions(+), 20 deletions(-)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index fe6eeccf7..b1c2b1556 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1200,10 +1200,11 @@
<property>
<name>fetcher.threads.timeout.divisor</name>
<value>2</value>
- <description>(EXPERT)The thread time-out divisor to use. By default threads
have a time-out
+ <description>(EXPERT) The thread time-out divisor to use. By default threads
have a time-out
value of mapreduce.task.timeout / 2. Increase this setting if the fetcher
waits too
- long before killing hanged threads. Be careful, a too high setting (+8) will
most likely kill the
- fetcher threads prematurely.
+ long before killing hung threads. Be careful, a too high setting (+8) will
most likely kill the
+ fetcher threads prematurely. The fetcher thread time-out avoids that the
task timeout (defined by
+ the Hadoop configuration property mapreduce.task.timeout) is reached and the
fetcher job is failed.
</description>
</property>
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java
b/src/java/org/apache/nutch/fetcher/Fetcher.java
index d1774f530..5ae13a52b 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -419,27 +419,41 @@ public class Fetcher extends NutchTool implements Tool {
.increment(hitByTimeLimit);
}
- // some requests seem to hang, despite all intentions
+ /*
+ * Some requests seem to hang, with no fetches finished and no new
+ * fetches started during half of the MapReduce task timeout
+ * (mapreduce.task.timeout, default value: 10 minutes). In order to
+ * avoid that the task timeout is hit and the fetcher job is failed,
+ * we stop the fetching now.
+ */
if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout)
{
- if (LOG.isWarnEnabled()) {
- LOG.warn("Aborting with {} hung threads.", activeThreads);
- for (int i = 0; i < fetcherThreads.size(); i++) {
- FetcherThread thread = fetcherThreads.get(i);
- if (thread.isAlive()) {
- LOG.warn("Thread #{} hung while processing {}", i,
- thread.getReprUrl());
- if (LOG.isDebugEnabled()) {
- StackTraceElement[] stack = thread.getStackTrace();
- StringBuilder sb = new StringBuilder();
- sb.append("Stack of thread #").append(i).append(":\n");
- for (StackTraceElement s : stack) {
- sb.append(s.toString()).append('\n');
- }
- LOG.debug(sb.toString());
- }
+ LOG.warn("Aborting with {} hung threads.", activeThreads);
+ innerContext.getCounter("FetcherStatus", "hungThreads")
+ .increment(activeThreads.get());
+ for (int i = 0; i < fetcherThreads.size(); i++) {
+ FetcherThread thread = fetcherThreads.get(i);
+ if (thread.isAlive()) {
+ LOG.warn("Thread #{} hung while processing {}", i,
+ thread.getReprUrl());
+ StackTraceElement[] stack = thread.getStackTrace();
+ StringBuilder sb = new StringBuilder();
+ sb.append("Stack of thread #").append(i).append(":\n");
+ for (StackTraceElement s : stack) {
+ sb.append(s.toString()).append('\n');
}
+ LOG.warn(sb.toString());
}
}
+ /*
+ * log and count queued items dropped from the fetch queues because
+ * of the timeout
+ */
+ LOG.warn("Aborting with {} queued fetch items in {} queues{}.",
+ fetchQueues.getTotalSize(), fetchQueues.getQueueCount(),
+ feeder.isAlive() ? " (queue feeder still alive)" : "");
+ int hitByTimeout = fetchQueues.emptyQueues();
+ innerContext.getCounter("FetcherStatus", "hitByTimeout")
+ .increment(hitByTimeout);
return;
}