This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 42ae2a34505e23319861e7b31fd9f87f1af68749
Author: Sebastian Nagel <sebast...@commoncrawl.org>
AuthorDate: Fri Jan 14 18:31:31 2022 +0100

    NUTCH-2946 Fetcher: slow down fetching from hosts where requests fail 
repeatedly
    with exceptions or HTTP status codes mapped to ProtocolStatus.EXCEPTION
    (HTTP 403 Forbidden, 429 Too many requests, 5xx server errors, etc.)
---
 conf/nutch-default.xml                                | 19 +++++++++++++++----
 .../org/apache/nutch/fetcher/FetchItemQueues.java     | 12 ++++++++++++
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 29a4716b5..7775fc70d 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1081,10 +1081,21 @@
 <property>
   <name>fetcher.max.exceptions.per.queue</name>
   <value>-1</value>
-  <description>The maximum number of protocol-level exceptions (e.g. timeouts) 
per
-  host (or IP) queue. Once this value is reached, any remaining entries from 
this
-  queue are purged, effectively stopping the fetching from this host/IP. The 
default
-  value of -1 deactivates this limit.
+  <description>The maximum number of protocol-level exceptions
+  (e.g. timeouts) or HTTP status codes mapped to ProtocolStatus.EXCEPTION
+  per host (or IP) queue. Once this value is reached, any remaining entries
+  from this queue are purged, effectively stopping the fetching from this
+  host/IP. The default value of -1 deactivates this limit.
+  </description>
+</property>
+
+<property>
+  <name>fetcher.exceptions.per.queue.delay</name>
+  <value>-1</value>
+  <description>Additional delay in milliseconds slowing down fetches from a 
queue
+  if an exception has occurred (see also fetcher.max.exceptions.per.queue).
+  The delay grows logarithmically with the number of observed exceptions:
+     delay = fetcher.exceptions.per.queue.delay * log2(1 + 
num_exception_in_queue)
   </description>
 </property>
 
diff --git a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java 
b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
index ceb8cab59..0faf391ce 100644
--- a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
+++ b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
@@ -56,6 +56,7 @@ public class FetchItemQueues {
   long minCrawlDelay;
   long timelimit = -1;
   int maxExceptionsPerQueue = -1;
+  long exceptionsPerQueueDelay = -1;
   Configuration conf;
 
   public static final String QUEUE_MODE_HOST = "byHost";
@@ -84,6 +85,8 @@ public class FetchItemQueues {
     this.timelimit = conf.getLong("fetcher.timelimit", -1);
     this.maxExceptionsPerQueue = conf.getInt(
         "fetcher.max.exceptions.per.queue", -1);
+    this.exceptionsPerQueueDelay = conf
+        .getLong("fetcher.exceptions.per.queue.delay", -1);
 
     int dedupRedirMaxTime = conf.getInt("fetcher.redirect.dedupcache.seconds",
         -1);
@@ -268,6 +271,15 @@ public class FetchItemQueues {
     if (delay > 0) {
       fiq.nextFetchTime.getAndAdd(delay);
       LOG.info("* queue: {} >> delayed next fetch by {} ms", queueid, delay);
+    } else if (exceptionsPerQueueDelay > 0) {
+      // delay the next fetch by a time span growing at log scale
+      // with the number of observed exceptions
+      long exceptionDelay = (long) (exceptionsPerQueueDelay
+          * Math.log(1 + excCount) / Math.log(2));
+      fiq.nextFetchTime.getAndAdd(exceptionDelay);
+      LOG.info(
+          "* queue: {} >> delayed next fetch by {} ms after {} exceptions in 
queue",
+          queueid, exceptionDelay, excCount);
     }
     if (fiq.getQueueSize() == 0) {
       return 0;

Reply via email to