This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 42ae2a34505e23319861e7b31fd9f87f1af68749 Author: Sebastian Nagel <sebast...@commoncrawl.org> AuthorDate: Fri Jan 14 18:31:31 2022 +0100 NUTCH-2946 Fetcher: slow down fetching from hosts where requests fail repeatedly with exceptions or HTTP status codes mapped to ProtocolStatus.EXCEPTION (HTTP 403 Forbidden, 429 Too many requests, 5xx server errors, etc.) --- conf/nutch-default.xml | 19 +++++++++++++++---- .../org/apache/nutch/fetcher/FetchItemQueues.java | 12 ++++++++++++ 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 29a4716b5..7775fc70d 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1081,10 +1081,21 @@ <property> <name>fetcher.max.exceptions.per.queue</name> <value>-1</value> - <description>The maximum number of protocol-level exceptions (e.g. timeouts) per - host (or IP) queue. Once this value is reached, any remaining entries from this - queue are purged, effectively stopping the fetching from this host/IP. The default - value of -1 deactivates this limit. + <description>The maximum number of protocol-level exceptions + (e.g. timeouts) or HTTP status codes mapped to ProtocolStatus.EXCEPTION + per host (or IP) queue. Once this value is reached, any remaining entries + from this queue are purged, effectively stopping the fetching from this + host/IP. The default value of -1 deactivates this limit. + </description> +</property> + +<property> + <name>fetcher.exceptions.per.queue.delay</name> + <value>-1</value> + <description>Additional delay in milliseconds slowing down fetches from a queue + if an exception has occurred (see also fetcher.max.exceptions.per.queue). + The delay grows logarithmically with the number of observed exceptions: + delay = fetcher.exceptions.per.queue.delay * log2(1 + num_exception_in_queue) </description> </property> diff --git a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java index ceb8cab59..0faf391ce 100644 --- a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java +++ b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java @@ -56,6 +56,7 @@ public class FetchItemQueues { long minCrawlDelay; long timelimit = -1; int maxExceptionsPerQueue = -1; + long exceptionsPerQueueDelay = -1; Configuration conf; public static final String QUEUE_MODE_HOST = "byHost"; @@ -84,6 +85,8 @@ public class FetchItemQueues { this.timelimit = conf.getLong("fetcher.timelimit", -1); this.maxExceptionsPerQueue = conf.getInt( "fetcher.max.exceptions.per.queue", -1); + this.exceptionsPerQueueDelay = conf + .getLong("fetcher.exceptions.per.queue.delay", -1); int dedupRedirMaxTime = conf.getInt("fetcher.redirect.dedupcache.seconds", -1); @@ -268,6 +271,15 @@ public class FetchItemQueues { if (delay > 0) { fiq.nextFetchTime.getAndAdd(delay); LOG.info("* queue: {} >> delayed next fetch by {} ms", queueid, delay); + } else if (exceptionsPerQueueDelay > 0) { + // delay the next fetch by a time span growing at log scale + // with the number of observed exceptions + long exceptionDelay = (long) (exceptionsPerQueueDelay + * Math.log(1 + excCount) / Math.log(2)); + fiq.nextFetchTime.getAndAdd(exceptionDelay); + LOG.info( + "* queue: {} >> delayed next fetch by {} ms after {} exceptions in queue", + queueid, exceptionDelay, excCount); } if (fiq.getQueueSize() == 0) { return 0;