This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit bdbe7b330b5e7fd712f1b5126f69e2efebb194e8 Author: Sebastian Nagel <sebast...@commoncrawl.org> AuthorDate: Tue May 3 17:14:03 2022 +0200 NUTCH-2946 Fetcher: optionally slow down fetching from hosts with repeated exceptions - configure the delay in seconds as a float instead of milliseconds - use the value of fetcher.server.delay as default - double the delay with every observed exception (exponential backoff) but cap the growth at 2**31 to avoid overflows --- conf/nutch-default.xml | 14 +++++++++----- .../org/apache/nutch/fetcher/FetchItemQueues.java | 19 +++++++++++++------ 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 7775fc70d..7faa6fdcd 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1091,11 +1091,15 @@ <property> <name>fetcher.exceptions.per.queue.delay</name> - <value>-1</value> - <description>Additional delay in milliseconds slowing down fetches from a queue - if an exception has occurred (see also fetcher.max.exceptions.per.queue). - The delay grows logarithmically with the number of observed exceptions: - delay = fetcher.exceptions.per.queue.delay * log2(1 + num_exception_in_queue) + <value>${fetcher.server.delay}</value> + <description>Initial value (in seconds) of an additional dynamic + delay slowing down fetches from a queue after an exception has + occurred (see also fetcher.max.exceptions.per.queue). Starting with + the initial value the delay doubles with every observed exception: + + delay = fetcher.exceptions.per.queue.delay * 2^num_exception_in_queue + + An initial value of 0.0 disables this exponential backoff mechanism. </description> </property> diff --git a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java index 0faf391ce..4914cd207 100644 --- a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java +++ b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java @@ -85,8 +85,8 @@ public class FetchItemQueues { this.timelimit = conf.getLong("fetcher.timelimit", -1); this.maxExceptionsPerQueue = conf.getInt( "fetcher.max.exceptions.per.queue", -1); - this.exceptionsPerQueueDelay = conf - .getLong("fetcher.exceptions.per.queue.delay", -1); + this.exceptionsPerQueueDelay = (long) (conf + .getFloat("fetcher.exceptions.per.queue.delay", .0f) * 1000); int dedupRedirMaxTime = conf.getInt("fetcher.redirect.dedupcache.seconds", -1); @@ -272,10 +272,17 @@ public class FetchItemQueues { fiq.nextFetchTime.getAndAdd(delay); LOG.info("* queue: {} >> delayed next fetch by {} ms", queueid, delay); } else if (exceptionsPerQueueDelay > 0) { - // delay the next fetch by a time span growing at log scale - // with the number of observed exceptions - long exceptionDelay = (long) (exceptionsPerQueueDelay - * Math.log(1 + excCount) / Math.log(2)); + /* + * Delay the next fetch by a time span growing exponentially with the + * number of observed exceptions. This dynamic delay is added to the + * constant delay. In order to avoid overflows, the exponential backoff is + * capped at 2**31 + */ + long exceptionDelay = exceptionsPerQueueDelay; + if (excCount > 1) { + // double the initial delay with every observed exception + exceptionDelay *= 2L << Math.min((excCount - 2), 31); + } fiq.nextFetchTime.getAndAdd(exceptionDelay); LOG.info( "* queue: {} >> delayed next fetch by {} ms after {} exceptions in queue",