This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 34e7b03 NUTCH-2929 Fetcher: start threads slowly to avoid that resources are temporarily exhausted - sleep for a configurable delay (fetcher.threads.start.delay) before starting the next Fetcher thread to avoid that resources (DNS, Tika XML parser pools) are temporarily exhausted when Fetcher threads fetch the first pages simultaneously new f4ce845 Merge pull request #722 from sebastian-nagel/NUTCH-2929-fetcher-threads-slow-start 34e7b03 is described below commit 34e7b03fda40b53bdeb41984eb4ee4125a512841 Author: Sebastian Nagel <sebast...@commoncrawl.org> AuthorDate: Tue Jan 11 13:43:55 2022 +0100 NUTCH-2929 Fetcher: start threads slowly to avoid that resources are temporarily exhausted - sleep for a configurable delay (fetcher.threads.start.delay) before starting the next Fetcher thread to avoid that resources (DNS, Tika XML parser pools) are temporarily exhausted when Fetcher threads fetch the first pages simultaneously --- conf/nutch-default.xml | 9 +++++++++ src/java/org/apache/nutch/fetcher/Fetcher.java | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 2b6fff2..c305fa8 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -976,6 +976,15 @@ </property> <property> + <name>fetcher.threads.start.delay</name> + <value>10</value> + <description>Delay in milliseconds between starting Fetcher threads + to avoid that all threads simultaneously fetch the first pages and + cause that DNS or other resources are temporarily exhausted. + </description> +</property> + +<property> <name>fetcher.threads.per.queue</name> <value>1</value> <description>This number is the maximum number of threads that diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java index 21e0f87..9456c58 100644 --- a/src/java/org/apache/nutch/fetcher/Fetcher.java +++ b/src/java/org/apache/nutch/fetcher/Fetcher.java @@ -223,7 +223,13 @@ public class Fetcher extends NutchTool implements Tool { feeder.setTimeLimit(timelimit); feeder.start(); + int startDelay = conf.getInt("fetcher.threads.start.delay", 10); for (int i = 0; i < threadCount; i++) { // spawn threads + if (startDelay > 0 && i > 0) { + // short delay to avoid that DNS or other resources are temporarily + // exhausted by all threads fetching simultaneously the first pages + Thread.sleep(startDelay); + } FetcherThread t = new FetcherThread(conf, getActiveThreads(), fetchQueues, feeder, spinWaiting, lastRequestStart, innerContext, errors, segmentName, parsing, storingContent, pages, bytes);