This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 34e7b03 NUTCH-2929 Fetcher: start threads slowly to avoid that
resources are temporarily exhausted - sleep for a configurable delay
(fetcher.threads.start.delay) before starting the next Fetcher thread to
avoid that resources (DNS, Tika XML parser pools) are temporarily exhausted
when Fetcher threads fetch the first pages simultaneously
new f4ce845 Merge pull request #722 from
sebastian-nagel/NUTCH-2929-fetcher-threads-slow-start
34e7b03 is described below
commit 34e7b03fda40b53bdeb41984eb4ee4125a512841
Author: Sebastian Nagel <[email protected]>
AuthorDate: Tue Jan 11 13:43:55 2022 +0100
NUTCH-2929 Fetcher: start threads slowly to avoid that resources are
temporarily exhausted
- sleep for a configurable delay (fetcher.threads.start.delay) before
starting the next
Fetcher thread to avoid that resources (DNS, Tika XML parser pools) are
temporarily exhausted when Fetcher threads fetch the first pages
simultaneously
---
conf/nutch-default.xml | 9 +++++++++
src/java/org/apache/nutch/fetcher/Fetcher.java | 6 ++++++
2 files changed, 15 insertions(+)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 2b6fff2..c305fa8 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -976,6 +976,15 @@
</property>
<property>
+ <name>fetcher.threads.start.delay</name>
+ <value>10</value>
+ <description>Delay in milliseconds between starting Fetcher threads
+ to avoid that all threads simultaneously fetch the first pages and
+ cause that DNS or other resources are temporarily exhausted.
+ </description>
+</property>
+
+<property>
<name>fetcher.threads.per.queue</name>
<value>1</value>
<description>This number is the maximum number of threads that
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java
b/src/java/org/apache/nutch/fetcher/Fetcher.java
index 21e0f87..9456c58 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -223,7 +223,13 @@ public class Fetcher extends NutchTool implements Tool {
feeder.setTimeLimit(timelimit);
feeder.start();
+ int startDelay = conf.getInt("fetcher.threads.start.delay", 10);
for (int i = 0; i < threadCount; i++) { // spawn threads
+ if (startDelay > 0 && i > 0) {
+ // short delay to avoid that DNS or other resources are temporarily
+ // exhausted by all threads fetching simultaneously the first pages
+ Thread.sleep(startDelay);
+ }
FetcherThread t = new FetcherThread(conf, getActiveThreads(),
fetchQueues, feeder, spinWaiting, lastRequestStart, innerContext,
errors, segmentName, parsing, storingContent, pages, bytes);