This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 34e7b03  NUTCH-2929 Fetcher: start threads slowly to avoid that 
resources are temporarily exhausted - sleep for a configurable delay 
(fetcher.threads.start.delay) before starting the next   Fetcher thread to 
avoid that resources (DNS, Tika XML parser pools) are   temporarily exhausted 
when Fetcher threads fetch the first pages simultaneously
     new f4ce845  Merge pull request #722 from 
sebastian-nagel/NUTCH-2929-fetcher-threads-slow-start
34e7b03 is described below

commit 34e7b03fda40b53bdeb41984eb4ee4125a512841
Author: Sebastian Nagel <sebast...@commoncrawl.org>
AuthorDate: Tue Jan 11 13:43:55 2022 +0100

    NUTCH-2929 Fetcher: start threads slowly to avoid that resources are 
temporarily exhausted
    - sleep for a configurable delay (fetcher.threads.start.delay) before 
starting the next
      Fetcher thread to avoid that resources (DNS, Tika XML parser pools) are
      temporarily exhausted when Fetcher threads fetch the first pages 
simultaneously
---
 conf/nutch-default.xml                         | 9 +++++++++
 src/java/org/apache/nutch/fetcher/Fetcher.java | 6 ++++++
 2 files changed, 15 insertions(+)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 2b6fff2..c305fa8 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -976,6 +976,15 @@
 </property>
 
 <property>
+  <name>fetcher.threads.start.delay</name>
+  <value>10</value>
+  <description>Delay in milliseconds between starting Fetcher threads
+  to avoid that all threads simultaneously fetch the first pages and
+  cause that DNS or other resources are temporarily exhausted.
+  </description>
+</property>
+
+<property>
   <name>fetcher.threads.per.queue</name>
   <value>1</value>
   <description>This number is the maximum number of threads that
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java 
b/src/java/org/apache/nutch/fetcher/Fetcher.java
index 21e0f87..9456c58 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -223,7 +223,13 @@ public class Fetcher extends NutchTool implements Tool {
           feeder.setTimeLimit(timelimit);
         feeder.start();
 
+        int startDelay = conf.getInt("fetcher.threads.start.delay", 10);
         for (int i = 0; i < threadCount; i++) { // spawn threads
+          if (startDelay > 0 && i > 0) {
+            // short delay to avoid that DNS or other resources are temporarily
+            // exhausted by all threads fetching simultaneously the first pages
+            Thread.sleep(startDelay);
+          }
           FetcherThread t = new FetcherThread(conf, getActiveThreads(),
               fetchQueues, feeder, spinWaiting, lastRequestStart, innerContext,
               errors, segmentName, parsing, storingContent, pages, bytes);

Reply via email to