This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new b081c75d8 NUTCH-3011 HttpRobotRulesParser: handle HTTP 429 Too Many 
Requests same as server errors (HTTP 5xx)
b081c75d8 is described below

commit b081c75d87be61e42297c952298b72eb7ff2a6dc
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Sun Oct 1 14:08:39 2023 +0200

    NUTCH-3011 HttpRobotRulesParser: handle HTTP 429 Too Many Requests same as 
server errors (HTTP 5xx)
---
 conf/nutch-default.xml                                        | 11 ++++++-----
 .../apache/nutch/protocol/http/api/HttpRobotRulesParser.java  |  3 ++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 18ed56b03..d8bf76486 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -141,8 +141,9 @@
   <name>http.robots.503.defer.visits</name>
   <value>true</value>
   <description>Temporarily suspend fetching from a host if the
-  robots.txt response is HTTP 503 or any other 5xx server error. See
-  also http.robots.503.defer.visits.delay and
+  robots.txt response is HTTP 503 or any other 5xx server error
+  and HTTP 429 Too Many Requests. See also
+  http.robots.503.defer.visits.delay and
   http.robots.503.defer.visits.retries</description>
 </property>
 
@@ -150,7 +151,7 @@
   <name>http.robots.503.defer.visits.delay</name>
   <value>300000</value>
   <description>Time in milliseconds to suspend crawling a host if the
-  robots.txt response is HTTP 5xx - see
+  robots.txt response is HTTP 5xx or 429 Too Many Requests - see
   http.robots.503.defer.visits.</description>
 </property>
 
@@ -158,8 +159,8 @@
   <name>http.robots.503.defer.visits.retries</name>
   <value>3</value>
   <description>Number of retries crawling a host if the robots.txt
-  response is HTTP 5xx - see http.robots.503.defer.visits. After n
-  retries the host queue is dropped for this segment/cycle.
+  response is HTTP 5xx or 429 - see http.robots.503.defer.visits.
+  After n retries the host queue is dropped for this segment/cycle.
   </description>
 </property>
 
diff --git 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
index 8d7263e3e..ec5e77e43 100644
--- 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++ 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -229,7 +229,8 @@ public class HttpRobotRulesParser extends RobotRulesParser {
         else if ((code == 403) && (!allowForbidden))
           robotRules = FORBID_ALL_RULES; // use forbid all
 
-        else if (code >= 500) {
+        else if (code >= 500 || code == 429) {
+          // 5xx server errors or 429 Too Many Requests
           cacheRule = false; // try again later to fetch robots.txt
           if (deferVisits503) {
             // signal fetcher to suspend crawling for this host

Reply via email to