This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new b081c75d8 NUTCH-3011 HttpRobotRulesParser: handle HTTP 429 Too Many
Requests same as server errors (HTTP 5xx)
b081c75d8 is described below
commit b081c75d87be61e42297c952298b72eb7ff2a6dc
Author: Sebastian Nagel <[email protected]>
AuthorDate: Sun Oct 1 14:08:39 2023 +0200
NUTCH-3011 HttpRobotRulesParser: handle HTTP 429 Too Many Requests same as
server errors (HTTP 5xx)
---
conf/nutch-default.xml | 11 ++++++-----
.../apache/nutch/protocol/http/api/HttpRobotRulesParser.java | 3 ++-
2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 18ed56b03..d8bf76486 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -141,8 +141,9 @@
<name>http.robots.503.defer.visits</name>
<value>true</value>
<description>Temporarily suspend fetching from a host if the
- robots.txt response is HTTP 503 or any other 5xx server error. See
- also http.robots.503.defer.visits.delay and
+ robots.txt response is HTTP 503 or any other 5xx server error
+ and HTTP 429 Too Many Requests. See also
+ http.robots.503.defer.visits.delay and
http.robots.503.defer.visits.retries</description>
</property>
@@ -150,7 +151,7 @@
<name>http.robots.503.defer.visits.delay</name>
<value>300000</value>
<description>Time in milliseconds to suspend crawling a host if the
- robots.txt response is HTTP 5xx - see
+ robots.txt response is HTTP 5xx or 429 Too Many Requests - see
http.robots.503.defer.visits.</description>
</property>
@@ -158,8 +159,8 @@
<name>http.robots.503.defer.visits.retries</name>
<value>3</value>
<description>Number of retries crawling a host if the robots.txt
- response is HTTP 5xx - see http.robots.503.defer.visits. After n
- retries the host queue is dropped for this segment/cycle.
+ response is HTTP 5xx or 429 - see http.robots.503.defer.visits.
+ After n retries the host queue is dropped for this segment/cycle.
</description>
</property>
diff --git
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
index 8d7263e3e..ec5e77e43 100644
---
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -229,7 +229,8 @@ public class HttpRobotRulesParser extends RobotRulesParser {
else if ((code == 403) && (!allowForbidden))
robotRules = FORBID_ALL_RULES; // use forbid all
- else if (code >= 500) {
+ else if (code >= 500 || code == 429) {
+ // 5xx server errors or 429 Too Many Requests
cacheRule = false; // try again later to fetch robots.txt
if (deferVisits503) {
// signal fetcher to suspend crawling for this host