This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new b081c75d8 NUTCH-3011 HttpRobotRulesParser: handle HTTP 429 Too Many Requests same as server errors (HTTP 5xx) b081c75d8 is described below commit b081c75d87be61e42297c952298b72eb7ff2a6dc Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Sun Oct 1 14:08:39 2023 +0200 NUTCH-3011 HttpRobotRulesParser: handle HTTP 429 Too Many Requests same as server errors (HTTP 5xx) --- conf/nutch-default.xml | 11 ++++++----- .../apache/nutch/protocol/http/api/HttpRobotRulesParser.java | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 18ed56b03..d8bf76486 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -141,8 +141,9 @@ <name>http.robots.503.defer.visits</name> <value>true</value> <description>Temporarily suspend fetching from a host if the - robots.txt response is HTTP 503 or any other 5xx server error. See - also http.robots.503.defer.visits.delay and + robots.txt response is HTTP 503 or any other 5xx server error + and HTTP 429 Too Many Requests. See also + http.robots.503.defer.visits.delay and http.robots.503.defer.visits.retries</description> </property> @@ -150,7 +151,7 @@ <name>http.robots.503.defer.visits.delay</name> <value>300000</value> <description>Time in milliseconds to suspend crawling a host if the - robots.txt response is HTTP 5xx - see + robots.txt response is HTTP 5xx or 429 Too Many Requests - see http.robots.503.defer.visits.</description> </property> @@ -158,8 +159,8 @@ <name>http.robots.503.defer.visits.retries</name> <value>3</value> <description>Number of retries crawling a host if the robots.txt - response is HTTP 5xx - see http.robots.503.defer.visits. After n - retries the host queue is dropped for this segment/cycle. + response is HTTP 5xx or 429 - see http.robots.503.defer.visits. + After n retries the host queue is dropped for this segment/cycle. </description> </property> diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index 8d7263e3e..ec5e77e43 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -229,7 +229,8 @@ public class HttpRobotRulesParser extends RobotRulesParser { else if ((code == 403) && (!allowForbidden)) robotRules = FORBID_ALL_RULES; // use forbid all - else if (code >= 500) { + else if (code >= 500 || code == 429) { + // 5xx server errors or 429 Too Many Requests cacheRule = false; // try again later to fetch robots.txt if (deferVisits503) { // signal fetcher to suspend crawling for this host