This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new ecdd19dbd NUTCH-2990 HttpRobotRulesParser to follow 5 redirects as
specified by RFC 9309 (#779)
ecdd19dbd is described below
commit ecdd19dbdd4424bf9b9bce206f23992140ee43fe
Author: Sebastian Nagel <[email protected]>
AuthorDate: Sat Oct 21 15:53:25 2023 +0200
NUTCH-2990 HttpRobotRulesParser to follow 5 redirects as specified by RFC
9309 (#779)
- follow multiple redirects when fetching robots.txt
- number of followed redirects is configurable by the property
http.robots.redirect.max (default: 5)
Improvements to RobotRulesParser's robots.txt test utility
- bug fix: the passed agent names need to be transferred
to the property http.robots.agents earlier, before the
protocol plugins are configured
- more verbose debug logging
---
conf/nutch-default.xml | 10 ++
.../apache/nutch/protocol/RobotRulesParser.java | 32 +++--
.../protocol/http/api/HttpRobotRulesParser.java | 141 ++++++++++++++++-----
3 files changed, 143 insertions(+), 40 deletions(-)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 58455b338..18ed56b03 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -163,6 +163,16 @@
</description>
</property>
+<property>
+ <name>http.robots.redirect.max</name>
+ <value>5</value>
+ <description>Maximum number of redirects followed when fetching
+ a robots.txt file. RFC 9309 specifies that "crawlers SHOULD
+ follow at least five consecutive redirects, even across authorities
+ (for example, hosts in the case of HTTP)."
+ </description>
+</property>
+
<property>
<name>http.agent.description</name>
<value></value>
diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 562c2c694..d73c07506 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -98,6 +98,7 @@ public abstract class RobotRulesParser implements Tool {
protected Configuration conf;
protected Set<String> agentNames;
+ protected int maxNumRedirects = 5;
/** set of host names or IPs to be explicitly excluded from robots.txt
checking */
protected Set<String> allowList = new HashSet<>();
@@ -149,6 +150,10 @@ public abstract class RobotRulesParser implements Tool {
}
}
}
+ LOG.info("Checking robots.txt for the following agent names: {}",
agentNames);
+
+ maxNumRedirects = conf.getInt("http.robots.redirect.max", 5);
+ LOG.info("Following max. {} robots.txt redirects", maxNumRedirects);
String[] confAllowList = conf.getStrings("http.robot.rules.allowlist");
if (confAllowList == null) {
@@ -294,8 +299,11 @@ public abstract class RobotRulesParser implements Tool {
"",
"<robots-file-or-url>\tlocal file or URL parsed as robots.txt file",
"\tIf <robots-file-or-url> starts with a protocol specification",
- "\t(`http', `https', `ftp' or `file'), robots.txt it is fetched",
- "\tusing the specified protocol. Otherwise, a local file is
assumed.",
+ "\t(`http', `https', `ftp' or `file'), the URL is parsed, URL path",
+ "\tand query are removed and the path \"/robots.txt\" is appended.",
+ "\tThe resulting URL (the canonical robots.txt location) is then",
+ "\tfetched using the specified protocol.",
+ "\tIf the URL does not include a protocol, a local file is assumed.",
"",
"<url-file>\tlocal file with URLs (one per line), for every URL",
"\tthe path part (including the query) is checked whether",
@@ -323,6 +331,16 @@ public abstract class RobotRulesParser implements Tool {
return -1;
}
+ if (args.length > 2) {
+ // set agent name from command-line in configuration
+ // Note: when fetching via protocol this must be done
+ // before the protocol is configured
+ String agents = args[2];
+ conf.set("http.robots.agents", agents);
+ conf.set("http.agent.name", agents.split(",")[0]);
+ setConf(conf);
+ }
+
Protocol protocol = null;
URL robotsTxtUrl = null;
if (args[0].matches("^(?:https?|ftp|file)://?.*")) {
@@ -334,6 +352,7 @@ public abstract class RobotRulesParser implements Tool {
ProtocolFactory factory = new ProtocolFactory(conf);
try {
protocol = factory.getProtocol(robotsTxtUrl);
+ LOG.debug("Using protocol {} to fetch robots.txt",
protocol.getClass());
} catch (ProtocolNotFound e) {
LOG.error("No protocol found for {}: {}", args[0],
StringUtils.stringifyException(e));
@@ -357,14 +376,6 @@ public abstract class RobotRulesParser implements Tool {
File urlFile = new File(args[1]);
- if (args.length > 2) {
- // set agent name from command-line in configuration and update parser
- String agents = args[2];
- conf.set("http.robots.agents", agents);
- conf.set("http.agent.name", agents.split(",")[0]);
- setConf(conf);
- }
-
List<Content> robotsTxtContent = null;
if (getConf().getBoolean("fetcher.store.robotstxt", false)) {
robotsTxtContent = new LinkedList<>();
@@ -373,6 +384,7 @@ public abstract class RobotRulesParser implements Tool {
try {
BaseRobotRules rules = getRobotRulesSet(protocol, robotsTxtUrl,
robotsTxtContent);
+ LOG.debug("Robots.txt rules:\n{}", rules);
if (robotsTxtContent != null) {
for (Content robotsTxt : robotsTxtContent) {
diff --git
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
index db09a0c88..8d7263e3e 100644
---
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -17,12 +17,15 @@
package org.apache.nutch.protocol.http.api;
import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
import java.net.URL;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
+import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.net.protocols.Response;
@@ -87,6 +90,13 @@ public class HttpRobotRulesParser extends RobotRulesParser {
* {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
* rules are cached to avoid re-fetching and re-parsing it again.
*
+ * <p>Following
+ * <a href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.3.1.2">RFC
+ * 9309, section 2.3.1.2. Redirects</a>, up to five consecutive HTTP
redirects
+ * are followed when fetching the robots.txt file. The max. number of
+ * redirects followed is configurable by the property
+ * <code>http.robots.redirect.max</code>.</p>
+ *
* @param http
* The {@link Protocol} object
* @param url
@@ -114,11 +124,11 @@ public class HttpRobotRulesParser extends
RobotRulesParser {
if (robotRules != null) {
return robotRules; // cached rule
} else if (LOG.isTraceEnabled()) {
- LOG.trace("cache miss {}", url);
+ LOG.trace("Robots.txt cache miss {}", url);
}
boolean cacheRule = true;
- URL redir = null;
+ Set<String> redirectCacheKeys = new HashSet<>();
if (isAllowListed(url)) {
// check in advance whether a host is allowlisted
@@ -129,43 +139,97 @@ public class HttpRobotRulesParser extends
RobotRulesParser {
url.getHost());
} else {
+ URL robotsUrl = null, robotsUrlRedir = null;
try {
- URL robotsUrl = new URL(url, "/robots.txt");
+ robotsUrl = new URL(url, "/robots.txt");
+
+ /*
+ * Redirect counter - following redirects up to the configured maximum
+ * ("five consecutive redirects" as per RFC 9309).
+ */
+ int numRedirects = 0;
+ /*
+ * The base URL to resolve relative redirect locations is set initially
+ * to the default URL path ("/robots.txt") and updated when redirects
+ * were followed.
+ */
+ robotsUrlRedir = robotsUrl;
+
Response response = ((HttpBase) http).getResponse(robotsUrl,
new CrawlDatum(), true);
+ int code = response.getCode();
if (robotsTxtContent != null) {
addRobotsContent(robotsTxtContent, robotsUrl, response);
}
- // try one level of redirection ?
- if (response.getCode() == 301 || response.getCode() == 302) {
- String redirection = response.getHeader("Location");
- if (redirection == null) {
- // some versions of MS IIS are known to mangle this header
- redirection = response.getHeader("location");
+
+ while (isRedirect(code) && numRedirects < maxNumRedirects) {
+ numRedirects++;
+
+ String redirectionLocation = response.getHeader("Location");
+ if (StringUtils.isNotBlank(redirectionLocation)) {
+ LOG.debug("Following robots.txt redirect: {} -> {}",
robotsUrlRedir,
+ redirectionLocation);
+ try {
+ robotsUrlRedir = new URL(robotsUrlRedir, redirectionLocation);
+ } catch (MalformedURLException e) {
+ LOG.info(
+ "Failed to resolve redirect location for robots.txt: {} ->
{} ({})",
+ robotsUrlRedir, redirectionLocation, e.getMessage());
+ break;
+ }
+ response = ((HttpBase) http).getResponse(robotsUrlRedir,
+ new CrawlDatum(), true);
+ code = response.getCode();
+ if (robotsTxtContent != null) {
+ addRobotsContent(robotsTxtContent, robotsUrlRedir, response);
+ }
+ } else {
+ LOG.info(
+ "No HTTP redirect Location header for robots.txt: {} (status
code: {})",
+ robotsUrlRedir, code);
+ break;
}
- if (redirection != null) {
- if (!redirection.startsWith("http")) {
- // RFC says it should be absolute, but apparently it isn't
- redir = new URL(url, redirection);
+
+ if ("/robots.txt".equals(robotsUrlRedir.getFile())) {
+ /*
+ * If a redirect points to a path /robots.txt on a different host
+ * (or a different authority scheme://host:port/, in general), we
+ * can lookup the cache for cached rules from the target host.
+ */
+ String redirectCacheKey = getCacheKey(robotsUrlRedir);
+ robotRules = CACHE.get(redirectCacheKey);
+ LOG.debug(
+ "Found cached robots.txt rules for {} (redirected to {}) under
target key {}",
+ url, robotsUrlRedir, redirectCacheKey);
+ if (robotRules != null) {
+ /* If found, cache and return the rules for the source host. */
+ CACHE.put(cacheKey, robotRules);
+ return robotRules;
} else {
- redir = new URL(redirection);
+ /*
+ * Remember the target host/authority, we can cache the rules,
+ * too.
+ */
+ redirectCacheKeys.add(redirectCacheKey);
}
+ }
- response = ((HttpBase) http).getResponse(redir, new CrawlDatum(),
- true);
- if (robotsTxtContent != null) {
- addRobotsContent(robotsTxtContent, redir, response);
- }
+ if (numRedirects == maxNumRedirects && isRedirect(code)) {
+ LOG.info(
+ "Reached maximum number of robots.txt redirects for {}
(assuming no robots.txt, allow all)",
+ url);
}
}
- if (response.getCode() == 200) // found rules: parse them
+ LOG.debug("Fetched robots.txt for {} with status code {}", url, code);
+ if (code == 200) // found rules: parse them
robotRules = parseRules(url.toString(), response.getContent(),
response.getHeader("Content-Type"), agentNames);
- else if ((response.getCode() == 403) && (!allowForbidden))
+ else if ((code == 403) && (!allowForbidden))
robotRules = FORBID_ALL_RULES; // use forbid all
- else if (response.getCode() >= 500) {
+
+ else if (code >= 500) {
cacheRule = false; // try again later to fetch robots.txt
if (deferVisits503) {
// signal fetcher to suspend crawling for this host
@@ -177,8 +241,15 @@ public class HttpRobotRulesParser extends RobotRulesParser
{
robotRules = EMPTY_RULES; // use default rules
}
} catch (Throwable t) {
- if (LOG.isInfoEnabled()) {
- LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+ if (robotsUrl == null || robotsUrlRedir == null) {
+ LOG.info("Couldn't get robots.txt for {}", url, t);
+ } else if (robotsUrl.equals(robotsUrlRedir)) {
+ LOG.info("Couldn't get robots.txt for {} ({}): {}", url, robotsUrl,
+ t);
+ } else {
+ LOG.info(
+ "Couldn't get redirected robots.txt for {} (redirected to {}):
{}",
+ url, robotsUrlRedir, t);
}
cacheRule = false; // try again later to fetch robots.txt
robotRules = EMPTY_RULES;
@@ -187,17 +258,27 @@ public class HttpRobotRulesParser extends
RobotRulesParser {
if (cacheRule) {
CACHE.put(cacheKey, robotRules); // cache rules for host
- if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())
- && "/robots.txt".equals(redir.getFile())) {
- // cache also for the redirected host
- // if the URL path is /robots.txt
- CACHE.put(getCacheKey(redir), robotRules);
+ for (String redirectCacheKey : redirectCacheKeys) {
+ /*
+ * and also for redirect target hosts where URL path and query were
+ * found to be "/robots.txt"
+ */
+ CACHE.put(redirectCacheKey, robotRules);
}
}
return robotRules;
}
+ /**
+ * @param code
+ * HTTP response status code
+ * @return whether the status code signals a redirect to a different location
+ */
+ private boolean isRedirect(int code) {
+ return (code == 301 || code == 302 || code == 303 || code == 307 || code
== 308);
+ }
+
/**
* Append {@link Content} of robots.txt to {@literal robotsTxtContent}
*