This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new ecdd19dbd NUTCH-2990 HttpRobotRulesParser to follow 5 redirects as specified by RFC 9309 (#779) ecdd19dbd is described below commit ecdd19dbdd4424bf9b9bce206f23992140ee43fe Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Sat Oct 21 15:53:25 2023 +0200 NUTCH-2990 HttpRobotRulesParser to follow 5 redirects as specified by RFC 9309 (#779) - follow multiple redirects when fetching robots.txt - number of followed redirects is configurable by the property http.robots.redirect.max (default: 5) Improvements to RobotRulesParser's robots.txt test utility - bug fix: the passed agent names need to be transferred to the property http.robots.agents earlier, before the protocol plugins are configured - more verbose debug logging --- conf/nutch-default.xml | 10 ++ .../apache/nutch/protocol/RobotRulesParser.java | 32 +++-- .../protocol/http/api/HttpRobotRulesParser.java | 141 ++++++++++++++++----- 3 files changed, 143 insertions(+), 40 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 58455b338..18ed56b03 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -163,6 +163,16 @@ </description> </property> +<property> + <name>http.robots.redirect.max</name> + <value>5</value> + <description>Maximum number of redirects followed when fetching + a robots.txt file. RFC 9309 specifies that "crawlers SHOULD + follow at least five consecutive redirects, even across authorities + (for example, hosts in the case of HTTP)." + </description> +</property> + <property> <name>http.agent.description</name> <value></value> diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java index 562c2c694..d73c07506 100644 --- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java +++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java @@ -98,6 +98,7 @@ public abstract class RobotRulesParser implements Tool { protected Configuration conf; protected Set<String> agentNames; + protected int maxNumRedirects = 5; /** set of host names or IPs to be explicitly excluded from robots.txt checking */ protected Set<String> allowList = new HashSet<>(); @@ -149,6 +150,10 @@ public abstract class RobotRulesParser implements Tool { } } } + LOG.info("Checking robots.txt for the following agent names: {}", agentNames); + + maxNumRedirects = conf.getInt("http.robots.redirect.max", 5); + LOG.info("Following max. {} robots.txt redirects", maxNumRedirects); String[] confAllowList = conf.getStrings("http.robot.rules.allowlist"); if (confAllowList == null) { @@ -294,8 +299,11 @@ public abstract class RobotRulesParser implements Tool { "", "<robots-file-or-url>\tlocal file or URL parsed as robots.txt file", "\tIf <robots-file-or-url> starts with a protocol specification", - "\t(`http', `https', `ftp' or `file'), robots.txt it is fetched", - "\tusing the specified protocol. Otherwise, a local file is assumed.", + "\t(`http', `https', `ftp' or `file'), the URL is parsed, URL path", + "\tand query are removed and the path \"/robots.txt\" is appended.", + "\tThe resulting URL (the canonical robots.txt location) is then", + "\tfetched using the specified protocol.", + "\tIf the URL does not include a protocol, a local file is assumed.", "", "<url-file>\tlocal file with URLs (one per line), for every URL", "\tthe path part (including the query) is checked whether", @@ -323,6 +331,16 @@ public abstract class RobotRulesParser implements Tool { return -1; } + if (args.length > 2) { + // set agent name from command-line in configuration + // Note: when fetching via protocol this must be done + // before the protocol is configured + String agents = args[2]; + conf.set("http.robots.agents", agents); + conf.set("http.agent.name", agents.split(",")[0]); + setConf(conf); + } + Protocol protocol = null; URL robotsTxtUrl = null; if (args[0].matches("^(?:https?|ftp|file)://?.*")) { @@ -334,6 +352,7 @@ public abstract class RobotRulesParser implements Tool { ProtocolFactory factory = new ProtocolFactory(conf); try { protocol = factory.getProtocol(robotsTxtUrl); + LOG.debug("Using protocol {} to fetch robots.txt", protocol.getClass()); } catch (ProtocolNotFound e) { LOG.error("No protocol found for {}: {}", args[0], StringUtils.stringifyException(e)); @@ -357,14 +376,6 @@ public abstract class RobotRulesParser implements Tool { File urlFile = new File(args[1]); - if (args.length > 2) { - // set agent name from command-line in configuration and update parser - String agents = args[2]; - conf.set("http.robots.agents", agents); - conf.set("http.agent.name", agents.split(",")[0]); - setConf(conf); - } - List<Content> robotsTxtContent = null; if (getConf().getBoolean("fetcher.store.robotstxt", false)) { robotsTxtContent = new LinkedList<>(); @@ -373,6 +384,7 @@ public abstract class RobotRulesParser implements Tool { try { BaseRobotRules rules = getRobotRulesSet(protocol, robotsTxtUrl, robotsTxtContent); + LOG.debug("Robots.txt rules:\n{}", rules); if (robotsTxtContent != null) { for (Content robotsTxt : robotsTxtContent) { diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index db09a0c88..8d7263e3e 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -17,12 +17,15 @@ package org.apache.nutch.protocol.http.api; import java.lang.invoke.MethodHandles; +import java.net.MalformedURLException; import java.net.URL; +import java.util.HashSet; import java.util.List; +import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - +import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.net.protocols.Response; @@ -87,6 +90,13 @@ public class HttpRobotRulesParser extends RobotRulesParser { * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the * rules are cached to avoid re-fetching and re-parsing it again. * + * <p>Following + * <a href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.3.1.2">RFC + * 9309, section 2.3.1.2. Redirects</a>, up to five consecutive HTTP redirects + * are followed when fetching the robots.txt file. The max. number of + * redirects followed is configurable by the property + * <code>http.robots.redirect.max</code>.</p> + * * @param http * The {@link Protocol} object * @param url @@ -114,11 +124,11 @@ public class HttpRobotRulesParser extends RobotRulesParser { if (robotRules != null) { return robotRules; // cached rule } else if (LOG.isTraceEnabled()) { - LOG.trace("cache miss {}", url); + LOG.trace("Robots.txt cache miss {}", url); } boolean cacheRule = true; - URL redir = null; + Set<String> redirectCacheKeys = new HashSet<>(); if (isAllowListed(url)) { // check in advance whether a host is allowlisted @@ -129,43 +139,97 @@ public class HttpRobotRulesParser extends RobotRulesParser { url.getHost()); } else { + URL robotsUrl = null, robotsUrlRedir = null; try { - URL robotsUrl = new URL(url, "/robots.txt"); + robotsUrl = new URL(url, "/robots.txt"); + + /* + * Redirect counter - following redirects up to the configured maximum + * ("five consecutive redirects" as per RFC 9309). + */ + int numRedirects = 0; + /* + * The base URL to resolve relative redirect locations is set initially + * to the default URL path ("/robots.txt") and updated when redirects + * were followed. + */ + robotsUrlRedir = robotsUrl; + Response response = ((HttpBase) http).getResponse(robotsUrl, new CrawlDatum(), true); + int code = response.getCode(); if (robotsTxtContent != null) { addRobotsContent(robotsTxtContent, robotsUrl, response); } - // try one level of redirection ? - if (response.getCode() == 301 || response.getCode() == 302) { - String redirection = response.getHeader("Location"); - if (redirection == null) { - // some versions of MS IIS are known to mangle this header - redirection = response.getHeader("location"); + + while (isRedirect(code) && numRedirects < maxNumRedirects) { + numRedirects++; + + String redirectionLocation = response.getHeader("Location"); + if (StringUtils.isNotBlank(redirectionLocation)) { + LOG.debug("Following robots.txt redirect: {} -> {}", robotsUrlRedir, + redirectionLocation); + try { + robotsUrlRedir = new URL(robotsUrlRedir, redirectionLocation); + } catch (MalformedURLException e) { + LOG.info( + "Failed to resolve redirect location for robots.txt: {} -> {} ({})", + robotsUrlRedir, redirectionLocation, e.getMessage()); + break; + } + response = ((HttpBase) http).getResponse(robotsUrlRedir, + new CrawlDatum(), true); + code = response.getCode(); + if (robotsTxtContent != null) { + addRobotsContent(robotsTxtContent, robotsUrlRedir, response); + } + } else { + LOG.info( + "No HTTP redirect Location header for robots.txt: {} (status code: {})", + robotsUrlRedir, code); + break; } - if (redirection != null) { - if (!redirection.startsWith("http")) { - // RFC says it should be absolute, but apparently it isn't - redir = new URL(url, redirection); + + if ("/robots.txt".equals(robotsUrlRedir.getFile())) { + /* + * If a redirect points to a path /robots.txt on a different host + * (or a different authority scheme://host:port/, in general), we + * can lookup the cache for cached rules from the target host. + */ + String redirectCacheKey = getCacheKey(robotsUrlRedir); + robotRules = CACHE.get(redirectCacheKey); + LOG.debug( + "Found cached robots.txt rules for {} (redirected to {}) under target key {}", + url, robotsUrlRedir, redirectCacheKey); + if (robotRules != null) { + /* If found, cache and return the rules for the source host. */ + CACHE.put(cacheKey, robotRules); + return robotRules; } else { - redir = new URL(redirection); + /* + * Remember the target host/authority, we can cache the rules, + * too. + */ + redirectCacheKeys.add(redirectCacheKey); } + } - response = ((HttpBase) http).getResponse(redir, new CrawlDatum(), - true); - if (robotsTxtContent != null) { - addRobotsContent(robotsTxtContent, redir, response); - } + if (numRedirects == maxNumRedirects && isRedirect(code)) { + LOG.info( + "Reached maximum number of robots.txt redirects for {} (assuming no robots.txt, allow all)", + url); } } - if (response.getCode() == 200) // found rules: parse them + LOG.debug("Fetched robots.txt for {} with status code {}", url, code); + if (code == 200) // found rules: parse them robotRules = parseRules(url.toString(), response.getContent(), response.getHeader("Content-Type"), agentNames); - else if ((response.getCode() == 403) && (!allowForbidden)) + else if ((code == 403) && (!allowForbidden)) robotRules = FORBID_ALL_RULES; // use forbid all - else if (response.getCode() >= 500) { + + else if (code >= 500) { cacheRule = false; // try again later to fetch robots.txt if (deferVisits503) { // signal fetcher to suspend crawling for this host @@ -177,8 +241,15 @@ public class HttpRobotRulesParser extends RobotRulesParser { robotRules = EMPTY_RULES; // use default rules } } catch (Throwable t) { - if (LOG.isInfoEnabled()) { - LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString()); + if (robotsUrl == null || robotsUrlRedir == null) { + LOG.info("Couldn't get robots.txt for {}", url, t); + } else if (robotsUrl.equals(robotsUrlRedir)) { + LOG.info("Couldn't get robots.txt for {} ({}): {}", url, robotsUrl, + t); + } else { + LOG.info( + "Couldn't get redirected robots.txt for {} (redirected to {}): {}", + url, robotsUrlRedir, t); } cacheRule = false; // try again later to fetch robots.txt robotRules = EMPTY_RULES; @@ -187,17 +258,27 @@ public class HttpRobotRulesParser extends RobotRulesParser { if (cacheRule) { CACHE.put(cacheKey, robotRules); // cache rules for host - if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost()) - && "/robots.txt".equals(redir.getFile())) { - // cache also for the redirected host - // if the URL path is /robots.txt - CACHE.put(getCacheKey(redir), robotRules); + for (String redirectCacheKey : redirectCacheKeys) { + /* + * and also for redirect target hosts where URL path and query were + * found to be "/robots.txt" + */ + CACHE.put(redirectCacheKey, robotRules); } } return robotRules; } + /** + * @param code + * HTTP response status code + * @return whether the status code signals a redirect to a different location + */ + private boolean isRedirect(int code) { + return (code == 301 || code == 302 || code == 303 || code == 307 || code == 308); + } + /** * Append {@link Content} of robots.txt to {@literal robotsTxtContent} *