[nutch] branch master updated: NUTCH-2990 HttpRobotRulesParser to follow 5 redirects as specified by RFC 9309 (#779)

snagel Sat, 21 Oct 2023 06:53:34 -0700

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git



The following commit(s) were added to refs/heads/master by this push:
     new ecdd19dbd NUTCH-2990 HttpRobotRulesParser to follow 5 redirects as 
specified by RFC 9309 (#779)
ecdd19dbd is described below

commit ecdd19dbdd4424bf9b9bce206f23992140ee43fe
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Sat Oct 21 15:53:25 2023 +0200

    NUTCH-2990 HttpRobotRulesParser to follow 5 redirects as specified by RFC 
9309 (#779)
    
    - follow multiple redirects when fetching robots.txt
    - number of followed redirects is configurable by the property
      http.robots.redirect.max (default: 5)
    
    Improvements to RobotRulesParser's robots.txt test utility
    - bug fix: the passed agent names need to be transferred
      to the property http.robots.agents earlier, before the
      protocol plugins are configured
    - more verbose debug logging
---
 conf/nutch-default.xml                             |  10 ++
 .../apache/nutch/protocol/RobotRulesParser.java    |  32 +++--
 .../protocol/http/api/HttpRobotRulesParser.java    | 141 ++++++++++++++++-----
 3 files changed, 143 insertions(+), 40 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 58455b338..18ed56b03 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -163,6 +163,16 @@
   </description>
 </property>
 
+<property>
+  <name>http.robots.redirect.max</name>
+  <value>5</value>
+  <description>Maximum number of redirects followed when fetching
+  a robots.txt file. RFC 9309 specifies that &quot;crawlers SHOULD
+  follow at least five consecutive redirects, even across authorities
+  (for example, hosts in the case of HTTP).&quot;
+  </description>
+</property>
+
 <property>
   <name>http.agent.description</name>
   <value></value>
diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java 
b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 562c2c694..d73c07506 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -98,6 +98,7 @@ public abstract class RobotRulesParser implements Tool {
 
   protected Configuration conf;
   protected Set<String> agentNames;
+  protected int maxNumRedirects = 5;
 
   /** set of host names or IPs to be explicitly excluded from robots.txt 
checking */
   protected Set<String> allowList = new HashSet<>();
@@ -149,6 +150,10 @@ public abstract class RobotRulesParser implements Tool {
         }
       }
     }
+    LOG.info("Checking robots.txt for the following agent names: {}", 
agentNames);
+
+    maxNumRedirects = conf.getInt("http.robots.redirect.max", 5);
+    LOG.info("Following max. {} robots.txt redirects", maxNumRedirects);
 
     String[] confAllowList = conf.getStrings("http.robot.rules.allowlist");
     if (confAllowList == null) {
@@ -294,8 +299,11 @@ public abstract class RobotRulesParser implements Tool {
           "",
           "<robots-file-or-url>\tlocal file or URL parsed as robots.txt file",
           "\tIf <robots-file-or-url> starts with a protocol specification",
-          "\t(`http', `https', `ftp' or `file'), robots.txt it is fetched",
-          "\tusing the specified protocol. Otherwise, a local file is 
assumed.",
+          "\t(`http', `https', `ftp' or `file'), the URL is parsed, URL path",
+          "\tand query are removed and the path \"/robots.txt\" is appended.",
+          "\tThe resulting URL (the canonical robots.txt location) is then",
+          "\tfetched using the specified protocol.",
+          "\tIf the URL does not include a protocol, a local file is assumed.",
           "",
           "<url-file>\tlocal file with URLs (one per line), for every URL",
           "\tthe path part (including the query) is checked whether",
@@ -323,6 +331,16 @@ public abstract class RobotRulesParser implements Tool {
       return -1;
     }
 
+    if (args.length > 2) {
+      // set agent name from command-line in configuration
+      // Note: when fetching via protocol this must be done
+      // before the protocol is configured
+      String agents = args[2];
+      conf.set("http.robots.agents", agents);
+      conf.set("http.agent.name", agents.split(",")[0]);
+      setConf(conf);
+    }
+
     Protocol protocol = null;
     URL robotsTxtUrl = null;
     if (args[0].matches("^(?:https?|ftp|file)://?.*")) {
@@ -334,6 +352,7 @@ public abstract class RobotRulesParser implements Tool {
       ProtocolFactory factory = new ProtocolFactory(conf);
       try {
         protocol = factory.getProtocol(robotsTxtUrl);
+        LOG.debug("Using protocol {} to fetch robots.txt", 
protocol.getClass());
       } catch (ProtocolNotFound e) {
         LOG.error("No protocol found for {}: {}", args[0],
             StringUtils.stringifyException(e));
@@ -357,14 +376,6 @@ public abstract class RobotRulesParser implements Tool {
 
     File urlFile = new File(args[1]);
 
-    if (args.length > 2) {
-      // set agent name from command-line in configuration and update parser
-      String agents = args[2];
-      conf.set("http.robots.agents", agents);
-      conf.set("http.agent.name", agents.split(",")[0]);
-      setConf(conf);
-    }
-
     List<Content> robotsTxtContent = null;
     if (getConf().getBoolean("fetcher.store.robotstxt", false)) {
       robotsTxtContent = new LinkedList<>();
@@ -373,6 +384,7 @@ public abstract class RobotRulesParser implements Tool {
     try {
 
       BaseRobotRules rules = getRobotRulesSet(protocol, robotsTxtUrl, 
robotsTxtContent);
+      LOG.debug("Robots.txt rules:\n{}", rules);
 
       if (robotsTxtContent != null) {
         for (Content robotsTxt : robotsTxtContent) {
diff --git 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
index db09a0c88..8d7263e3e 100644
--- 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++ 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -17,12 +17,15 @@
 package org.apache.nutch.protocol.http.api;
 
 import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
 import java.net.URL;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
+import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.net.protocols.Response;
@@ -87,6 +90,13 @@ public class HttpRobotRulesParser extends RobotRulesParser {
    * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
    * rules are cached to avoid re-fetching and re-parsing it again.
    * 
+   * <p>Following
+   * <a href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.3.1.2";>RFC
+   * 9309, section 2.3.1.2. Redirects</a>, up to five consecutive HTTP 
redirects
+   * are followed when fetching the robots.txt file. The max. number of
+   * redirects followed is configurable by the property
+   * <code>http.robots.redirect.max</code>.</p>
+   * 
    * @param http
    *          The {@link Protocol} object
    * @param url
@@ -114,11 +124,11 @@ public class HttpRobotRulesParser extends 
RobotRulesParser {
     if (robotRules != null) {
       return robotRules; // cached rule
     } else if (LOG.isTraceEnabled()) {
-      LOG.trace("cache miss {}", url);
+      LOG.trace("Robots.txt cache miss {}", url);
     }
 
     boolean cacheRule = true;
-    URL redir = null;
+    Set<String> redirectCacheKeys = new HashSet<>();
 
     if (isAllowListed(url)) {
       // check in advance whether a host is allowlisted
@@ -129,43 +139,97 @@ public class HttpRobotRulesParser extends 
RobotRulesParser {
           url.getHost());
 
     } else {
+      URL robotsUrl = null, robotsUrlRedir = null;
       try {
-        URL robotsUrl = new URL(url, "/robots.txt");
+        robotsUrl = new URL(url, "/robots.txt");
+
+        /*
+         * Redirect counter - following redirects up to the configured maximum
+         * ("five consecutive redirects" as per RFC 9309).
+         */
+        int numRedirects = 0;
+        /*
+         * The base URL to resolve relative redirect locations is set initially
+         * to the default URL path ("/robots.txt") and updated when redirects
+         * were followed.
+         */
+        robotsUrlRedir = robotsUrl;
+
         Response response = ((HttpBase) http).getResponse(robotsUrl,
             new CrawlDatum(), true);
+        int code = response.getCode();
         if (robotsTxtContent != null) {
           addRobotsContent(robotsTxtContent, robotsUrl, response);
         }
-        // try one level of redirection ?
-        if (response.getCode() == 301 || response.getCode() == 302) {
-          String redirection = response.getHeader("Location");
-          if (redirection == null) {
-            // some versions of MS IIS are known to mangle this header
-            redirection = response.getHeader("location");
+
+        while (isRedirect(code) && numRedirects < maxNumRedirects) {
+          numRedirects++;
+
+          String redirectionLocation = response.getHeader("Location");
+          if (StringUtils.isNotBlank(redirectionLocation)) {
+            LOG.debug("Following robots.txt redirect: {} -> {}", 
robotsUrlRedir,
+                redirectionLocation);
+            try {
+              robotsUrlRedir = new URL(robotsUrlRedir, redirectionLocation);
+            } catch (MalformedURLException e) {
+              LOG.info(
+                  "Failed to resolve redirect location for robots.txt: {} -> 
{} ({})",
+                  robotsUrlRedir, redirectionLocation, e.getMessage());
+              break;
+            }
+            response = ((HttpBase) http).getResponse(robotsUrlRedir,
+                new CrawlDatum(), true);
+            code = response.getCode();
+            if (robotsTxtContent != null) {
+              addRobotsContent(robotsTxtContent, robotsUrlRedir, response);
+            }
+          } else {
+            LOG.info(
+                "No HTTP redirect Location header for robots.txt: {} (status 
code: {})",
+                robotsUrlRedir, code);
+            break;
           }
-          if (redirection != null) {
-            if (!redirection.startsWith("http")) {
-              // RFC says it should be absolute, but apparently it isn't
-              redir = new URL(url, redirection);
+
+          if ("/robots.txt".equals(robotsUrlRedir.getFile())) {
+            /*
+             * If a redirect points to a path /robots.txt on a different host
+             * (or a different authority scheme://host:port/, in general), we
+             * can lookup the cache for cached rules from the target host.
+             */
+            String redirectCacheKey = getCacheKey(robotsUrlRedir);
+            robotRules = CACHE.get(redirectCacheKey);
+            LOG.debug(
+                "Found cached robots.txt rules for {} (redirected to {}) under 
target key {}",
+                url, robotsUrlRedir, redirectCacheKey);
+            if (robotRules != null) {
+              /* If found, cache and return the rules for the source host. */
+              CACHE.put(cacheKey, robotRules);
+              return robotRules;
             } else {
-              redir = new URL(redirection);
+              /*
+               * Remember the target host/authority, we can cache the rules,
+               * too.
+               */
+              redirectCacheKeys.add(redirectCacheKey);
             }
+          }
 
-            response = ((HttpBase) http).getResponse(redir, new CrawlDatum(),
-                true);
-            if (robotsTxtContent != null) {
-              addRobotsContent(robotsTxtContent, redir, response);
-            }
+          if (numRedirects == maxNumRedirects && isRedirect(code)) {
+            LOG.info(
+                "Reached maximum number of robots.txt redirects for {} 
(assuming no robots.txt, allow all)",
+                url);
           }
         }
 
-        if (response.getCode() == 200) // found rules: parse them
+        LOG.debug("Fetched robots.txt for {} with status code {}", url, code);
+        if (code == 200) // found rules: parse them
           robotRules = parseRules(url.toString(), response.getContent(),
               response.getHeader("Content-Type"), agentNames);
 
-        else if ((response.getCode() == 403) && (!allowForbidden))
+        else if ((code == 403) && (!allowForbidden))
           robotRules = FORBID_ALL_RULES; // use forbid all
-        else if (response.getCode() >= 500) {
+
+        else if (code >= 500) {
           cacheRule = false; // try again later to fetch robots.txt
           if (deferVisits503) {
             // signal fetcher to suspend crawling for this host
@@ -177,8 +241,15 @@ public class HttpRobotRulesParser extends RobotRulesParser 
{
           robotRules = EMPTY_RULES; // use default rules
         }
       } catch (Throwable t) {
-        if (LOG.isInfoEnabled()) {
-          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+        if (robotsUrl == null || robotsUrlRedir == null) {
+          LOG.info("Couldn't get robots.txt for {}", url, t);
+        } else if (robotsUrl.equals(robotsUrlRedir)) {
+          LOG.info("Couldn't get robots.txt for {} ({}): {}", url, robotsUrl,
+              t);
+        } else {
+          LOG.info(
+              "Couldn't get redirected robots.txt for {} (redirected to {}): 
{}",
+              url, robotsUrlRedir, t);
         }
         cacheRule = false; // try again later to fetch robots.txt
         robotRules = EMPTY_RULES;
@@ -187,17 +258,27 @@ public class HttpRobotRulesParser extends 
RobotRulesParser {
 
     if (cacheRule) {
       CACHE.put(cacheKey, robotRules); // cache rules for host
-      if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())
-          && "/robots.txt".equals(redir.getFile())) {
-        // cache also for the redirected host
-        // if the URL path is /robots.txt
-        CACHE.put(getCacheKey(redir), robotRules);
+      for (String redirectCacheKey : redirectCacheKeys) {
+        /*
+         * and also for redirect target hosts where URL path and query were
+         * found to be "/robots.txt"
+         */
+        CACHE.put(redirectCacheKey, robotRules);
       }
     }
 
     return robotRules;
   }
 
+  /**
+   * @param code
+   *          HTTP response status code
+   * @return whether the status code signals a redirect to a different location
+   */
+  private boolean isRedirect(int code) {
+    return (code == 301 || code == 302 || code == 303 || code == 307 || code 
== 308);
+  }
+
   /**
    * Append {@link Content} of robots.txt to {@literal robotsTxtContent}
    *

[nutch] branch master updated: NUTCH-2990 HttpRobotRulesParser to follow 5 redirects as specified by RFC 9309 (#779)

Reply via email to