[nutch] branch master updated: NUTCH-2996 Use new SimpleRobotRulesParser API entry point crawler-commons 1.4

snagel Tue, 22 Aug 2023 02:39:32 -0700

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git



The following commit(s) were added to refs/heads/master by this push:
     new 070c115cf NUTCH-2996 Use new SimpleRobotRulesParser API entry point 
crawler-commons 1.4
070c115cf is described below

commit 070c115cfadbc937a8ad0add6447461983e92028
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Tue Aug 22 11:39:22 2023 +0200

    NUTCH-2996 Use new SimpleRobotRulesParser API entry point crawler-commons 
1.4
    
    - split and lowercase agent names (if multiple) at configuration time
      and pass as collection to SimpleRobotRulesParser
    - update RobotRulesParser command-line help
    - update unit tests to use new API
    - update description of Nutch properties to reflect the changes due to
      the usage of the new API entry point and the upgrade to crawler-commons 
1.4
---
 conf/nutch-default.xml                             | 34 +++++----
 .../apache/nutch/protocol/RobotRulesParser.java    | 71 +++++++++++++-----
 .../protocol/http/api/TestRobotRulesParser.java    | 87 ++++++++++++++++------
 3 files changed, 135 insertions(+), 57 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 379b5ef5d..e98bd5570 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -72,9 +72,18 @@
 <property>
   <name>http.agent.name</name>
   <value></value>
-  <description>HTTP 'User-Agent' request header. MUST NOT be empty -
+  <description>'User-Agent' name: a single word uniquely identifying your 
crawler.
+
+  The value is used to select the group of robots.txt rules addressing your
+  crawler. It is also sent as part of the HTTP 'User-Agent' request header.
+
+  This property MUST NOT be empty -
   please set this to a single word uniquely related to your organization.
 
+  Following RFC 9309 the 'User-Agent' name (aka. 'product token')
+  &quot;MUST contain only uppercase and lowercase letters ('a-z' and
+  'A-Z'), underscores ('_'), and hyphens ('-').&quot;
+
   NOTE: You should also check other related properties:
 
     http.robots.agents
@@ -84,7 +93,6 @@
     http.agent.version
 
   and set their values appropriately.
-
   </description>
 </property>
 
@@ -95,13 +103,13 @@
   parser would look for in robots.txt. Multiple agents can be provided using
   comma as a delimiter. eg. mybot,foo-spider,bar-crawler
 
-  The ordering of agents does NOT matter and the robots parser would make
-  decision based on the agent which matches first to the robots rules.
-  Also, there is NO need to add a wildcard (ie. "*") to this string as the
-  robots parser would smartly take care of a no-match situation.
+  The ordering of agents does NOT matter and the robots.txt parser combines
+  all rules to any of the agent names.  Also, there is NO need to add
+  a wildcard (ie. "*") to this string as the robots parser would smartly
+  take care of a no-match situation.
 
   If no value is specified, by default HTTP agent (ie. 'http.agent.name')
-  would be used for user agent matching by the robots parser.
+  is used for user-agent matching by the robots parser.
   </description>
 </property>
 
@@ -166,9 +174,9 @@
 <property>
   <name>http.agent.url</name>
   <value></value>
-  <description>A URL to advertise in the User-Agent header.  This will
+  <description>A URL to advertise in the User-Agent header. This will
    appear in parenthesis after the agent name. Custom dictates that this
-   should be a URL of a page explaining the purpose and behavior of this
+   should be a URL to a page that explains the purpose and behavior of this
    crawler.
   </description>
 </property>
@@ -176,9 +184,9 @@
 <property>
   <name>http.agent.email</name>
   <value></value>
-  <description>An email address to advertise in the HTTP 'From' request
-   header and User-Agent header. A good practice is to mangle this
-   address (e.g. 'info at example dot com') to avoid spamming.
+  <description>An email address to advertise in the HTTP 'User-Agent' (and
+   'From') request headers. A good practice is to mangle this address
+   (e.g. 'info at example dot com') to avoid spamming.
   </description>
 </property>
 
@@ -202,7 +210,7 @@
   <name>http.agent.rotate.file</name>
   <value>agents.txt</value>
   <description>
-    File containing alternative user agent names to be used instead of
+    File containing alternative user-agent names to be used instead of
     http.agent.name on a rotating basis if http.agent.rotate is true.
     Each line of the file should contain exactly one agent
     specification including name, version, description, URL, etc.
diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java 
b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 1493bc292..562c2c694 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -24,12 +24,13 @@ import java.io.LineNumberReader;
 import java.lang.invoke.MethodHandles;
 import java.net.MalformedURLException;
 import java.net.URL;
+import java.util.Collection;
 import java.util.HashSet;
 import java.util.Hashtable;
+import java.util.LinkedHashSet;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Set;
-import java.util.StringTokenizer;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -96,7 +97,7 @@ public abstract class RobotRulesParser implements Tool {
   }
 
   protected Configuration conf;
-  protected String agentNames;
+  protected Set<String> agentNames;
 
   /** set of host names or IPs to be explicitly excluded from robots.txt 
checking */
   protected Set<String> allowList = new HashSet<>();
@@ -114,6 +115,7 @@ public abstract class RobotRulesParser implements Tool {
   /**
    * Set the {@link Configuration} object
    */
+  @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
 
@@ -122,26 +124,30 @@ public abstract class RobotRulesParser implements Tool {
     if (agentName == null || (agentName = agentName.trim()).isEmpty()) {
       throw new RuntimeException("Agent name not configured!");
     }
-    agentNames = agentName;
+    agentNames = new LinkedHashSet<>();
+    if (!agentName.equals("*")) {
+      /*
+       * skip wildcard "*" - crawler-commons' SimpleRobotRulesParser expects an
+       * empty set of agent names to use the wildcard rules
+       */
+      agentNames.add(agentName.toLowerCase());
+    }
 
     // If there are any other agents specified, append those to the list of
     // agents
-    String otherAgents = conf.get("http.robots.agents");
-    if (otherAgents != null && !otherAgents.trim().isEmpty()) {
-      StringTokenizer tok = new StringTokenizer(otherAgents, ",");
-      StringBuilder sb = new StringBuilder(agentNames);
-      while (tok.hasMoreTokens()) {
-        String str = tok.nextToken().trim();
-        if (str.equals("*") || str.equals(agentName)) {
-          // skip wildcard "*" or agent name itself
-          // (required for backward compatibility, cf. NUTCH-1715 and
-          // NUTCH-1718)
+    String[] otherAgents = conf.getStrings("http.robots.agents");
+    if (otherAgents != null && otherAgents.length > 0) {
+      for (String otherAgent : otherAgents) {
+        otherAgent = otherAgent.toLowerCase();
+        if (otherAgent.equals("*") || otherAgent.equalsIgnoreCase(agentName)) {
+          /*
+           * skip wildcard "*" or agent name itself (required for backward
+           * compatibility, cf. NUTCH-1715 and NUTCH-1718)
+           */
         } else {
-          sb.append(",").append(str);
+          agentNames.add(otherAgent);
         }
       }
-
-      agentNames = sb.toString();
     }
 
     String[] confAllowList = conf.getStrings("http.robot.rules.allowlist");
@@ -166,6 +172,7 @@ public abstract class RobotRulesParser implements Tool {
   /**
    * Get the {@link Configuration} object
    */
+  @Override
   public Configuration getConf() {
     return conf;
   }
@@ -188,10 +195,10 @@ public abstract class RobotRulesParser implements Tool {
 
   /**
    * Parses the robots content using the {@link SimpleRobotRulesParser} from
-   * crawler commons
+   * crawler-commons
    * 
    * @param url
-   *          A string containing url
+   *          The robots.txt URL
    * @param content
    *          Contents of the robots file in a byte array
    * @param contentType
@@ -201,11 +208,32 @@ public abstract class RobotRulesParser implements Tool {
    *          matching
    * @return BaseRobotRules object
    */
+  @Deprecated
   public BaseRobotRules parseRules(String url, byte[] content,
       String contentType, String robotName) {
     return robotParser.parseContent(url, content, contentType, robotName);
   }
 
+  /**
+   * Parses the robots content using the {@link SimpleRobotRulesParser} from
+   * crawler-commons
+   * 
+   * @param url
+   *          The robots.txt URL
+   * @param content
+   *          Contents of the robots file in a byte array
+   * @param contentType
+   *          The content type of the robots file
+   * @param robotNames
+   *          A collection containing all the robots agent names used by parser
+   *          for matching
+   * @return BaseRobotRules object
+   */
+  public BaseRobotRules parseRules(String url, byte[] content,
+      String contentType, Collection<String> robotNames) {
+    return robotParser.parseContent(url, content, contentType, robotNames);
+  }
+
   /**
    * Fetch robots.txt (or it's protocol-specific equivalent) which applies to
    * the given URL, parse it and return the set of robot rules applicable for
@@ -274,8 +302,9 @@ public abstract class RobotRulesParser implements Tool {
           "\tit is allowed by the robots.txt rules.  Other parts of the URLs",
           "\t(mainly the host) are ignored.",
           "",
-          "<agent-names>\tcomma-separated list of agent names",
+          "<agent-names>\tuser-agent name (aka. \"product token\")",
           "\tused to select rules from the robots.txt file.",
+          "\tMultiple agent names can be passed as comma-separated string.",
           "\tIf no agent name is given the properties http.agent.name",
           "\tand http.robots.agents are used.",
           "\tIf also http.agent.name and http.robots.agents are empty,",
@@ -353,7 +382,8 @@ public abstract class RobotRulesParser implements Tool {
         }
       }
 
-      System.out.println("Testing robots.txt for agent names: " + agentNames);
+      System.out.println("Testing robots.txt for agent names: "
+          + (agentNames.isEmpty() ? "* (any other agent)" : agentNames));
 
       LineNumberReader testsIn = new LineNumberReader(new FileReader(urlFile));
       String testPath;
@@ -393,6 +423,7 @@ public abstract class RobotRulesParser implements Tool {
    */
   private static class TestRobotRulesParser extends RobotRulesParser {
 
+    @Override
     public void setConf(Configuration conf) {
       /*
        * Make sure that agent name is not empty so that
diff --git 
a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 
b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
index 265abf934..202d2d08b 100644
--- 
a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
+++ 
b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
@@ -16,6 +16,8 @@
  */
 package org.apache.nutch.protocol.http.api;
 
+import java.util.Set;
+
 import org.junit.Assert;
 import org.junit.Test;
 
@@ -94,6 +96,64 @@ public class TestRobotRulesParser {
     parser = new HttpRobotRulesParser();
   }
 
+  private void testRulesOnPaths(String agent, String[] paths,
+      boolean[] results) {
+    for (int counter = 0; counter < paths.length; counter++) {
+      boolean res = rules.isAllowed(paths[counter]);
+      Assert.assertTrue(
+          "testing on agent (" + agent + "), and " + "path " + paths[counter]
+              + " got " + res + ", expected " + results[counter],
+          res == results[counter]);
+    }
+  }
+
+  /**
+   * Test that the robots rules are interpreted correctly by the robots rules
+   * parser.
+   */
+  @Test
+  public void testRobotsAgent() {
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, Set.of(SINGLE_AGENT1.toLowerCase()));
+    testRulesOnPaths(SINGLE_AGENT1, TEST_PATHS, RESULTS_AGENT1);
+
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, Set.of(SINGLE_AGENT2.toLowerCase()));
+    testRulesOnPaths(SINGLE_AGENT2, TEST_PATHS, RESULTS_AGENT2);
+
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, 
Set.of(MULTIPLE_AGENTS.toLowerCase().split("\\s*,\\s*")));
+    testRulesOnPaths(MULTIPLE_AGENTS, TEST_PATHS, RESULTS_AGENT1_AND_AGENT2);
+  }
+
+  /**
+   * Test that the crawl delay is extracted from the robots file for respective
+   * agent. If its not specified for a given agent, default value must be
+   * returned.
+   */
+  @Test
+  public void testCrawlDelay() {
+    // for SINGLE_AGENT1, the crawl delay of 10 seconds, i.e. 10000 msec must 
be
+    // returned by the parser
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, Set.of(SINGLE_AGENT1.toLowerCase()));
+    Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT1 + " : ",
+        (rules.getCrawlDelay() == 10000));
+
+    // for SINGLE_AGENT2, the crawl delay of 20 seconds, i.e. 20000 msec must 
be
+    // returned by the parser
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, Set.of(SINGLE_AGENT2.toLowerCase()));
+    Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT2 + " : ",
+        (rules.getCrawlDelay() == 20000));
+
+    // for UNKNOWN_AGENT, the default crawl delay must be returned.
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, Set.of(UNKNOWN_AGENT.toLowerCase()));
+    Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
+        (rules.getCrawlDelay() == Long.MIN_VALUE));
+  }
+
   /**
    * Test that the robots rules are interpreted correctly by the robots rules
    * parser.
@@ -103,36 +163,15 @@ public class TestRobotRulesParser {
   public void testRobotsAgentDeprecatedAPIMethod() {
     rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
         CONTENT_TYPE, SINGLE_AGENT1);
-
-    for (int counter = 0; counter < TEST_PATHS.length; counter++) {
-      Assert.assertTrue(
-          "testing on agent (" + SINGLE_AGENT1 + "), and " + "path "
-              + TEST_PATHS[counter] + " got "
-              + rules.isAllowed(TEST_PATHS[counter]),
-          rules.isAllowed(TEST_PATHS[counter]) == RESULTS_AGENT1[counter]);
-    }
+    testRulesOnPaths(SINGLE_AGENT1, TEST_PATHS, RESULTS_AGENT1);
 
     rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
         CONTENT_TYPE, SINGLE_AGENT2);
-
-    for (int counter = 0; counter < TEST_PATHS.length; counter++) {
-      Assert.assertTrue(
-          "testing on agent (" + SINGLE_AGENT2 + "), and " + "path "
-              + TEST_PATHS[counter] + " got "
-              + rules.isAllowed(TEST_PATHS[counter]),
-          rules.isAllowed(TEST_PATHS[counter]) == RESULTS_AGENT2[counter]);
-    }
+    testRulesOnPaths(SINGLE_AGENT2, TEST_PATHS, RESULTS_AGENT2);
 
     rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
         CONTENT_TYPE, MULTIPLE_AGENTS);
-
-    for (int counter = 0; counter < TEST_PATHS.length; counter++) {
-      Assert.assertTrue(
-          "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
-              + TEST_PATHS[counter] + " got "
-              + rules.isAllowed(TEST_PATHS[counter]),
-          rules.isAllowed(TEST_PATHS[counter]) == 
RESULTS_AGENT1_AND_AGENT2[counter]);
-    }
+    testRulesOnPaths(MULTIPLE_AGENTS, TEST_PATHS, RESULTS_AGENT1_AND_AGENT2);
   }
 
   /**

[nutch] branch master updated: NUTCH-2996 Use new SimpleRobotRulesParser API entry point crawler-commons 1.4

Reply via email to