[nutch] branch master updated: NUTCH-2995 Upgrade to crawler-commons 1.4

snagel Tue, 22 Aug 2023 01:36:55 -0700

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git



The following commit(s) were added to refs/heads/master by this push:
     new a24ec5c5b  NUTCH-2995 Upgrade to crawler-commons 1.4
a24ec5c5b is described below

commit a24ec5c5b761476897c7fff0bfd3d5107995fedc
Author: Sebastian Nagel <[email protected]>
AuthorDate: Tue Aug 22 10:36:45 2023 +0200

     NUTCH-2995 Upgrade to crawler-commons 1.4
    
    - upgrade to crawler-commons from 1.3 to 1.4
    - update Javadoc and improve code formatting of robots.txt unit tests
    - fix robots.txt unit tests to reflect changes in
      crawler-commons due to RFC 9309 compliance and merging of rule groups
      (see https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1)
    - mark unit tests for deprecated API endpoints as deprecated
---
 ivy/ivy.xml                                        |   2 +-
 .../protocol/http/api/TestRobotRulesParser.java    | 102 +++++++++++++++------
 2 files changed, 74 insertions(+), 30 deletions(-)

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 269f521c8..18a6df230 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -65,7 +65,7 @@
 
                <dependency org="com.google.guava" name="guava" rev="31.1-jre" 
/>
 
-               <dependency org="com.github.crawler-commons" 
name="crawler-commons" rev="1.3" />
+               <dependency org="com.github.crawler-commons" 
name="crawler-commons" rev="1.4" />
 
                <dependency org="com.google.code.gson" name="gson" rev="2.9.1"/>
                <dependency org="com.martinkl.warc" name="warc-hadoop" 
rev="0.1.0">
diff --git 
a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 
b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
index 93bb51b22..265abf934 100644
--- 
a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
+++ 
b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
@@ -22,32 +22,37 @@ import org.junit.Test;
 import crawlercommons.robots.BaseRobotRules;
 
 /**
- * JUnit test case which tests 1. that robots filtering is performed correctly
- * as per the agent name 2. that crawl delay is extracted correctly from the
- * robots file
- * 
+ * JUnit test case which tests
+ * <ol>
+ * <li>that robots filtering is performed correctly as per the agent name</li>
+ * <li>that crawl delay is extracted correctly from the robots.txt file</li>
+ * </ol>
  */
 public class TestRobotRulesParser {
 
   private static final String CONTENT_TYPE = "text/plain";
-  private static final String SINGLE_AGENT = "Agent1";
-  private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
+  private static final String SINGLE_AGENT1 = "Agent1";
+  private static final String SINGLE_AGENT2 = "Agent2";
+  private static final String MULTIPLE_AGENTS = "Agent2, Agent1"; // rules are 
merged for both agents
   private static final String UNKNOWN_AGENT = "AgentABC";
   private static final String CR = "\r";
 
-  private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR
-      + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c"
-      + CR
-      + "Crawl-delay: 10"
-      + CR // set crawl delay for Agent1 as 10 sec
-      + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh"
-      + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20"
-      + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no
-                                                                          // 
crawl
-                                                                          // 
delay
-                                                                          // 
for
-                                                                          // 
other
-                                                                          // 
agents
+  private static final String ROBOTS_STRING = //
+      "User-Agent: Agent1 #foo" + CR //
+          + "Disallow: /a" + CR //
+          + "Disallow: /b/a" + CR //
+          + "#Disallow: /c" + CR //
+          + "Crawl-delay: 10" + CR // set crawl delay for Agent1 as 10 seconds
+          + "" + CR //
+          + "" + CR //
+          + "User-Agent: Agent2" + CR //
+          + "Disallow: /a/bloh" + CR //
+          + "Disallow: /c" + CR //
+          + "Disallow: /foo" + CR //
+          + "Crawl-delay: 20" + CR // Agent2: 20 seconds
+          + "" + CR //
+          + "User-Agent: *" + CR //
+          + "Disallow: /foo/bar/" + CR; // no crawl delay for other agents
 
   private static final String[] TEST_PATHS = new String[] {
       "http://example.com/a";, "http://example.com/a/bloh/foo.html";,
@@ -55,7 +60,8 @@ public class TestRobotRulesParser {
       "http://example.com/b/a/index.html";,
       "http://example.com/foo/bar/baz.html"; };
 
-  private static final boolean[] RESULTS = new boolean[] { false, // /a
+  private static final boolean[] RESULTS_AGENT1 = new boolean[] { //
+      false, // /a
       false, // /a/bloh/foo.html
       true, // /b
       true, // /c
@@ -63,6 +69,24 @@ public class TestRobotRulesParser {
       true // /foo/bar/baz.html
   };
 
+  private static final boolean[] RESULTS_AGENT2 = new boolean[] { //
+      true, // /a
+      false, // /a/bloh/foo.html
+      true, // /b
+      false, // /c
+      true, // /b/a/index.html
+      false // /foo/bar/baz.html
+  };
+
+  private static final boolean[] RESULTS_AGENT1_AND_AGENT2 = new boolean[] { //
+      false, // /a
+      false, // /a/bloh/foo.html
+      true, // /b
+      false, // /c
+      false, // /b/a/index.html
+      false // /foo/bar/baz.html
+  };
+
   private HttpRobotRulesParser parser;
   private BaseRobotRules rules;
 
@@ -74,17 +98,29 @@ public class TestRobotRulesParser {
    * Test that the robots rules are interpreted correctly by the robots rules
    * parser.
    */
+  @Deprecated
   @Test
-  public void testRobotsAgent() {
+  public void testRobotsAgentDeprecatedAPIMethod() {
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, SINGLE_AGENT1);
+
+    for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+      Assert.assertTrue(
+          "testing on agent (" + SINGLE_AGENT1 + "), and " + "path "
+              + TEST_PATHS[counter] + " got "
+              + rules.isAllowed(TEST_PATHS[counter]),
+          rules.isAllowed(TEST_PATHS[counter]) == RESULTS_AGENT1[counter]);
+    }
+
     rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
-        CONTENT_TYPE, SINGLE_AGENT);
+        CONTENT_TYPE, SINGLE_AGENT2);
 
     for (int counter = 0; counter < TEST_PATHS.length; counter++) {
       Assert.assertTrue(
-          "testing on agent (" + SINGLE_AGENT + "), and " + "path "
+          "testing on agent (" + SINGLE_AGENT2 + "), and " + "path "
               + TEST_PATHS[counter] + " got "
               + rules.isAllowed(TEST_PATHS[counter]),
-          rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+          rules.isAllowed(TEST_PATHS[counter]) == RESULTS_AGENT2[counter]);
     }
 
     rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
@@ -95,7 +131,7 @@ public class TestRobotRulesParser {
           "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
               + TEST_PATHS[counter] + " got "
               + rules.isAllowed(TEST_PATHS[counter]),
-          rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+          rules.isAllowed(TEST_PATHS[counter]) == 
RESULTS_AGENT1_AND_AGENT2[counter]);
     }
   }
 
@@ -104,15 +140,23 @@ public class TestRobotRulesParser {
    * agent. If its not specified for a given agent, default value must be
    * returned.
    */
+  @Deprecated
   @Test
-  public void testCrawlDelay() {
-    // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
+  public void testCrawlDelayDeprecatedAPIMethod() {
+    // for SINGLE_AGENT1, the crawl delay of 10 seconds, i.e. 10000 msec must 
be
     // returned by the parser
     rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
-        CONTENT_TYPE, SINGLE_AGENT);
-    Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ",
+        CONTENT_TYPE, SINGLE_AGENT1);
+    Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT1 + " : ",
         (rules.getCrawlDelay() == 10000));
 
+    // for SINGLE_AGENT2, the crawl delay of 20 seconds, i.e. 20000 msec must 
be
+    // returned by the parser
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, SINGLE_AGENT2);
+    Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT2 + " : ",
+        (rules.getCrawlDelay() == 20000));
+
     // for UNKNOWN_AGENT, the default crawl delay must be returned.
     rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
         CONTENT_TYPE, UNKNOWN_AGENT);

[nutch] branch master updated: NUTCH-2995 Upgrade to crawler-commons 1.4

Reply via email to