This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new a24ec5c5b NUTCH-2995 Upgrade to crawler-commons 1.4
a24ec5c5b is described below
commit a24ec5c5b761476897c7fff0bfd3d5107995fedc
Author: Sebastian Nagel <[email protected]>
AuthorDate: Tue Aug 22 10:36:45 2023 +0200
NUTCH-2995 Upgrade to crawler-commons 1.4
- upgrade to crawler-commons from 1.3 to 1.4
- update Javadoc and improve code formatting of robots.txt unit tests
- fix robots.txt unit tests to reflect changes in
crawler-commons due to RFC 9309 compliance and merging of rule groups
(see https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1)
- mark unit tests for deprecated API endpoints as deprecated
---
ivy/ivy.xml | 2 +-
.../protocol/http/api/TestRobotRulesParser.java | 102 +++++++++++++++------
2 files changed, 74 insertions(+), 30 deletions(-)
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 269f521c8..18a6df230 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -65,7 +65,7 @@
<dependency org="com.google.guava" name="guava" rev="31.1-jre"
/>
- <dependency org="com.github.crawler-commons"
name="crawler-commons" rev="1.3" />
+ <dependency org="com.github.crawler-commons"
name="crawler-commons" rev="1.4" />
<dependency org="com.google.code.gson" name="gson" rev="2.9.1"/>
<dependency org="com.martinkl.warc" name="warc-hadoop"
rev="0.1.0">
diff --git
a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
index 93bb51b22..265abf934 100644
---
a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
+++
b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
@@ -22,32 +22,37 @@ import org.junit.Test;
import crawlercommons.robots.BaseRobotRules;
/**
- * JUnit test case which tests 1. that robots filtering is performed correctly
- * as per the agent name 2. that crawl delay is extracted correctly from the
- * robots file
- *
+ * JUnit test case which tests
+ * <ol>
+ * <li>that robots filtering is performed correctly as per the agent name</li>
+ * <li>that crawl delay is extracted correctly from the robots.txt file</li>
+ * </ol>
*/
public class TestRobotRulesParser {
private static final String CONTENT_TYPE = "text/plain";
- private static final String SINGLE_AGENT = "Agent1";
- private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
+ private static final String SINGLE_AGENT1 = "Agent1";
+ private static final String SINGLE_AGENT2 = "Agent2";
+ private static final String MULTIPLE_AGENTS = "Agent2, Agent1"; // rules are
merged for both agents
private static final String UNKNOWN_AGENT = "AgentABC";
private static final String CR = "\r";
- private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR
- + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c"
- + CR
- + "Crawl-delay: 10"
- + CR // set crawl delay for Agent1 as 10 sec
- + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh"
- + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20"
- + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no
- //
crawl
- //
delay
- //
for
- //
other
- //
agents
+ private static final String ROBOTS_STRING = //
+ "User-Agent: Agent1 #foo" + CR //
+ + "Disallow: /a" + CR //
+ + "Disallow: /b/a" + CR //
+ + "#Disallow: /c" + CR //
+ + "Crawl-delay: 10" + CR // set crawl delay for Agent1 as 10 seconds
+ + "" + CR //
+ + "" + CR //
+ + "User-Agent: Agent2" + CR //
+ + "Disallow: /a/bloh" + CR //
+ + "Disallow: /c" + CR //
+ + "Disallow: /foo" + CR //
+ + "Crawl-delay: 20" + CR // Agent2: 20 seconds
+ + "" + CR //
+ + "User-Agent: *" + CR //
+ + "Disallow: /foo/bar/" + CR; // no crawl delay for other agents
private static final String[] TEST_PATHS = new String[] {
"http://example.com/a", "http://example.com/a/bloh/foo.html",
@@ -55,7 +60,8 @@ public class TestRobotRulesParser {
"http://example.com/b/a/index.html",
"http://example.com/foo/bar/baz.html" };
- private static final boolean[] RESULTS = new boolean[] { false, // /a
+ private static final boolean[] RESULTS_AGENT1 = new boolean[] { //
+ false, // /a
false, // /a/bloh/foo.html
true, // /b
true, // /c
@@ -63,6 +69,24 @@ public class TestRobotRulesParser {
true // /foo/bar/baz.html
};
+ private static final boolean[] RESULTS_AGENT2 = new boolean[] { //
+ true, // /a
+ false, // /a/bloh/foo.html
+ true, // /b
+ false, // /c
+ true, // /b/a/index.html
+ false // /foo/bar/baz.html
+ };
+
+ private static final boolean[] RESULTS_AGENT1_AND_AGENT2 = new boolean[] { //
+ false, // /a
+ false, // /a/bloh/foo.html
+ true, // /b
+ false, // /c
+ false, // /b/a/index.html
+ false // /foo/bar/baz.html
+ };
+
private HttpRobotRulesParser parser;
private BaseRobotRules rules;
@@ -74,17 +98,29 @@ public class TestRobotRulesParser {
* Test that the robots rules are interpreted correctly by the robots rules
* parser.
*/
+ @Deprecated
@Test
- public void testRobotsAgent() {
+ public void testRobotsAgentDeprecatedAPIMethod() {
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, SINGLE_AGENT1);
+
+ for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+ Assert.assertTrue(
+ "testing on agent (" + SINGLE_AGENT1 + "), and " + "path "
+ + TEST_PATHS[counter] + " got "
+ + rules.isAllowed(TEST_PATHS[counter]),
+ rules.isAllowed(TEST_PATHS[counter]) == RESULTS_AGENT1[counter]);
+ }
+
rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
- CONTENT_TYPE, SINGLE_AGENT);
+ CONTENT_TYPE, SINGLE_AGENT2);
for (int counter = 0; counter < TEST_PATHS.length; counter++) {
Assert.assertTrue(
- "testing on agent (" + SINGLE_AGENT + "), and " + "path "
+ "testing on agent (" + SINGLE_AGENT2 + "), and " + "path "
+ TEST_PATHS[counter] + " got "
+ rules.isAllowed(TEST_PATHS[counter]),
- rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+ rules.isAllowed(TEST_PATHS[counter]) == RESULTS_AGENT2[counter]);
}
rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
@@ -95,7 +131,7 @@ public class TestRobotRulesParser {
"testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
+ TEST_PATHS[counter] + " got "
+ rules.isAllowed(TEST_PATHS[counter]),
- rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+ rules.isAllowed(TEST_PATHS[counter]) ==
RESULTS_AGENT1_AND_AGENT2[counter]);
}
}
@@ -104,15 +140,23 @@ public class TestRobotRulesParser {
* agent. If its not specified for a given agent, default value must be
* returned.
*/
+ @Deprecated
@Test
- public void testCrawlDelay() {
- // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
+ public void testCrawlDelayDeprecatedAPIMethod() {
+ // for SINGLE_AGENT1, the crawl delay of 10 seconds, i.e. 10000 msec must
be
// returned by the parser
rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
- CONTENT_TYPE, SINGLE_AGENT);
- Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ",
+ CONTENT_TYPE, SINGLE_AGENT1);
+ Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT1 + " : ",
(rules.getCrawlDelay() == 10000));
+ // for SINGLE_AGENT2, the crawl delay of 20 seconds, i.e. 20000 msec must
be
+ // returned by the parser
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, SINGLE_AGENT2);
+ Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT2 + " : ",
+ (rules.getCrawlDelay() == 20000));
+
// for UNKNOWN_AGENT, the default crawl delay must be returned.
rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, UNKNOWN_AGENT);