This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 070c115cf NUTCH-2996 Use new SimpleRobotRulesParser API entry point
crawler-commons 1.4
070c115cf is described below
commit 070c115cfadbc937a8ad0add6447461983e92028
Author: Sebastian Nagel <[email protected]>
AuthorDate: Tue Aug 22 11:39:22 2023 +0200
NUTCH-2996 Use new SimpleRobotRulesParser API entry point crawler-commons
1.4
- split and lowercase agent names (if multiple) at configuration time
and pass as collection to SimpleRobotRulesParser
- update RobotRulesParser command-line help
- update unit tests to use new API
- update description of Nutch properties to reflect the changes due to
the usage of the new API entry point and the upgrade to crawler-commons
1.4
---
conf/nutch-default.xml | 34 +++++----
.../apache/nutch/protocol/RobotRulesParser.java | 71 +++++++++++++-----
.../protocol/http/api/TestRobotRulesParser.java | 87 ++++++++++++++++------
3 files changed, 135 insertions(+), 57 deletions(-)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 379b5ef5d..e98bd5570 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -72,9 +72,18 @@
<property>
<name>http.agent.name</name>
<value></value>
- <description>HTTP 'User-Agent' request header. MUST NOT be empty -
+ <description>'User-Agent' name: a single word uniquely identifying your
crawler.
+
+ The value is used to select the group of robots.txt rules addressing your
+ crawler. It is also sent as part of the HTTP 'User-Agent' request header.
+
+ This property MUST NOT be empty -
please set this to a single word uniquely related to your organization.
+ Following RFC 9309 the 'User-Agent' name (aka. 'product token')
+ "MUST contain only uppercase and lowercase letters ('a-z' and
+ 'A-Z'), underscores ('_'), and hyphens ('-')."
+
NOTE: You should also check other related properties:
http.robots.agents
@@ -84,7 +93,6 @@
http.agent.version
and set their values appropriately.
-
</description>
</property>
@@ -95,13 +103,13 @@
parser would look for in robots.txt. Multiple agents can be provided using
comma as a delimiter. eg. mybot,foo-spider,bar-crawler
- The ordering of agents does NOT matter and the robots parser would make
- decision based on the agent which matches first to the robots rules.
- Also, there is NO need to add a wildcard (ie. "*") to this string as the
- robots parser would smartly take care of a no-match situation.
+ The ordering of agents does NOT matter and the robots.txt parser combines
+ all rules to any of the agent names. Also, there is NO need to add
+ a wildcard (ie. "*") to this string as the robots parser would smartly
+ take care of a no-match situation.
If no value is specified, by default HTTP agent (ie. 'http.agent.name')
- would be used for user agent matching by the robots parser.
+ is used for user-agent matching by the robots parser.
</description>
</property>
@@ -166,9 +174,9 @@
<property>
<name>http.agent.url</name>
<value></value>
- <description>A URL to advertise in the User-Agent header. This will
+ <description>A URL to advertise in the User-Agent header. This will
appear in parenthesis after the agent name. Custom dictates that this
- should be a URL of a page explaining the purpose and behavior of this
+ should be a URL to a page that explains the purpose and behavior of this
crawler.
</description>
</property>
@@ -176,9 +184,9 @@
<property>
<name>http.agent.email</name>
<value></value>
- <description>An email address to advertise in the HTTP 'From' request
- header and User-Agent header. A good practice is to mangle this
- address (e.g. 'info at example dot com') to avoid spamming.
+ <description>An email address to advertise in the HTTP 'User-Agent' (and
+ 'From') request headers. A good practice is to mangle this address
+ (e.g. 'info at example dot com') to avoid spamming.
</description>
</property>
@@ -202,7 +210,7 @@
<name>http.agent.rotate.file</name>
<value>agents.txt</value>
<description>
- File containing alternative user agent names to be used instead of
+ File containing alternative user-agent names to be used instead of
http.agent.name on a rotating basis if http.agent.rotate is true.
Each line of the file should contain exactly one agent
specification including name, version, description, URL, etc.
diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 1493bc292..562c2c694 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -24,12 +24,13 @@ import java.io.LineNumberReader;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
+import java.util.Collection;
import java.util.HashSet;
import java.util.Hashtable;
+import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
-import java.util.StringTokenizer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -96,7 +97,7 @@ public abstract class RobotRulesParser implements Tool {
}
protected Configuration conf;
- protected String agentNames;
+ protected Set<String> agentNames;
/** set of host names or IPs to be explicitly excluded from robots.txt
checking */
protected Set<String> allowList = new HashSet<>();
@@ -114,6 +115,7 @@ public abstract class RobotRulesParser implements Tool {
/**
* Set the {@link Configuration} object
*/
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
@@ -122,26 +124,30 @@ public abstract class RobotRulesParser implements Tool {
if (agentName == null || (agentName = agentName.trim()).isEmpty()) {
throw new RuntimeException("Agent name not configured!");
}
- agentNames = agentName;
+ agentNames = new LinkedHashSet<>();
+ if (!agentName.equals("*")) {
+ /*
+ * skip wildcard "*" - crawler-commons' SimpleRobotRulesParser expects an
+ * empty set of agent names to use the wildcard rules
+ */
+ agentNames.add(agentName.toLowerCase());
+ }
// If there are any other agents specified, append those to the list of
// agents
- String otherAgents = conf.get("http.robots.agents");
- if (otherAgents != null && !otherAgents.trim().isEmpty()) {
- StringTokenizer tok = new StringTokenizer(otherAgents, ",");
- StringBuilder sb = new StringBuilder(agentNames);
- while (tok.hasMoreTokens()) {
- String str = tok.nextToken().trim();
- if (str.equals("*") || str.equals(agentName)) {
- // skip wildcard "*" or agent name itself
- // (required for backward compatibility, cf. NUTCH-1715 and
- // NUTCH-1718)
+ String[] otherAgents = conf.getStrings("http.robots.agents");
+ if (otherAgents != null && otherAgents.length > 0) {
+ for (String otherAgent : otherAgents) {
+ otherAgent = otherAgent.toLowerCase();
+ if (otherAgent.equals("*") || otherAgent.equalsIgnoreCase(agentName)) {
+ /*
+ * skip wildcard "*" or agent name itself (required for backward
+ * compatibility, cf. NUTCH-1715 and NUTCH-1718)
+ */
} else {
- sb.append(",").append(str);
+ agentNames.add(otherAgent);
}
}
-
- agentNames = sb.toString();
}
String[] confAllowList = conf.getStrings("http.robot.rules.allowlist");
@@ -166,6 +172,7 @@ public abstract class RobotRulesParser implements Tool {
/**
* Get the {@link Configuration} object
*/
+ @Override
public Configuration getConf() {
return conf;
}
@@ -188,10 +195,10 @@ public abstract class RobotRulesParser implements Tool {
/**
* Parses the robots content using the {@link SimpleRobotRulesParser} from
- * crawler commons
+ * crawler-commons
*
* @param url
- * A string containing url
+ * The robots.txt URL
* @param content
* Contents of the robots file in a byte array
* @param contentType
@@ -201,11 +208,32 @@ public abstract class RobotRulesParser implements Tool {
* matching
* @return BaseRobotRules object
*/
+ @Deprecated
public BaseRobotRules parseRules(String url, byte[] content,
String contentType, String robotName) {
return robotParser.parseContent(url, content, contentType, robotName);
}
+ /**
+ * Parses the robots content using the {@link SimpleRobotRulesParser} from
+ * crawler-commons
+ *
+ * @param url
+ * The robots.txt URL
+ * @param content
+ * Contents of the robots file in a byte array
+ * @param contentType
+ * The content type of the robots file
+ * @param robotNames
+ * A collection containing all the robots agent names used by parser
+ * for matching
+ * @return BaseRobotRules object
+ */
+ public BaseRobotRules parseRules(String url, byte[] content,
+ String contentType, Collection<String> robotNames) {
+ return robotParser.parseContent(url, content, contentType, robotNames);
+ }
+
/**
* Fetch robots.txt (or it's protocol-specific equivalent) which applies to
* the given URL, parse it and return the set of robot rules applicable for
@@ -274,8 +302,9 @@ public abstract class RobotRulesParser implements Tool {
"\tit is allowed by the robots.txt rules. Other parts of the URLs",
"\t(mainly the host) are ignored.",
"",
- "<agent-names>\tcomma-separated list of agent names",
+ "<agent-names>\tuser-agent name (aka. \"product token\")",
"\tused to select rules from the robots.txt file.",
+ "\tMultiple agent names can be passed as comma-separated string.",
"\tIf no agent name is given the properties http.agent.name",
"\tand http.robots.agents are used.",
"\tIf also http.agent.name and http.robots.agents are empty,",
@@ -353,7 +382,8 @@ public abstract class RobotRulesParser implements Tool {
}
}
- System.out.println("Testing robots.txt for agent names: " + agentNames);
+ System.out.println("Testing robots.txt for agent names: "
+ + (agentNames.isEmpty() ? "* (any other agent)" : agentNames));
LineNumberReader testsIn = new LineNumberReader(new FileReader(urlFile));
String testPath;
@@ -393,6 +423,7 @@ public abstract class RobotRulesParser implements Tool {
*/
private static class TestRobotRulesParser extends RobotRulesParser {
+ @Override
public void setConf(Configuration conf) {
/*
* Make sure that agent name is not empty so that
diff --git
a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
index 265abf934..202d2d08b 100644
---
a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
+++
b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
@@ -16,6 +16,8 @@
*/
package org.apache.nutch.protocol.http.api;
+import java.util.Set;
+
import org.junit.Assert;
import org.junit.Test;
@@ -94,6 +96,64 @@ public class TestRobotRulesParser {
parser = new HttpRobotRulesParser();
}
+ private void testRulesOnPaths(String agent, String[] paths,
+ boolean[] results) {
+ for (int counter = 0; counter < paths.length; counter++) {
+ boolean res = rules.isAllowed(paths[counter]);
+ Assert.assertTrue(
+ "testing on agent (" + agent + "), and " + "path " + paths[counter]
+ + " got " + res + ", expected " + results[counter],
+ res == results[counter]);
+ }
+ }
+
+ /**
+ * Test that the robots rules are interpreted correctly by the robots rules
+ * parser.
+ */
+ @Test
+ public void testRobotsAgent() {
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, Set.of(SINGLE_AGENT1.toLowerCase()));
+ testRulesOnPaths(SINGLE_AGENT1, TEST_PATHS, RESULTS_AGENT1);
+
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, Set.of(SINGLE_AGENT2.toLowerCase()));
+ testRulesOnPaths(SINGLE_AGENT2, TEST_PATHS, RESULTS_AGENT2);
+
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE,
Set.of(MULTIPLE_AGENTS.toLowerCase().split("\\s*,\\s*")));
+ testRulesOnPaths(MULTIPLE_AGENTS, TEST_PATHS, RESULTS_AGENT1_AND_AGENT2);
+ }
+
+ /**
+ * Test that the crawl delay is extracted from the robots file for respective
+ * agent. If its not specified for a given agent, default value must be
+ * returned.
+ */
+ @Test
+ public void testCrawlDelay() {
+ // for SINGLE_AGENT1, the crawl delay of 10 seconds, i.e. 10000 msec must
be
+ // returned by the parser
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, Set.of(SINGLE_AGENT1.toLowerCase()));
+ Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT1 + " : ",
+ (rules.getCrawlDelay() == 10000));
+
+ // for SINGLE_AGENT2, the crawl delay of 20 seconds, i.e. 20000 msec must
be
+ // returned by the parser
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, Set.of(SINGLE_AGENT2.toLowerCase()));
+ Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT2 + " : ",
+ (rules.getCrawlDelay() == 20000));
+
+ // for UNKNOWN_AGENT, the default crawl delay must be returned.
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+ CONTENT_TYPE, Set.of(UNKNOWN_AGENT.toLowerCase()));
+ Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
+ (rules.getCrawlDelay() == Long.MIN_VALUE));
+ }
+
/**
* Test that the robots rules are interpreted correctly by the robots rules
* parser.
@@ -103,36 +163,15 @@ public class TestRobotRulesParser {
public void testRobotsAgentDeprecatedAPIMethod() {
rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, SINGLE_AGENT1);
-
- for (int counter = 0; counter < TEST_PATHS.length; counter++) {
- Assert.assertTrue(
- "testing on agent (" + SINGLE_AGENT1 + "), and " + "path "
- + TEST_PATHS[counter] + " got "
- + rules.isAllowed(TEST_PATHS[counter]),
- rules.isAllowed(TEST_PATHS[counter]) == RESULTS_AGENT1[counter]);
- }
+ testRulesOnPaths(SINGLE_AGENT1, TEST_PATHS, RESULTS_AGENT1);
rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, SINGLE_AGENT2);
-
- for (int counter = 0; counter < TEST_PATHS.length; counter++) {
- Assert.assertTrue(
- "testing on agent (" + SINGLE_AGENT2 + "), and " + "path "
- + TEST_PATHS[counter] + " got "
- + rules.isAllowed(TEST_PATHS[counter]),
- rules.isAllowed(TEST_PATHS[counter]) == RESULTS_AGENT2[counter]);
- }
+ testRulesOnPaths(SINGLE_AGENT2, TEST_PATHS, RESULTS_AGENT2);
rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, MULTIPLE_AGENTS);
-
- for (int counter = 0; counter < TEST_PATHS.length; counter++) {
- Assert.assertTrue(
- "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
- + TEST_PATHS[counter] + " got "
- + rules.isAllowed(TEST_PATHS[counter]),
- rules.isAllowed(TEST_PATHS[counter]) ==
RESULTS_AGENT1_AND_AGENT2[counter]);
- }
+ testRulesOnPaths(MULTIPLE_AGENTS, TEST_PATHS, RESULTS_AGENT1_AND_AGENT2);
}
/**