This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 070c115cf NUTCH-2996 Use new SimpleRobotRulesParser API entry point crawler-commons 1.4 070c115cf is described below commit 070c115cfadbc937a8ad0add6447461983e92028 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Tue Aug 22 11:39:22 2023 +0200 NUTCH-2996 Use new SimpleRobotRulesParser API entry point crawler-commons 1.4 - split and lowercase agent names (if multiple) at configuration time and pass as collection to SimpleRobotRulesParser - update RobotRulesParser command-line help - update unit tests to use new API - update description of Nutch properties to reflect the changes due to the usage of the new API entry point and the upgrade to crawler-commons 1.4 --- conf/nutch-default.xml | 34 +++++---- .../apache/nutch/protocol/RobotRulesParser.java | 71 +++++++++++++----- .../protocol/http/api/TestRobotRulesParser.java | 87 ++++++++++++++++------ 3 files changed, 135 insertions(+), 57 deletions(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 379b5ef5d..e98bd5570 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -72,9 +72,18 @@ <property> <name>http.agent.name</name> <value></value> - <description>HTTP 'User-Agent' request header. MUST NOT be empty - + <description>'User-Agent' name: a single word uniquely identifying your crawler. + + The value is used to select the group of robots.txt rules addressing your + crawler. It is also sent as part of the HTTP 'User-Agent' request header. + + This property MUST NOT be empty - please set this to a single word uniquely related to your organization. + Following RFC 9309 the 'User-Agent' name (aka. 'product token') + "MUST contain only uppercase and lowercase letters ('a-z' and + 'A-Z'), underscores ('_'), and hyphens ('-')." + NOTE: You should also check other related properties: http.robots.agents @@ -84,7 +93,6 @@ http.agent.version and set their values appropriately. - </description> </property> @@ -95,13 +103,13 @@ parser would look for in robots.txt. Multiple agents can be provided using comma as a delimiter. eg. mybot,foo-spider,bar-crawler - The ordering of agents does NOT matter and the robots parser would make - decision based on the agent which matches first to the robots rules. - Also, there is NO need to add a wildcard (ie. "*") to this string as the - robots parser would smartly take care of a no-match situation. + The ordering of agents does NOT matter and the robots.txt parser combines + all rules to any of the agent names. Also, there is NO need to add + a wildcard (ie. "*") to this string as the robots parser would smartly + take care of a no-match situation. If no value is specified, by default HTTP agent (ie. 'http.agent.name') - would be used for user agent matching by the robots parser. + is used for user-agent matching by the robots parser. </description> </property> @@ -166,9 +174,9 @@ <property> <name>http.agent.url</name> <value></value> - <description>A URL to advertise in the User-Agent header. This will + <description>A URL to advertise in the User-Agent header. This will appear in parenthesis after the agent name. Custom dictates that this - should be a URL of a page explaining the purpose and behavior of this + should be a URL to a page that explains the purpose and behavior of this crawler. </description> </property> @@ -176,9 +184,9 @@ <property> <name>http.agent.email</name> <value></value> - <description>An email address to advertise in the HTTP 'From' request - header and User-Agent header. A good practice is to mangle this - address (e.g. 'info at example dot com') to avoid spamming. + <description>An email address to advertise in the HTTP 'User-Agent' (and + 'From') request headers. A good practice is to mangle this address + (e.g. 'info at example dot com') to avoid spamming. </description> </property> @@ -202,7 +210,7 @@ <name>http.agent.rotate.file</name> <value>agents.txt</value> <description> - File containing alternative user agent names to be used instead of + File containing alternative user-agent names to be used instead of http.agent.name on a rotating basis if http.agent.rotate is true. Each line of the file should contain exactly one agent specification including name, version, description, URL, etc. diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java index 1493bc292..562c2c694 100644 --- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java +++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java @@ -24,12 +24,13 @@ import java.io.LineNumberReader; import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URL; +import java.util.Collection; import java.util.HashSet; import java.util.Hashtable; +import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; -import java.util.StringTokenizer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -96,7 +97,7 @@ public abstract class RobotRulesParser implements Tool { } protected Configuration conf; - protected String agentNames; + protected Set<String> agentNames; /** set of host names or IPs to be explicitly excluded from robots.txt checking */ protected Set<String> allowList = new HashSet<>(); @@ -114,6 +115,7 @@ public abstract class RobotRulesParser implements Tool { /** * Set the {@link Configuration} object */ + @Override public void setConf(Configuration conf) { this.conf = conf; @@ -122,26 +124,30 @@ public abstract class RobotRulesParser implements Tool { if (agentName == null || (agentName = agentName.trim()).isEmpty()) { throw new RuntimeException("Agent name not configured!"); } - agentNames = agentName; + agentNames = new LinkedHashSet<>(); + if (!agentName.equals("*")) { + /* + * skip wildcard "*" - crawler-commons' SimpleRobotRulesParser expects an + * empty set of agent names to use the wildcard rules + */ + agentNames.add(agentName.toLowerCase()); + } // If there are any other agents specified, append those to the list of // agents - String otherAgents = conf.get("http.robots.agents"); - if (otherAgents != null && !otherAgents.trim().isEmpty()) { - StringTokenizer tok = new StringTokenizer(otherAgents, ","); - StringBuilder sb = new StringBuilder(agentNames); - while (tok.hasMoreTokens()) { - String str = tok.nextToken().trim(); - if (str.equals("*") || str.equals(agentName)) { - // skip wildcard "*" or agent name itself - // (required for backward compatibility, cf. NUTCH-1715 and - // NUTCH-1718) + String[] otherAgents = conf.getStrings("http.robots.agents"); + if (otherAgents != null && otherAgents.length > 0) { + for (String otherAgent : otherAgents) { + otherAgent = otherAgent.toLowerCase(); + if (otherAgent.equals("*") || otherAgent.equalsIgnoreCase(agentName)) { + /* + * skip wildcard "*" or agent name itself (required for backward + * compatibility, cf. NUTCH-1715 and NUTCH-1718) + */ } else { - sb.append(",").append(str); + agentNames.add(otherAgent); } } - - agentNames = sb.toString(); } String[] confAllowList = conf.getStrings("http.robot.rules.allowlist"); @@ -166,6 +172,7 @@ public abstract class RobotRulesParser implements Tool { /** * Get the {@link Configuration} object */ + @Override public Configuration getConf() { return conf; } @@ -188,10 +195,10 @@ public abstract class RobotRulesParser implements Tool { /** * Parses the robots content using the {@link SimpleRobotRulesParser} from - * crawler commons + * crawler-commons * * @param url - * A string containing url + * The robots.txt URL * @param content * Contents of the robots file in a byte array * @param contentType @@ -201,11 +208,32 @@ public abstract class RobotRulesParser implements Tool { * matching * @return BaseRobotRules object */ + @Deprecated public BaseRobotRules parseRules(String url, byte[] content, String contentType, String robotName) { return robotParser.parseContent(url, content, contentType, robotName); } + /** + * Parses the robots content using the {@link SimpleRobotRulesParser} from + * crawler-commons + * + * @param url + * The robots.txt URL + * @param content + * Contents of the robots file in a byte array + * @param contentType + * The content type of the robots file + * @param robotNames + * A collection containing all the robots agent names used by parser + * for matching + * @return BaseRobotRules object + */ + public BaseRobotRules parseRules(String url, byte[] content, + String contentType, Collection<String> robotNames) { + return robotParser.parseContent(url, content, contentType, robotNames); + } + /** * Fetch robots.txt (or it's protocol-specific equivalent) which applies to * the given URL, parse it and return the set of robot rules applicable for @@ -274,8 +302,9 @@ public abstract class RobotRulesParser implements Tool { "\tit is allowed by the robots.txt rules. Other parts of the URLs", "\t(mainly the host) are ignored.", "", - "<agent-names>\tcomma-separated list of agent names", + "<agent-names>\tuser-agent name (aka. \"product token\")", "\tused to select rules from the robots.txt file.", + "\tMultiple agent names can be passed as comma-separated string.", "\tIf no agent name is given the properties http.agent.name", "\tand http.robots.agents are used.", "\tIf also http.agent.name and http.robots.agents are empty,", @@ -353,7 +382,8 @@ public abstract class RobotRulesParser implements Tool { } } - System.out.println("Testing robots.txt for agent names: " + agentNames); + System.out.println("Testing robots.txt for agent names: " + + (agentNames.isEmpty() ? "* (any other agent)" : agentNames)); LineNumberReader testsIn = new LineNumberReader(new FileReader(urlFile)); String testPath; @@ -393,6 +423,7 @@ public abstract class RobotRulesParser implements Tool { */ private static class TestRobotRulesParser extends RobotRulesParser { + @Override public void setConf(Configuration conf) { /* * Make sure that agent name is not empty so that diff --git a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java index 265abf934..202d2d08b 100644 --- a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java +++ b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java @@ -16,6 +16,8 @@ */ package org.apache.nutch.protocol.http.api; +import java.util.Set; + import org.junit.Assert; import org.junit.Test; @@ -94,6 +96,64 @@ public class TestRobotRulesParser { parser = new HttpRobotRulesParser(); } + private void testRulesOnPaths(String agent, String[] paths, + boolean[] results) { + for (int counter = 0; counter < paths.length; counter++) { + boolean res = rules.isAllowed(paths[counter]); + Assert.assertTrue( + "testing on agent (" + agent + "), and " + "path " + paths[counter] + + " got " + res + ", expected " + results[counter], + res == results[counter]); + } + } + + /** + * Test that the robots rules are interpreted correctly by the robots rules + * parser. + */ + @Test + public void testRobotsAgent() { + rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, Set.of(SINGLE_AGENT1.toLowerCase())); + testRulesOnPaths(SINGLE_AGENT1, TEST_PATHS, RESULTS_AGENT1); + + rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, Set.of(SINGLE_AGENT2.toLowerCase())); + testRulesOnPaths(SINGLE_AGENT2, TEST_PATHS, RESULTS_AGENT2); + + rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, Set.of(MULTIPLE_AGENTS.toLowerCase().split("\\s*,\\s*"))); + testRulesOnPaths(MULTIPLE_AGENTS, TEST_PATHS, RESULTS_AGENT1_AND_AGENT2); + } + + /** + * Test that the crawl delay is extracted from the robots file for respective + * agent. If its not specified for a given agent, default value must be + * returned. + */ + @Test + public void testCrawlDelay() { + // for SINGLE_AGENT1, the crawl delay of 10 seconds, i.e. 10000 msec must be + // returned by the parser + rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, Set.of(SINGLE_AGENT1.toLowerCase())); + Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT1 + " : ", + (rules.getCrawlDelay() == 10000)); + + // for SINGLE_AGENT2, the crawl delay of 20 seconds, i.e. 20000 msec must be + // returned by the parser + rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, Set.of(SINGLE_AGENT2.toLowerCase())); + Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT2 + " : ", + (rules.getCrawlDelay() == 20000)); + + // for UNKNOWN_AGENT, the default crawl delay must be returned. + rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, Set.of(UNKNOWN_AGENT.toLowerCase())); + Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ", + (rules.getCrawlDelay() == Long.MIN_VALUE)); + } + /** * Test that the robots rules are interpreted correctly by the robots rules * parser. @@ -103,36 +163,15 @@ public class TestRobotRulesParser { public void testRobotsAgentDeprecatedAPIMethod() { rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT1); - - for (int counter = 0; counter < TEST_PATHS.length; counter++) { - Assert.assertTrue( - "testing on agent (" + SINGLE_AGENT1 + "), and " + "path " - + TEST_PATHS[counter] + " got " - + rules.isAllowed(TEST_PATHS[counter]), - rules.isAllowed(TEST_PATHS[counter]) == RESULTS_AGENT1[counter]); - } + testRulesOnPaths(SINGLE_AGENT1, TEST_PATHS, RESULTS_AGENT1); rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT2); - - for (int counter = 0; counter < TEST_PATHS.length; counter++) { - Assert.assertTrue( - "testing on agent (" + SINGLE_AGENT2 + "), and " + "path " - + TEST_PATHS[counter] + " got " - + rules.isAllowed(TEST_PATHS[counter]), - rules.isAllowed(TEST_PATHS[counter]) == RESULTS_AGENT2[counter]); - } + testRulesOnPaths(SINGLE_AGENT2, TEST_PATHS, RESULTS_AGENT2); rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, MULTIPLE_AGENTS); - - for (int counter = 0; counter < TEST_PATHS.length; counter++) { - Assert.assertTrue( - "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path " - + TEST_PATHS[counter] + " got " - + rules.isAllowed(TEST_PATHS[counter]), - rules.isAllowed(TEST_PATHS[counter]) == RESULTS_AGENT1_AND_AGENT2[counter]); - } + testRulesOnPaths(MULTIPLE_AGENTS, TEST_PATHS, RESULTS_AGENT1_AND_AGENT2); } /**