Author: totaro
Date: Wed May 27 00:10:34 2015
New Revision: 1681894
URL: http://svn.apache.org/r1681894
Log:
NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist
Modified:
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1681894&r1=1681893&r2=1681894&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed May 27 00:10:34 2015
@@ -118,15 +118,6 @@
</property>
<property>
- <name>http.robot.rules.whitelist</name>
- <value></value>
- <description>Comma separated list of hostnames or IP addresses to ignore
- robot rules parsing for. Use with care and only if you are explicitly
- allowed by the site owner to ignore the site's robots.txt!
- </description>
-</property>
-
-<property>
<name>http.robot.rules.whitelist</name>
<value></value>
<description>Comma separated list of hostnames or IP addresses to ignore
Modified: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1681894&r1=1681893&r2=1681894&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Wed
May 27 00:10:34 2015
@@ -25,7 +25,6 @@ import java.io.InputStream;
import java.io.LineNumberReader;
import java.net.MalformedURLException;
import java.net.URL;
-import java.util.Arrays;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Set;
@@ -42,6 +41,7 @@ import org.apache.hadoop.util.StringUtil
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.SuffixStringMatcher;
import crawlercommons.robots.BaseRobotRules;
import crawlercommons.robots.SimpleRobotRules;
@@ -62,7 +62,7 @@ public abstract class RobotRulesParser i
.getLogger(RobotRulesParser.class);
protected static final Hashtable<String, BaseRobotRules> CACHE = new
Hashtable<String, BaseRobotRules>();
-
+
/**
* A {@link BaseRobotRules} object appropriate for use when the
* {@code robots.txt} file is empty or missing; all requests are allowed.
@@ -83,8 +83,10 @@ public abstract class RobotRulesParser i
protected String agentNames;
/** set of host names or IPs to be explicitly excluded from robots.txt
checking */
- protected Set<String> whiteList = new HashSet<String>();;
-
+ protected Set<String> whiteList = new HashSet<String>();
+
+ /* Matcher user for efficiently matching URLs against a set of suffixes. */
+ private SuffixStringMatcher matcher = null;
public RobotRulesParser() {
}
@@ -127,8 +129,17 @@ public abstract class RobotRulesParser i
}
String[] confWhiteList = conf.getStrings("http.robot.rules.whitelist");
- if (confWhiteList != null && confWhiteList.length > 0) {
- whiteList.addAll(Arrays.asList(confWhiteList));
+
+ for (int i = 0; i < confWhiteList.length; i++) {
+ if (confWhiteList[i].isEmpty()) {
+ LOG.info("Empty whitelisted URL skipped!");
+ continue;
+ }
+ whiteList.add(confWhiteList[i]);
+ }
+
+ if (whiteList.size() > 0) {
+ matcher = new SuffixStringMatcher(whiteList);
LOG.info("Whitelisted hosts: " + whiteList);
}
}
@@ -140,12 +151,18 @@ public abstract class RobotRulesParser i
return conf;
}
-
/**
* Check whether a URL belongs to a whitelisted host.
*/
public boolean isWhiteListed(URL url) {
- return whiteList.contains(url.getHost());
+ boolean match = false;
+ String urlString = url.getHost();
+
+ if (matcher != null) {
+ match = matcher.matches(urlString);
+ }
+
+ return match;
}
/**