Author: totaro
Date: Wed May 27 18:09:37 2015
New Revision: 1682090
URL: http://svn.apache.org/r1682090
Log:
Fix for NUTCH-1995: The result of conf.getStrings("http.robot.rules.whitelist")
is now checked for null
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1682090&r1=1682089&r2=1682090&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May 27 18:09:37 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist
+
* NUTCH-2013 Fetcher: missing logs "fetching ..." on stdout (snagel)
* NUTCH-2014 Fetcher hang-up on completion (snagel)
Modified: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1682090&r1=1682089&r2=1682090&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Wed
May 27 18:09:37 2015
@@ -129,18 +129,22 @@ public abstract class RobotRulesParser i
}
String[] confWhiteList = conf.getStrings("http.robot.rules.whitelist");
-
- for (int i = 0; i < confWhiteList.length; i++) {
- if (confWhiteList[i].isEmpty()) {
- LOG.info("Empty whitelisted URL skipped!");
- continue;
- }
- whiteList.add(confWhiteList[i]);
+ if (confWhiteList == null) {
+ LOG.info("robots.txt whitelist not configured.");
}
-
- if (whiteList.size() > 0) {
- matcher = new SuffixStringMatcher(whiteList);
- LOG.info("Whitelisted hosts: " + whiteList);
+ else {
+ for (int i = 0; i < confWhiteList.length; i++) {
+ if (confWhiteList[i].isEmpty()) {
+ LOG.info("Empty whitelisted URL skipped!");
+ continue;
+ }
+ whiteList.add(confWhiteList[i]);
+ }
+
+ if (whiteList.size() > 0) {
+ matcher = new SuffixStringMatcher(whiteList);
+ LOG.info("Whitelisted hosts: " + whiteList);
+ }
}
}