Author: totaro
Date: Wed May 27 18:09:37 2015
New Revision: 1682090

URL: http://svn.apache.org/r1682090
Log:
Fix for NUTCH-1995: The result of conf.getStrings("http.robot.rules.whitelist") 
is now checked for null

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1682090&r1=1682089&r2=1682090&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May 27 18:09:37 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist
+
 * NUTCH-2013 Fetcher: missing logs "fetching ..." on stdout (snagel)
 
 * NUTCH-2014 Fetcher hang-up on completion (snagel)

Modified: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1682090&r1=1682089&r2=1682090&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Wed 
May 27 18:09:37 2015
@@ -129,18 +129,22 @@ public abstract class RobotRulesParser i
     }
 
     String[] confWhiteList = conf.getStrings("http.robot.rules.whitelist");
-
-    for (int i = 0; i < confWhiteList.length; i++) {
-      if (confWhiteList[i].isEmpty()) {
-         LOG.info("Empty whitelisted URL skipped!");
-         continue;
-      }
-      whiteList.add(confWhiteList[i]);
+    if (confWhiteList == null) {
+      LOG.info("robots.txt whitelist not configured.");
     }
-    
-    if (whiteList.size() > 0) {
-      matcher = new SuffixStringMatcher(whiteList);
-      LOG.info("Whitelisted hosts: " + whiteList);
+    else {
+      for (int i = 0; i < confWhiteList.length; i++) {
+        if (confWhiteList[i].isEmpty()) {
+         LOG.info("Empty whitelisted URL skipped!");
+         continue;
+        }
+        whiteList.add(confWhiteList[i]);
+      }
+      
+      if (whiteList.size() > 0) {
+        matcher = new SuffixStringMatcher(whiteList);
+        LOG.info("Whitelisted hosts: " + whiteList);
+      }
     }
   }
 


Reply via email to