RobotRulesParser.java

totaro Tue, 26 May 2015 17:10:55 -0700

Author: totaro
Date: Wed May 27 00:10:34 2015
New Revision: 1681894

URL: http://svn.apache.org/r1681894
Log:
NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist


Modified:
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1681894&r1=1681893&r2=1681894&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed May 27 00:10:34 2015
@@ -118,15 +118,6 @@
 </property>
 
 <property>
-   <name>http.robot.rules.whitelist</name>      
-   <value></value>      
-   <description>Comma separated list of hostnames or IP addresses to ignore    
 
-   robot rules parsing for. Use with care and only if you are explicitly       
 
-   allowed by the site owner to ignore the site's robots.txt!   
-   </description>       
-</property>     
-                                                                
-<property>
   <name>http.robot.rules.whitelist</name>
   <value></value>
   <description>Comma separated list of hostnames or IP addresses to ignore 

Modified: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1681894&r1=1681893&r2=1681894&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Wed 
May 27 00:10:34 2015
@@ -25,7 +25,6 @@ import java.io.InputStream;
 import java.io.LineNumberReader;
 import java.net.MalformedURLException;
 import java.net.URL;
-import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Hashtable;
 import java.util.Set;
@@ -42,6 +41,7 @@ import org.apache.hadoop.util.StringUtil
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.SuffixStringMatcher;
 
 import crawlercommons.robots.BaseRobotRules;
 import crawlercommons.robots.SimpleRobotRules;
@@ -62,7 +62,7 @@ public abstract class RobotRulesParser i
       .getLogger(RobotRulesParser.class);
 
   protected static final Hashtable<String, BaseRobotRules> CACHE = new 
Hashtable<String, BaseRobotRules>();
-
+  
   /**
    * A {@link BaseRobotRules} object appropriate for use when the
    * {@code robots.txt} file is empty or missing; all requests are allowed.
@@ -83,8 +83,10 @@ public abstract class RobotRulesParser i
   protected String agentNames;
 
   /** set of host names or IPs to be explicitly excluded from robots.txt 
checking */
-  protected Set<String> whiteList = new HashSet<String>();;
-
+  protected Set<String> whiteList = new HashSet<String>();
+  
+  /* Matcher user for efficiently matching URLs against a set of suffixes. */
+  private SuffixStringMatcher matcher = null;
 
   public RobotRulesParser() {
   }
@@ -127,8 +129,17 @@ public abstract class RobotRulesParser i
     }
 
     String[] confWhiteList = conf.getStrings("http.robot.rules.whitelist");
-    if (confWhiteList != null && confWhiteList.length > 0) {
-      whiteList.addAll(Arrays.asList(confWhiteList));
+
+    for (int i = 0; i < confWhiteList.length; i++) {
+      if (confWhiteList[i].isEmpty()) {
+         LOG.info("Empty whitelisted URL skipped!");
+         continue;
+      }
+      whiteList.add(confWhiteList[i]);
+    }
+    
+    if (whiteList.size() > 0) {
+      matcher = new SuffixStringMatcher(whiteList);
       LOG.info("Whitelisted hosts: " + whiteList);
     }
   }
@@ -140,12 +151,18 @@ public abstract class RobotRulesParser i
     return conf;
   }
 
-
   /**
    * Check whether a URL belongs to a whitelisted host.
    */
   public boolean isWhiteListed(URL url) {
-    return whiteList.contains(url.getHost());
+    boolean match = false;
+    String urlString = url.getHost();
+    
+    if (matcher != null) {
+       match = matcher.matches(urlString);
+    }
+    
+    return match;
   }
 
   /**

svn commit: r1681894 - in /nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/protocol/RobotRulesParser.java

Reply via email to