a...

tejasp Mon, 29 Apr 2013 13:27:25 -0700

Author: tejasp
Date: Mon Apr 29 20:26:52 2013
New Revision: 1477319

URL: http://svn.apache.org/r1477319
Log:
NUTCH-1031 Delegate parsing of robots.txt to crawler-commons


Added:
    nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
    
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
Removed:
    nutch/branches/2.x/src/java/org/apache/nutch/protocol/EmptyRobotRules.java
    
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/ivy/ivy.xml
    nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
    nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java
    
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    
nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
    
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
    
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
    
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Apr 29 20:26:52 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
+* NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)
+
 * NUTCH-346 Improve readability of logs/hadoop.log (Renaud Richardet via 
tejasp)
 
 * NUTCH-1501 Harmonize behavior of parsechecker and indexchecker (snagel + 
lewismc)

Modified: nutch/branches/2.x/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
--- nutch/branches/2.x/ivy/ivy.xml (original)
+++ nutch/branches/2.x/ivy/ivy.xml Mon Apr 29 20:26:52 2013
@@ -70,6 +70,7 @@
     <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->default" />
 
     <dependency org="com.google.guava" name="guava" rev="11.0.2" />
+    <dependency org="com.google.code.crawler-commons" name="crawler-commons" 
rev="0.2" />
 
     <!--Configuration: test -->
 

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java 
Mon Apr 29 20:26:52 2013
@@ -61,6 +61,8 @@ import org.apache.nutch.util.TableUtil;
 import org.apache.nutch.util.URLUtil;
 import org.slf4j.Logger;
 
+import crawlercommons.robots.BaseRobotRules;
+
 public class FetcherReducer
 extends GoraReducer<IntWritable, FetchEntry, String, WebPage> {
 
@@ -152,9 +154,6 @@ extends GoraReducer<IntWritable, FetchEn
       return "FetchItem [queueID=" + queueID + ", url=" + url + ", u=" + u
           + ", page=" + page + "]";
     }
-    
-    
-
   }
 
   /**
@@ -489,8 +488,8 @@ extends GoraReducer<IntWritable, FetchEn
 
             // fetch the page
             final Protocol protocol = 
this.protocolFactory.getProtocol(fit.url);
-            final RobotRules rules = protocol.getRobotRules(fit.url, fit.page);
-            if (!rules.isAllowed(fit.u)) {
+            final BaseRobotRules rules = protocol.getRobotRules(fit.url, 
fit.page);
+            if (!rules.isAllowed(fit.u.toString())) {
               // unblock
               fetchQueues.finishFetchItem(fit, true);
               if (LOG.isDebugEnabled()) {

Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java Mon Apr 
29 20:26:52 2013
@@ -23,6 +23,8 @@ import org.apache.hadoop.conf.Configurab
 import org.apache.nutch.plugin.FieldPluggable;
 import org.apache.nutch.storage.WebPage;
 
+import crawlercommons.robots.BaseRobotRules;
+
 /** A retriever of url content.  Implemented by protocol extensions. */
 public interface Protocol extends FieldPluggable, Configurable {
   /** The name of the extension point. */
@@ -46,7 +48,8 @@ public interface Protocol extends FieldP
    */
   public final static String CHECK_ROBOTS = "protocol.plugin.check.robots";
 
-  /** Returns the {@link Content} for a fetchlist entry.
+  /*
+   * Returns the {@link Content} for a fetchlist entry.
    */
   ProtocolOutput getProtocolOutput(String url, WebPage page);
 
@@ -56,5 +59,5 @@ public interface Protocol extends FieldP
    * @param page
    * @return robot rules (specific for this url or default), never null
    */
-  RobotRules getRobotRules(String url, WebPage page);
+  BaseRobotRules getRobotRules(String url, WebPage page);
 }

Added: 
nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1477319&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java 
(added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java 
Mon Apr 29 20:26:52 2013
@@ -0,0 +1,195 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+// JDK imports
+import java.io.File;
+import java.io.FileReader;
+import java.io.LineNumberReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.StringTokenizer;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.io.Text;
+
+import com.google.common.io.Files;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
+import crawlercommons.robots.SimpleRobotRulesParser;
+
+/**
+ * This class uses crawler-commons for handling the parsing of {@code 
robots.txt} files.
+ * It emits SimpleRobotRules objects, which describe the download permissions
+ * as described in SimpleRobotRulesParser.
+ */
+public abstract class RobotRulesParser implements Configurable {
+
+  public static final Logger LOG = 
LoggerFactory.getLogger(RobotRulesParser.class);
+
+  protected static final Hashtable<String, BaseRobotRules> CACHE = new 
Hashtable<String, BaseRobotRules> ();
+
+  /**
+   *  A {@link BaseRobotRules} object appropriate for use
+   *  when the {@code robots.txt} file is empty or missing;
+   *  all requests are allowed.
+   */
+  public static final BaseRobotRules EMPTY_RULES = new 
SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
+
+  /**
+   *  A {@link BaseRobotRules} object appropriate for use when the 
+   *  {@code robots.txt} file is not fetched due to a {@code 403/Forbidden}
+   *  response; all requests are disallowed. 
+   */
+  public static BaseRobotRules FORBID_ALL_RULES = new 
SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
+
+  private static SimpleRobotRulesParser robotParser = new 
SimpleRobotRulesParser();
+  private Configuration conf;
+  protected String agentNames;
+
+  public RobotRulesParser() { }
+
+  public RobotRulesParser(Configuration conf) {
+    setConf(conf);
+  }
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // Grab the agent names we advertise to robots files.
+    String agentName = conf.get("http.agent.name");
+    if (null == agentName) {
+      throw new RuntimeException("Agent name not configured!");
+    }
+
+    String agentNames = conf.get("http.robots.agents");
+    StringTokenizer tok = new StringTokenizer(agentNames, ",");
+    ArrayList<String> agents = new ArrayList<String>();
+    while (tok.hasMoreTokens()) {
+      agents.add(tok.nextToken().trim());
+    }
+
+    /**
+     * If there are no agents for robots-parsing, use the
+     * default agent-string. If both are present, our agent-string
+     * should be the first one we advertise to robots-parsing.
+     */
+    if (agents.size() == 0) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error("No agents listed in 'http.robots.agents' property!");
+      }
+    } else { 
+      StringBuffer combinedAgentsString = new StringBuffer(agentName);
+      int index = 0;
+
+      if ((agents.get(0)).equalsIgnoreCase(agentName))
+        index++;
+      else if (LOG.isErrorEnabled()) {
+        LOG.error("Agent we advertise (" + agentName
+            + ") not listed first in 'http.robots.agents' property!");
+      }
+
+      // append all the agents from the http.robots.agents property
+      for(; index < agents.size(); index++) {
+        combinedAgentsString.append(", " + agents.get(index));
+      }
+
+      // always make sure "*" is included in the end
+      combinedAgentsString.append(", *");
+      this.agentNames = combinedAgentsString.toString();
+    }
+  }
+
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return conf;
+  }
+
+  /**
+   * Parses the robots content using the {@link SimpleRobotRulesParser} from 
crawler commons
+   *    
+   * @param url A string containing url
+   * @param content Contents of the robots file in a byte array 
+   * @param contentType The 
+   * @param robotName A string containing value of  
+   * @return BaseRobotRules object 
+   */
+  public BaseRobotRules parseRules (String url, byte[] content, String 
contentType, String robotName) {
+    return robotParser.parseContent(url, content, contentType, robotName); 
+  }
+
+  public BaseRobotRules getRobotRulesSet(Protocol protocol, String url) {
+    URL u = null;
+    try {
+      u = new URL(url);
+    } catch (Exception e) {
+      return EMPTY_RULES;
+    }
+    return getRobotRulesSet(protocol, u);
+  }
+
+  public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url);
+
+  /** command-line main for testing */
+  public static void main(String[] argv) {
+
+    if (argv.length < 3) {
+      System.err.println("Usage: RobotRulesParser <robots-file> <url-file> 
<agent-names>\n");
+      System.err.println("    <robots-file> - Input robots.txt file which will 
be parsed.");
+      System.err.println("    <url-file>    - Contains input URLs (1 per line) 
which are tested against the rules.");
+      System.err.println("    <agent-names> - Input agent name. Multiple agent 
names can be specified using spaces.");
+      System.exit(-1);
+    }
+
+    try {
+      StringBuilder agentNames = new StringBuilder();
+      for(int counter = 2; counter < argv.length; counter++) 
+        agentNames.append(argv[counter]).append(",");
+
+      agentNames.deleteCharAt(agentNames.length()-1);
+
+      byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
+      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, 
"text/plain", agentNames.toString());
+
+      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
+      String testPath = testsIn.readLine().trim();
+      while (testPath != null) {
+        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not 
allowed") +
+            ":\t" + testPath);
+        testPath = testsIn.readLine();
+      }
+      testsIn.close();
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+}

Modified: 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Mon Apr 29 20:26:52 2013
@@ -32,23 +32,21 @@ import org.apache.nutch.protocol.Protoco
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatusCodes;
 import org.apache.nutch.protocol.ProtocolStatusUtils;
-import org.apache.nutch.protocol.RobotRules;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.GZIPUtils;
 import org.apache.nutch.util.DeflateUtils;
 import org.apache.nutch.util.MimeUtil;
 
-/**
- * @author J&eacute;r&ocirc;me Charron
- */
-public abstract class HttpBase implements Protocol {
+// crawler-commons imports
+import crawlercommons.robots.BaseRobotRules;
 
+public abstract class HttpBase implements Protocol {
 
   public static final int BUFFER_SIZE = 8 * 1024;
 
   private static final byte[] EMPTY_CONTENT = new byte[0];
 
-  private RobotRulesParser robots = null;
+  private HttpRobotRulesParser robots = null;
 
   /** The proxy hostname. */
   protected String proxyHost = null;
@@ -102,7 +100,7 @@ public abstract class HttpBase implement
     if (logger != null) {
       this.logger = logger;
     }
-    robots = new RobotRulesParser();
+    robots = new HttpRobotRulesParser();
   }
 
   // Inherited Javadoc
@@ -128,13 +126,10 @@ public abstract class HttpBase implement
     return this.conf;
   }
 
-
-
   public ProtocolOutput getProtocolOutput(String url, WebPage page) {
 
     try {
       URL u = new URL(url);
-      String host = null;
       Response response = getResponse(u, page, false); // make a request
       int code = response.getCode();
       byte[] content = response.getContent();
@@ -145,7 +140,6 @@ public abstract class HttpBase implement
 
       if (code == 200) { // got a good response
         return new ProtocolOutput(c); // return it
-
       } else if (code == 410) { // page is gone
         return new ProtocolOutput(c,
             ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.GONE, "Http: " 
+ code + " url=" + url));
@@ -206,8 +200,6 @@ public abstract class HttpBase implement
   /* -------------------------- *
    * </implementation:Protocol> *
    * -------------------------- */
-
-
   public String getProxyHost() {
     return proxyHost;
   }
@@ -367,10 +359,6 @@ public abstract class HttpBase implement
         url = args[i];
     }
 
-    //    if (verbose) {
-    //      LOGGER.setLevel(Level.FINE);
-    //    }
-
     ProtocolOutput out = http.getProtocolOutput(url, new WebPage());
     Content content = out.getContent();
 
@@ -383,17 +371,14 @@ public abstract class HttpBase implement
       String text = new String(content.getContent());
       System.out.println(text);
     }
-
   }
 
-
   protected abstract Response getResponse(URL url,
       WebPage page, boolean followRedirects)
   throws ProtocolException, IOException;
 
   @Override
-  public RobotRules getRobotRules(String url, WebPage page) {
+  public BaseRobotRules getRobotRules(String url, WebPage page) {
     return robots.getRobotRulesSet(this, url);
   }
-
 }

Added: 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1477319&view=auto
==============================================================================
--- 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
 (added)
+++ 
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
 Mon Apr 29 20:26:52 2013
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http.api;
+
+import java.net.URL;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.RobotRulesParser;
+import org.apache.nutch.storage.WebPage;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+
+/**
+ * This class is used for parsing robots for urls belonging to HTTP protocol.
+ * It extends the generic {@link RobotRulesParser} class and contains 
+ * Http protocol specific implementation for obtaining the robots file.
+ */
+public class HttpRobotRulesParser extends RobotRulesParser {
+  
+  public static final Logger LOG = 
LoggerFactory.getLogger(HttpRobotRulesParser.class);
+  protected boolean allowForbidden = false;
+
+  HttpRobotRulesParser() { }
+
+  public HttpRobotRulesParser(Configuration conf) {
+    super(conf);
+    allowForbidden = conf.getBoolean("http.robots.403.allow", false);
+  }
+
+  /**
+   * The hosts for which the caching of robots rules is yet to be done,
+   * it sends a Http request to the host corresponding to the {@link URL} 
+   * passed, gets robots file, parses the rules and caches the rules object
+   * to avoid re-work in future.
+   * 
+   *  @param http The {@link Protocol} object
+   *  @param url URL 
+   *  
+   *  @return robotRules A {@link BaseRobotRules} object for the rules
+   */
+  public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
+
+    String protocol = url.getProtocol().toLowerCase();  // normalize to lower 
case
+    String host = url.getHost().toLowerCase();          // normalize to lower 
case
+
+    BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" + 
host);
+
+    boolean cacheRule = true;
+    
+    if (robotRules == null) {                     // cache miss
+      URL redir = null;
+      if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
+      try {
+        Response response = ((HttpBase)http).getResponse(new URL(url, 
"/robots.txt"),
+                                             new WebPage(), true);
+        // try one level of redirection ?
+        if (response.getCode() == 301 || response.getCode() == 302) {
+          String redirection = response.getHeader("Location");
+          if (redirection == null) {
+            // some versions of MS IIS are known to mangle this header
+            redirection = response.getHeader("location");
+          }
+          if (redirection != null) {
+            if (!redirection.startsWith("http")) {
+              // RFC says it should be absolute, but apparently it isn't
+              redir = new URL(url, redirection);
+            } else {
+              redir = new URL(redirection);
+            }
+            
+            response = ((HttpBase)http).getResponse(redir, new WebPage(), 
true);
+          }
+        }
+
+        if (response.getCode() == 200)               // found rules: parse them
+          robotRules =  parseRules(url.toString(), response.getContent(), 
+                                   response.getHeader("Content-Type"), 
+                                   agentNames);
+
+        else if ( (response.getCode() == 403) && (!allowForbidden) )
+          robotRules = FORBID_ALL_RULES;            // use forbid all
+        else if (response.getCode() >= 500) {
+          cacheRule = false;
+          robotRules = EMPTY_RULES;
+        }else                                        
+          robotRules = EMPTY_RULES;                 // use default rules
+      } catch (Throwable t) {
+        if (LOG.isInfoEnabled()) {
+          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+        }
+        cacheRule = false;
+        robotRules = EMPTY_RULES;
+      }
+
+      if (cacheRule) {
+        CACHE.put(protocol + ":" + host, robotRules);  // cache rules for host
+        if (redir != null && !redir.getHost().equals(host)) {
+          // cache also for the redirected host
+          CACHE.put(protocol + ":" + redir.getHost(), robotRules);
+        }
+      }
+    }
+    return robotRules;
+  }
+}

Modified: 
nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 Mon Apr 29 20:26:52 2013
@@ -17,292 +17,100 @@
 
 package org.apache.nutch.protocol.http.api;
 
-import org.apache.nutch.protocol.http.api.RobotRulesParser.RobotRuleSet;
-
+import crawlercommons.robots.BaseRobotRules;
 import junit.framework.TestCase;
 
+/**
+ * JUnit test case which tests
+ * 1. that robots filtering is performed correctly as per the agent name
+ * 2. that crawl delay is extracted correctly from the robots file
+ *
+ */
 public class TestRobotRulesParser extends TestCase {
-  private static final String LF= "\n";
-  private static final String CR= "\r";
-  private static final String CRLF= "\r\n";
+
+  private static final String CONTENT_TYPE = "text/plain";
+  private static final String SINGLE_AGENT = "Agent1";
+  private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
+  private static final String UNKNOWN_AGENT = "AgentABC";
+  private static final String CR = "\r";
   
-  private static final boolean[] ACCEPT_ALL = {
-    true,   // "/a",         
-    true,   // "/a/",        
-    true,   // "/a/bloh/foo.html"
-    true,   // "/b",         
-    true,   // "/b/a",       
-    true,   // "/b/a/index.html",
-    true,   // "/b/b/foo.html",  
-    true,   // "/c",         
-    true,   // "/c/a",       
-    true,   // "/c/a/index.html",
-    true,   // "/c/b/foo.html",  
-    true,   // "/d",         
-    true,   // "/d/a",       
-    true,   // "/e/a/index.html",
-    true,   // "/e/d",       
-    true,   // "/e/d/foo.html",  
-    true,   // "/e/doh.html",    
-    true,   // "/f/index.html",  
-    true,   // "/foo/bar.html",  
-    true,   // "/f/",
-  };
+  private static final String ROBOTS_STRING = 
+      "User-Agent: Agent1 #foo" + CR 
+      + "Disallow: /a" + CR 
+      + "Disallow: /b/a" + CR 
+      + "#Disallow: /c" + CR 
+      + "Crawl-delay: 10" + CR  // set crawl delay for Agent1 as 10 sec
+      + "" + CR 
+      + "" + CR 
+      + "User-Agent: Agent2" + CR 
+      + "Disallow: /a/bloh" + CR 
+      + "Disallow: /c" + CR
+      + "Disallow: /foo" + CR
+      + "Crawl-delay: 20" + CR
+      + "" + CR 
+      + "User-Agent: *" + CR 
+      + "Disallow: /foo/bar/" + CR;   // no crawl delay for other agents
   
-  private static final String[] ROBOTS_STRINGS= new String[] {
-    "User-Agent: Agent1 #foo" + CR 
-    + "Disallow: /a" + CR 
-    + "Disallow: /b/a" + CR 
-    + "#Disallow: /c" + CR 
-    + "" + CR 
-    + "" + CR 
-    + "User-Agent: Agent2 Agent3#foo" + CR 
-    + "User-Agent: Agent4" + CR 
-    + "Disallow: /d" + CR 
-    + "Disallow: /e/d/" + CR
-    + "" + CR 
-    + "User-Agent: *" + CR 
-    + "Disallow: /foo/bar/" + CR,
-    null  // Used to test EMPTY_RULES
+  private static final String[] TEST_PATHS = new String[] {
+    "http://example.com/a";,
+    "http://example.com/a/bloh/foo.html";,
+    "http://example.com/b";,
+    "http://example.com/c";,
+    "http://example.com/b/a/index.html";,
+    "http://example.com/foo/bar/baz.html";
+  };
+
+  private static final boolean[] RESULTS = new boolean[] {
+    false,  //  /a
+    false,  //  /a/bloh/foo.html
+    true,   //  /b
+    true,   //  /c
+    false,  //  /b/a/index.html
+    true    //  /foo/bar/baz.html
   };
 
-  private static final String[] AGENT_STRINGS= new String[] {
-    "Agent1",
-    "Agent2",
-    "Agent3",
-    "Agent4",
-    "Agent5",
-  };
-
-  private static final boolean[][] NOT_IN_ROBOTS_STRING= new boolean[][] {
-    { 
-      false, 
-      false,
-      false,
-      false,
-      true,
-    },
-    { 
-      false, 
-      false,
-      false,
-      false,
-      true,
-    }    
-  };
+  private HttpRobotRulesParser parser;
+  private BaseRobotRules rules;
 
-  private static final String[] TEST_PATHS= new String[] {
-    "/a",
-    "/a/",
-    "/a/bloh/foo.html",
-    "/b",
-    "/b/a",
-    "/b/a/index.html",
-    "/b/b/foo.html",
-    "/c",
-    "/c/a",
-    "/c/a/index.html",
-    "/c/b/foo.html",
-    "/d",
-    "/d/a",
-    "/e/a/index.html",
-    "/e/d",
-    "/e/d/foo.html",
-    "/e/doh.html",
-    "/f/index.html",
-    "/foo/bar/baz.html",  
-    "/f/",
-  };
-
-  private static final boolean[][][] ALLOWED= new boolean[][][] {
-    { // ROBOTS_STRINGS[0]
-      { // Agent1
-       false,  // "/a",              
-       false,  // "/a/",             
-       false,  // "/a/bloh/foo.html"
-       true,   // "/b",              
-       false,  // "/b/a",            
-       false,  // "/b/a/index.html",
-       true,   // "/b/b/foo.html",  
-       true,   // "/c",              
-       true,   // "/c/a",            
-       true,   // "/c/a/index.html",
-       true,   // "/c/b/foo.html",  
-       true,   // "/d",              
-       true,   // "/d/a",            
-       true,   // "/e/a/index.html",
-       true,   // "/e/d",            
-       true,   // "/e/d/foo.html",  
-       true,   // "/e/doh.html",    
-       true,   // "/f/index.html",  
-       true,   // "/foo/bar.html",  
-       true,   // "/f/",  
-      }, 
-      { // Agent2
-       true,   // "/a",              
-       true,   // "/a/",             
-       true,   // "/a/bloh/foo.html"
-       true,   // "/b",              
-       true,   // "/b/a",            
-       true,   // "/b/a/index.html",
-       true,   // "/b/b/foo.html",  
-       true,   // "/c",              
-       true,   // "/c/a",            
-       true,   // "/c/a/index.html",
-       true,   // "/c/b/foo.html",  
-       false,  // "/d",              
-       false,  // "/d/a",            
-       true,   // "/e/a/index.html",
-       true,   // "/e/d",            
-       false,  // "/e/d/foo.html",  
-       true,   // "/e/doh.html",    
-       true,   // "/f/index.html",  
-       true,   // "/foo/bar.html",  
-       true,   // "/f/",  
-      },
-      { // Agent3
-       true,   // "/a",              
-       true,   // "/a/",             
-       true,   // "/a/bloh/foo.html"
-       true,   // "/b",              
-       true,   // "/b/a",            
-       true,   // "/b/a/index.html",
-       true,   // "/b/b/foo.html",  
-       true,   // "/c",              
-       true,   // "/c/a",            
-       true,   // "/c/a/index.html",
-       true,   // "/c/b/foo.html",  
-       false,  // "/d",              
-       false,  // "/d/a",            
-       true,   // "/e/a/index.html",
-       true,   // "/e/d",            
-       false,  // "/e/d/foo.html",  
-       true,   // "/e/doh.html",    
-       true,   // "/f/index.html",  
-       true,   // "/foo/bar.html",  
-       true,   // "/f/",  
-      },
-      { // Agent4
-       true,   // "/a",              
-       true,   // "/a/",             
-       true,   // "/a/bloh/foo.html"
-       true,   // "/b",              
-       true,   // "/b/a",            
-       true,   // "/b/a/index.html",
-       true,   // "/b/b/foo.html",  
-       true,   // "/c",              
-       true,   // "/c/a",            
-       true,   // "/c/a/index.html",
-       true,   // "/c/b/foo.html",  
-       false,  // "/d",              
-       false,  // "/d/a",            
-       true,   // "/e/a/index.html",
-       true,   // "/e/d",            
-       false,  // "/e/d/foo.html",  
-       true,   // "/e/doh.html",    
-       true,   // "/f/index.html",  
-       true,   // "/foo/bar.html",  
-       true,   // "/f/",  
-      },
-      { // Agent5/"*"
-       true,   // "/a",              
-       true,   // "/a/",             
-       true,   // "/a/bloh/foo.html"
-       true,   // "/b",              
-       true,   // "/b/a",            
-       true,   // "/b/a/index.html",
-       true,   // "/b/b/foo.html",  
-       true,   // "/c",              
-       true,   // "/c/a",            
-       true,   // "/c/a/index.html",
-       true,   // "/c/b/foo.html",  
-       true,   // "/d",              
-       true,   // "/d/a",            
-       true,   // "/e/a/index.html",
-       true,   // "/e/d",            
-       true,   // "/e/d/foo.html",  
-       true,   // "/e/doh.html",    
-       true,   // "/f/index.html",  
-       false,  // "/foo/bar.html",  
-       true,   // "/f/",  
-      }
-    },
-    { // ROBOTS_STRINGS[1]
-      ACCEPT_ALL, // Agent 1
-      ACCEPT_ALL, // Agent 2
-      ACCEPT_ALL, // Agent 3
-      ACCEPT_ALL, // Agent 4
-      ACCEPT_ALL, // Agent 5
-    }
-  };
- 
   public TestRobotRulesParser(String name) {
     super(name);
+    parser = new HttpRobotRulesParser();
   }
 
-  public void testRobotsOneAgent() {
-    for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
-      for (int j= 0; j < AGENT_STRINGS.length; j++) {
-       testRobots(i, new String[] { AGENT_STRINGS[j] },
-                  TEST_PATHS, ALLOWED[i][j]);
-      }
+  /**
+  * Test that the robots rules are interpreted correctly by the robots rules 
parser. 
+  */
+  public void testRobotsAgent() {
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, SINGLE_AGENT);
+
+    for(int counter = 0; counter < TEST_PATHS.length; counter++) {
+      assertTrue("testing on agent (" + SINGLE_AGENT + "), and " 
+              + "path " + TEST_PATHS[counter] 
+              + " got " + rules.isAllowed(TEST_PATHS[counter]),
+              rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
     }
-  }
 
-  public void testRobotsTwoAgents() {
-    for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
-      for (int j= 0; j < AGENT_STRINGS.length; j++) {
-       for (int k= 0; k < AGENT_STRINGS.length; k++) {
-         int key= j;
-         if (NOT_IN_ROBOTS_STRING[i][j])
-           key= k;
-         testRobots(i, new String[] { AGENT_STRINGS[j], AGENT_STRINGS[k] },
-                    TEST_PATHS, ALLOWED[i][key]);
-       }
-      }
-    }
-  }
-  
-  public void testCrawlDelay() {
-    RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" });
-    String delayRule1 = "User-agent: nutchbot" + CR +
-                        "Crawl-delay: 10" + CR +
-                        "User-agent: foobot" + CR +
-                        "Crawl-delay: 20" + CR +
-                        "User-agent: *" + CR + 
-                        "Disallow:/baz" + CR;
-    String delayRule2 = "User-agent: foobot" + CR +
-                        "Crawl-delay: 20" + CR +
-                        "User-agent: *" + CR + 
-                        "Disallow:/baz" + CR;
-    RobotRuleSet rules = p.parseRules(delayRule1.getBytes());
-    long crawlDelay = rules.getCrawlDelay();
-    assertTrue("testing crawl delay for agent nutchbot - rule 1", (crawlDelay 
== 10000));
-    rules = p.parseRules(delayRule2.getBytes());
-    crawlDelay = rules.getCrawlDelay();
-    assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay 
== -1));
-  }
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, MULTIPLE_AGENTS);
 
-  // helper
-
-  public void testRobots(int robotsString, String[] agents, String[] paths, 
-                        boolean[] allowed) {
-    String agentsString= agents[0];
-    for (int i= 1; i < agents.length; i++)
-      agentsString= agentsString + "," + agents[i];
-    RobotRulesParser p= new RobotRulesParser(agents);
-    RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString] != null
-                                     ? ROBOTS_STRINGS[robotsString].getBytes()
-                                     : null);
-    for (int i= 0; i < paths.length; i++) {
-      assertTrue("testing robots file "+robotsString+", on agents ("
-                + agentsString + "), and path " + TEST_PATHS[i] + "; got " 
-                + rules.isAllowed(TEST_PATHS[i]) + ", rules are: " + LF
-                                  + rules,
-                rules.isAllowed(TEST_PATHS[i]) == allowed[i]);
+    for(int counter = 0; counter < TEST_PATHS.length; counter++) {
+      assertTrue("testing on agents (" + MULTIPLE_AGENTS + "), and " 
+              + "path " + TEST_PATHS[counter] 
+              + " got " + rules.isAllowed(TEST_PATHS[counter]),
+              rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
     }
   }
 
-
-  
+  /**
+  * Test that the crawl delay is extracted from the robots file for respective 
agent. 
+  * If its not specified for a given agent, default value must be returned.
+  */
+  public void testCrawlDelay() {
+    // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be 
returned by the parser
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, SINGLE_AGENT);
+    assertTrue("testing crawl delay for agent "+ SINGLE_AGENT +" : ", 
(rules.getCrawlDelay() == 10000));
+    
+    // for UNKNOWN_AGENT, the default crawl delay must be returned.
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, UNKNOWN_AGENT);
+    assertTrue("testing crawl delay for agent "+ UNKNOWN_AGENT +" : ", 
(rules.getCrawlDelay() == Long.MIN_VALUE));
+  }
 }

Modified: 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 Mon Apr 29 20:26:52 2013
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.file;
 
 import java.net.URL;
@@ -23,28 +22,30 @@ import java.util.HashSet;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.EmptyRobotRules;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatusCodes;
 import org.apache.nutch.protocol.ProtocolStatusUtils;
-import org.apache.nutch.protocol.RobotRules;
+import org.apache.nutch.protocol.RobotRulesParser;
 import org.apache.nutch.storage.ProtocolStatus;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
 import org.apache.nutch.util.NutchConfiguration;
 
-/************************************
- * File.java deals with file: scheme.
- * 
- * Configurable parameters are defined under "FILE properties" section in
- * ./conf/nutch-default.xml or similar.
- * 
- * @author John Xing
- ***********************************/
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * This class is a protocol plugin used for file: scheme.
+ * It creates {@link FileResponse} object and gets the content of the url from 
it.
+ * Configurable parameters are {@code file.content.limit} and {@code 
file.crawl.parent} 
+ * in nutch-default.xml defined under "file properties" section.
+ */
 public class File implements Protocol {
 
   public static final Logger LOG = LoggerFactory.getLogger(File.class);
@@ -65,14 +66,40 @@ public class File implements Protocol {
   private Configuration conf;
 
   // constructor
-  public File() {
-  }
+  public File() { }
 
-  /** Set the point at which content is truncated. */
-  public void setMaxContentLength(int length) {
-    maxContentLength = length;
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
+    this.crawlParents = conf.getBoolean("file.crawl.parent", true);
   }
-
+  
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+    
+  /** 
+   * Set the point at which content is truncated. 
+   */
+  public void setMaxContentLength(int maxContentLength) {
+    this.maxContentLength = maxContentLength;
+  }
+  
+  /** 
+   * Creates a {@link FileResponse} object corresponding to the url and 
+   * return a {@link ProtocolOutput} object as per the content received
+   * 
+   * @param url Text containing the url
+   * @param datum The CrawlDatum object corresponding to the url
+   * 
+   * @return {@link ProtocolOutput} object for the content of the file 
indicated by url
+   */
   public ProtocolOutput getProtocolOutput(String url, WebPage page) {
     String urlString = url.toString();
     try {
@@ -82,16 +109,11 @@ public class File implements Protocol {
 
       while (true) {
         FileResponse response;
-        response = new FileResponse(u, page, this, getConf()); // make
-        // a
-        // request
-
+        response = new FileResponse(u, page, this, getConf()); // make a 
request
         int code = response.getCode();
 
         if (code == 200) { // got a good response
-          return new ProtocolOutput(response.toContent()); // return
-          // it
-
+          return new ProtocolOutput(response.toContent()); // return it
         } else if (code >= 300 && code < 400) { // handle redirect
           if (redirects == MAX_REDIRECTS)
             throw new FileException("Too many redirects: " + url);
@@ -114,16 +136,13 @@ public class File implements Protocol {
   }
 
   @Override
-  public RobotRules getRobotRules(String url, WebPage page) {
-    return EmptyRobotRules.RULES;
-  }
-
-  @Override
   public Collection<Field> getFields() {
     return FIELDS;
   }
 
-  /** For debugging. */
+  /** 
+   * Quick way for running this class. Useful for debugging. 
+   */
   public static void main(String[] args) throws Exception {
     int maxContentLength = Integer.MIN_VALUE;
     boolean dumpContent = false;
@@ -154,9 +173,6 @@ public class File implements Protocol {
     if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
       file.setMaxContentLength(maxContentLength);
 
-    // set log level
-    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
-
     Content content = file.getProtocolOutput(urlString, new WebPage())
         .getContent();
 
@@ -172,13 +188,11 @@ public class File implements Protocol {
     file = null;
   }
 
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
-    this.crawlParents = conf.getBoolean("file.crawl.parent", true);
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
+  /** 
+   * No robots parsing is done for file protocol. 
+   * So this returns a set of empty rules which will allow every url.
+   */
+  public BaseRobotRules getRobotRules(String url, WebPage page) {
+    return RobotRulesParser.EMPTY_RULES;
+  }   
 }

Modified: 
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
 Mon Apr 29 20:26:52 2013
@@ -28,23 +28,27 @@ import org.apache.commons.net.ftp.FTPFil
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.EmptyRobotRules;
+import org.apache.nutch.protocol.RobotRulesParser;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatusCodes;
 import org.apache.nutch.protocol.ProtocolStatusUtils;
-import org.apache.nutch.protocol.RobotRules;
 import org.apache.nutch.storage.ProtocolStatus;
 import org.apache.nutch.storage.WebPage;
 
-/************************************
- * Ftp.java deals with ftp: scheme.
- * 
- * Configurable parameters are defined under "FTP properties" section in
- * ./conf/nutch-default.xml or similar.
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * This class is a protocol plugin used for ftp: scheme.
+ * It creates {@link FtpResponse} object and gets the content of the url from 
it.
+ * Configurable parameters are {@code ftp.username}, {@code ftp.password},
+ *                             {@code ftp.content.limit}, {@code ftp.timeout}, 
+ *                             {@code ftp.server.timeout}, {@code 
ftp.password}, 
+ *                             {@code ftp.keep.connection} and {@code 
ftp.follow.talk}.
+ * For details see "FTP properties" section in {@code nutch-default.xml}.
  * 
  * @author John Xing
- ***********************************/
+ */
 public class Ftp implements Protocol {
 
   public static final Logger LOG = LoggerFactory.getLogger(Ftp.class);
@@ -109,6 +113,15 @@ public class Ftp implements Protocol {
     this.keepConnection = keepConnection;
   }
 
+  /** 
+   * Creates a {@link FtpResponse} object corresponding to the url and 
+   * returns a {@link ProtocolOutput} object as per the content received
+   * 
+   * @param url Text containing the ftp url
+   * @param datum The CrawlDatum object corresponding to the url
+   * 
+   * @return {@link ProtocolOutput} object for the url
+   */
   public ProtocolOutput getProtocolOutput(String url, WebPage page) {
     try {
       URL u = new URL(url);
@@ -154,6 +167,9 @@ public class Ftp implements Protocol {
     }
   }
 
+  /**
+   * Set the {@link Configuration} object
+   */
   public void setConf(Configuration conf) {
     this.conf = conf;
     this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);
@@ -165,14 +181,13 @@ public class Ftp implements Protocol {
     this.followTalk = conf.getBoolean("ftp.follow.talk", false);
   }
 
+  /**
+   * Get the {@link Configuration} object
+   */
   public Configuration getConf() {
     return this.conf;
   }
 
-  public RobotRules getRobotRules(String url, WebPage page) {
-    return EmptyRobotRules.RULES;
-  }
-
   /** For debugging. */
   public static void main(String[] args) throws Exception {
     int timeout = Integer.MIN_VALUE;
@@ -222,9 +237,6 @@ public class Ftp implements Protocol {
     if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
       ftp.setMaxContentLength(maxContentLength);
 
-    // set log level
-    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
-
     Content content = ftp.getProtocolOutput(urlString, new WebPage())
         .getContent();
 
@@ -244,4 +256,12 @@ public class Ftp implements Protocol {
     return FIELDS;
   }
 
+  /** 
+   * Currently, no robots parsing is done for ftp protocol 
+   * and this returns a set of empty rules which will allow every url.
+   * There a jira logged for the same NUTCH-1513
+   */
+  public BaseRobotRules getRobotRules(String url, WebPage page) {
+    return RobotRulesParser.EMPTY_RULES;
+  }
 }

Modified: 
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java?rev=1477319&r1=1477318&r2=1477319&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
 Mon Apr 29 20:26:52 2013
@@ -38,9 +38,9 @@ import org.apache.nutch.net.protocols.Re
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.RobotRules;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
+import org.apache.nutch.protocol.RobotRulesParser;
 
 //JSCH imports
 import com.jcraft.jsch.ChannelSftp;
@@ -50,6 +50,8 @@ import com.jcraft.jsch.Session;
 import com.jcraft.jsch.SftpException;
 import com.jcraft.jsch.ChannelSftp.LsEntry;
 
+import crawlercommons.robots.BaseRobotRules;
+
 /**
  * This class uses the Jsch package to fetch content using the Sftp protocol.
  * 
@@ -224,10 +226,16 @@ public class Sftp implements Protocol {
     }
   }
 
+  /**
+   * Get the {@link Configuration} object
+   */
   public Configuration getConf() {
     return configuration;
   }
 
+  /**
+   * Set the {@link Configuration} object
+   */
   public void setConf(Configuration arg0) {
     configuration = arg0;
 
@@ -288,34 +296,9 @@ public class Sftp implements Protocol {
     }
   }
 
-  /*
-   * (non-Javadoc)
-   * 
-   * @see org.apache.nutch.protocol.Protocol#getRobotRules(java.lang.String,
-   * org.apache.nutch.storage.WebPage)
-   */
   @Override
-  public RobotRules getRobotRules(String url, WebPage page) {
-    return new RobotRules() {
-
-      @Override
-      public boolean isAllowed(URL url) {
-        // they're all allowed for now.
-        return true;
-      }
-
-      @Override
-      public long getExpireTime() {
-        // set to 0 for never expire
-        return 0;
-      }
-
-      @Override
-      public long getCrawlDelay() {
-        // no delay
-        return 0;
-      }
-    };
+  public BaseRobotRules getRobotRules(String url, WebPage page) {
+    return RobotRulesParser.EMPTY_RULES;
   }
 
   /*
@@ -327,5 +310,4 @@ public class Sftp implements Protocol {
   public Collection<Field> getFields() {
     return Collections.emptySet();
   }
-
 }

svn commit: r1477319 - in /nutch/branches/2.x: ./ ivy/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/protocol/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/a...

Reply via email to