Author: tejasp
Date: Fri Apr  5 23:50:56 2013
New Revision: 1465159

URL: http://svn.apache.org/r1465159
Log:
NUTCH-1031 Delegate parsing of robots.txt to crawler-commons

Added:
    nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
    
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
Removed:
    nutch/trunk/src/java/org/apache/nutch/protocol/EmptyRobotRules.java
    
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
    
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    
nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
    
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
    
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Apr  5 23:50:56 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)
+
 * NUTCH-1547 BasicIndexingFilter - Problem to index full title (Feng)
 
 * NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel)

Modified: nutch/trunk/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Fri Apr  5 23:50:56 2013
@@ -74,6 +74,7 @@
                <dependency org="oro" name="oro" rev="2.0.8" />
 
                <dependency org="com.google.guava" name="guava" rev="11.0.2" />
+                <dependency org="com.google.code.crawler-commons" 
name="crawler-commons" rev="0.2" />
 
                <!--Configuration: test -->
 

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Apr  5 
23:50:56 2013
@@ -51,6 +51,7 @@ import org.apache.nutch.scoring.ScoringF
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.*;
 
+import crawlercommons.robots.BaseRobotRules;
 
 /**
  * A queue-based fetcher.
@@ -671,8 +672,8 @@ public class Fetcher extends Configured 
               }
               redirecting = false;
               Protocol protocol = 
this.protocolFactory.getProtocol(fit.url.toString());
-              RobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
-              if (!rules.isAllowed(fit.u)) {
+              BaseRobotRules rules = protocol.getRobotRules(fit.url, 
fit.datum);
+              if (!rules.isAllowed(fit.u.toString())) {
                 // unblock
                 fetchQueues.finishFetchItem(fit, true);
                 if (LOG.isDebugEnabled()) {

Modified: nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java Fri Apr  5 
23:50:56 2013
@@ -25,6 +25,8 @@ import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.plugin.Pluggable;
 
+import crawlercommons.robots.BaseRobotRules;
+
 
 /** A retriever of url content.  Implemented by protocol extensions. */
 public interface Protocol extends Pluggable, Configurable {
@@ -59,5 +61,6 @@ public interface Protocol extends Plugga
    * @param datum page datum
    * @return robot rules (specific for this url or default), never null
    */
-  RobotRules getRobotRules(Text url, CrawlDatum datum);
+  BaseRobotRules getRobotRules(Text url, CrawlDatum datum);
 }
+

Added: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1465159&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Fri 
Apr  5 23:50:56 2013
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+// JDK imports
+import java.io.File;
+import java.io.FileReader;
+import java.io.LineNumberReader;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.StringTokenizer;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.io.Text;
+
+import com.google.common.io.Files;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
+import crawlercommons.robots.SimpleRobotRulesParser;
+
+/**
+ * This class uses crawler-commons for handling the parsing of {@code 
robots.txt} files.
+ * It emits SimpleRobotRules objects, which describe the download permissions
+ * as described in SimpleRobotRulesParser.
+ */
+public abstract class RobotRulesParser implements Configurable {
+
+  public static final Logger LOG = 
LoggerFactory.getLogger(RobotRulesParser.class);
+
+  protected static final Hashtable<String, BaseRobotRules> CACHE = new 
Hashtable<String, BaseRobotRules> ();
+
+  /**
+   *  A {@link BaseRobotRules} object appropriate for use
+   *  when the {@code robots.txt} file is empty or missing;
+   *  all requests are allowed.
+   */
+  public static final BaseRobotRules EMPTY_RULES = new 
SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
+
+  /**
+   *  A {@link BaseRobotRules} object appropriate for use when the 
+   *  {@code robots.txt} file is not fetched due to a {@code 403/Forbidden}
+   *  response; all requests are disallowed. 
+   */
+  public static BaseRobotRules FORBID_ALL_RULES = new 
SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
+
+  private static SimpleRobotRulesParser robotParser = new 
SimpleRobotRulesParser();
+  private Configuration conf;
+  protected String agentNames;
+
+  public RobotRulesParser() { }
+
+  public RobotRulesParser(Configuration conf) {
+    setConf(conf);
+  }
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // Grab the agent names we advertise to robots files.
+    String agentName = conf.get("http.agent.name");
+    if (null == agentName) {
+      throw new RuntimeException("Agent name not configured!");
+    }
+
+    String agentNames = conf.get("http.robots.agents");
+    StringTokenizer tok = new StringTokenizer(agentNames, ",");
+    ArrayList<String> agents = new ArrayList<String>();
+    while (tok.hasMoreTokens()) {
+      agents.add(tok.nextToken().trim());
+    }
+
+    /**
+     * If there are no agents for robots-parsing, use the
+     * default agent-string. If both are present, our agent-string
+     * should be the first one we advertise to robots-parsing.
+     */
+    if (agents.size() == 0) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error("No agents listed in 'http.robots.agents' property!");
+      }
+    } else { 
+      StringBuffer combinedAgentsString = new StringBuffer(agentName);
+      int index = 0;
+
+      if ((agents.get(0)).equalsIgnoreCase(agentName))
+        index++;
+      else if (LOG.isErrorEnabled()) {
+        LOG.error("Agent we advertise (" + agentName
+            + ") not listed first in 'http.robots.agents' property!");
+      }
+
+      // append all the agents from the http.robots.agents property
+      for(; index < agents.size(); index++) {
+        combinedAgentsString.append(", " + agents.get(index));
+      }
+
+      // always make sure "*" is included in the end
+      combinedAgentsString.append(", *");
+      this.agentNames = combinedAgentsString.toString();
+    }
+  }
+
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return conf;
+  }
+
+  /**
+   * Parses the robots content using the {@link SimpleRobotRulesParser} from 
crawler commons
+   *    
+   * @param url A string containing url
+   * @param content Contents of the robots file in a byte array 
+   * @param contentType The 
+   * @param robotName A string containing value of  
+   * @return BaseRobotRules object 
+   */
+  public BaseRobotRules parseRules (String url, byte[] content, String 
contentType, String robotName) {
+    return robotParser.parseContent(url, content, contentType, robotName); 
+  }
+
+  public BaseRobotRules getRobotRulesSet(Protocol protocol, Text url) {
+    URL u = null;
+    try {
+      u = new URL(url.toString());
+    } catch (Exception e) {
+      return EMPTY_RULES;
+    }
+    return getRobotRulesSet(protocol, u);
+  }
+
+  public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url);
+
+  /** command-line main for testing */
+  public static void main(String[] argv) {
+
+    if (argv.length < 3) {
+      System.err.println("Usage: RobotRulesParser <robots-file> <url-file> 
<agent-names>\n");
+      System.err.println("\tThe <robots-file> will be parsed as a robots.txt 
file,");
+      System.err.println("\tusing the given <agent-name> to select rules.  
URLs ");
+      System.err.println("\twill be read (one per line) from <url-file>, and 
tested");
+      System.err.println("\tagainst the rules. Multiple agent names can be 
specified using spaces.");
+      System.exit(-1);
+    }
+
+    try {
+      StringBuilder agentNames = new StringBuilder();
+      for(int counter = 2; counter < argv.length; counter++) 
+        agentNames.append(argv[counter]).append(",");
+
+      agentNames.deleteCharAt(agentNames.length()-1);
+
+      byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
+      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, 
"text/plain", agentNames.toString());
+
+      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
+      String testPath = testsIn.readLine().trim();
+      while (testPath != null) {
+        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not 
allowed") +
+            ":\t" + testPath);
+        testPath = testsIn.readLine();
+      }
+      testsIn.close();
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+}

Modified: 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Fri Apr  5 23:50:56 2013
@@ -32,15 +32,16 @@ import org.apache.nutch.protocol.Protoco
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRules;
 import org.apache.nutch.util.GZIPUtils;
 import org.apache.nutch.util.DeflateUtils;
 
-
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 
+// crawler-commons imports
+import crawlercommons.robots.BaseRobotRules;
+
 /**
  * @author J&eacute;r&ocirc;me Charron
  */
@@ -51,7 +52,7 @@ public abstract class HttpBase implement
   
   private static final byte[] EMPTY_CONTENT = new byte[0];
 
-  private RobotRulesParser robots = null;
+  private HttpRobotRulesParser robots = null;
  
   /** The proxy hostname. */ 
   protected String proxyHost = null;
@@ -105,7 +106,7 @@ public abstract class HttpBase implement
     if (logger != null) {
       this.logger = logger;
     }
-    robots = new RobotRulesParser();
+    robots = new HttpRobotRulesParser();
   }
   
   // Inherited Javadoc
@@ -138,7 +139,6 @@ public abstract class HttpBase implement
     String urlString = url.toString();
     try {
       URL u = new URL(urlString);
-      String host = null;
       Response response = getResponse(u, datum, false); // make a request
       
       int code = response.getCode();
@@ -381,18 +381,16 @@ public abstract class HttpBase implement
       System.out.println("Content:");
       String text = new String(content.getContent());
       System.out.println(text);
-    }
-    
+    }  
   }
   
-  
   protected abstract Response getResponse(URL url,
                                           CrawlDatum datum,
                                           boolean followRedirects)
     throws ProtocolException, IOException;
 
-  public RobotRules getRobotRules(Text url, CrawlDatum datum) {
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
     return robots.getRobotRulesSet(this, url);
   }
-
 }
+

Added: 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1465159&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
 (added)
+++ 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
 Fri Apr  5 23:50:56 2013
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http.api;
+
+import java.net.URL;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.RobotRulesParser;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+
+/**
+ * This class is used for parsing robots for urls belonging to HTTP protocol.
+ * It extends the generic {@link RobotRulesParser} class and contains 
+ * Http protocol specific implementation for obtaining the robots file.
+ */
+public class HttpRobotRulesParser extends RobotRulesParser {
+  
+  public static final Logger LOG = 
LoggerFactory.getLogger(HttpRobotRulesParser.class);
+  protected boolean allowForbidden = false;
+
+  HttpRobotRulesParser() { }
+
+  public HttpRobotRulesParser(Configuration conf) {
+    super(conf);
+    allowForbidden = conf.getBoolean("http.robots.403.allow", false);
+  }
+
+  /**
+   * The hosts for which the caching of robots rules is yet to be done,
+   * it sends a Http request to the host corresponding to the {@link URL} 
+   * passed, gets robots file, parses the rules and caches the rules object
+   * to avoid re-work in future.
+   * 
+   *  @param http The {@link Protocol} object
+   *  @param url URL 
+   *  
+   *  @return robotRules A {@link BaseRobotRules} object for the rules
+   */
+  public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
+
+    String protocol = url.getProtocol().toLowerCase();  // normalize to lower 
case
+    String host = url.getHost().toLowerCase();          // normalize to lower 
case
+
+    BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" + 
host);
+
+    boolean cacheRule = true;
+    
+    if (robotRules == null) {                     // cache miss
+      URL redir = null;
+      if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
+      try {
+        Response response = ((HttpBase)http).getResponse(new URL(url, 
"/robots.txt"),
+                                             new CrawlDatum(), true);
+        // try one level of redirection ?
+        if (response.getCode() == 301 || response.getCode() == 302) {
+          String redirection = response.getHeader("Location");
+          if (redirection == null) {
+            // some versions of MS IIS are known to mangle this header
+            redirection = response.getHeader("location");
+          }
+          if (redirection != null) {
+            if (!redirection.startsWith("http")) {
+              // RFC says it should be absolute, but apparently it isn't
+              redir = new URL(url, redirection);
+            } else {
+              redir = new URL(redirection);
+            }
+            
+            response = ((HttpBase)http).getResponse(redir, new CrawlDatum(), 
true);
+          }
+        }
+
+        if (response.getCode() == 200)               // found rules: parse them
+          robotRules =  parseRules(url.toString(), response.getContent(), 
+                                   response.getHeader("Content-Type"), 
+                                   agentNames);
+
+        else if ( (response.getCode() == 403) && (!allowForbidden) )
+          robotRules = FORBID_ALL_RULES;            // use forbid all
+        else if (response.getCode() >= 500) {
+          cacheRule = false;
+          robotRules = EMPTY_RULES;
+        }else                                        
+          robotRules = EMPTY_RULES;                 // use default rules
+      } catch (Throwable t) {
+        if (LOG.isInfoEnabled()) {
+          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+        }
+        cacheRule = false;
+        robotRules = EMPTY_RULES;
+      }
+
+      if (cacheRule) {
+        CACHE.put(protocol + ":" + host, robotRules);  // cache rules for host
+        if (redir != null && !redir.getHost().equals(host)) {
+          // cache also for the redirected host
+          CACHE.put(protocol + ":" + redir.getHost(), robotRules);
+        }
+      }
+    }
+    return robotRules;
+  }
+}

Modified: 
nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
 Fri Apr  5 23:50:56 2013
@@ -17,292 +17,100 @@
 
 package org.apache.nutch.protocol.http.api;
 
-import org.apache.nutch.protocol.http.api.RobotRulesParser.RobotRuleSet;
-
+import crawlercommons.robots.BaseRobotRules;
 import junit.framework.TestCase;
 
+/**
+ * JUnit test case which tests
+ * 1. that robots filtering is performed correctly as per the agent name
+ * 2. that crawl delay is extracted correctly from the robots file
+ *
+ */
 public class TestRobotRulesParser extends TestCase {
-  private static final String LF= "\n";
-  private static final String CR= "\r";
-  private static final String CRLF= "\r\n";
+
+  private static final String CONTENT_TYPE = "text/plain";
+  private static final String SINGLE_AGENT = "Agent1";
+  private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
+  private static final String UNKNOWN_AGENT = "AgentABC";
+  private static final String CR = "\r";
   
-  private static final boolean[] ACCEPT_ALL = {
-    true,   // "/a",         
-    true,   // "/a/",        
-    true,   // "/a/bloh/foo.html"
-    true,   // "/b",         
-    true,   // "/b/a",       
-    true,   // "/b/a/index.html",
-    true,   // "/b/b/foo.html",  
-    true,   // "/c",         
-    true,   // "/c/a",       
-    true,   // "/c/a/index.html",
-    true,   // "/c/b/foo.html",  
-    true,   // "/d",         
-    true,   // "/d/a",       
-    true,   // "/e/a/index.html",
-    true,   // "/e/d",       
-    true,   // "/e/d/foo.html",  
-    true,   // "/e/doh.html",    
-    true,   // "/f/index.html",  
-    true,   // "/foo/bar.html",  
-    true,   // "/f/",
-  };
+  private static final String ROBOTS_STRING = 
+      "User-Agent: Agent1 #foo" + CR 
+      + "Disallow: /a" + CR 
+      + "Disallow: /b/a" + CR 
+      + "#Disallow: /c" + CR 
+      + "Crawl-delay: 10" + CR  // set crawl delay for Agent1 as 10 sec
+      + "" + CR 
+      + "" + CR 
+      + "User-Agent: Agent2" + CR 
+      + "Disallow: /a/bloh" + CR 
+      + "Disallow: /c" + CR
+      + "Disallow: /foo" + CR
+      + "Crawl-delay: 20" + CR
+      + "" + CR 
+      + "User-Agent: *" + CR 
+      + "Disallow: /foo/bar/" + CR;   // no crawl delay for other agents
   
-  private static final String[] ROBOTS_STRINGS= new String[] {
-    "User-Agent: Agent1 #foo" + CR 
-    + "Disallow: /a" + CR 
-    + "Disallow: /b/a" + CR 
-    + "#Disallow: /c" + CR 
-    + "" + CR 
-    + "" + CR 
-    + "User-Agent: Agent2 Agent3#foo" + CR 
-    + "User-Agent: Agent4" + CR 
-    + "Disallow: /d" + CR 
-    + "Disallow: /e/d/" + CR
-    + "" + CR 
-    + "User-Agent: *" + CR 
-    + "Disallow: /foo/bar/" + CR,
-    null  // Used to test EMPTY_RULES
+  private static final String[] TEST_PATHS = new String[] {
+    "http://example.com/a";,
+    "http://example.com/a/bloh/foo.html";,
+    "http://example.com/b";,
+    "http://example.com/c";,
+    "http://example.com/b/a/index.html";,
+    "http://example.com/foo/bar/baz.html";
+  };
+
+  private static final boolean[] RESULTS = new boolean[] {
+    false,  //  /a
+    false,  //  /a/bloh/foo.html
+    true,   //  /b
+    true,   //  /c
+    false,  //  /b/a/index.html
+    true    //  /foo/bar/baz.html
   };
 
-  private static final String[] AGENT_STRINGS= new String[] {
-    "Agent1",
-    "Agent2",
-    "Agent3",
-    "Agent4",
-    "Agent5",
-  };
-
-  private static final boolean[][] NOT_IN_ROBOTS_STRING= new boolean[][] {
-    { 
-      false, 
-      false,
-      false,
-      false,
-      true,
-    },
-    { 
-      false, 
-      false,
-      false,
-      false,
-      true,
-    }    
-  };
+  private HttpRobotRulesParser parser;
+  private BaseRobotRules rules;
 
-  private static final String[] TEST_PATHS= new String[] {
-    "/a",
-    "/a/",
-    "/a/bloh/foo.html",
-    "/b",
-    "/b/a",
-    "/b/a/index.html",
-    "/b/b/foo.html",
-    "/c",
-    "/c/a",
-    "/c/a/index.html",
-    "/c/b/foo.html",
-    "/d",
-    "/d/a",
-    "/e/a/index.html",
-    "/e/d",
-    "/e/d/foo.html",
-    "/e/doh.html",
-    "/f/index.html",
-    "/foo/bar/baz.html",  
-    "/f/",
-  };
-
-  private static final boolean[][][] ALLOWED= new boolean[][][] {
-    { // ROBOTS_STRINGS[0]
-      { // Agent1
-       false,  // "/a",              
-       false,  // "/a/",             
-       false,  // "/a/bloh/foo.html"
-       true,   // "/b",              
-       false,  // "/b/a",            
-       false,  // "/b/a/index.html",
-       true,   // "/b/b/foo.html",  
-       true,   // "/c",              
-       true,   // "/c/a",            
-       true,   // "/c/a/index.html",
-       true,   // "/c/b/foo.html",  
-       true,   // "/d",              
-       true,   // "/d/a",            
-       true,   // "/e/a/index.html",
-       true,   // "/e/d",            
-       true,   // "/e/d/foo.html",  
-       true,   // "/e/doh.html",    
-       true,   // "/f/index.html",  
-       true,   // "/foo/bar.html",  
-       true,   // "/f/",  
-      }, 
-      { // Agent2
-       true,   // "/a",              
-       true,   // "/a/",             
-       true,   // "/a/bloh/foo.html"
-       true,   // "/b",              
-       true,   // "/b/a",            
-       true,   // "/b/a/index.html",
-       true,   // "/b/b/foo.html",  
-       true,   // "/c",              
-       true,   // "/c/a",            
-       true,   // "/c/a/index.html",
-       true,   // "/c/b/foo.html",  
-       false,  // "/d",              
-       false,  // "/d/a",            
-       true,   // "/e/a/index.html",
-       true,   // "/e/d",            
-       false,  // "/e/d/foo.html",  
-       true,   // "/e/doh.html",    
-       true,   // "/f/index.html",  
-       true,   // "/foo/bar.html",  
-       true,   // "/f/",  
-      },
-      { // Agent3
-       true,   // "/a",              
-       true,   // "/a/",             
-       true,   // "/a/bloh/foo.html"
-       true,   // "/b",              
-       true,   // "/b/a",            
-       true,   // "/b/a/index.html",
-       true,   // "/b/b/foo.html",  
-       true,   // "/c",              
-       true,   // "/c/a",            
-       true,   // "/c/a/index.html",
-       true,   // "/c/b/foo.html",  
-       false,  // "/d",              
-       false,  // "/d/a",            
-       true,   // "/e/a/index.html",
-       true,   // "/e/d",            
-       false,  // "/e/d/foo.html",  
-       true,   // "/e/doh.html",    
-       true,   // "/f/index.html",  
-       true,   // "/foo/bar.html",  
-       true,   // "/f/",  
-      },
-      { // Agent4
-       true,   // "/a",              
-       true,   // "/a/",             
-       true,   // "/a/bloh/foo.html"
-       true,   // "/b",              
-       true,   // "/b/a",            
-       true,   // "/b/a/index.html",
-       true,   // "/b/b/foo.html",  
-       true,   // "/c",              
-       true,   // "/c/a",            
-       true,   // "/c/a/index.html",
-       true,   // "/c/b/foo.html",  
-       false,  // "/d",              
-       false,  // "/d/a",            
-       true,   // "/e/a/index.html",
-       true,   // "/e/d",            
-       false,  // "/e/d/foo.html",  
-       true,   // "/e/doh.html",    
-       true,   // "/f/index.html",  
-       true,   // "/foo/bar.html",  
-       true,   // "/f/",  
-      },
-      { // Agent5/"*"
-       true,   // "/a",              
-       true,   // "/a/",             
-       true,   // "/a/bloh/foo.html"
-       true,   // "/b",              
-       true,   // "/b/a",            
-       true,   // "/b/a/index.html",
-       true,   // "/b/b/foo.html",  
-       true,   // "/c",              
-       true,   // "/c/a",            
-       true,   // "/c/a/index.html",
-       true,   // "/c/b/foo.html",  
-       true,   // "/d",              
-       true,   // "/d/a",            
-       true,   // "/e/a/index.html",
-       true,   // "/e/d",            
-       true,   // "/e/d/foo.html",  
-       true,   // "/e/doh.html",    
-       true,   // "/f/index.html",  
-       false,  // "/foo/bar.html",  
-       true,   // "/f/",  
-      }
-    },
-    { // ROBOTS_STRINGS[1]
-      ACCEPT_ALL, // Agent 1
-      ACCEPT_ALL, // Agent 2
-      ACCEPT_ALL, // Agent 3
-      ACCEPT_ALL, // Agent 4
-      ACCEPT_ALL, // Agent 5
-    }
-  };
- 
   public TestRobotRulesParser(String name) {
     super(name);
+    parser = new HttpRobotRulesParser();
   }
 
-  public void testRobotsOneAgent() {
-    for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
-      for (int j= 0; j < AGENT_STRINGS.length; j++) {
-       testRobots(i, new String[] { AGENT_STRINGS[j] },
-                  TEST_PATHS, ALLOWED[i][j]);
-      }
+  /**
+  * Test that the robots rules are interpreted correctly by the robots rules 
parser. 
+  */
+  public void testRobotsAgent() {
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, SINGLE_AGENT);
+
+    for(int counter = 0; counter < TEST_PATHS.length; counter++) {
+      assertTrue("testing on agent (" + SINGLE_AGENT + "), and " 
+              + "path " + TEST_PATHS[counter] 
+              + " got " + rules.isAllowed(TEST_PATHS[counter]),
+              rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
     }
-  }
 
-  public void testRobotsTwoAgents() {
-    for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
-      for (int j= 0; j < AGENT_STRINGS.length; j++) {
-       for (int k= 0; k < AGENT_STRINGS.length; k++) {
-         int key= j;
-         if (NOT_IN_ROBOTS_STRING[i][j])
-           key= k;
-         testRobots(i, new String[] { AGENT_STRINGS[j], AGENT_STRINGS[k] },
-                    TEST_PATHS, ALLOWED[i][key]);
-       }
-      }
-    }
-  }
-  
-  public void testCrawlDelay() {
-    RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" });
-    String delayRule1 = "User-agent: nutchbot" + CR +
-                        "Crawl-delay: 10" + CR +
-                        "User-agent: foobot" + CR +
-                        "Crawl-delay: 20" + CR +
-                        "User-agent: *" + CR + 
-                        "Disallow:/baz" + CR;
-    String delayRule2 = "User-agent: foobot" + CR +
-                        "Crawl-delay: 20" + CR +
-                        "User-agent: *" + CR + 
-                        "Disallow:/baz" + CR;
-    RobotRuleSet rules = p.parseRules(delayRule1.getBytes());
-    long crawlDelay = rules.getCrawlDelay();
-    assertTrue("testing crawl delay for agent nutchbot - rule 1", (crawlDelay 
== 10000));
-    rules = p.parseRules(delayRule2.getBytes());
-    crawlDelay = rules.getCrawlDelay();
-    assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay 
== -1));
-  }
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, MULTIPLE_AGENTS);
 
-  // helper
-
-  public void testRobots(int robotsString, String[] agents, String[] paths, 
-                        boolean[] allowed) {
-    String agentsString= agents[0];
-    for (int i= 1; i < agents.length; i++)
-      agentsString= agentsString + "," + agents[i];
-    RobotRulesParser p= new RobotRulesParser(agents);
-    RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString] != null
-                                     ? ROBOTS_STRINGS[robotsString].getBytes()
-                                     : null);
-    for (int i= 0; i < paths.length; i++) {
-      assertTrue("testing robots file "+robotsString+", on agents ("
-                + agentsString + "), and path " + TEST_PATHS[i] + "; got " 
-                + rules.isAllowed(TEST_PATHS[i]) + ", rules are: " + LF
-                                  + rules,
-                rules.isAllowed(TEST_PATHS[i]) == allowed[i]);
+    for(int counter = 0; counter < TEST_PATHS.length; counter++) {
+      assertTrue("testing on agents (" + MULTIPLE_AGENTS + "), and " 
+              + "path " + TEST_PATHS[counter] 
+              + " got " + rules.isAllowed(TEST_PATHS[counter]),
+              rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
     }
   }
 
-
-  
+  /**
+  * Test that the crawl delay is extracted from the robots file for respective 
agent. 
+  * If its not specified for a given agent, default value must be returned.
+  */
+  public void testCrawlDelay() {
+    // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be 
returned by the parser
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, SINGLE_AGENT);
+    assertTrue("testing crawl delay for agent "+ SINGLE_AGENT +" : ", 
(rules.getCrawlDelay() == 10000));
+    
+    // for UNKNOWN_AGENT, the default crawl delay must be returned.
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), 
CONTENT_TYPE, UNKNOWN_AGENT);
+    assertTrue("testing crawl delay for agent "+ UNKNOWN_AGENT +" : ", 
(rules.getCrawlDelay() == Long.MIN_VALUE));
+  }
 }

Modified: 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
 Fri Apr  5 23:50:56 2013
@@ -17,35 +17,33 @@
 
 package org.apache.nutch.protocol.file;
 
+import java.net.URL;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.HttpDateFormat;
-import org.apache.nutch.net.protocols.Response;
-
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
 
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.EmptyRobotRules;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRules;
+import org.apache.nutch.protocol.RobotRulesParser;
 import org.apache.nutch.util.NutchConfiguration;
 
-import java.net.URL;
+import crawlercommons.robots.BaseRobotRules;
 
-/************************************
- * File.java deals with file: scheme.
- *
- * Configurable parameters are defined under "FILE properties" section
- * in ./conf/nutch-default.xml or similar.
+/**
+ * This class is a protocol plugin used for file: scheme.
+ * It creates {@link FileResponse} object and gets the content of the url from 
it.
+ * Configurable parameters are {@code file.content.limit} and {@code 
file.crawl.parent} 
+ * in nutch-default.xml defined under "file properties" section.
  *
  * @author John Xing
- ***********************************/
+ */
 public class File implements Protocol {
 
   public static final Logger LOG = LoggerFactory.getLogger(File.class);
@@ -57,13 +55,40 @@ public class File implements Protocol {
 
   private Configuration conf;
 
-  // constructor
-  public File() {
-  }
+  public File() {}
 
-  /** Set the point at which content is truncated. */
-  public void setMaxContentLength(int length) {maxContentLength = length;}
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
+    this.crawlParents = conf.getBoolean("file.crawl.parent", true);
+  }
 
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+  
+  /** 
+   * Set the length after at which content is truncated. 
+   */
+  public void setMaxContentLength(int maxContentLength) {
+    this.maxContentLength = maxContentLength;
+  }
+
+  /** 
+   * Creates a {@link FileResponse} object corresponding to the url and 
+   * return a {@link ProtocolOutput} object as per the content received
+   * 
+   * @param url Text containing the url
+   * @param datum The CrawlDatum object corresponding to the url
+   * 
+   * @return {@link ProtocolOutput} object for the content of the file 
indicated by url
+   */
   public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
     String urlString = url.toString();
     try {
@@ -99,11 +124,9 @@ public class File implements Protocol {
     }
   }
 
-//  protected void finalize () {
-//    // nothing here
-//  }
-
-  /** For debugging. */
+  /** 
+   * Quick way for running this class. Useful for debugging.
+   */
   public static void main(String[] args) throws Exception {
     int maxContentLength = Integer.MIN_VALUE;
     String logLevel = "info";
@@ -154,17 +177,12 @@ public class File implements Protocol {
     file = null;
   }
 
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
-    this.crawlParents = conf.getBoolean("file.crawl.parent", true);
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  public RobotRules getRobotRules(Text url, CrawlDatum datum) {
-    return EmptyRobotRules.RULES;
+  /** 
+   * No robots parsing is done for file protocol. 
+   * So this returns a set of empty rules which will allow every url.
+   */
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+    return RobotRulesParser.EMPTY_RULES;
   }
 }
+

Modified: 
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=1465159&r1=1465158&r2=1465159&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
 Fri Apr  5 23:50:56 2013
@@ -24,30 +24,33 @@ import org.apache.commons.net.ftp.FTPFil
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.hadoop.io.Text;
-import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
 
 import org.apache.hadoop.conf.Configuration;
 
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.EmptyRobotRules;
+import org.apache.nutch.protocol.RobotRulesParser;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRules;
+
+import crawlercommons.robots.BaseRobotRules;
 
 import java.net.URL;
 
 import java.io.IOException;
 
-/************************************
- * Ftp.java deals with ftp: scheme.
- *
- * Configurable parameters are defined under "FTP properties" section
- * in ./conf/nutch-default.xml or similar.
+/**
+ * This class is a protocol plugin used for ftp: scheme.
+ * It creates {@link FtpResponse} object and gets the content of the url from 
it.
+ * Configurable parameters are {@code ftp.username}, {@code ftp.password},
+ *                             {@code ftp.content.limit}, {@code ftp.timeout}, 
+ *                             {@code ftp.server.timeout}, {@code 
ftp.password}, 
+ *                             {@code ftp.keep.connection} and {@code 
ftp.follow.talk}.
+ * For details see "FTP properties" section in {@code nutch-default.xml}.
  *
  * @author John Xing
- ***********************************/
+ */
 public class Ftp implements Protocol {
 
   public static final Logger LOG = LoggerFactory.getLogger(Ftp.class);
@@ -106,6 +109,15 @@ public class Ftp implements Protocol {
     this.keepConnection = keepConnection;
   }
 
+  /** 
+   * Creates a {@link FtpResponse} object corresponding to the url and 
+   * returns a {@link ProtocolOutput} object as per the content received
+   * 
+   * @param url Text containing the ftp url
+   * @param datum The CrawlDatum object corresponding to the url
+   * 
+   * @return {@link ProtocolOutput} object for the url
+   */
   public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
     String urlString = url.toString();
     try {
@@ -216,7 +228,9 @@ public class Ftp implements Protocol {
     ftp = null;
   }
 
-  
+  /**
+   * Set the {@link Configuration} object
+   */
   public void setConf(Configuration conf) {
     this.conf = conf;
     this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);
@@ -228,12 +242,20 @@ public class Ftp implements Protocol {
     this.followTalk = conf.getBoolean("ftp.follow.talk", false);
   }
 
+  /**
+   * Get the {@link Configuration} object
+   */
   public Configuration getConf() {
     return this.conf;
   }
 
-  public RobotRules getRobotRules(Text url, CrawlDatum datum) {
-    return EmptyRobotRules.RULES;
+  /** 
+   * Currently, no robots parsing is done for ftp protocol 
+   * and this returns a set of empty rules which will allow every url.
+   * There a jira logged for the same NUTCH-1513
+   */
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+    return RobotRulesParser.EMPTY_RULES;
   }
-
 }
+


Reply via email to