[1/5] nutch git commit: Allow Fetcher to optionally store robots.txt content (if property fetcher.store.robotstxt == true). Improved RobotRulesParser command-line tool.

snagel Mon, 22 Aug 2016 14:51:36 -0700

Repository: nutch
Updated Branches:
  refs/heads/master d37b7ce13 -> 3fca1a590



Allow Fetcher to optionally store robots.txt content (if property 
fetcher.store.robotstxt == true).
Improved RobotRulesParser command-line tool.


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/6c9cca5e
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/6c9cca5e
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/6c9cca5e

Branch: refs/heads/master
Commit: 6c9cca5e55e43458cbc5e59b8591e4d27ac425a2
Parents: d37b7ce
Author: Sebastian Nagel <sna...@apache.org>
Authored: Wed May 25 14:24:11 2016 +0200
Committer: Sebastian Nagel <sna...@apache.org>
Committed: Fri Aug 19 12:07:06 2016 +0200

----------------------------------------------------------------------
 conf/nutch-default.xml                          |   8 +
 .../org/apache/nutch/fetcher/FetcherThread.java |  17 +-
 .../org/apache/nutch/protocol/Protocol.java     |  20 ++-
 .../apache/nutch/protocol/RobotRulesParser.java | 174 +++++++++++++++----
 .../nutch/protocol/http/api/HttpBase.java       |  29 ++--
 .../protocol/http/api/HttpRobotRulesParser.java |  52 +++++-
 .../org/apache/nutch/protocol/file/File.java    |  13 +-
 .../java/org/apache/nutch/protocol/ftp/Ftp.java |   9 +-
 .../nutch/protocol/ftp/FtpRobotRulesParser.java |  17 +-
 9 files changed, 265 insertions(+), 74 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 67326ee..8c329bc 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -982,6 +982,14 @@
    fetcher.bandwidth.target. Defaults to 30 and must be at least 
1.</description>
 </property>
 
+<property>
+  <name>fetcher.store.robotstxt</name>
+  <value>false</value>
+  <description>If true, fetcher will store the robots.txt response
+  content and status for debugging or archival purposes.
+  </description>
+</property>
+
 <!-- moreindexingfilter plugin properties -->
 
 <property>

http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/java/org/apache/nutch/fetcher/FetcherThread.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java 
b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index e57e735..cac16ff 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -22,6 +22,7 @@ import java.net.URL;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.Map.Entry;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -129,6 +130,8 @@ public class FetcherThread extends Thread {
 
   private AtomicLong bytes;
   
+  private List<Content> robotsTxtContent = null;
+
   //Used by the REST service
   private FetchNode fetchNode;
   private boolean reportToNutchServer;
@@ -188,6 +191,9 @@ public class FetcherThread extends Thread {
         "fetcher.follow.outlinks.num.links", 4);
     outlinksDepthDivisor = conf.getInt(
         "fetcher.follow.outlinks.depth.divisor", 2);
+    if (conf.getBoolean("fetcher.store.robotstxt", false)) {
+      robotsTxtContent = new LinkedList<Content>();
+    }
   }
 
   @SuppressWarnings("fallthrough")
@@ -256,7 +262,16 @@ public class FetcherThread extends Thread {
             redirecting = false;
             Protocol protocol = this.protocolFactory.getProtocol(fit.url
                 .toString());
-            BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
+            BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, 
robotsTxtContent);
+            if (robotsTxtContent != null) {
+              for (Content robotsTxt : robotsTxtContent) {
+                LOG.debug("fetched and stored robots.txt {}",
+                    robotsTxt.getUrl());
+                output.collect(new Text(robotsTxt.getUrl()),
+                    new NutchWritable(robotsTxt));
+              }
+              robotsTxtContent.clear();
+            }
             if (!rules.isAllowed(fit.u.toString())) {
               // unblock
               ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);

http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/java/org/apache/nutch/protocol/Protocol.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/protocol/Protocol.java 
b/src/java/org/apache/nutch/protocol/Protocol.java
index efd0100..ddebffb 100755
--- a/src/java/org/apache/nutch/protocol/Protocol.java
+++ b/src/java/org/apache/nutch/protocol/Protocol.java
@@ -17,6 +17,8 @@
 
 package org.apache.nutch.protocol;
 
+import java.util.List;
+
 // Hadoop imports
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.io.Text;
@@ -38,13 +40,21 @@ public interface Protocol extends Pluggable, Configurable {
   ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum);
 
   /**
-   * Retrieve robot rules applicable for this url.
-   * 
+   * Retrieve robot rules applicable for this URL.
+   *
    * @param url
-   *          url to check
+   *          URL to check
    * @param datum
    *          page datum
-   * @return robot rules (specific for this url or default), never null
+   * @param robotsTxtContent
+   *          container to store responses when fetching the robots.txt file 
for
+   *          debugging or archival purposes. Instead of a robots.txt file, it
+   *          may include redirects or an error page (404, etc.). Response
+   *          {@link Content} is appended to the passed list. If null is passed
+   *          nothing is stored.
+   * @return robot rules (specific for this URL or default), never null
    */
-  BaseRobotRules getRobotRules(Text url, CrawlDatum datum);
+  BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
+      List<Content> robotsTxtContent);
+
 }

http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/java/org/apache/nutch/protocol/RobotRulesParser.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java 
b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 475aef4..d7eba92 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -27,6 +27,8 @@ import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.HashSet;
 import java.util.Hashtable;
+import java.util.LinkedList;
+import java.util.List;
 import java.util.Set;
 import java.util.StringTokenizer;
 
@@ -189,51 +191,124 @@ public abstract class RobotRulesParser implements Tool {
     return robotParser.parseContent(url, content, contentType, robotName);
   }
 
-  public BaseRobotRules getRobotRulesSet(Protocol protocol, Text url) {
+  /**
+   * Fetch robots.txt (or it's protocol-specific equivalent) which applies to
+   * the given URL, parse it and return the set of robot rules applicable for
+   * the configured agent name(s).
+   *
+   * @param protocol
+   *          {@link Protocol}
+   * @param url
+   *          URL to check
+   * @param robotsTxtContent
+   *          container to store responses when fetching the robots.txt file 
for
+   *          debugging or archival purposes. Instead of a robots.txt file, it
+   *          may include redirects or an error page (404, etc.). Response
+   *          {@link Content} is appended to the passed list. If null is passed
+   *          nothing is stored.
+   *
+   * @return robot rules (specific for this URL or default), never null
+   */
+  public BaseRobotRules getRobotRulesSet(Protocol protocol, Text url,
+      List<Content> robotsTxtContent) {
     URL u = null;
     try {
       u = new URL(url.toString());
     } catch (Exception e) {
       return EMPTY_RULES;
     }
-    return getRobotRulesSet(protocol, u);
+    return getRobotRulesSet(protocol, u, robotsTxtContent);
   }
 
   /**
    * Fetch robots.txt (or it's protocol-specific equivalent) which applies to
    * the given URL, parse it and return the set of robot rules applicable for
    * the configured agent name(s).
-   * 
+   *
    * @param protocol
-   *          protocol implementation
+   *          {@link Protocol}
    * @param url
-   *          URL to be checked whether fetching is allowed by robot rules
-   * @return robot rules
+   *          URL to check
+   * @param robotsTxtContent
+   *          container to store responses when fetching the robots.txt file 
for
+   *          debugging or archival purposes. Instead of a robots.txt file, it
+   *          may include redirects or an error page (404, etc.). Response
+   *          {@link Content} is appended to the passed list. If null is passed
+   *          nothing is stored.
+   *
+   * @return robot rules (specific for this URL or default), never null
    */
-  public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url);
+  public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url,
+      List<Content> robotsTxtContent);
+
 
   @Override
   public int run(String[] args) {
 
     if (args.length < 2) {
       String[] help = {
-          "Usage: RobotRulesParser <robots-file> <url-file> [<agent-names>]\n",
-          "\tThe <robots-file> will be parsed as a robots.txt file,",
-          "\tusing the given <agent-name> to select rules.",
-          "\tURLs will be read (one per line) from <url-file>,",
-          "\tand tested against the rules.",
-          "\tMultiple agent names can be provided using",
-          "\tcomma as a delimiter without any spaces.",
-          "\tIf no agent name is given the property http.agent.name",
-          "\tis used. If http.agent.name is empty, robots.txt is checked",
-          "\tfor rules assigned to the user agent `*' (meaning any other)." };
+          "Usage: RobotRulesParser [ -Dproperty=... ] <robots-file-or-url> 
<url-file> [<agent-names>]",
+          "",
+          "<robots-file-or-url>\tlocal file or URL parsed as robots.txt file",
+          "\tIf <robots-file-or-url> starts with a protocol specification",
+          "\t(`http', `https', `ftp' or `file'), robots.txt it is fetched",
+          "\tusing the specified protocol. Otherwise, a local file is 
assumed.",
+          "",
+          "<url-file>\tlocal file with URLs (one per line), for every URL",
+          "\tthe path part (including the query) is checked whether",
+          "\tit is allowed by the robots.txt rules.  Other parts of the URLs",
+          "\t(mainly the host) are ignored.",
+          "",
+          "<agent-names>\tcomma-separated list of agent names",
+          "\tused to select rules from the robots.txt file.",
+          "\tIf no agent name is given the property http.agent.name is used.",
+          "\tIf http.agent.name is empty, robots.txt is checked for rules",
+          "\tassigned to the user agent `*' (meaning any other).",
+          "",
+          "Important properties:",
+          " -D fetcher.store.robotstxt=true",
+          "\toutput content and HTTP meta data of fetched robots.txt (if not a 
local file)",
+          " -D http.agent.name=...\tsame as argument <agent-names>",
+          " -D http.robots.agents=...\tadditional agent names",
+          " -D http.robot.rules.whitelist=..."};
       for (String s : help) {
         System.err.println(s);
       }
-      System.exit(-1);
+      return -1;
+    }
+
+    Protocol protocol = null;
+    URL robotsTxtUrl = null;
+    if (args[0].matches("^(?:https?|ftp|file)://?.*")) {
+      try {
+        robotsTxtUrl = new URL(args[0]);
+      } catch (MalformedURLException e) {
+        LOG.warn("Not a valid URL, assuming local file: {}", args[0]);
+      }
+      ProtocolFactory factory = new ProtocolFactory(conf);
+      try {
+        protocol = factory.getProtocol(robotsTxtUrl.toString());
+      } catch (ProtocolNotFound e) {
+        LOG.error("No protocol found for {}: {}", args[0],
+            StringUtils.stringifyException(e));
+        return -1;
+      }
+    }
+
+    if (robotsTxtUrl == null) {
+      // try as local file
+      File robotsFile = new File(args[0]);
+      if (!robotsFile.exists()) {
+        LOG.error("File does not exist: {}", args[0]);
+        return -1;
+      } else {
+        try {
+          robotsTxtUrl = robotsFile.toURI().toURL();
+        } catch (MalformedURLException e) {
+        }
+      }
     }
 
-    File robotsFile = new File(args[0]);
     File urlFile = new File(args[1]);
 
     if (args.length > 2) {
@@ -243,13 +318,30 @@ public abstract class RobotRulesParser implements Tool {
       setConf(conf);
     }
 
+    List<Content> robotsTxtContent = null;
+    if (getConf().getBoolean("fetcher.store.robotstxt", false)) {
+      robotsTxtContent = new LinkedList<Content>();
+    }
+
     try {
-      BaseRobotRules rules = getRobotRulesSet(null, 
robotsFile.toURI().toURL());
+
+      BaseRobotRules rules = getRobotRulesSet(protocol, robotsTxtUrl, 
robotsTxtContent);
+
+      if (robotsTxtContent != null) {
+        for (Content robotsTxt : robotsTxtContent) {
+          LOG.info("fetched robots.txt {}:",
+              robotsTxt.getUrl());
+          LOG.info(robotsTxt.toString());
+        }
+      }
+
+      System.out.println("Testing robots.txt for agent names: " + agentNames);
 
       LineNumberReader testsIn = new LineNumberReader(new FileReader(urlFile));
       String testPath;
-      testPath = testsIn.readLine().trim();
+      testPath = testsIn.readLine();
       while (testPath != null) {
+        testPath = testPath.trim();
         try {
           // testPath can be just a path or a complete URL
           URL url = new URL(testPath);
@@ -263,6 +355,7 @@ public abstract class RobotRulesParser implements Tool {
           }
           System.out.println(status + ":\t" + testPath);
         } catch (MalformedURLException e) {
+          LOG.warn("Not a valid URL: {}", testPath);
         }
         testPath = testsIn.readLine();
       }
@@ -292,24 +385,33 @@ public abstract class RobotRulesParser implements Tool {
     }
 
     /**
-     * @param protocol  (ignored)
+     * @param protocol
+     *          (if not null) protocol used to get robot rules,
+     *          (if null) the URL is read via {@link URLConnection}
      * @param url
      *          location of the robots.txt file
-     * */
-    public BaseRobotRules getRobotRulesSet(Protocol protocol, URL url) {
+     */
+    @Override
+    public BaseRobotRules getRobotRulesSet(Protocol protocol, URL url,
+        List<Content> robotsTxtContent) {
       BaseRobotRules rules;
-      try {
-        int contentLength = url.openConnection().getContentLength();
-        byte[] robotsBytes = new byte[contentLength];
-        InputStream openStream = url.openStream();
-        openStream.read(robotsBytes);
-        openStream.close();
-        rules = robotParser.parseContent(url.toString(), robotsBytes,
-            "text/plain", this.conf.get("http.agent.name"));
-      } catch (IOException e) {
-        LOG.error("Failed to open robots.txt file " + url
-            + StringUtils.stringifyException(e));
-        rules = EMPTY_RULES;
+      if (protocol != null) {
+        rules = protocol.getRobotRules(new Text(url.toString()), null,
+            robotsTxtContent);
+      } else {
+        try {
+          int contentLength = url.openConnection().getContentLength();
+          byte[] robotsBytes = new byte[contentLength];
+          InputStream openStream = url.openStream();
+          openStream.read(robotsBytes);
+          openStream.close();
+          rules = robotParser.parseContent(url.toString(), robotsBytes,
+              "text/plain", this.conf.get("http.agent.name"));
+        } catch (IOException e) {
+          LOG.error("Failed to open robots.txt file " + url
+              + StringUtils.stringifyException(e));
+          rules = EMPTY_RULES;
+        }
       }
       return rules;
     }

http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 9f616fe..4d1a0cc 100644
--- 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -21,10 +21,11 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.Reader;
 import java.net.URL;
-import java.util.*;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashMap;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 import java.util.concurrent.ThreadLocalRandom;
 // Logging imports
@@ -518,7 +519,6 @@ public abstract class HttpBase implements Protocol {
   }
 
   protected static void main(HttpBase http, String[] args) throws Exception {
-    boolean verbose = false;
     String url = null;
 
     String usage = "Usage: Http [-verbose] [-timeout N] url";
@@ -532,7 +532,6 @@ public abstract class HttpBase implements Protocol {
       if (args[i].equals("-timeout")) { // found -timeout option
         http.timeout = Integer.parseInt(args[++i]) * 1000;
       } else if (args[i].equals("-verbose")) { // found -verbose option
-        verbose = true;
       } else if (i != args.length - 1) {
         System.err.println(usage);
         System.exit(-1);
@@ -541,10 +540,6 @@ public abstract class HttpBase implements Protocol {
         url = args[i];
     }
 
-    // if (verbose) {
-    // LOGGER.setLevel(Level.FINE);
-    // }
-
     ProtocolOutput out = http
         .getProtocolOutput(new Text(url), new CrawlDatum());
     Content content = out.getContent();
@@ -563,8 +558,10 @@ public abstract class HttpBase implements Protocol {
   protected abstract Response getResponse(URL url, CrawlDatum datum,
       boolean followRedirects) throws ProtocolException, IOException;
 
-  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
-    return robots.getRobotRulesSet(this, url);
+  @Override
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
+      List<Content> robotsTxtContent) {
+    return robots.getRobotRulesSet(this, url, robotsTxtContent);
   }
   
   /**
@@ -572,14 +569,14 @@ public abstract class HttpBase implements Protocol {
    * @param input String[]
    * @return a new HashMap
    */
-  private HashMap arrayToMap(String[]input){
-    if (input==null ||input.length==0) {
-      return new HashMap();
+  private HashMap<String, String> arrayToMap(String[] input) {
+    if (input == null || input.length == 0) {
+      return new HashMap<String, String>();
     }
-    HashMap hm=new HashMap();
-    for (int i=0;i<input.length;i++){
-      if (!"".equals(input[i].trim())){
-        hm.put(input[i],input[i]);
+    HashMap<String, String> hm = new HashMap<String, String>();
+    for (int i = 0; i < input.length; i++) {
+      if (!"".equals(input[i].trim())) {
+        hm.put(input[i], input[i]);
       }
     }
     return hm;

http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
index 185ca15..494ae0d 100644
--- 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++ 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -18,6 +18,7 @@
 package org.apache.nutch.protocol.http.api;
 
 import java.net.URL;
+import java.util.List;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -25,6 +26,7 @@ import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.RobotRulesParser;
 
@@ -80,11 +82,19 @@ public class HttpRobotRulesParser extends RobotRulesParser {
    * @param http
    *          The {@link Protocol} object
    * @param url
-   *          URL robots.txt applies to
-   * 
-   * @return {@link BaseRobotRules} holding the rules from robots.txt
+   *          URL
+   * @param robotsTxtContent
+   *          container to store responses when fetching the robots.txt file 
for
+   *          debugging or archival purposes. Instead of a robots.txt file, it
+   *          may include redirects or an error page (404, etc.). Response
+   *          {@link Content} is appended to the passed list. If null is passed
+   *          nothing is stored.
+   *
+   * @return robotRules A {@link BaseRobotRules} object for the rules
    */
-  public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
+  @Override
+  public BaseRobotRules getRobotRulesSet(Protocol http, URL url,
+      List<Content> robotsTxtContent) {
 
     if (LOG.isTraceEnabled() && isWhiteListed(url)) {
       LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
@@ -112,8 +122,12 @@ public class HttpRobotRulesParser extends RobotRulesParser 
{
 
     } else {
       try {
-        Response response = ((HttpBase) http).getResponse(new URL(url,
-            "/robots.txt"), new CrawlDatum(), true);
+        URL robotsUrl = new URL(url, "/robots.txt");
+        Response response = ((HttpBase) http).getResponse(robotsUrl,
+            new CrawlDatum(), true);
+        if (robotsTxtContent != null) {
+          addRobotsContent(robotsTxtContent, robotsUrl, response);
+        }
         // try one level of redirection ?
         if (response.getCode() == 301 || response.getCode() == 302) {
           String redirection = response.getHeader("Location");
@@ -131,6 +145,9 @@ public class HttpRobotRulesParser extends RobotRulesParser {
 
             response = ((HttpBase) http).getResponse(redir, new CrawlDatum(),
                 true);
+            if (robotsTxtContent != null) {
+              addRobotsContent(robotsTxtContent, robotsUrl, response);
+            }
           }
         }
 
@@ -164,4 +181,27 @@ public class HttpRobotRulesParser extends RobotRulesParser 
{
 
     return robotRules;
   }
+
+  /**
+   * Append {@link Content} of robots.txt to {@literal robotsTxtContent}
+   * 
+   * @param robotsTxtContent
+   *          container to store robots.txt response content
+   * @param robotsUrl
+   *          robots.txt URL
+   * @param robotsResponse
+   *          response object to be stored
+   */
+  protected void addRobotsContent(List<Content> robotsTxtContent,
+      URL robotsUrl, Response robotsResponse) {
+    byte[] robotsBytes = robotsResponse.getContent();
+    if (robotsBytes == null)
+      robotsBytes = new byte[0];
+    Content content = new Content(robotsUrl.toString(),
+        robotsUrl.toString(), robotsBytes,
+        robotsResponse.getHeader("Content-Type"), robotsResponse.getHeaders(),
+        getConf());
+    robotsTxtContent.add(content);
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java 
b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
index 2712218..2efb140 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
@@ -18,6 +18,7 @@
 package org.apache.nutch.protocol.file;
 
 import java.net.URL;
+import java.util.List;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -161,11 +162,10 @@ public class File implements Protocol {
    */
   public static void main(String[] args) throws Exception {
     int maxContentLength = Integer.MIN_VALUE;
-    String logLevel = "info";
     boolean dumpContent = false;
     String urlString = null;
 
-    String usage = "Usage: File [-logLevel level] [-maxContentLength L] 
[-dumpContent] url";
+    String usage = "Usage: File [-maxContentLength L] [-dumpContent] url";
 
     if (args.length == 0) {
       System.err.println(usage);
@@ -173,9 +173,7 @@ public class File implements Protocol {
     }
 
     for (int i = 0; i < args.length; i++) {
-      if (args[i].equals("-logLevel")) {
-        logLevel = args[++i];
-      } else if (args[i].equals("-maxContentLength")) {
+      if (args[i].equals("-maxContentLength")) {
         maxContentLength = Integer.parseInt(args[++i]);
       } else if (args[i].equals("-dumpContent")) {
         dumpContent = true;
@@ -222,7 +220,10 @@ public class File implements Protocol {
    * No robots parsing is done for file protocol. So this returns a set of 
empty
    * rules which will allow every url.
    */
-  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+  @Override
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
+      List<Content> robotsTxtContent) {
     return RobotRulesParser.EMPTY_RULES;
   }
+
 }

http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java 
b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
index 772f3bb..a4051ed 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -36,7 +36,7 @@ import org.apache.nutch.protocol.ProtocolStatus;
 import crawlercommons.robots.BaseRobotRules;
 
 import java.net.URL;
-
+import java.util.List;
 import java.io.IOException;
 
 /**
@@ -257,11 +257,14 @@ public class Ftp implements Protocol {
   /**
    * Get the robots rules for a given url
    */
-  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
-    return robots.getRobotRulesSet(this, url);
+  @Override
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
+      List<Content> robotsTxtContent) {
+    return robots.getRobotRulesSet(this, url, robotsTxtContent);
   }
 
   public int getBufferSize() {
     return BUFFER_SIZE;
   }
+
 }

http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
 
b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
index 3764864..482acdf 100644
--- 
a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
+++ 
b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
@@ -18,10 +18,12 @@
 package org.apache.nutch.protocol.ftp;
 
 import java.net.URL;
+import java.util.List;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
@@ -60,10 +62,18 @@ public class FtpRobotRulesParser extends RobotRulesParser {
    *          The {@link Protocol} object
    * @param url
    *          URL
+   * @param robotsTxtContent
+   *          container to store responses when fetching the robots.txt file 
for
+   *          debugging or archival purposes. Instead of a robots.txt file, it
+   *          may include redirects or an error page (404, etc.). Response
+   *          {@link Content} is appended to the passed list. If null is passed
+   *          nothing is stored.
    * 
    * @return robotRules A {@link BaseRobotRules} object for the rules
    */
-  public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {
+  @Override
+  public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url,
+      List<Content> robotsTxtContent) {
 
     String protocol = url.getProtocol().toLowerCase(); // normalize to lower
                                                        // case
@@ -97,6 +107,10 @@ public class FtpRobotRulesParser extends RobotRulesParser {
             new CrawlDatum());
         ProtocolStatus status = output.getStatus();
 
+        if (robotsTxtContent != null) {
+          robotsTxtContent.add(output.getContent());
+        }
+
         if (status.getCode() == ProtocolStatus.SUCCESS) {
           robotRules = parseRules(url.toString(), output.getContent()
               .getContent(), CONTENT_TYPE, agentNames);
@@ -118,4 +132,5 @@ public class FtpRobotRulesParser extends RobotRulesParser {
 
     return robotRules;
   }
+
 }

[1/5] nutch git commit: Allow Fetcher to optionally store robots.txt content (if property fetcher.store.robotstxt == true). Improved RobotRulesParser command-line tool.

Reply via email to