Repository: nutch Updated Branches: refs/heads/master d37b7ce13 -> 3fca1a590
Allow Fetcher to optionally store robots.txt content (if property fetcher.store.robotstxt == true). Improved RobotRulesParser command-line tool. Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/6c9cca5e Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/6c9cca5e Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/6c9cca5e Branch: refs/heads/master Commit: 6c9cca5e55e43458cbc5e59b8591e4d27ac425a2 Parents: d37b7ce Author: Sebastian Nagel <sna...@apache.org> Authored: Wed May 25 14:24:11 2016 +0200 Committer: Sebastian Nagel <sna...@apache.org> Committed: Fri Aug 19 12:07:06 2016 +0200 ---------------------------------------------------------------------- conf/nutch-default.xml | 8 + .../org/apache/nutch/fetcher/FetcherThread.java | 17 +- .../org/apache/nutch/protocol/Protocol.java | 20 ++- .../apache/nutch/protocol/RobotRulesParser.java | 174 +++++++++++++++---- .../nutch/protocol/http/api/HttpBase.java | 29 ++-- .../protocol/http/api/HttpRobotRulesParser.java | 52 +++++- .../org/apache/nutch/protocol/file/File.java | 13 +- .../java/org/apache/nutch/protocol/ftp/Ftp.java | 9 +- .../nutch/protocol/ftp/FtpRobotRulesParser.java | 17 +- 9 files changed, 265 insertions(+), 74 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/conf/nutch-default.xml ---------------------------------------------------------------------- diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 67326ee..8c329bc 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -982,6 +982,14 @@ fetcher.bandwidth.target. Defaults to 30 and must be at least 1.</description> </property> +<property> + <name>fetcher.store.robotstxt</name> + <value>false</value> + <description>If true, fetcher will store the robots.txt response + content and status for debugging or archival purposes. + </description> +</property> + <!-- moreindexingfilter plugin properties --> <property> http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/java/org/apache/nutch/fetcher/FetcherThread.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index e57e735..cac16ff 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -22,6 +22,7 @@ import java.net.URL; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import java.util.Map.Entry; import java.util.concurrent.atomic.AtomicInteger; @@ -129,6 +130,8 @@ public class FetcherThread extends Thread { private AtomicLong bytes; + private List<Content> robotsTxtContent = null; + //Used by the REST service private FetchNode fetchNode; private boolean reportToNutchServer; @@ -188,6 +191,9 @@ public class FetcherThread extends Thread { "fetcher.follow.outlinks.num.links", 4); outlinksDepthDivisor = conf.getInt( "fetcher.follow.outlinks.depth.divisor", 2); + if (conf.getBoolean("fetcher.store.robotstxt", false)) { + robotsTxtContent = new LinkedList<Content>(); + } } @SuppressWarnings("fallthrough") @@ -256,7 +262,16 @@ public class FetcherThread extends Thread { redirecting = false; Protocol protocol = this.protocolFactory.getProtocol(fit.url .toString()); - BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum); + BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, robotsTxtContent); + if (robotsTxtContent != null) { + for (Content robotsTxt : robotsTxtContent) { + LOG.debug("fetched and stored robots.txt {}", + robotsTxt.getUrl()); + output.collect(new Text(robotsTxt.getUrl()), + new NutchWritable(robotsTxt)); + } + robotsTxtContent.clear(); + } if (!rules.isAllowed(fit.u.toString())) { // unblock ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true); http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/java/org/apache/nutch/protocol/Protocol.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java index efd0100..ddebffb 100755 --- a/src/java/org/apache/nutch/protocol/Protocol.java +++ b/src/java/org/apache/nutch/protocol/Protocol.java @@ -17,6 +17,8 @@ package org.apache.nutch.protocol; +import java.util.List; + // Hadoop imports import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.io.Text; @@ -38,13 +40,21 @@ public interface Protocol extends Pluggable, Configurable { ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum); /** - * Retrieve robot rules applicable for this url. - * + * Retrieve robot rules applicable for this URL. + * * @param url - * url to check + * URL to check * @param datum * page datum - * @return robot rules (specific for this url or default), never null + * @param robotsTxtContent + * container to store responses when fetching the robots.txt file for + * debugging or archival purposes. Instead of a robots.txt file, it + * may include redirects or an error page (404, etc.). Response + * {@link Content} is appended to the passed list. If null is passed + * nothing is stored. + * @return robot rules (specific for this URL or default), never null */ - BaseRobotRules getRobotRules(Text url, CrawlDatum datum); + BaseRobotRules getRobotRules(Text url, CrawlDatum datum, + List<Content> robotsTxtContent); + } http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/java/org/apache/nutch/protocol/RobotRulesParser.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java index 475aef4..d7eba92 100644 --- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java +++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java @@ -27,6 +27,8 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.HashSet; import java.util.Hashtable; +import java.util.LinkedList; +import java.util.List; import java.util.Set; import java.util.StringTokenizer; @@ -189,51 +191,124 @@ public abstract class RobotRulesParser implements Tool { return robotParser.parseContent(url, content, contentType, robotName); } - public BaseRobotRules getRobotRulesSet(Protocol protocol, Text url) { + /** + * Fetch robots.txt (or it's protocol-specific equivalent) which applies to + * the given URL, parse it and return the set of robot rules applicable for + * the configured agent name(s). + * + * @param protocol + * {@link Protocol} + * @param url + * URL to check + * @param robotsTxtContent + * container to store responses when fetching the robots.txt file for + * debugging or archival purposes. Instead of a robots.txt file, it + * may include redirects or an error page (404, etc.). Response + * {@link Content} is appended to the passed list. If null is passed + * nothing is stored. + * + * @return robot rules (specific for this URL or default), never null + */ + public BaseRobotRules getRobotRulesSet(Protocol protocol, Text url, + List<Content> robotsTxtContent) { URL u = null; try { u = new URL(url.toString()); } catch (Exception e) { return EMPTY_RULES; } - return getRobotRulesSet(protocol, u); + return getRobotRulesSet(protocol, u, robotsTxtContent); } /** * Fetch robots.txt (or it's protocol-specific equivalent) which applies to * the given URL, parse it and return the set of robot rules applicable for * the configured agent name(s). - * + * * @param protocol - * protocol implementation + * {@link Protocol} * @param url - * URL to be checked whether fetching is allowed by robot rules - * @return robot rules + * URL to check + * @param robotsTxtContent + * container to store responses when fetching the robots.txt file for + * debugging or archival purposes. Instead of a robots.txt file, it + * may include redirects or an error page (404, etc.). Response + * {@link Content} is appended to the passed list. If null is passed + * nothing is stored. + * + * @return robot rules (specific for this URL or default), never null */ - public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url); + public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url, + List<Content> robotsTxtContent); + @Override public int run(String[] args) { if (args.length < 2) { String[] help = { - "Usage: RobotRulesParser <robots-file> <url-file> [<agent-names>]\n", - "\tThe <robots-file> will be parsed as a robots.txt file,", - "\tusing the given <agent-name> to select rules.", - "\tURLs will be read (one per line) from <url-file>,", - "\tand tested against the rules.", - "\tMultiple agent names can be provided using", - "\tcomma as a delimiter without any spaces.", - "\tIf no agent name is given the property http.agent.name", - "\tis used. If http.agent.name is empty, robots.txt is checked", - "\tfor rules assigned to the user agent `*' (meaning any other)." }; + "Usage: RobotRulesParser [ -Dproperty=... ] <robots-file-or-url> <url-file> [<agent-names>]", + "", + "<robots-file-or-url>\tlocal file or URL parsed as robots.txt file", + "\tIf <robots-file-or-url> starts with a protocol specification", + "\t(`http', `https', `ftp' or `file'), robots.txt it is fetched", + "\tusing the specified protocol. Otherwise, a local file is assumed.", + "", + "<url-file>\tlocal file with URLs (one per line), for every URL", + "\tthe path part (including the query) is checked whether", + "\tit is allowed by the robots.txt rules. Other parts of the URLs", + "\t(mainly the host) are ignored.", + "", + "<agent-names>\tcomma-separated list of agent names", + "\tused to select rules from the robots.txt file.", + "\tIf no agent name is given the property http.agent.name is used.", + "\tIf http.agent.name is empty, robots.txt is checked for rules", + "\tassigned to the user agent `*' (meaning any other).", + "", + "Important properties:", + " -D fetcher.store.robotstxt=true", + "\toutput content and HTTP meta data of fetched robots.txt (if not a local file)", + " -D http.agent.name=...\tsame as argument <agent-names>", + " -D http.robots.agents=...\tadditional agent names", + " -D http.robot.rules.whitelist=..."}; for (String s : help) { System.err.println(s); } - System.exit(-1); + return -1; + } + + Protocol protocol = null; + URL robotsTxtUrl = null; + if (args[0].matches("^(?:https?|ftp|file)://?.*")) { + try { + robotsTxtUrl = new URL(args[0]); + } catch (MalformedURLException e) { + LOG.warn("Not a valid URL, assuming local file: {}", args[0]); + } + ProtocolFactory factory = new ProtocolFactory(conf); + try { + protocol = factory.getProtocol(robotsTxtUrl.toString()); + } catch (ProtocolNotFound e) { + LOG.error("No protocol found for {}: {}", args[0], + StringUtils.stringifyException(e)); + return -1; + } + } + + if (robotsTxtUrl == null) { + // try as local file + File robotsFile = new File(args[0]); + if (!robotsFile.exists()) { + LOG.error("File does not exist: {}", args[0]); + return -1; + } else { + try { + robotsTxtUrl = robotsFile.toURI().toURL(); + } catch (MalformedURLException e) { + } + } } - File robotsFile = new File(args[0]); File urlFile = new File(args[1]); if (args.length > 2) { @@ -243,13 +318,30 @@ public abstract class RobotRulesParser implements Tool { setConf(conf); } + List<Content> robotsTxtContent = null; + if (getConf().getBoolean("fetcher.store.robotstxt", false)) { + robotsTxtContent = new LinkedList<Content>(); + } + try { - BaseRobotRules rules = getRobotRulesSet(null, robotsFile.toURI().toURL()); + + BaseRobotRules rules = getRobotRulesSet(protocol, robotsTxtUrl, robotsTxtContent); + + if (robotsTxtContent != null) { + for (Content robotsTxt : robotsTxtContent) { + LOG.info("fetched robots.txt {}:", + robotsTxt.getUrl()); + LOG.info(robotsTxt.toString()); + } + } + + System.out.println("Testing robots.txt for agent names: " + agentNames); LineNumberReader testsIn = new LineNumberReader(new FileReader(urlFile)); String testPath; - testPath = testsIn.readLine().trim(); + testPath = testsIn.readLine(); while (testPath != null) { + testPath = testPath.trim(); try { // testPath can be just a path or a complete URL URL url = new URL(testPath); @@ -263,6 +355,7 @@ public abstract class RobotRulesParser implements Tool { } System.out.println(status + ":\t" + testPath); } catch (MalformedURLException e) { + LOG.warn("Not a valid URL: {}", testPath); } testPath = testsIn.readLine(); } @@ -292,24 +385,33 @@ public abstract class RobotRulesParser implements Tool { } /** - * @param protocol (ignored) + * @param protocol + * (if not null) protocol used to get robot rules, + * (if null) the URL is read via {@link URLConnection} * @param url * location of the robots.txt file - * */ - public BaseRobotRules getRobotRulesSet(Protocol protocol, URL url) { + */ + @Override + public BaseRobotRules getRobotRulesSet(Protocol protocol, URL url, + List<Content> robotsTxtContent) { BaseRobotRules rules; - try { - int contentLength = url.openConnection().getContentLength(); - byte[] robotsBytes = new byte[contentLength]; - InputStream openStream = url.openStream(); - openStream.read(robotsBytes); - openStream.close(); - rules = robotParser.parseContent(url.toString(), robotsBytes, - "text/plain", this.conf.get("http.agent.name")); - } catch (IOException e) { - LOG.error("Failed to open robots.txt file " + url - + StringUtils.stringifyException(e)); - rules = EMPTY_RULES; + if (protocol != null) { + rules = protocol.getRobotRules(new Text(url.toString()), null, + robotsTxtContent); + } else { + try { + int contentLength = url.openConnection().getContentLength(); + byte[] robotsBytes = new byte[contentLength]; + InputStream openStream = url.openStream(); + openStream.read(robotsBytes); + openStream.close(); + rules = robotParser.parseContent(url.toString(), robotsBytes, + "text/plain", this.conf.get("http.agent.name")); + } catch (IOException e) { + LOG.error("Failed to open robots.txt file " + url + + StringUtils.stringifyException(e)); + rules = EMPTY_RULES; + } } return rules; } http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index 9f616fe..4d1a0cc 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -21,10 +21,11 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.net.URL; -import java.util.*; import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Set; import java.util.concurrent.ThreadLocalRandom; // Logging imports @@ -518,7 +519,6 @@ public abstract class HttpBase implements Protocol { } protected static void main(HttpBase http, String[] args) throws Exception { - boolean verbose = false; String url = null; String usage = "Usage: Http [-verbose] [-timeout N] url"; @@ -532,7 +532,6 @@ public abstract class HttpBase implements Protocol { if (args[i].equals("-timeout")) { // found -timeout option http.timeout = Integer.parseInt(args[++i]) * 1000; } else if (args[i].equals("-verbose")) { // found -verbose option - verbose = true; } else if (i != args.length - 1) { System.err.println(usage); System.exit(-1); @@ -541,10 +540,6 @@ public abstract class HttpBase implements Protocol { url = args[i]; } - // if (verbose) { - // LOGGER.setLevel(Level.FINE); - // } - ProtocolOutput out = http .getProtocolOutput(new Text(url), new CrawlDatum()); Content content = out.getContent(); @@ -563,8 +558,10 @@ public abstract class HttpBase implements Protocol { protected abstract Response getResponse(URL url, CrawlDatum datum, boolean followRedirects) throws ProtocolException, IOException; - public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) { - return robots.getRobotRulesSet(this, url); + @Override + public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, + List<Content> robotsTxtContent) { + return robots.getRobotRulesSet(this, url, robotsTxtContent); } /** @@ -572,14 +569,14 @@ public abstract class HttpBase implements Protocol { * @param input String[] * @return a new HashMap */ - private HashMap arrayToMap(String[]input){ - if (input==null ||input.length==0) { - return new HashMap(); + private HashMap<String, String> arrayToMap(String[] input) { + if (input == null || input.length == 0) { + return new HashMap<String, String>(); } - HashMap hm=new HashMap(); - for (int i=0;i<input.length;i++){ - if (!"".equals(input[i].trim())){ - hm.put(input[i],input[i]); + HashMap<String, String> hm = new HashMap<String, String>(); + for (int i = 0; i < input.length; i++) { + if (!"".equals(input[i].trim())) { + hm.put(input[i], input[i]); } } return hm; http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index 185ca15..494ae0d 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -18,6 +18,7 @@ package org.apache.nutch.protocol.http.api; import java.net.URL; +import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,6 +26,7 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.RobotRulesParser; @@ -80,11 +82,19 @@ public class HttpRobotRulesParser extends RobotRulesParser { * @param http * The {@link Protocol} object * @param url - * URL robots.txt applies to - * - * @return {@link BaseRobotRules} holding the rules from robots.txt + * URL + * @param robotsTxtContent + * container to store responses when fetching the robots.txt file for + * debugging or archival purposes. Instead of a robots.txt file, it + * may include redirects or an error page (404, etc.). Response + * {@link Content} is appended to the passed list. If null is passed + * nothing is stored. + * + * @return robotRules A {@link BaseRobotRules} object for the rules */ - public BaseRobotRules getRobotRulesSet(Protocol http, URL url) { + @Override + public BaseRobotRules getRobotRulesSet(Protocol http, URL url, + List<Content> robotsTxtContent) { if (LOG.isTraceEnabled() && isWhiteListed(url)) { LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url); @@ -112,8 +122,12 @@ public class HttpRobotRulesParser extends RobotRulesParser { } else { try { - Response response = ((HttpBase) http).getResponse(new URL(url, - "/robots.txt"), new CrawlDatum(), true); + URL robotsUrl = new URL(url, "/robots.txt"); + Response response = ((HttpBase) http).getResponse(robotsUrl, + new CrawlDatum(), true); + if (robotsTxtContent != null) { + addRobotsContent(robotsTxtContent, robotsUrl, response); + } // try one level of redirection ? if (response.getCode() == 301 || response.getCode() == 302) { String redirection = response.getHeader("Location"); @@ -131,6 +145,9 @@ public class HttpRobotRulesParser extends RobotRulesParser { response = ((HttpBase) http).getResponse(redir, new CrawlDatum(), true); + if (robotsTxtContent != null) { + addRobotsContent(robotsTxtContent, robotsUrl, response); + } } } @@ -164,4 +181,27 @@ public class HttpRobotRulesParser extends RobotRulesParser { return robotRules; } + + /** + * Append {@link Content} of robots.txt to {@literal robotsTxtContent} + * + * @param robotsTxtContent + * container to store robots.txt response content + * @param robotsUrl + * robots.txt URL + * @param robotsResponse + * response object to be stored + */ + protected void addRobotsContent(List<Content> robotsTxtContent, + URL robotsUrl, Response robotsResponse) { + byte[] robotsBytes = robotsResponse.getContent(); + if (robotsBytes == null) + robotsBytes = new byte[0]; + Content content = new Content(robotsUrl.toString(), + robotsUrl.toString(), robotsBytes, + robotsResponse.getHeader("Content-Type"), robotsResponse.getHeaders(), + getConf()); + robotsTxtContent.add(content); + } + } http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java index 2712218..2efb140 100644 --- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java +++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java @@ -18,6 +18,7 @@ package org.apache.nutch.protocol.file; import java.net.URL; +import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -161,11 +162,10 @@ public class File implements Protocol { */ public static void main(String[] args) throws Exception { int maxContentLength = Integer.MIN_VALUE; - String logLevel = "info"; boolean dumpContent = false; String urlString = null; - String usage = "Usage: File [-logLevel level] [-maxContentLength L] [-dumpContent] url"; + String usage = "Usage: File [-maxContentLength L] [-dumpContent] url"; if (args.length == 0) { System.err.println(usage); @@ -173,9 +173,7 @@ public class File implements Protocol { } for (int i = 0; i < args.length; i++) { - if (args[i].equals("-logLevel")) { - logLevel = args[++i]; - } else if (args[i].equals("-maxContentLength")) { + if (args[i].equals("-maxContentLength")) { maxContentLength = Integer.parseInt(args[++i]); } else if (args[i].equals("-dumpContent")) { dumpContent = true; @@ -222,7 +220,10 @@ public class File implements Protocol { * No robots parsing is done for file protocol. So this returns a set of empty * rules which will allow every url. */ - public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) { + @Override + public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, + List<Content> robotsTxtContent) { return RobotRulesParser.EMPTY_RULES; } + } http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java index 772f3bb..a4051ed 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java @@ -36,7 +36,7 @@ import org.apache.nutch.protocol.ProtocolStatus; import crawlercommons.robots.BaseRobotRules; import java.net.URL; - +import java.util.List; import java.io.IOException; /** @@ -257,11 +257,14 @@ public class Ftp implements Protocol { /** * Get the robots rules for a given url */ - public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) { - return robots.getRobotRulesSet(this, url); + @Override + public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, + List<Content> robotsTxtContent) { + return robots.getRobotRulesSet(this, url, robotsTxtContent); } public int getBufferSize() { return BUFFER_SIZE; } + } http://git-wip-us.apache.org/repos/asf/nutch/blob/6c9cca5e/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java index 3764864..482acdf 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java @@ -18,10 +18,12 @@ package org.apache.nutch.protocol.ftp; import java.net.URL; +import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolOutput; import org.apache.nutch.protocol.ProtocolStatus; @@ -60,10 +62,18 @@ public class FtpRobotRulesParser extends RobotRulesParser { * The {@link Protocol} object * @param url * URL + * @param robotsTxtContent + * container to store responses when fetching the robots.txt file for + * debugging or archival purposes. Instead of a robots.txt file, it + * may include redirects or an error page (404, etc.). Response + * {@link Content} is appended to the passed list. If null is passed + * nothing is stored. * * @return robotRules A {@link BaseRobotRules} object for the rules */ - public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) { + @Override + public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url, + List<Content> robotsTxtContent) { String protocol = url.getProtocol().toLowerCase(); // normalize to lower // case @@ -97,6 +107,10 @@ public class FtpRobotRulesParser extends RobotRulesParser { new CrawlDatum()); ProtocolStatus status = output.getStatus(); + if (robotsTxtContent != null) { + robotsTxtContent.add(output.getContent()); + } + if (status.getCode() == ProtocolStatus.SUCCESS) { robotRules = parseRules(url.toString(), output.getContent() .getContent(), CONTENT_TYPE, agentNames); @@ -118,4 +132,5 @@ public class FtpRobotRulesParser extends RobotRulesParser { return robotRules; } + }