This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit aed6fa71fa7cd07740235e4c4aeca8380ddb9b48 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Thu Apr 30 12:58:05 2020 +0200 NUTCH-2002 parse and index checkers to check robots.txt - applied Julien's patch to recent code base - also check redirects whether they are allowed - add command-line parameter `-checkRobotsTxt` enabling this check --- .../nutch/indexer/IndexingFiltersChecker.java | 21 ++++++++++++++++----- src/java/org/apache/nutch/parse/ParserChecker.java | 22 +++++++++++++++++----- .../org/apache/nutch/util/AbstractChecker.java | 15 ++++++++++++++- 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java index 4f849a0..84d9f6d 100644 --- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java +++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java @@ -58,6 +58,7 @@ public class IndexingFiltersChecker extends AbstractChecker { protected URLNormalizers normalizers = null; protected boolean dumpText = false; protected boolean followRedirects = false; + protected boolean checkRobotsTxt = false; protected boolean doIndex = false; // used to simulate the metadata propagated from injection protected HashMap<String, String> metadata = new HashMap<>(); @@ -82,6 +83,7 @@ public class IndexingFiltersChecker extends AbstractChecker { + " \t before other command-specific options)\n" + " -normalize \tnormalize URLs\n" // + " -followRedirects\tfollow redirects when fetching URL\n" // + + " -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" // + " -dumpText \tshow the entire plain-text content,\n" //" + " \tnot only the first 100 characters\n" // + " -doIndex \tpass document to configured index writers\n" // @@ -103,6 +105,8 @@ public class IndexingFiltersChecker extends AbstractChecker { normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT); } else if (args[i].equals("-followRedirects")) { followRedirects = true; + } else if (args[i].equals("-checkRobotsTxt")) { + checkRobotsTxt = true; } else if (args[i].equals("-dumpText")) { dumpText = true; } else if (args[i].equals("-doIndex")) { @@ -164,13 +168,15 @@ public class IndexingFiltersChecker extends AbstractChecker { } } - ProtocolOutput protocolOutput = getProtocolOutput(url, datum); + ProtocolOutput protocolOutput = getProtocolOutput(url, datum, + checkRobotsTxt); Text turl = new Text(url); - + // Following redirects and not reached maxRedirects? int numRedirects = 0; - while (!protocolOutput.getStatus().isSuccess() && followRedirects - && protocolOutput.getStatus().isRedirect() && maxRedirects >= numRedirects) { + while (protocolOutput != null && !protocolOutput.getStatus().isSuccess() + && followRedirects && protocolOutput.getStatus().isRedirect() + && maxRedirects >= numRedirects) { String[] stuff = protocolOutput.getStatus().getArgs(); url = stuff[0]; LOG.info("Follow redirect to {}", url); @@ -182,10 +188,15 @@ public class IndexingFiltersChecker extends AbstractChecker { turl.set(url); // try again - protocolOutput = getProtocolOutput(url, datum); + protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt); numRedirects++; } + if (checkRobotsTxt && protocolOutput == null) { + System.err.println("Fetch disallowed by robots.txt"); + return -1; + } + if (!protocolOutput.getStatus().isSuccess()) { System.err.println("Fetch failed with protocol status: " + protocolOutput.getStatus()); diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java index 2a976ba..4dbfcfa 100644 --- a/src/java/org/apache/nutch/parse/ParserChecker.java +++ b/src/java/org/apache/nutch/parse/ParserChecker.java @@ -69,6 +69,7 @@ public class ParserChecker extends AbstractChecker { protected URLNormalizers normalizers = null; protected boolean dumpText = false; protected boolean followRedirects = false; + protected boolean checkRobotsTxt = false; // used to simulate the metadata propagated from injection protected HashMap<String, String> metadata = new HashMap<>(); protected String forceAsContentType = null; @@ -94,9 +95,11 @@ public class ParserChecker extends AbstractChecker { + " \t before other command-specific options)\n" + " -normalize \tnormalize URLs\n" // + " -followRedirects\tfollow redirects when fetching URL\n" // + + " -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" // + " -dumpText \talso show the plain-text extracted by parsers\n" // + " -forceAs <mimeType>\tforce parsing as <mimeType>\n" // + " -md <key>=<value>\tmetadata added to CrawlDatum before parsing\n"; + // Print help when no args given if (args.length < 1) { System.err.println(usage); @@ -109,6 +112,8 @@ public class ParserChecker extends AbstractChecker { normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT); } else if (args[i].equals("-followRedirects")) { followRedirects = true; + } else if (args[i].equals("-checkRobotsTxt")) { + checkRobotsTxt = true; } else if (args[i].equals("-forceAs")) { forceAsContentType = args[++i]; } else if (args[i].equals("-dumpText")) { @@ -172,13 +177,15 @@ public class ParserChecker extends AbstractChecker { } } - ProtocolOutput protocolOutput = getProtocolOutput(url, datum); + ProtocolOutput protocolOutput = getProtocolOutput(url, datum, + checkRobotsTxt); Text turl = new Text(url); - + // Following redirects and not reached maxRedirects? int numRedirects = 0; - while (!protocolOutput.getStatus().isSuccess() && followRedirects - && protocolOutput.getStatus().isRedirect() && maxRedirects >= numRedirects) { + while (protocolOutput != null && !protocolOutput.getStatus().isSuccess() + && followRedirects && protocolOutput.getStatus().isRedirect() + && maxRedirects >= numRedirects) { String[] stuff = protocolOutput.getStatus().getArgs(); url = stuff[0]; LOG.info("Follow redirect to {}", url); @@ -190,10 +197,15 @@ public class ParserChecker extends AbstractChecker { turl.set(url); // try again - protocolOutput = getProtocolOutput(url, datum); + protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt); numRedirects++; } + if (checkRobotsTxt && protocolOutput == null) { + System.err.println("Fetch disallowed by robots.txt"); + return -1; + } + if (!protocolOutput.getStatus().isSuccess()) { System.err.println("Fetch failed with protocol status: " + protocolOutput.getStatus()); diff --git a/src/java/org/apache/nutch/util/AbstractChecker.java b/src/java/org/apache/nutch/util/AbstractChecker.java index b41bbc9..616e3dd 100644 --- a/src/java/org/apache/nutch/util/AbstractChecker.java +++ b/src/java/org/apache/nutch/util/AbstractChecker.java @@ -36,6 +36,8 @@ import org.apache.nutch.protocol.ProtocolOutput; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import crawlercommons.robots.BaseRobotRules; + /** * Scaffolding class for the various Checker implementations. Can process cmdline input, stdin and TCP connections. * @@ -188,10 +190,21 @@ public abstract class AbstractChecker extends Configured implements Tool { } } - protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception { + protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum, + boolean checkRobotsTxt) throws Exception { ProtocolFactory factory = new ProtocolFactory(getConf()); Protocol protocol = factory.getProtocol(url); Text turl = new Text(url); + if (checkRobotsTxt) { + System.err.print("Checking robots.txt ..."); + BaseRobotRules rules = protocol.getRobotRules(turl, datum, null); + if (rules.isAllowed(url)) { + System.err.println(" (allowed)"); + } else { + System.err.println("\nDenied by robots.txt: " + url); + return null; + } + } return protocol.getProtocolOutput(turl, datum); }