This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 46db3ed NUTCH-2002 parse and index checkers to check robots.txt -
applied Julien's patch to recent code base - also check redirects whether they
are allowed - add command-line parameter `-checkRobotsTxt` enabling this check
new 40472c1 Merge pull request #521 from
sebastian-nagel/NUTCH-2002-checkers-robotstxt
46db3ed is described below
commit 46db3ed71355fefda42a008ece75094f51859ab2
Author: Sebastian Nagel <[email protected]>
AuthorDate: Thu Apr 30 12:58:05 2020 +0200
NUTCH-2002 parse and index checkers to check robots.txt
- applied Julien's patch to recent code base
- also check redirects whether they are allowed
- add command-line parameter `-checkRobotsTxt` enabling this check
---
.../nutch/indexer/IndexingFiltersChecker.java | 21 ++++++++++++++++-----
src/java/org/apache/nutch/parse/ParserChecker.java | 22 +++++++++++++++++-----
.../org/apache/nutch/util/AbstractChecker.java | 15 ++++++++++++++-
3 files changed, 47 insertions(+), 11 deletions(-)
diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
index 4f849a0..84d9f6d 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
@@ -58,6 +58,7 @@ public class IndexingFiltersChecker extends AbstractChecker {
protected URLNormalizers normalizers = null;
protected boolean dumpText = false;
protected boolean followRedirects = false;
+ protected boolean checkRobotsTxt = false;
protected boolean doIndex = false;
// used to simulate the metadata propagated from injection
protected HashMap<String, String> metadata = new HashMap<>();
@@ -82,6 +83,7 @@ public class IndexingFiltersChecker extends AbstractChecker {
+ " \t before other command-specific options)\n"
+ " -normalize \tnormalize URLs\n" //
+ " -followRedirects\tfollow redirects when fetching URL\n" //
+ + " -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" //
+ " -dumpText \tshow the entire plain-text content,\n" //"
+ " \tnot only the first 100 characters\n" //
+ " -doIndex \tpass document to configured index writers\n" //
@@ -103,6 +105,8 @@ public class IndexingFiltersChecker extends AbstractChecker
{
normalizers = new URLNormalizers(getConf(),
URLNormalizers.SCOPE_DEFAULT);
} else if (args[i].equals("-followRedirects")) {
followRedirects = true;
+ } else if (args[i].equals("-checkRobotsTxt")) {
+ checkRobotsTxt = true;
} else if (args[i].equals("-dumpText")) {
dumpText = true;
} else if (args[i].equals("-doIndex")) {
@@ -164,13 +168,15 @@ public class IndexingFiltersChecker extends
AbstractChecker {
}
}
- ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
+ ProtocolOutput protocolOutput = getProtocolOutput(url, datum,
+ checkRobotsTxt);
Text turl = new Text(url);
-
+
// Following redirects and not reached maxRedirects?
int numRedirects = 0;
- while (!protocolOutput.getStatus().isSuccess() && followRedirects
- && protocolOutput.getStatus().isRedirect() && maxRedirects >=
numRedirects) {
+ while (protocolOutput != null && !protocolOutput.getStatus().isSuccess()
+ && followRedirects && protocolOutput.getStatus().isRedirect()
+ && maxRedirects >= numRedirects) {
String[] stuff = protocolOutput.getStatus().getArgs();
url = stuff[0];
LOG.info("Follow redirect to {}", url);
@@ -182,10 +188,15 @@ public class IndexingFiltersChecker extends
AbstractChecker {
turl.set(url);
// try again
- protocolOutput = getProtocolOutput(url, datum);
+ protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
numRedirects++;
}
+ if (checkRobotsTxt && protocolOutput == null) {
+ System.err.println("Fetch disallowed by robots.txt");
+ return -1;
+ }
+
if (!protocolOutput.getStatus().isSuccess()) {
System.err.println("Fetch failed with protocol status: "
+ protocolOutput.getStatus());
diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java
b/src/java/org/apache/nutch/parse/ParserChecker.java
index 2a976ba..4dbfcfa 100644
--- a/src/java/org/apache/nutch/parse/ParserChecker.java
+++ b/src/java/org/apache/nutch/parse/ParserChecker.java
@@ -69,6 +69,7 @@ public class ParserChecker extends AbstractChecker {
protected URLNormalizers normalizers = null;
protected boolean dumpText = false;
protected boolean followRedirects = false;
+ protected boolean checkRobotsTxt = false;
// used to simulate the metadata propagated from injection
protected HashMap<String, String> metadata = new HashMap<>();
protected String forceAsContentType = null;
@@ -94,9 +95,11 @@ public class ParserChecker extends AbstractChecker {
+ " \t before other command-specific options)\n"
+ " -normalize \tnormalize URLs\n" //
+ " -followRedirects\tfollow redirects when fetching URL\n" //
+ + " -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" //
+ " -dumpText \talso show the plain-text extracted by
parsers\n" //
+ " -forceAs <mimeType>\tforce parsing as <mimeType>\n" //
+ " -md <key>=<value>\tmetadata added to CrawlDatum before parsing\n";
+
// Print help when no args given
if (args.length < 1) {
System.err.println(usage);
@@ -109,6 +112,8 @@ public class ParserChecker extends AbstractChecker {
normalizers = new URLNormalizers(getConf(),
URLNormalizers.SCOPE_DEFAULT);
} else if (args[i].equals("-followRedirects")) {
followRedirects = true;
+ } else if (args[i].equals("-checkRobotsTxt")) {
+ checkRobotsTxt = true;
} else if (args[i].equals("-forceAs")) {
forceAsContentType = args[++i];
} else if (args[i].equals("-dumpText")) {
@@ -172,13 +177,15 @@ public class ParserChecker extends AbstractChecker {
}
}
- ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
+ ProtocolOutput protocolOutput = getProtocolOutput(url, datum,
+ checkRobotsTxt);
Text turl = new Text(url);
-
+
// Following redirects and not reached maxRedirects?
int numRedirects = 0;
- while (!protocolOutput.getStatus().isSuccess() && followRedirects
- && protocolOutput.getStatus().isRedirect() && maxRedirects >=
numRedirects) {
+ while (protocolOutput != null && !protocolOutput.getStatus().isSuccess()
+ && followRedirects && protocolOutput.getStatus().isRedirect()
+ && maxRedirects >= numRedirects) {
String[] stuff = protocolOutput.getStatus().getArgs();
url = stuff[0];
LOG.info("Follow redirect to {}", url);
@@ -190,10 +197,15 @@ public class ParserChecker extends AbstractChecker {
turl.set(url);
// try again
- protocolOutput = getProtocolOutput(url, datum);
+ protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
numRedirects++;
}
+ if (checkRobotsTxt && protocolOutput == null) {
+ System.err.println("Fetch disallowed by robots.txt");
+ return -1;
+ }
+
if (!protocolOutput.getStatus().isSuccess()) {
System.err.println("Fetch failed with protocol status: "
+ protocolOutput.getStatus());
diff --git a/src/java/org/apache/nutch/util/AbstractChecker.java
b/src/java/org/apache/nutch/util/AbstractChecker.java
index b41bbc9..616e3dd 100644
--- a/src/java/org/apache/nutch/util/AbstractChecker.java
+++ b/src/java/org/apache/nutch/util/AbstractChecker.java
@@ -36,6 +36,8 @@ import org.apache.nutch.protocol.ProtocolOutput;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import crawlercommons.robots.BaseRobotRules;
+
/**
* Scaffolding class for the various Checker implementations. Can process
cmdline input, stdin and TCP connections.
*
@@ -188,10 +190,21 @@ public abstract class AbstractChecker extends Configured
implements Tool {
}
}
- protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum)
throws Exception {
+ protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum,
+ boolean checkRobotsTxt) throws Exception {
ProtocolFactory factory = new ProtocolFactory(getConf());
Protocol protocol = factory.getProtocol(url);
Text turl = new Text(url);
+ if (checkRobotsTxt) {
+ System.err.print("Checking robots.txt ...");
+ BaseRobotRules rules = protocol.getRobotRules(turl, datum, null);
+ if (rules.isAllowed(url)) {
+ System.err.println(" (allowed)");
+ } else {
+ System.err.println("\nDenied by robots.txt: " + url);
+ return null;
+ }
+ }
return protocol.getProtocolOutput(turl, datum);
}