[nutch] 05/35: NUTCH-2002 parse and index checkers to check robots.txt - applied Julien's patch to recent code base - also check redirects whether they are allowed - add command-line parameter `-checkRobotsTxt` enabling this check

snagel Sun, 16 Aug 2020 12:04:44 -0700

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


commit aed6fa71fa7cd07740235e4c4aeca8380ddb9b48
Author: Sebastian Nagel <[email protected]>
AuthorDate: Thu Apr 30 12:58:05 2020 +0200

    NUTCH-2002 parse and index checkers to check robots.txt
    - applied Julien's patch to recent code base
    - also check redirects whether they are allowed
    - add command-line parameter `-checkRobotsTxt` enabling this check
---
 .../nutch/indexer/IndexingFiltersChecker.java      | 21 ++++++++++++++++-----
 src/java/org/apache/nutch/parse/ParserChecker.java | 22 +++++++++++++++++-----
 .../org/apache/nutch/util/AbstractChecker.java     | 15 ++++++++++++++-
 3 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
index 4f849a0..84d9f6d 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
@@ -58,6 +58,7 @@ public class IndexingFiltersChecker extends AbstractChecker {
   protected URLNormalizers normalizers = null;
   protected boolean dumpText = false;
   protected boolean followRedirects = false;
+  protected boolean checkRobotsTxt = false;
   protected boolean doIndex = false;
   // used to simulate the metadata propagated from injection
   protected HashMap<String, String> metadata = new HashMap<>();
@@ -82,6 +83,7 @@ public class IndexingFiltersChecker extends AbstractChecker {
         + "                  \t before other command-specific options)\n"
         + "  -normalize      \tnormalize URLs\n" //
         + "  -followRedirects\tfollow redirects when fetching URL\n" //
+        + "  -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" //
         + "  -dumpText       \tshow the entire plain-text content,\n" //"
         + "                  \tnot only the first 100 characters\n" //
         + "  -doIndex        \tpass document to configured index writers\n" //
@@ -103,6 +105,8 @@ public class IndexingFiltersChecker extends AbstractChecker 
{
         normalizers = new URLNormalizers(getConf(), 
URLNormalizers.SCOPE_DEFAULT);
       } else if (args[i].equals("-followRedirects")) {
         followRedirects = true;
+      } else if (args[i].equals("-checkRobotsTxt")) {
+        checkRobotsTxt = true;
       } else if (args[i].equals("-dumpText")) {
         dumpText = true;
       } else if (args[i].equals("-doIndex")) {
@@ -164,13 +168,15 @@ public class IndexingFiltersChecker extends 
AbstractChecker {
       }
     }
 
-    ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
+    ProtocolOutput protocolOutput = getProtocolOutput(url, datum,
+        checkRobotsTxt);
     Text turl = new Text(url);
-    
+
     // Following redirects and not reached maxRedirects?
     int numRedirects = 0;
-    while (!protocolOutput.getStatus().isSuccess() && followRedirects
-        && protocolOutput.getStatus().isRedirect() && maxRedirects >= 
numRedirects) {
+    while (protocolOutput != null && !protocolOutput.getStatus().isSuccess()
+        && followRedirects && protocolOutput.getStatus().isRedirect()
+        && maxRedirects >= numRedirects) {
       String[] stuff = protocolOutput.getStatus().getArgs();
       url = stuff[0];
       LOG.info("Follow redirect to {}", url);
@@ -182,10 +188,15 @@ public class IndexingFiltersChecker extends 
AbstractChecker {
       turl.set(url);
 
       // try again
-      protocolOutput = getProtocolOutput(url, datum);
+      protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
       numRedirects++;
     }
 
+    if (checkRobotsTxt && protocolOutput == null) {
+      System.err.println("Fetch disallowed by robots.txt");
+      return -1;
+    }
+
     if (!protocolOutput.getStatus().isSuccess()) {
       System.err.println("Fetch failed with protocol status: "
           + protocolOutput.getStatus());
diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java 
b/src/java/org/apache/nutch/parse/ParserChecker.java
index 2a976ba..4dbfcfa 100644
--- a/src/java/org/apache/nutch/parse/ParserChecker.java
+++ b/src/java/org/apache/nutch/parse/ParserChecker.java
@@ -69,6 +69,7 @@ public class ParserChecker extends AbstractChecker {
   protected URLNormalizers normalizers = null;
   protected boolean dumpText = false;
   protected boolean followRedirects = false;
+  protected boolean checkRobotsTxt = false;
   // used to simulate the metadata propagated from injection
   protected HashMap<String, String> metadata = new HashMap<>();
   protected String forceAsContentType = null;
@@ -94,9 +95,11 @@ public class ParserChecker extends AbstractChecker {
         + "                  \t before other command-specific options)\n"
         + "  -normalize      \tnormalize URLs\n" //
         + "  -followRedirects\tfollow redirects when fetching URL\n" //
+        + "  -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" //
         + "  -dumpText       \talso show the plain-text extracted by 
parsers\n" //
         + "  -forceAs <mimeType>\tforce parsing as <mimeType>\n" //
         + "  -md <key>=<value>\tmetadata added to CrawlDatum before parsing\n";
+
     // Print help when no args given
     if (args.length < 1) {
       System.err.println(usage);
@@ -109,6 +112,8 @@ public class ParserChecker extends AbstractChecker {
         normalizers = new URLNormalizers(getConf(), 
URLNormalizers.SCOPE_DEFAULT);
       } else if (args[i].equals("-followRedirects")) {
         followRedirects = true;
+      } else if (args[i].equals("-checkRobotsTxt")) {
+        checkRobotsTxt = true;
       } else if (args[i].equals("-forceAs")) {
         forceAsContentType = args[++i];
       } else if (args[i].equals("-dumpText")) {
@@ -172,13 +177,15 @@ public class ParserChecker extends AbstractChecker {
       }
     }
 
-    ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
+    ProtocolOutput protocolOutput = getProtocolOutput(url, datum,
+        checkRobotsTxt);
     Text turl = new Text(url);
-    
+
     // Following redirects and not reached maxRedirects?
     int numRedirects = 0;
-    while (!protocolOutput.getStatus().isSuccess() && followRedirects
-        && protocolOutput.getStatus().isRedirect() && maxRedirects >= 
numRedirects) {
+    while (protocolOutput != null && !protocolOutput.getStatus().isSuccess()
+        && followRedirects && protocolOutput.getStatus().isRedirect()
+        && maxRedirects >= numRedirects) {
       String[] stuff = protocolOutput.getStatus().getArgs();
       url = stuff[0];
       LOG.info("Follow redirect to {}", url);
@@ -190,10 +197,15 @@ public class ParserChecker extends AbstractChecker {
       turl.set(url);
 
       // try again
-      protocolOutput = getProtocolOutput(url, datum);
+      protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
       numRedirects++;
     }
 
+    if (checkRobotsTxt && protocolOutput == null) {
+      System.err.println("Fetch disallowed by robots.txt");
+      return -1;
+    }
+
     if (!protocolOutput.getStatus().isSuccess()) {
       System.err.println("Fetch failed with protocol status: "
           + protocolOutput.getStatus());
diff --git a/src/java/org/apache/nutch/util/AbstractChecker.java 
b/src/java/org/apache/nutch/util/AbstractChecker.java
index b41bbc9..616e3dd 100644
--- a/src/java/org/apache/nutch/util/AbstractChecker.java
+++ b/src/java/org/apache/nutch/util/AbstractChecker.java
@@ -36,6 +36,8 @@ import org.apache.nutch.protocol.ProtocolOutput;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import crawlercommons.robots.BaseRobotRules;
+
 /**
  * Scaffolding class for the various Checker implementations. Can process 
cmdline input, stdin and TCP connections.
  * 
@@ -188,10 +190,21 @@ public abstract class AbstractChecker extends Configured 
implements Tool {
     }
   }
 
-  protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) 
throws Exception {
+  protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum,
+      boolean checkRobotsTxt) throws Exception {
     ProtocolFactory factory = new ProtocolFactory(getConf());
     Protocol protocol = factory.getProtocol(url);
     Text turl = new Text(url);
+    if (checkRobotsTxt) {
+      System.err.print("Checking robots.txt ...");
+      BaseRobotRules rules = protocol.getRobotRules(turl, datum, null);
+      if (rules.isAllowed(url)) {
+        System.err.println(" (allowed)");
+      } else {
+        System.err.println("\nDenied by robots.txt: " + url);
+        return null;
+      }
+    }
     return protocol.getProtocolOutput(turl, datum);
   }

[nutch] 05/35: NUTCH-2002 parse and index checkers to check robots.txt - applied Julien's patch to recent code base - also check redirects whether they are allowed - add command-line parameter `-checkRobotsTxt` enabling this check

Reply via email to