Author: markus Date: Wed Jan 13 12:17:03 2016 New Revision: 1724409 URL: http://svn.apache.org/viewvc?rev=1724409&view=rev Log: NUTCH-2195 IndexingFilterChecker to optionally follow N redirects
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724409&r1=1724408&r2=1724409&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jan 13 12:17:03 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2195 IndexingFilterChecker to optionally follow N redirects (markus) + * NUTCH-2190 Protocol normalizer (markus) * NUTCH-1838 Host and domain based regex and automaton filtering (markus) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1724409&r1=1724408&r2=1724409&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Wed Jan 13 12:17:03 2016 @@ -70,8 +70,9 @@ public class IndexingFiltersChecker exte String contentType = null; String url = null; boolean dumpText = false; + boolean followRedirects = false; - String usage = "Usage: IndexingFiltersChecker [-dumpText] [-md key=value] <url>"; + String usage = "Usage: IndexingFiltersChecker [-followRedirects] [-dumpText] [-md key=value] <url>"; if (args.length == 0) { System.err.println(usage); @@ -82,7 +83,9 @@ public class IndexingFiltersChecker exte HashMap<String, String> metadata = new HashMap<String, String>(); for (int i = 0; i < args.length; i++) { - if (args[i].equals("-dumpText")) { + if (args[i].equals("-followRedirects")) { + followRedirects = true; + } else if (args[i].equals("-dumpText")) { dumpText = true; } else if (args[i].equals("-md")) { String k = null, v = null; @@ -116,11 +119,22 @@ public class IndexingFiltersChecker exte } IndexingFilters indexers = new IndexingFilters(getConf()); + + int maxRedirects = 3; - ProtocolFactory factory = new ProtocolFactory(getConf()); - Protocol protocol = factory.getProtocol(url); + ProtocolOutput output = getProtocolOutput(url, datum); Text turl = new Text(url); - ProtocolOutput output = protocol.getProtocolOutput(turl, datum); + + // Following redirects and not reached maxRedirects? + while (!output.getStatus().isSuccess() && followRedirects && output.getStatus().isRedirect() && maxRedirects != 0) { + String[] stuff = output.getStatus().getArgs(); + url = stuff[0]; + turl.set(url); + + // try again + output = getProtocolOutput(url, datum); + maxRedirects--; + } if (!output.getStatus().isSuccess()) { System.out.println("Fetch failed with protocol status: " @@ -224,6 +238,14 @@ public class IndexingFiltersChecker exte return 0; } + + protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception { + ProtocolFactory factory = new ProtocolFactory(getConf()); + Protocol protocol = factory.getProtocol(url); + Text turl = new Text(url); + ProtocolOutput output = protocol.getProtocolOutput(turl, datum); + return output; + } public static void main(String[] args) throws Exception { final int res = ToolRunner.run(NutchConfiguration.create(),