Author: markus Date: Wed Jan 13 13:10:19 2016 New Revision: 1724418 URL: http://svn.apache.org/viewvc?rev=1724418&view=rev Log: NUTCH-2196 IndexingFilterChecker to optionally normalize
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724418&r1=1724417&r2=1724418&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Jan 13 13:10:19 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2196 IndexingFilterChecker to optionally normalize (markus) + * NUTCH-2195 IndexingFilterChecker to optionally follow N redirects (markus) * NUTCH-2190 Protocol normalizer (markus) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1724418&r1=1724417&r2=1724418&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Wed Jan 13 13:10:19 2016 @@ -32,6 +32,7 @@ import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseResult; import org.apache.nutch.parse.ParseSegment; @@ -43,7 +44,6 @@ import org.apache.nutch.protocol.Protoco import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.StringUtil; -import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -69,10 +69,11 @@ public class IndexingFiltersChecker exte public int run(String[] args) throws Exception { String contentType = null; String url = null; + URLNormalizers normalizers = null; boolean dumpText = false; boolean followRedirects = false; - String usage = "Usage: IndexingFiltersChecker [-followRedirects] [-dumpText] [-md key=value] <url>"; + String usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] <url>"; if (args.length == 0) { System.err.println(usage); @@ -83,7 +84,9 @@ public class IndexingFiltersChecker exte HashMap<String, String> metadata = new HashMap<String, String>(); for (int i = 0; i < args.length; i++) { - if (args[i].equals("-followRedirects")) { + if (args[i].equals("-normalize")) { + normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT); + } else if (args[i].equals("-followRedirects")) { followRedirects = true; } else if (args[i].equals("-dumpText")) { dumpText = true; @@ -101,9 +104,13 @@ public class IndexingFiltersChecker exte System.err.println(usage); System.exit(-1); } else { - url = URLUtil.toASCII(args[i]); + url =args[i]; } } + + if (normalizers != null) { + url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); + } LOG.info("fetching: " + url); @@ -129,6 +136,11 @@ public class IndexingFiltersChecker exte while (!output.getStatus().isSuccess() && followRedirects && output.getStatus().isRedirect() && maxRedirects != 0) { String[] stuff = output.getStatus().getArgs(); url = stuff[0]; + + if (normalizers != null) { + url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); + } + turl.set(url); // try again