Author: markus Date: Thu Jun 7 18:48:58 2012 New Revision: 1347755 URL: http://svn.apache.org/viewvc?rev=1347755&view=rev Log: NUTCH-1320 IndexChecker and ParseChecker choke on IDN's
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1347755&r1=1347754&r2=1347755&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Jun 7 18:48:58 2012 @@ -2,6 +2,8 @@ Nutch Change Log (trunk) Current Development: +* NUTCH-1320 IndexChecker and ParseChecker choke on IDN's (markus) + * NUTCH-1351 DomainStatistics to aggregate by TLD (markus) * NUTCH-1381 Allow to override default subcollection field name (markus) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1347755&r1=1347754&r2=1347755&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Thu Jun 7 18:48:58 2012 @@ -40,47 +40,47 @@ import org.apache.nutch.protocol.Content import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolFactory; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.URLUtil; /** * Reads and parses a URL and run the indexers on it. Displays the fields obtained and the first * 100 characters of their value - * + * * Tested with e.g. ./nutch org.apache.nutch.indexer.IndexingFiltersChecker http://www.lemonde.fr * @author Julien Nioche **/ public class IndexingFiltersChecker extends Configured implements Tool { - + public static final Logger LOG = LoggerFactory.getLogger(IndexingFiltersChecker.class); - + public IndexingFiltersChecker() { } - + public int run(String[] args) throws Exception { - String contentType = null; String url = null; - + String usage = "Usage: IndexingFiltersChecker <url>"; - + if (args.length != 1) { System.err.println(usage); System.exit(-1); } - - url = args[0]; - + + url = URLUtil.toASCII(args[0]); + if (LOG.isInfoEnabled()) { LOG.info("fetching: " + url); } - + IndexingFilters indexers = new IndexingFilters(conf); - + ProtocolFactory factory = new ProtocolFactory(conf); Protocol protocol = factory.getProtocol(url); CrawlDatum datum = new CrawlDatum(); - + Content content = protocol.getProtocolOutput(new Text(url), datum) .getContent(); @@ -91,20 +91,20 @@ public class IndexingFiltersChecker exte System.out.println("No content for " + url); return 0; } - + contentType = content.getContentType(); - + if (contentType == null) { return -1; } - + if (LOG.isInfoEnabled()) { LOG.info("parsing: " + url); LOG.info("contentType: " + contentType); } ParseResult parseResult = new ParseUtil(conf).parse(content); - + NutchDocument doc = new NutchDocument(); Text urlText = new Text(url); @@ -128,19 +128,19 @@ public class IndexingFiltersChecker exte } return 0; } - + public static void main(String[] args) throws Exception { final int res = ToolRunner.run(NutchConfiguration.create(), new IndexingFiltersChecker(), args); System.exit(res); } - + Configuration conf; - + public Configuration getConf() { return conf; } - + @Override public void setConf(Configuration arg0) { conf = arg0; Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1347755&r1=1347754&r2=1347755&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Thu Jun 7 18:48:58 2012 @@ -29,6 +29,7 @@ import org.apache.nutch.protocol.Content import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolFactory; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.URLUtil; import org.apache.nutch.util.StringUtil; /** @@ -69,7 +70,7 @@ public class ParserChecker implements To System.err.println(usage); System.exit(-1); } else { - url = args[i]; + url = URLUtil.toASCII(args[i]); } } Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1347755&r1=1347754&r2=1347755&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Thu Jun 7 18:48:58 2012 @@ -465,6 +465,43 @@ public class URLUtil { } } + public static String toASCII(String url) { + try { + URL u = new URL(url); + URI p = new URI(u.getProtocol(), + null, + IDN.toASCII(u.getHost()), + u.getPort(), + u.getPath(), + u.getQuery(), + u.getRef()); + + return p.toString(); + } + catch (Exception e) { + return null; + } + } + + public static String toUNICODE(String url) { + try { + URL u = new URL(url); + URI p = new URI(u.getProtocol(), + null, + IDN.toUnicode(u.getHost()), + u.getPort(), + u.getPath(), + u.getQuery(), + u.getRef()); + + return p.toString(); + } + catch (Exception e) { + return null; + } + } + + /** For testing */ public static void main(String[] args){