Author: markus
Date: Thu Jun  7 18:48:58 2012
New Revision: 1347755

URL: http://svn.apache.org/viewvc?rev=1347755&view=rev
Log:
NUTCH-1320 IndexChecker and ParseChecker choke on IDN's

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
    nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
    nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1347755&r1=1347754&r2=1347755&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jun  7 18:48:58 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1320 IndexChecker and ParseChecker choke on IDN's (markus)
+
 * NUTCH-1351 DomainStatistics to aggregate by TLD (markus)
 
 * NUTCH-1381 Allow to override default subcollection field name (markus)

Modified: 
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1347755&r1=1347754&r2=1347755&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
Thu Jun  7 18:48:58 2012
@@ -40,47 +40,47 @@ import org.apache.nutch.protocol.Content
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.URLUtil;
 
 /**
  * Reads and parses a URL and run the indexers on it. Displays the fields 
obtained and the first
  * 100 characters of their value
- * 
+ *
  * Tested with e.g. ./nutch org.apache.nutch.indexer.IndexingFiltersChecker 
http://www.lemonde.fr
  * @author Julien Nioche
  **/
 
 public class IndexingFiltersChecker extends Configured implements Tool {
-  
+
   public static final Logger LOG = 
LoggerFactory.getLogger(IndexingFiltersChecker.class);
-  
+
   public IndexingFiltersChecker() {
 
   }
-  
+
   public int run(String[] args) throws Exception {
-    
     String contentType = null;
     String url = null;
-    
+
     String usage = "Usage: IndexingFiltersChecker <url>";
-    
+
     if (args.length != 1) {
       System.err.println(usage);
       System.exit(-1);
     }
-    
-    url = args[0];
-    
+
+    url = URLUtil.toASCII(args[0]);
+
     if (LOG.isInfoEnabled()) {
       LOG.info("fetching: " + url);
     }
-        
+
     IndexingFilters indexers = new IndexingFilters(conf);
-    
+
     ProtocolFactory factory = new ProtocolFactory(conf);
     Protocol protocol = factory.getProtocol(url);
     CrawlDatum datum = new CrawlDatum();
-    
+
     Content content = protocol.getProtocolOutput(new Text(url), datum)
         .getContent();
 
@@ -91,20 +91,20 @@ public class IndexingFiltersChecker exte
       System.out.println("No content for " + url);
       return 0;
     }
-    
+
     contentType = content.getContentType();
-    
+
     if (contentType == null) {
       return -1;
     }
-    
+
     if (LOG.isInfoEnabled()) {
       LOG.info("parsing: " + url);
       LOG.info("contentType: " + contentType);
     }
 
     ParseResult parseResult = new ParseUtil(conf).parse(content);
-    
+
     NutchDocument doc = new NutchDocument();
     Text urlText = new Text(url);
 
@@ -128,19 +128,19 @@ public class IndexingFiltersChecker exte
     }
     return 0;
   }
-  
+
   public static void main(String[] args) throws Exception {
     final int res = ToolRunner.run(NutchConfiguration.create(),
         new IndexingFiltersChecker(), args);
     System.exit(res);
   }
-  
+
   Configuration conf;
-  
+
   public Configuration getConf() {
     return conf;
   }
-  
+
   @Override
   public void setConf(Configuration arg0) {
     conf = arg0;

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1347755&r1=1347754&r2=1347755&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Thu Jun  7 
18:48:58 2012
@@ -29,6 +29,7 @@ import org.apache.nutch.protocol.Content
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.URLUtil;
 import org.apache.nutch.util.StringUtil;
 
 /**
@@ -69,7 +70,7 @@ public class ParserChecker implements To
         System.err.println(usage);
         System.exit(-1);
       } else {
-        url = args[i];
+        url = URLUtil.toASCII(args[i]);
       }
     }
 

Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1347755&r1=1347754&r2=1347755&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Thu Jun  7 18:48:58 
2012
@@ -465,6 +465,43 @@ public class URLUtil {
     }
   }
 
+  public static String toASCII(String url) {
+    try {
+      URL u = new URL(url);
+      URI p = new URI(u.getProtocol(),
+        null,
+        IDN.toASCII(u.getHost()),
+        u.getPort(),
+        u.getPath(),
+        u.getQuery(),
+        u.getRef());
+
+      return p.toString();
+    }
+    catch (Exception e) {
+      return null;
+    }
+  }
+
+  public static String toUNICODE(String url) {
+    try {
+      URL u = new URL(url);
+      URI p = new URI(u.getProtocol(),
+        null,
+        IDN.toUnicode(u.getHost()),
+        u.getPort(),
+        u.getPath(),
+        u.getQuery(),
+        u.getRef());
+
+      return p.toString();
+    }
+    catch (Exception e) {
+      return null;
+    }
+  }
+
+
   /** For testing */
   public static void main(String[] args){
     


Reply via email to