Author: markus
Date: Mon Dec 23 14:17:40 2013
New Revision: 1553115

URL: http://svn.apache.org/r1553115
Log:
NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
    nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1553115&r1=1553114&r2=1553115&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Dec 23 14:17:40 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly 
(Ä°lhami KALKAN, snagel via markus)
+
 * NUTCH-1668 Remove package org.apache.nutch.indexer.solr (jnioche)
 
 * NUTCH-1621 Remove deprecated class o.a.n.crawl.Crawler (Rui Gao via jnioche)

Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1553115&r1=1553114&r2=1553115&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Mon Dec 23 14:17:40 
2013
@@ -481,7 +481,7 @@ public class URLUtil {
     try {
       URL u = new URL(url);
       URI p = new URI(u.getProtocol(),
-        null,
+        u.getUserInfo(),
         IDN.toASCII(u.getHost()),
         u.getPort(),
         u.getPath(),
@@ -498,15 +498,25 @@ public class URLUtil {
   public static String toUNICODE(String url) {
     try {
       URL u = new URL(url);
-      URI p = new URI(u.getProtocol(),
-        null,
-        IDN.toUnicode(u.getHost()),
-        u.getPort(),
-        u.getPath(),
-        u.getQuery(),
-        u.getRef());
+      StringBuilder sb = new StringBuilder();
+      sb.append(u.getProtocol());
+      sb.append("://");
+      if (u.getUserInfo() != null) {
+        sb.append(u.getUserInfo());
+        sb.append('@');
+      }
+      sb.append(IDN.toUnicode(u.getHost()));
+      if (u.getPort() != -1) {
+        sb.append(':');
+        sb.append(u.getPort());
+      }
+      sb.append(u.getFile()); // includes query
+      if (u.getRef() != null) {
+        sb.append('#');
+        sb.append(u.getRef());
+      }
 
-      return p.toString();
+      return sb.toString();
     }
     catch (Exception e) {
       return null;

Modified: nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1553115&r1=1553114&r2=1553115&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java Mon Dec 23 
14:17:40 2013
@@ -258,5 +258,22 @@ public class TestURLUtil
       assertEquals(targets[i][1], targets[i][1], u.toString());
     }
   }
+  
+  public void testToUNICODE() throws Exception {
+    assertEquals("http://www.çevir.com";, 
URLUtil.toUNICODE("http://www.xn--evir-zoa.com";));
+    assertEquals("http://uni-tübingen.de/";, 
URLUtil.toUNICODE("http://xn--uni-tbingen-xhb.de/";));
+    assertEquals(
+        "http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1";,
+        
URLUtil.toUNICODE("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1";));
+    
+  }
+  
+  public void testToASCII() throws Exception {
+    assertEquals("http://www.xn--evir-zoa.com";, 
URLUtil.toASCII("http://www.çevir.com";));
+    assertEquals("http://xn--uni-tbingen-xhb.de/";, 
URLUtil.toASCII("http://uni-tübingen.de/";));
+    assertEquals(
+        "http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1";,
+        
URLUtil.toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1";));
 
+  }
 
 }


Reply via email to