Author: lewismc
Date: Mon Dec 23 15:06:41 2013
New Revision: 1553125

URL: http://svn.apache.org/r1553125
Log:
NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
    nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1553125&r1=1553124&r2=1553125&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Dec 23 15:06:41 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly 
(İlhami KALKAN, snagel, markus via lewismc) 
+
 * NUTCH-1673 Title isn't reset in MoreIndexingFilter (Nguyen Manh Tien via 
lewismc)
 
 * NUTCH-1621 Remove deprecated class o.a.n.crawl.Crawler (Rui Gao via jnioche)

Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1553125&r1=1553124&r2=1553125&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Mon Dec 23 
15:06:41 2013
@@ -337,7 +337,7 @@ public class URLUtil {
     try {
       URL u = new URL(url);
       URI p = new URI(u.getProtocol(),
-        null,
+        u.getUserInfo(),
         IDN.toASCII(u.getHost()),
         u.getPort(),
         u.getPath(),
@@ -354,15 +354,25 @@ public class URLUtil {
   public static String toUNICODE(String url) {
     try {
       URL u = new URL(url);
-      URI p = new URI(u.getProtocol(),
-        null,
-        IDN.toUnicode(u.getHost()),
-        u.getPort(),
-        u.getPath(),
-        u.getQuery(),
-        u.getRef());
+      StringBuilder sb = new StringBuilder();
+      sb.append(u.getProtocol());
+      sb.append("://");
+      if (u.getUserInfo() != null) {
+        sb.append(u.getUserInfo());
+        sb.append('@');
+      }
+      sb.append(IDN.toUnicode(u.getHost()));
+      if (u.getPort() != -1) {
+        sb.append(':');
+        sb.append(u.getPort());
+      }
+      sb.append(u.getFile()); // includes query
+      if (u.getRef() != null) {
+        sb.append('#');
+        sb.append(u.getRef());
+      }
 
-      return p.toString();
+      return sb.toString();
     }
     catch (Exception e) {
       return null;

Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1553125&r1=1553124&r2=1553125&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java 
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java Mon Dec 
23 15:06:41 2013
@@ -210,5 +210,24 @@ public class TestURLUtil {
     // *www.a.com -> www.news.a.com
     assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aSubDotCom, true));
   }
+  
+  @Test
+  public void testToUNICODE() throws Exception {
+    assertEquals("http://www.çevir.com";, 
URLUtil.toUNICODE("http://www.xn--evir-zoa.com";));
+    assertEquals("http://uni-tübingen.de/";, 
URLUtil.toUNICODE("http://xn--uni-tbingen-xhb.de/";));
+    assertEquals(
+        "http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1";,
+        
URLUtil.toUNICODE("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1";));
+    
+  }
+  
+  @Test
+  public void testToASCII() throws Exception {
+    assertEquals("http://www.xn--evir-zoa.com";, 
URLUtil.toASCII("http://www.çevir.com";));
+    assertEquals("http://xn--uni-tbingen-xhb.de/";, 
URLUtil.toASCII("http://uni-tübingen.de/";));
+    assertEquals(
+        "http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1";,
+        
URLUtil.toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1";));
 
+  }
 
 }


Reply via email to