Author: markus Date: Mon Dec 23 14:17:40 2013 New Revision: 1553115 URL: http://svn.apache.org/r1553115 Log: NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1553115&r1=1553114&r2=1553115&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon Dec 23 14:17:40 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly (İlhami KALKAN, snagel via markus) + * NUTCH-1668 Remove package org.apache.nutch.indexer.solr (jnioche) * NUTCH-1621 Remove deprecated class o.a.n.crawl.Crawler (Rui Gao via jnioche) Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1553115&r1=1553114&r2=1553115&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Mon Dec 23 14:17:40 2013 @@ -481,7 +481,7 @@ public class URLUtil { try { URL u = new URL(url); URI p = new URI(u.getProtocol(), - null, + u.getUserInfo(), IDN.toASCII(u.getHost()), u.getPort(), u.getPath(), @@ -498,15 +498,25 @@ public class URLUtil { public static String toUNICODE(String url) { try { URL u = new URL(url); - URI p = new URI(u.getProtocol(), - null, - IDN.toUnicode(u.getHost()), - u.getPort(), - u.getPath(), - u.getQuery(), - u.getRef()); + StringBuilder sb = new StringBuilder(); + sb.append(u.getProtocol()); + sb.append("://"); + if (u.getUserInfo() != null) { + sb.append(u.getUserInfo()); + sb.append('@'); + } + sb.append(IDN.toUnicode(u.getHost())); + if (u.getPort() != -1) { + sb.append(':'); + sb.append(u.getPort()); + } + sb.append(u.getFile()); // includes query + if (u.getRef() != null) { + sb.append('#'); + sb.append(u.getRef()); + } - return p.toString(); + return sb.toString(); } catch (Exception e) { return null; Modified: nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1553115&r1=1553114&r2=1553115&view=diff ============================================================================== --- nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java (original) +++ nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java Mon Dec 23 14:17:40 2013 @@ -258,5 +258,22 @@ public class TestURLUtil assertEquals(targets[i][1], targets[i][1], u.toString()); } } + + public void testToUNICODE() throws Exception { + assertEquals("http://www.çevir.com", URLUtil.toUNICODE("http://www.xn--evir-zoa.com")); + assertEquals("http://uni-tübingen.de/", URLUtil.toUNICODE("http://xn--uni-tbingen-xhb.de/")); + assertEquals( + "http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1", + URLUtil.toUNICODE("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1")); + + } + + public void testToASCII() throws Exception { + assertEquals("http://www.xn--evir-zoa.com", URLUtil.toASCII("http://www.çevir.com")); + assertEquals("http://xn--uni-tbingen-xhb.de/", URLUtil.toASCII("http://uni-tübingen.de/")); + assertEquals( + "http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1", + URLUtil.toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1")); + } }