Author: markus
Date: Mon Dec 23 14:17:40 2013
New Revision: 1553115
URL: http://svn.apache.org/r1553115
Log:
NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1553115&r1=1553114&r2=1553115&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Dec 23 14:17:40 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly
(İlhami KALKAN, snagel via markus)
+
* NUTCH-1668 Remove package org.apache.nutch.indexer.solr (jnioche)
* NUTCH-1621 Remove deprecated class o.a.n.crawl.Crawler (Rui Gao via jnioche)
Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1553115&r1=1553114&r2=1553115&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Mon Dec 23 14:17:40
2013
@@ -481,7 +481,7 @@ public class URLUtil {
try {
URL u = new URL(url);
URI p = new URI(u.getProtocol(),
- null,
+ u.getUserInfo(),
IDN.toASCII(u.getHost()),
u.getPort(),
u.getPath(),
@@ -498,15 +498,25 @@ public class URLUtil {
public static String toUNICODE(String url) {
try {
URL u = new URL(url);
- URI p = new URI(u.getProtocol(),
- null,
- IDN.toUnicode(u.getHost()),
- u.getPort(),
- u.getPath(),
- u.getQuery(),
- u.getRef());
+ StringBuilder sb = new StringBuilder();
+ sb.append(u.getProtocol());
+ sb.append("://");
+ if (u.getUserInfo() != null) {
+ sb.append(u.getUserInfo());
+ sb.append('@');
+ }
+ sb.append(IDN.toUnicode(u.getHost()));
+ if (u.getPort() != -1) {
+ sb.append(':');
+ sb.append(u.getPort());
+ }
+ sb.append(u.getFile()); // includes query
+ if (u.getRef() != null) {
+ sb.append('#');
+ sb.append(u.getRef());
+ }
- return p.toString();
+ return sb.toString();
}
catch (Exception e) {
return null;
Modified: nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1553115&r1=1553114&r2=1553115&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java Mon Dec 23
14:17:40 2013
@@ -258,5 +258,22 @@ public class TestURLUtil
assertEquals(targets[i][1], targets[i][1], u.toString());
}
}
+
+ public void testToUNICODE() throws Exception {
+ assertEquals("http://www.çevir.com",
URLUtil.toUNICODE("http://www.xn--evir-zoa.com"));
+ assertEquals("http://uni-tübingen.de/",
URLUtil.toUNICODE("http://xn--uni-tbingen-xhb.de/"));
+ assertEquals(
+ "http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1",
+
URLUtil.toUNICODE("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1"));
+
+ }
+
+ public void testToASCII() throws Exception {
+ assertEquals("http://www.xn--evir-zoa.com",
URLUtil.toASCII("http://www.çevir.com"));
+ assertEquals("http://xn--uni-tbingen-xhb.de/",
URLUtil.toASCII("http://uni-tübingen.de/"));
+ assertEquals(
+ "http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1",
+
URLUtil.toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1"));
+ }
}