Author: lewismc
Date: Mon Dec 23 15:06:41 2013
New Revision: 1553125
URL: http://svn.apache.org/r1553125
Log:
NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1553125&r1=1553124&r2=1553125&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Dec 23 15:06:41 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly
(ðlhami KALKAN, snagel, markus via lewismc)
+
* NUTCH-1673 Title isn't reset in MoreIndexingFilter (Nguyen Manh Tien via
lewismc)
* NUTCH-1621 Remove deprecated class o.a.n.crawl.Crawler (Rui Gao via jnioche)
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1553125&r1=1553124&r2=1553125&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Mon Dec 23
15:06:41 2013
@@ -337,7 +337,7 @@ public class URLUtil {
try {
URL u = new URL(url);
URI p = new URI(u.getProtocol(),
- null,
+ u.getUserInfo(),
IDN.toASCII(u.getHost()),
u.getPort(),
u.getPath(),
@@ -354,15 +354,25 @@ public class URLUtil {
public static String toUNICODE(String url) {
try {
URL u = new URL(url);
- URI p = new URI(u.getProtocol(),
- null,
- IDN.toUnicode(u.getHost()),
- u.getPort(),
- u.getPath(),
- u.getQuery(),
- u.getRef());
+ StringBuilder sb = new StringBuilder();
+ sb.append(u.getProtocol());
+ sb.append("://");
+ if (u.getUserInfo() != null) {
+ sb.append(u.getUserInfo());
+ sb.append('@');
+ }
+ sb.append(IDN.toUnicode(u.getHost()));
+ if (u.getPort() != -1) {
+ sb.append(':');
+ sb.append(u.getPort());
+ }
+ sb.append(u.getFile()); // includes query
+ if (u.getRef() != null) {
+ sb.append('#');
+ sb.append(u.getRef());
+ }
- return p.toString();
+ return sb.toString();
}
catch (Exception e) {
return null;
Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1553125&r1=1553124&r2=1553125&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java Mon Dec
23 15:06:41 2013
@@ -210,5 +210,24 @@ public class TestURLUtil {
// *www.a.com -> www.news.a.com
assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aSubDotCom, true));
}
+
+ @Test
+ public void testToUNICODE() throws Exception {
+ assertEquals("http://www.çevir.com",
URLUtil.toUNICODE("http://www.xn--evir-zoa.com"));
+ assertEquals("http://uni-tübingen.de/",
URLUtil.toUNICODE("http://xn--uni-tbingen-xhb.de/"));
+ assertEquals(
+ "http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1",
+
URLUtil.toUNICODE("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1"));
+
+ }
+
+ @Test
+ public void testToASCII() throws Exception {
+ assertEquals("http://www.xn--evir-zoa.com",
URLUtil.toASCII("http://www.çevir.com"));
+ assertEquals("http://xn--uni-tbingen-xhb.de/",
URLUtil.toASCII("http://uni-tübingen.de/"));
+ assertEquals(
+ "http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1",
+
URLUtil.toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1"));
+ }
}