Repository: nutch
Updated Branches:
  refs/heads/master 6e051f2cc -> f351790d7


NUTCH-2337 urlnormalizer-basic to strip empty port, closes #160
- make sure that URLs which contain anything else than the host
  in the authority (incl. empty port) are marked as changed
- always use root locale for case conversion


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/f351790d
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/f351790d
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/f351790d

Branch: refs/heads/master
Commit: f351790d7f496561aeae5e214d1b33975ca34cf2
Parents: 6e051f2
Author: Sebastian Nagel <[email protected]>
Authored: Fri Dec 9 11:45:54 2016 +0100
Committer: Sebastian Nagel <[email protected]>
Committed: Tue Dec 13 14:03:16 2016 +0100

----------------------------------------------------------------------
 .../net/urlnormalizer/basic/BasicURLNormalizer.java      | 11 ++++++++---
 .../net/urlnormalizer/basic/TestBasicURLNormalizer.java  |  5 ++++-
 2 files changed, 12 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/f351790d/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 3e00346..5287d2d 100644
--- 
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -112,10 +112,14 @@ public class BasicURLNormalizer extends Configured 
implements URLNormalizer {
         || "ftp".equals(protocol)) {
 
       if (host != null) {
-        String newHost = host.toLowerCase(); // lowercase host
+        String newHost = host.toLowerCase(Locale.ROOT); // lowercase host
         if (!host.equals(newHost)) {
           host = newHost;
           changed = true;
+        } else if (!url.getAuthority().equals(newHost)) {
+          // authority (http://<...>/) contains other elements (port, user,
+          // etc.) which will likely cause a change if left away
+          changed = true;
         }
       }
 
@@ -247,7 +251,7 @@ public class BasicURLNormalizer extends Configured 
implements URLNormalizer {
         sb.append('%');
         
         // Get this byte's hexadecimal representation 
-        String hex = Integer.toHexString(b & 0xFF).toUpperCase();
+        String hex = Integer.toHexString(b & 0xFF).toUpperCase(Locale.ROOT);
         
         // Do we need to prepend a zero?
         if (hex.length() % 2 != 0 ) {
@@ -275,7 +279,8 @@ public class BasicURLNormalizer extends Configured 
implements URLNormalizer {
       System.out.println("Scope: " + scope);
     }
     String line, normUrl;
-    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    BufferedReader in = new BufferedReader(
+        new InputStreamReader(System.in, utf8));
     while ((line = in.readLine()) != null) {
       try {
         normUrl = normalizer.normalize(line, scope);

http://git-wip-us.apache.org/repos/asf/nutch/blob/f351790d/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
 
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 9a0f8c4..d62a3a9 100644
--- 
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -116,6 +116,10 @@ public class TestBasicURLNormalizer {
     // check that port number is normalized
     normalizeTest("http://foo.com:80/index.html";, "http://foo.com/index.html";);
     normalizeTest("http://foo.com:81/";, "http://foo.com:81/";);
+    // check that empty port is removed
+    normalizeTest("http://example.com:/";, "http://example.com/";);
+    normalizeTest("https://example.com:/foobar.html";,
+        "https://example.com/foobar.html";);
 
     // check that null path is normalized
     normalizeTest("http://foo.com";, "http://foo.com/";);
@@ -127,7 +131,6 @@ public class TestBasicURLNormalizer {
     // normalizeTest("http://foo.com/%66oo.html";, "http://foo.com/foo.html";);
 
     // check that unnecessary "../" are removed
-
     normalizeTest("http://foo.com/aa/./foo.html";, 
"http://foo.com/aa/foo.html";);
     normalizeTest("http://foo.com/aa/../";, "http://foo.com/";);
     normalizeTest("http://foo.com/aa/bb/../";, "http://foo.com/aa/";);

Reply via email to