Repository: nutch
Updated Branches:
  refs/heads/2.x 0ea78907d -> 6e3c34db1


NUTCH-2337 urlnormalizer-basic to strip empty port
- make sure that URLs which contain anything else than the host
  in the authority (incl. empty port) are marked as changed


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/6e3c34db
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/6e3c34db
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/6e3c34db

Branch: refs/heads/2.x
Commit: 6e3c34db16e385b0dadbe6444c2685283c863350
Parents: 0ea7890
Author: Sebastian Nagel <sna...@apache.org>
Authored: Tue Dec 13 14:27:55 2016 +0100
Committer: Sebastian Nagel <sna...@apache.org>
Committed: Tue Dec 13 14:29:31 2016 +0100

----------------------------------------------------------------------
 .../nutch/net/urlnormalizer/basic/BasicURLNormalizer.java       | 4 ++++
 .../nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java   | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/6e3c34db/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 3652d47..b648293 100644
--- 
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -83,6 +83,10 @@ public class BasicURLNormalizer extends Configured 
implements URLNormalizer {
         if (!host.equals(newHost)) {
           host = newHost;
           changed = true;
+        } else if (!url.getAuthority().equals(newHost)) {
+          // authority (http://<...>/) contains other elements (port, user,
+          // etc.) which will likely cause a change if left away
+          changed = true;
         }
       }
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/6e3c34db/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
 
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 0974b49..006c1a3 100644
--- 
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -52,6 +52,10 @@ public class TestBasicURLNormalizer {
     // check that port number is normalized
     normalizeTest("http://foo.com:80/index.html";, "http://foo.com/index.html";);
     normalizeTest("http://foo.com:81/";, "http://foo.com:81/";);
+    // check that empty port is removed
+    normalizeTest("http://example.com:/";, "http://example.com/";);
+    normalizeTest("https://example.com:/foobar.html";,
+        "https://example.com/foobar.html";);
 
     // check that null path is normalized
     normalizeTest("http://foo.com";, "http://foo.com/";);
@@ -63,7 +67,6 @@ public class TestBasicURLNormalizer {
     // normalizeTest("http://foo.com/%66oo.html";, "http://foo.com/foo.html";);
 
     // check that unnecessary "../" are removed
-
     normalizeTest("http://foo.com/aa/./foo.html";, 
"http://foo.com/aa/foo.html";);
     normalizeTest("http://foo.com/aa/../";, "http://foo.com/";);
     normalizeTest("http://foo.com/aa/bb/../";, "http://foo.com/aa/";);

Reply via email to