Repository: nutch Updated Branches: refs/heads/2.x 0ea78907d -> 6e3c34db1
NUTCH-2337 urlnormalizer-basic to strip empty port - make sure that URLs which contain anything else than the host in the authority (incl. empty port) are marked as changed Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/6e3c34db Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/6e3c34db Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/6e3c34db Branch: refs/heads/2.x Commit: 6e3c34db16e385b0dadbe6444c2685283c863350 Parents: 0ea7890 Author: Sebastian Nagel <sna...@apache.org> Authored: Tue Dec 13 14:27:55 2016 +0100 Committer: Sebastian Nagel <sna...@apache.org> Committed: Tue Dec 13 14:29:31 2016 +0100 ---------------------------------------------------------------------- .../nutch/net/urlnormalizer/basic/BasicURLNormalizer.java | 4 ++++ .../nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/6e3c34db/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java index 3652d47..b648293 100644 --- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -83,6 +83,10 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer { if (!host.equals(newHost)) { host = newHost; changed = true; + } else if (!url.getAuthority().equals(newHost)) { + // authority (http://<...>/) contains other elements (port, user, + // etc.) which will likely cause a change if left away + changed = true; } } http://git-wip-us.apache.org/repos/asf/nutch/blob/6e3c34db/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java index 0974b49..006c1a3 100644 --- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java @@ -52,6 +52,10 @@ public class TestBasicURLNormalizer { // check that port number is normalized normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html"); normalizeTest("http://foo.com:81/", "http://foo.com:81/"); + // check that empty port is removed + normalizeTest("http://example.com:/", "http://example.com/"); + normalizeTest("https://example.com:/foobar.html", + "https://example.com/foobar.html"); // check that null path is normalized normalizeTest("http://foo.com", "http://foo.com/"); @@ -63,7 +67,6 @@ public class TestBasicURLNormalizer { // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); // check that unnecessary "../" are removed - normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html"); normalizeTest("http://foo.com/aa/../", "http://foo.com/"); normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/");