Repository: nutch Updated Branches: refs/heads/master 6e051f2cc -> f351790d7
NUTCH-2337 urlnormalizer-basic to strip empty port, closes #160 - make sure that URLs which contain anything else than the host in the authority (incl. empty port) are marked as changed - always use root locale for case conversion Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/f351790d Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/f351790d Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/f351790d Branch: refs/heads/master Commit: f351790d7f496561aeae5e214d1b33975ca34cf2 Parents: 6e051f2 Author: Sebastian Nagel <[email protected]> Authored: Fri Dec 9 11:45:54 2016 +0100 Committer: Sebastian Nagel <[email protected]> Committed: Tue Dec 13 14:03:16 2016 +0100 ---------------------------------------------------------------------- .../net/urlnormalizer/basic/BasicURLNormalizer.java | 11 ++++++++--- .../net/urlnormalizer/basic/TestBasicURLNormalizer.java | 5 ++++- 2 files changed, 12 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/f351790d/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java index 3e00346..5287d2d 100644 --- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -112,10 +112,14 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer { || "ftp".equals(protocol)) { if (host != null) { - String newHost = host.toLowerCase(); // lowercase host + String newHost = host.toLowerCase(Locale.ROOT); // lowercase host if (!host.equals(newHost)) { host = newHost; changed = true; + } else if (!url.getAuthority().equals(newHost)) { + // authority (http://<...>/) contains other elements (port, user, + // etc.) which will likely cause a change if left away + changed = true; } } @@ -247,7 +251,7 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer { sb.append('%'); // Get this byte's hexadecimal representation - String hex = Integer.toHexString(b & 0xFF).toUpperCase(); + String hex = Integer.toHexString(b & 0xFF).toUpperCase(Locale.ROOT); // Do we need to prepend a zero? if (hex.length() % 2 != 0 ) { @@ -275,7 +279,8 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer { System.out.println("Scope: " + scope); } String line, normUrl; - BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + BufferedReader in = new BufferedReader( + new InputStreamReader(System.in, utf8)); while ((line = in.readLine()) != null) { try { normUrl = normalizer.normalize(line, scope); http://git-wip-us.apache.org/repos/asf/nutch/blob/f351790d/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java index 9a0f8c4..d62a3a9 100644 --- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java @@ -116,6 +116,10 @@ public class TestBasicURLNormalizer { // check that port number is normalized normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html"); normalizeTest("http://foo.com:81/", "http://foo.com:81/"); + // check that empty port is removed + normalizeTest("http://example.com:/", "http://example.com/"); + normalizeTest("https://example.com:/foobar.html", + "https://example.com/foobar.html"); // check that null path is normalized normalizeTest("http://foo.com", "http://foo.com/"); @@ -127,7 +131,6 @@ public class TestBasicURLNormalizer { // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); // check that unnecessary "../" are removed - normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html"); normalizeTest("http://foo.com/aa/../", "http://foo.com/"); normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/");
