Repository: nutch Updated Branches: refs/heads/master 2b93a66f0 -> 76aedcb78
NUTCH-2349 urlnormalizer-basic: NPE for URLs without authority - check whether URL.getAuthority() returns null - recompose URLs without authority with empty authority/host Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/1a718e0c Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/1a718e0c Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/1a718e0c Branch: refs/heads/master Commit: 1a718e0cc9a0c3811111e40f4bf8351e26f73522 Parents: f351790 Author: Sebastian Nagel <[email protected]> Authored: Wed Jan 11 15:46:46 2017 +0100 Committer: Sebastian Nagel <[email protected]> Committed: Wed Jan 11 16:20:33 2017 +0100 ---------------------------------------------------------------------- .../nutch/net/urlnormalizer/basic/BasicURLNormalizer.java | 5 ++++- .../nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/1a718e0c/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java index 5287d2d..5c05636 100644 --- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -111,7 +111,7 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer { if ("http".equals(protocol) || "https".equals(protocol) || "ftp".equals(protocol)) { - if (host != null) { + if (host != null && url.getAuthority() != null) { String newHost = host.toLowerCase(Locale.ROOT); // lowercase host if (!host.equals(newHost)) { host = newHost; @@ -121,6 +121,9 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer { // etc.) which will likely cause a change if left away changed = true; } + } else { + // no host or authority: recompose the URL from components + changed = true; } if (port == url.getDefaultPort()) { // uses default port http://git-wip-us.apache.org/repos/asf/nutch/blob/1a718e0c/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java index d62a3a9..2625ea3 100644 --- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java @@ -164,6 +164,12 @@ public class TestBasicURLNormalizer { "http://foo.com/aa/bb/foo.html"); normalizeTest("http://foo.com/aa?referer=http://bar.com", "http://foo.com/aa?referer=http://bar.com"); + // check for NPEs when normalizing URLs without host (authority) + normalizeTest("file:///foo/bar.txt", "file:///foo/bar.txt"); + normalizeTest("ftp:/", "ftp:/"); + normalizeTest("http:", "http:/"); + normalizeTest("http:////", "http:/"); + normalizeTest("http:///////", "http:/"); } private void normalizeTest(String weird, String normal) throws Exception {
