Repository: nutch Updated Branches: refs/heads/2.x 022ed5c03 -> 700857d16
NUTCH-2349 urlnormalizer-basic: NPE for URLs without authority - check whether URL.getAuthority() returns null - recompose URLs without authority with empty authority/host Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/700857d1 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/700857d1 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/700857d1 Branch: refs/heads/2.x Commit: 700857d16c9e1517ddb9868ed41171d91e5c9116 Parents: 022ed5c Author: Sebastian Nagel <[email protected]> Authored: Wed Feb 1 11:51:04 2017 +0100 Committer: Sebastian Nagel <[email protected]> Committed: Wed Feb 1 11:51:04 2017 +0100 ---------------------------------------------------------------------- .../nutch/net/urlnormalizer/basic/BasicURLNormalizer.java | 5 ++++- .../nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/700857d1/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java index e17b19a..15a1de0 100644 --- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -79,7 +79,7 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer { if ("http".equals(protocol) || "https".equals(protocol) || "ftp".equals(protocol)) { - if (host != null) { + if (host != null && url.getAuthority() != null) { String newHost = host.toLowerCase(Locale.ROOT); // lowercase host if (!host.equals(newHost)) { host = newHost; @@ -89,6 +89,9 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer { // etc.) which will likely cause a change if left away changed = true; } + } else { + // no host or authority: recompose the URL from components + changed = true; } if (port == url.getDefaultPort()) { // uses default port http://git-wip-us.apache.org/repos/asf/nutch/blob/700857d1/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java index 006c1a3..1d5d99e 100644 --- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java @@ -100,6 +100,12 @@ public class TestBasicURLNormalizer { "http://foo.com/aa/bb/foo.html"); normalizeTest("http://foo.com/aa?referer=http://bar.com", "http://foo.com/aa?referer=http://bar.com"); + // check for NPEs when normalizing URLs without host (authority) + normalizeTest("file:///foo/bar.txt", "file:///foo/bar.txt"); + normalizeTest("ftp:/", "ftp:/"); + normalizeTest("http:", "http:/"); + normalizeTest("http:////", "http:/"); + normalizeTest("http:///////", "http:/"); } private void normalizeTest(String weird, String normal) throws Exception {
