Repository: nutch
Updated Branches:
  refs/heads/master 2b93a66f0 -> 76aedcb78


NUTCH-2349 urlnormalizer-basic: NPE for URLs without authority
- check whether URL.getAuthority() returns null
- recompose URLs without authority with empty authority/host


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/1a718e0c
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/1a718e0c
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/1a718e0c

Branch: refs/heads/master
Commit: 1a718e0cc9a0c3811111e40f4bf8351e26f73522
Parents: f351790
Author: Sebastian Nagel <[email protected]>
Authored: Wed Jan 11 15:46:46 2017 +0100
Committer: Sebastian Nagel <[email protected]>
Committed: Wed Jan 11 16:20:33 2017 +0100

----------------------------------------------------------------------
 .../nutch/net/urlnormalizer/basic/BasicURLNormalizer.java      | 5 ++++-
 .../nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java  | 6 ++++++
 2 files changed, 10 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/1a718e0c/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 5287d2d..5c05636 100644
--- 
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -111,7 +111,7 @@ public class BasicURLNormalizer extends Configured 
implements URLNormalizer {
     if ("http".equals(protocol) || "https".equals(protocol)
         || "ftp".equals(protocol)) {
 
-      if (host != null) {
+      if (host != null && url.getAuthority() != null) {
         String newHost = host.toLowerCase(Locale.ROOT); // lowercase host
         if (!host.equals(newHost)) {
           host = newHost;
@@ -121,6 +121,9 @@ public class BasicURLNormalizer extends Configured 
implements URLNormalizer {
           // etc.) which will likely cause a change if left away
           changed = true;
         }
+      } else {
+        // no host or authority: recompose the URL from components
+        changed = true;
       }
 
       if (port == url.getDefaultPort()) { // uses default port

http://git-wip-us.apache.org/repos/asf/nutch/blob/1a718e0c/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
 
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index d62a3a9..2625ea3 100644
--- 
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -164,6 +164,12 @@ public class TestBasicURLNormalizer {
         "http://foo.com/aa/bb/foo.html";);
     normalizeTest("http://foo.com/aa?referer=http://bar.com";,
         "http://foo.com/aa?referer=http://bar.com";);
+    // check for NPEs when normalizing URLs without host (authority)
+    normalizeTest("file:///foo/bar.txt", "file:///foo/bar.txt");
+    normalizeTest("ftp:/", "ftp:/");
+    normalizeTest("http:", "http:/");
+    normalizeTest("http:////";, "http:/");
+    normalizeTest("http:///////";, "http:/");
   }
 
   private void normalizeTest(String weird, String normal) throws Exception {

Reply via email to