This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new a077ffc82 NUTCH-3087 BasicURLNormalizer to keep userinfo for protocols 
which might require it
a077ffc82 is described below

commit a077ffc8254c32156461a11a91d777e985cb0db3
Author: Sebastian Nagel <[email protected]>
AuthorDate: Wed Dec 4 20:27:32 2024 +0100

    NUTCH-3087 BasicURLNormalizer to keep userinfo for protocols which might 
require it
    
    - strip the userinfo from the authority only for HTTP and HTTPS
---
 .../urlnormalizer/basic/BasicURLNormalizer.java    | 27 ++++++++++++++++++----
 .../basic/TestBasicURLNormalizer.java              | 26 +++++++++++++++++++++
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git 
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 48c4a666a..b350af9a1 100644
--- 
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -242,9 +242,11 @@ public class BasicURLNormalizer implements URLNormalizer {
     if (normalizePath) {
       // check for unnecessary use of "/../", "/./", and "//"
       if (changed) {
-        url = new URL(protocol, host, port, file);
+        URL u = new URL(protocol, host, port, file);
+        file2 = getFileWithNormalizedPath(u);
+      } else {
+        file2 = getFileWithNormalizedPath(url);
       }
-      file2 = getFileWithNormalizedPath(url);
       if (!file.equals(file2)) {
         changed = true;
         file = file2;
@@ -252,8 +254,25 @@ public class BasicURLNormalizer implements URLNormalizer {
     }
 
     if (changed) {
-      url = new URL(protocol, host, port, file);
-      urlString = url.toString();
+      if (protocol.equals("http") || protocol.equals("https")
+          || url.getUserInfo() == null) {
+        url = new URL(protocol, host, port, file);
+        urlString = url.toString();
+      } else {
+        /*
+         * NUTCH-3087 - userinfo is required for protocols with frequent
+         * authentication. Note: need to build the URL string directly, because
+         * there is no URL constructor which takes the userinfo as parameter.
+         */
+        StringBuilder sb = new StringBuilder();
+        sb.append(protocol).append("://").append(url.getUserInfo()).append('@')
+            .append(host);
+        if (port != -1) {
+          sb.append(':').append(port);
+        }
+        sb.append(file);
+        urlString = sb.toString();
+      }
     }
 
     return urlString;
diff --git 
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
 
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 8f3a1fdaa..d48e58510 100644
--- 
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -216,6 +216,32 @@ public class TestBasicURLNormalizer {
     normalizeTest("file:/var/www/html/////./bar/index.html",
         "file:/var/www/html/bar/index.html");
   }
+
+  @Test
+  public void testNUTCH3087() throws Exception {
+    // NUTCH-3087 userinfo to be kept in URLs with protocols usually requiring
+    // authentication
+    normalizeTest("ftp://[email protected]/path/file.txt";,
+        "ftp://[email protected]/path/file.txt";);
+    normalizeTest("ftp://[email protected]/";,
+        "ftp://[email protected]/";);
+    normalizeTest("ftp://user:[email protected]/path/file.txt";,
+        "ftp://user:[email protected]/path/file.txt";);
+    // But for HTTP(S) the userinfo should be removed.
+    // (example from https://en.wikipedia.org/wiki/Uniform_Resource_Identifier)
+    normalizeTest(
+        
"https://[email protected]:1234/forum/questions/?tag=networking&order=newest#top";,
+        
"https://www.example.com:1234/forum/questions/?tag=networking&order=newest";);
+    // URLs with IPv6 address
+    
normalizeTest("ftp://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/../path/file.txt";,
+        "ftp://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/path/file.txt";);
+    normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/";,
+        "https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/";);
+    
normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]:443/";,
+        "https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/";);
+    
normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/path/../to/index.html";,
+        "https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/to/index.html";);
+  }
   
   @Test
   public void testCurlyBraces() throws Exception {

Reply via email to