This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new a077ffc82 NUTCH-3087 BasicURLNormalizer to keep userinfo for protocols
which might require it
a077ffc82 is described below
commit a077ffc8254c32156461a11a91d777e985cb0db3
Author: Sebastian Nagel <[email protected]>
AuthorDate: Wed Dec 4 20:27:32 2024 +0100
NUTCH-3087 BasicURLNormalizer to keep userinfo for protocols which might
require it
- strip the userinfo from the authority only for HTTP and HTTPS
---
.../urlnormalizer/basic/BasicURLNormalizer.java | 27 ++++++++++++++++++----
.../basic/TestBasicURLNormalizer.java | 26 +++++++++++++++++++++
2 files changed, 49 insertions(+), 4 deletions(-)
diff --git
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 48c4a666a..b350af9a1 100644
---
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -242,9 +242,11 @@ public class BasicURLNormalizer implements URLNormalizer {
if (normalizePath) {
// check for unnecessary use of "/../", "/./", and "//"
if (changed) {
- url = new URL(protocol, host, port, file);
+ URL u = new URL(protocol, host, port, file);
+ file2 = getFileWithNormalizedPath(u);
+ } else {
+ file2 = getFileWithNormalizedPath(url);
}
- file2 = getFileWithNormalizedPath(url);
if (!file.equals(file2)) {
changed = true;
file = file2;
@@ -252,8 +254,25 @@ public class BasicURLNormalizer implements URLNormalizer {
}
if (changed) {
- url = new URL(protocol, host, port, file);
- urlString = url.toString();
+ if (protocol.equals("http") || protocol.equals("https")
+ || url.getUserInfo() == null) {
+ url = new URL(protocol, host, port, file);
+ urlString = url.toString();
+ } else {
+ /*
+ * NUTCH-3087 - userinfo is required for protocols with frequent
+ * authentication. Note: need to build the URL string directly, because
+ * there is no URL constructor which takes the userinfo as parameter.
+ */
+ StringBuilder sb = new StringBuilder();
+ sb.append(protocol).append("://").append(url.getUserInfo()).append('@')
+ .append(host);
+ if (port != -1) {
+ sb.append(':').append(port);
+ }
+ sb.append(file);
+ urlString = sb.toString();
+ }
}
return urlString;
diff --git
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 8f3a1fdaa..d48e58510 100644
---
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -216,6 +216,32 @@ public class TestBasicURLNormalizer {
normalizeTest("file:/var/www/html/////./bar/index.html",
"file:/var/www/html/bar/index.html");
}
+
+ @Test
+ public void testNUTCH3087() throws Exception {
+ // NUTCH-3087 userinfo to be kept in URLs with protocols usually requiring
+ // authentication
+ normalizeTest("ftp://[email protected]/path/file.txt",
+ "ftp://[email protected]/path/file.txt");
+ normalizeTest("ftp://[email protected]/",
+ "ftp://[email protected]/");
+ normalizeTest("ftp://user:[email protected]/path/file.txt",
+ "ftp://user:[email protected]/path/file.txt");
+ // But for HTTP(S) the userinfo should be removed.
+ // (example from https://en.wikipedia.org/wiki/Uniform_Resource_Identifier)
+ normalizeTest(
+
"https://[email protected]:1234/forum/questions/?tag=networking&order=newest#top",
+
"https://www.example.com:1234/forum/questions/?tag=networking&order=newest");
+ // URLs with IPv6 address
+
normalizeTest("ftp://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/../path/file.txt",
+ "ftp://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/path/file.txt");
+ normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/",
+ "https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/");
+
normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]:443/",
+ "https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/");
+
normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/path/../to/index.html",
+ "https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/to/index.html");
+ }
@Test
public void testCurlyBraces() throws Exception {