This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 2702c6f NUTCH-2868 urlnormalizer-protocol fails with StringIndexOutOfBoundsException when reading invalid line in configuration file - log invalid line and skip over it - more verbose logging which configuration file is read - add unit test to proof that invalid configuration lines are skipped new abb6927 Merge pull request #649 from sebastian-nagel/NUTCH-2868-urlnormalizer-protocol-exception-reading-config-file 2702c6f is described below commit 2702c6fc68e79a83ae393a820ab49040eecb1934 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Thu Jun 10 11:44:22 2021 +0200 NUTCH-2868 urlnormalizer-protocol fails with StringIndexOutOfBoundsException when reading invalid line in configuration file - log invalid line and skip over it - more verbose logging which configuration file is read - add unit test to proof that invalid configuration lines are skipped --- .../urlnormalizer-protocol/data/protocols.txt | 10 ++++++++++ .../protocol/ProtocolURLNormalizer.java | 23 ++++++++++++++++++++-- .../protocol/TestProtocolURLNormalizer.java | 9 +++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/src/plugin/urlnormalizer-protocol/data/protocols.txt b/src/plugin/urlnormalizer-protocol/data/protocols.txt index 7e49703..fc7d86c 100644 --- a/src/plugin/urlnormalizer-protocol/data/protocols.txt +++ b/src/plugin/urlnormalizer-protocol/data/protocols.txt @@ -19,3 +19,13 @@ example.io https example.nl https *.example.com https + +# invalid input to verify whether this is handled nicely by the configuration file reader +# no host/domain + https +# no protocol +invalid-rule1.example.top +# more than two fields (skip rule) +invalid-rule2.example.top https http +# invalid protocol, not following RFC 1630 (skip rule) +invalid-rule3.example.top @mail diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java index d747858..f2b475a 100644 --- a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java +++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java @@ -27,6 +27,7 @@ import java.net.URL; import java.util.HashMap; import java.util.Map; import java.util.TreeMap; +import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; @@ -69,9 +70,14 @@ public class ProtocolURLNormalizer implements URLNormalizer { // Map of domain suffixes and protocol to be used for all hosts below this domain private final Map<String,String> domainProtocolsMap = new HashMap<>(); + // Matcher for domain suffixes private SuffixStringMatcher domainMatcher = null; + // validator for protocols/schemes following RFC 1630 + private final static Pattern PROTOCOL_VALIDATOR = Pattern.compile( + "^[a-z](?:[a-z0-9$\\-_@.&!*\"'(),]|%[0-9a-f]{2})*$", + Pattern.CASE_INSENSITIVE); private synchronized void readConfiguration(Reader configReader) throws IOException { if (protocolsMap.size() > 0) { @@ -82,19 +88,31 @@ public class ProtocolURLNormalizer implements URLNormalizer { String line, host; String protocol; int delimiterIndex; + int lineNumber = 0; while ((line = reader.readLine()) != null) { + lineNumber++; + line = line.trim(); if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { - line = line.trim(); delimiterIndex = line.indexOf(" "); // try tabulator if (delimiterIndex == -1) { delimiterIndex = line.indexOf("\t"); } + if (delimiterIndex == -1) { + LOG.warn("Invalid line {}, no delimiter between <host/domain> and <protocol> found: {}", lineNumber, line); + continue; + } host = line.substring(0, delimiterIndex); protocol = line.substring(delimiterIndex + 1).trim(); + if (!PROTOCOL_VALIDATOR.matcher(protocol).matches()) { + LOG.warn("Skipping rule with protocol not following RFC 1630 in line {}: {}", + lineNumber, line); + continue; + } + /* * dedup protocol values to reduce memory footprint of map: equal * strings are represented by the same string object @@ -172,13 +190,14 @@ public class ProtocolURLNormalizer implements URLNormalizer { if (stringRules != null && !stringRules.isEmpty()) { // takes precedence over files reader = new StringReader(stringRules); } else { - LOG.info("Reading {} rules file {}", pluginName, file); + LOG.info("Reading {} rules file {} from Java class path", pluginName, file); reader = conf.getConfResourceAsReader(file); } try { if (reader == null) { Path path = new Path(file); FileSystem fs = path.getFileSystem(conf); + LOG.info("Reading {} rules file {}", pluginName, path.toUri()); reader = new InputStreamReader(fs.open(path)); } readConfiguration(reader); diff --git a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java index de9f77a..9775250 100644 --- a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java +++ b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java @@ -83,5 +83,14 @@ public class TestProtocolURLNormalizer extends TestCase { normalizer.normalize( "http://www.subdomain.example.com:8080/path?q=uery", URLNormalizers.SCOPE_DEFAULT)); + + // No change because of invalid rules in protocols.txt + // (verify that these rules are skipped) + assertEquals("http://invalid-rule3.example.top/", normalizer + .normalize("http://invalid-rule3.example.top/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://invalid-rule2.example.top/", normalizer + .normalize("http://invalid-rule2.example.top/", URLNormalizers.SCOPE_DEFAULT)); + assertEquals("http://invalid-rule3.example.top/", normalizer + .normalize("http://invalid-rule3.example.top/", URLNormalizers.SCOPE_DEFAULT)); } }