This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 83011a08b98c55406583eb068d516ccb9f137266 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Wed May 13 14:39:15 2020 +0200 NUTCH-2419 Some URL filters and normalizers do not respect command-line override for rule file - simplify selection of rule file (from property or attribute in plugin.xml) --- .../org/apache/nutch/parsefilter/regex/RegexParseFilter.java | 8 +------- .../java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java | 9 ++------- .../urlfilter/domainblacklist/DomainBlacklistURLFilter.java | 9 ++------- .../java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java | 7 +------ .../java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java | 7 +------ .../apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java | 7 +------ .../nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java | 7 +------ .../apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java | 7 +------ 8 files changed, 10 insertions(+), 51 deletions(-) diff --git a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java index c84f27c..6e86fc6 100644 --- a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java +++ b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java @@ -120,14 +120,8 @@ public class RegexParseFilter implements HtmlParseFilter { } } - // domain file and attribute "file" take precedence if defined - String file = conf.get("parsefilter.regex.file"); + String file = conf.get("parsefilter.regex.file", attributeFile); String stringRules = conf.get("parsefilter.regex.rules"); - if (file != null) { - // take file - } else if (attributeFile != null) { - file = attributeFile; - } Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java index fac02af..f629262 100644 --- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java +++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java @@ -59,7 +59,7 @@ import org.apache.nutch.util.domain.DomainSuffix; * such as lucene.apache.org and hadoop.apache.org. The third line would allow * only URLs from www.apache.org. There is no specific ordering to entries. The * entries are from more general to more specific with the more general - * overridding the more specific. + * overriding the more specific. * </p> * * The domain file defaults to domain-urlfilter.txt in the classpath but can be @@ -130,16 +130,11 @@ public class DomainURLFilter implements URLFilter { // 2. rule file name defined by `urlfilter.domain.file` // 3. rule file name defined in plugin.xml (`attributeFile`) String stringRules = conf.get("urlfilter.domain.rules"); - String file = conf.get("urlfilter.domain.file"); + String file = conf.get("urlfilter.domain.file", attributeFile); Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); } else { - if (file != null) { - // take file - } else if (attributeFile != null) { - file = attributeFile; - } LOG.info("Reading {} rules file {}", pluginName, file); reader = conf.getConfResourceAsReader(file); } diff --git a/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java b/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java index 56b11e9..77c238b 100644 --- a/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java +++ b/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java @@ -59,7 +59,7 @@ import org.apache.nutch.util.domain.DomainSuffix; * such as lucene.apache.org and hadoop.apache.org. The third line would exclude * only URLs from www.apache.org. There is no specific ordering to entries. The * entries are from more general to more specific with the more general - * overridding the more specific. + * overriding the more specific. * </p> * * The domain file defaults to domainblacklist-urlfilter.txt in the classpath @@ -131,16 +131,11 @@ public class DomainBlacklistURLFilter implements URLFilter { // 2. rule file name defined by `urlfilter.domainblacklist.file` // 3. rule file name defined in plugin.xml (`attributeFile`) String stringRules = conf.get("urlfilter.domainblacklist.rules"); - String file = conf.get("urlfilter.domainblacklist.file"); + String file = conf.get("urlfilter.domainblacklist.file", attributeFile); Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); } else { - if (file != null) { - // take file - } else if (attributeFile != null) { - file = attributeFile; - } LOG.info("Reading {} rules file {}", pluginName, file); reader = conf.getConfResourceAsReader(file); } diff --git a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java index eeef9cf..61c6f17 100644 --- a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java +++ b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java @@ -142,17 +142,12 @@ public class PrefixURLFilter implements URLFilter { // 1. string rules defined by `urlfilter.domainblacklist.rules` // 2. rule file name defined by `urlfilter.domainblacklist.file` // 3. rule file name defined in plugin.xml (`attributeFile`) - String file = conf.get("urlfilter.prefix.file"); + String file = conf.get("urlfilter.prefix.file", attributeFile); String stringRules = conf.get("urlfilter.prefix.rules"); Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); } else { - if (file != null) { - // take file - } else if (attributeFile != null) { - file = attributeFile; - } LOG.info("Reading {} rules file {}", pluginName, file); reader = conf.getConfResourceAsReader(file); } diff --git a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java index 55382cc..3833f3c 100644 --- a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java +++ b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java @@ -276,17 +276,12 @@ public class SuffixURLFilter implements URLFilter { // 1. string rules defined by `urlfilter.domainblacklist.rules` // 2. rule file name defined by `urlfilter.domainblacklist.file` // 3. rule file name defined in plugin.xml (`attributeFile`) - String file = conf.get("urlfilter.suffix.file"); + String file = conf.get("urlfilter.suffix.file", attributeFile); String stringRules = conf.get("urlfilter.suffix.rules"); Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); } else { - if (file != null) { - // take file - } else if (attributeFile != null) { - file = attributeFile; - } LOG.info("Reading {} rules file {}", pluginName, file); reader = conf.getConfResourceAsReader(file); } diff --git a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java index 4506c85..3a3c8a4 100644 --- a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java +++ b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java @@ -118,13 +118,8 @@ public class HostURLNormalizer implements URLNormalizer { // 1. string rules defined by `urlnormalizer.hosts.rules` // 2. rule file name defined by `urlnormalizer.hosts.file"` // 3. rule file name defined in plugin.xml (`attributeFile`) - String file = conf.get("urlnormalizer.hosts.file"); + String file = conf.get("urlnormalizer.hosts.file", attributeFile); String stringRules = conf.get("urlnormalizer.hosts.rules"); - if (file != null) { - // take file - } else if (attributeFile != null) { - file = attributeFile; - } Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java index f18ac65..f60c291 100644 --- a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java +++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java @@ -124,13 +124,8 @@ public class ProtocolURLNormalizer implements URLNormalizer { // 1. string rules defined by `urlnormalizer.protocols.rules` // 2. rule file name defined by `urlnormalizer.protocols.file"` // 3. rule file name defined in plugin.xml (`attributeFile`) - String file = conf.get("urlnormalizer.protocols.file"); + String file = conf.get("urlnormalizer.protocols.file", attributeFile); String stringRules = conf.get("urlnormalizer.protocols.rules"); - if (file != null) { - // take file - } else if (attributeFile != null) { - file = attributeFile; - } Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); diff --git a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java index 6e8b7b9..2570427 100644 --- a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java +++ b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java @@ -130,13 +130,8 @@ public class SlashURLNormalizer implements URLNormalizer { // 1. string rules defined by `urlnormalizer.slashes.rules` // 2. rule file name defined by `urlnormalizer.slashes.file"` // 3. rule file name defined in plugin.xml (`attributeFile`) - String file = conf.get("urlnormalizer.slashes.file"); + String file = conf.get("urlnormalizer.slashes.file", attributeFile); String stringRules = conf.get("urlnormalizer.slashes.rules"); - if (file != null) { - // take file - } else if (attributeFile != null) { - file = attributeFile; - } Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules);