This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 79f3c0ad54025c4d3f87c625faecc807be2a04b9 Author: Sebastian Nagel <[email protected]> AuthorDate: Fri Sep 27 22:51:29 2019 +0200 NUTCH-2419 Some URL filters and normalizers do not respect command-line override for rule file - fix urlfilter-domain, urlfilter-domainblacklist, urlfilter-prefix and urlfilter-suffix - always prefer the configured rule file (urlfilter.domain.file, urlfilter.domainblacklist.file, urlfilter.prefix.file, urlfilter.suffix.file) over the file defined in plugin.xml - remove constructors taking rule file as argument (used only in unit tests and now obsolete because we can override the rule file via configuration) - update Java API doc comments --- .../nutch/parsefilter/regex/RegexParseFilter.java | 16 +---- .../parsefilter/regex/TestRegexParseFilter.java | 6 +- .../nutch/urlfilter/domain/DomainURLFilter.java | 66 +++++++------------- .../urlfilter/domain/TestDomainURLFilter.java | 6 +- .../domainblacklist/DomainBlacklistURLFilter.java | 71 ++++++++-------------- .../TestDomainBlacklistURLFilter.java | 4 +- .../nutch/urlfilter/prefix/PrefixURLFilter.java | 42 ++++++------- .../nutch/urlfilter/suffix/SuffixURLFilter.java | 41 ++++++------- .../net/urlnormalizer/host/HostURLNormalizer.java | 21 +++---- .../urlnormalizer/host/TestHostURLNormalizer.java | 3 +- .../protocol/ProtocolURLNormalizer.java | 23 +++---- .../protocol/TestProtocolURLNormalizer.java | 3 +- .../urlnormalizer/slash/SlashURLNormalizer.java | 25 +++----- .../slash/TestSlashURLNormalizer.java | 3 +- 14 files changed, 137 insertions(+), 193 deletions(-) diff --git a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java index 3c43cf5..c84f27c 100644 --- a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java +++ b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java @@ -51,20 +51,11 @@ public class RegexParseFilter implements HtmlParseFilter { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); private static String attributeFile = null; - private String regexFile = null; private Configuration conf; private static final Map<String,RegexRule> rules = new HashMap<>(); - public RegexParseFilter() { - //default constructor - } - - public RegexParseFilter(String regexFile) { - this.regexFile = regexFile; - } - public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { Parse parse = parseResult.get(content.getUrl()); String html = new String(content.getContent()); @@ -132,10 +123,9 @@ public class RegexParseFilter implements HtmlParseFilter { // domain file and attribute "file" take precedence if defined String file = conf.get("parsefilter.regex.file"); String stringRules = conf.get("parsefilter.regex.rules"); - if (regexFile != null) { - file = regexFile; - } - else if (attributeFile != null) { + if (file != null) { + // take file + } else if (attributeFile != null) { file = attributeFile; } Reader reader = null; diff --git a/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java b/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java index 238d300..64fa7f6 100644 --- a/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java +++ b/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java @@ -35,7 +35,8 @@ public class TestRegexParseFilter extends TestCase { Configuration conf = NutchConfiguration.create(); String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt"; - RegexParseFilter filter = new RegexParseFilter(file); + conf.set("parsefilter.regex.file", file); + RegexParseFilter filter = new RegexParseFilter(); filter.setConf(conf); String url = "http://nutch.apache.org/"; @@ -56,7 +57,8 @@ public class TestRegexParseFilter extends TestCase { Configuration conf = NutchConfiguration.create(); String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt"; - RegexParseFilter filter = new RegexParseFilter(file); + conf.set("parsefilter.regex.file", file); + RegexParseFilter filter = new RegexParseFilter(); filter.setConf(conf); String url = "http://nutch.apache.org/"; diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java index 9e2e2e7..fac02af 100644 --- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java +++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java @@ -38,24 +38,26 @@ import org.apache.nutch.util.domain.DomainSuffix; /** * <p> * Filters URLs based on a file containing domain suffixes, domain names, and - * hostnames. Only a url that matches one of the suffixes, domains, or hosts + * hostnames. Only a URL that matches one of the suffixes, domains, or hosts * present in the file is allowed. * </p> * * <p> - * Urls are checked in order of domain suffix, domain name, and hostname against + * URLs are checked in order of domain suffix, domain name, and hostname against * entries in the domain file. The domain file would be setup as follows with * one entry per line: * * <pre> - * com apache.org www.apache.org + * com + * apache.org + * www.apache.org * </pre> * * <p> * The first line is an example of a filter that would allow all .com domains. - * The second line allows all urls from apache.org and all of its subdomains + * The second line allows all URLs from apache.org and all of its subdomains * such as lucene.apache.org and hadoop.apache.org. The third line would allow - * only urls from www.apache.org. There is no specific ordering to entries. The + * only URLs from www.apache.org. There is no specific ordering to entries. The * entries are from more general to more specific with the more general * overridding the more specific. * </p> @@ -72,7 +74,6 @@ import org.apache.nutch.util.domain.DomainSuffix; * </li> * </ul> * - * the attribute "file" has higher precedence if defined. */ public class DomainURLFilter implements URLFilter { @@ -82,7 +83,6 @@ public class DomainURLFilter implements URLFilter { // read in attribute "file" of this plugin. private static String attributeFile = null; private Configuration conf; - private String domainFile = null; private Set<String> domainSet = new LinkedHashSet<String>(); private void readConfiguration(Reader configReader) throws IOException { @@ -99,23 +99,6 @@ public class DomainURLFilter implements URLFilter { } /** - * Default constructor. - */ - public DomainURLFilter() { - - } - - /** - * Constructor that specifies the domain file to use. - * - * @param domainFile - * The domain file, overrides domain-urlfilter.text default. - */ - public DomainURLFilter(String domainFile) { - this.domainFile = domainFile; - } - - /** * Sets the configuration. */ public void setConf(Configuration conf) { @@ -133,44 +116,41 @@ public class DomainURLFilter implements URLFilter { } } - // handle blank non empty input - if (attributeFile != null && attributeFile.trim().equals("")) { + if (attributeFile != null && attributeFile.trim().isEmpty()) { attributeFile = null; } if (attributeFile != null) { - if (LOG.isInfoEnabled()) { - LOG.info("Attribute \"file\" is defined for plugin " + pluginName - + " as " + attributeFile); - } - } else { - if (LOG.isWarnEnabled()) { - LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " - + pluginName); - } + LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName, attributeFile); } - // domain file and attribute "file" take precedence if defined - String file = conf.get("urlfilter.domain.file"); + // precedence hierarchy for definition of filter rules + // (first non-empty definition takes precedence): + // 1. string rules defined by `urlfilter.domain.rules` + // 2. rule file name defined by `urlfilter.domain.file` + // 3. rule file name defined in plugin.xml (`attributeFile`) String stringRules = conf.get("urlfilter.domain.rules"); - if (domainFile != null) { - file = domainFile; - } else if (attributeFile != null) { - file = attributeFile; - } + String file = conf.get("urlfilter.domain.file"); Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); } else { + if (file != null) { + // take file + } else if (attributeFile != null) { + file = attributeFile; + } + LOG.info("Reading {} rules file {}", pluginName, file); reader = conf.getConfResourceAsReader(file); } try { if (reader == null) { + // read local file reader = new FileReader(file); } readConfiguration(reader); } catch (IOException e) { - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + LOG.error("Error reading " + pluginName + " rule file " + file, e); } } diff --git a/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java b/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java index 0be1e31..7878aa1 100644 --- a/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java +++ b/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java @@ -31,7 +31,8 @@ public class TestDomainURLFilter { String domainFile = SAMPLES + SEPARATOR + "hosts.txt"; Configuration conf = NutchConfiguration.create(); - DomainURLFilter domainFilter = new DomainURLFilter(domainFile); + conf.set("urlfilter.domain.file", domainFile); + DomainURLFilter domainFilter = new DomainURLFilter(); domainFilter.setConf(conf); Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org")); Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org")); @@ -50,7 +51,8 @@ public class TestDomainURLFilter { // https://issues.apache.org/jira/browse/NUTCH-2189 String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt"; Configuration conf = NutchConfiguration.create(); - DomainURLFilter domainFilter = new DomainURLFilter(domainFile); + conf.set("urlfilter.domain.file", domainFile); + DomainURLFilter domainFilter = new DomainURLFilter(); domainFilter.setConf(conf); Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org")); Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org")); diff --git a/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java b/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java index 452f6d4..56b11e9 100644 --- a/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java +++ b/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java @@ -38,24 +38,26 @@ import org.apache.nutch.util.domain.DomainSuffix; /** * <p> * Filters URLs based on a file containing domain suffixes, domain names, and - * hostnames. A url that matches one of the suffixes, domains, or hosts present + * hostnames. A URL that matches one of the suffixes, domains, or hosts present * in the file is filtered out. * </p> * * <p> - * Urls are checked in order of domain suffix, domain name, and hostname against + * URLs are checked in order of domain suffix, domain name, and hostname against * entries in the domain file. The domain file would be setup as follows with * one entry per line: * * <pre> - * com apache.org www.apache.org + * com + * apache.org + * www.apache.org * </pre> * * <p> - * The first line is an example of a filter that would allow all .com domains. - * The second line allows all urls from apache.org and all of its subdomains - * such as lucene.apache.org and hadoop.apache.org. The third line would allow - * only urls from www.apache.org. There is no specific ordering to entries. The + * The first line is an example of a filter that would exclude all .com domains. + * The second line excludes all URLs from apache.org and all of its subdomains + * such as lucene.apache.org and hadoop.apache.org. The third line would exclude + * only URLs from www.apache.org. There is no specific ordering to entries. The * entries are from more general to more specific with the more general * overridding the more specific. * </p> @@ -72,7 +74,6 @@ import org.apache.nutch.util.domain.DomainSuffix; * </li> * </ul> * - * the attribute "file" has higher precedence if defined. */ public class DomainBlacklistURLFilter implements URLFilter { @@ -82,7 +83,6 @@ public class DomainBlacklistURLFilter implements URLFilter { // read in attribute "file" of this plugin. private static String attributeFile = null; private Configuration conf; - private String domainFile = null; private Set<String> domainSet = new LinkedHashSet<String>(); private void readConfiguration(Reader configReader) throws IOException { @@ -99,23 +99,6 @@ public class DomainBlacklistURLFilter implements URLFilter { } /** - * Default constructor. - */ - public DomainBlacklistURLFilter() { - - } - - /** - * Constructor that specifies the domain file to use. - * - * @param domainFile - * The domain file, overrides domainblacklist-urlfilter.text default. - */ - public DomainBlacklistURLFilter(String domainFile) { - this.domainFile = domainFile; - } - - /** * Sets the configuration. */ public void setConf(Configuration conf) { @@ -133,44 +116,42 @@ public class DomainBlacklistURLFilter implements URLFilter { } } - // handle blank non empty input - if (attributeFile != null && attributeFile.trim().equals("")) { + if (attributeFile != null && attributeFile.trim().isEmpty()) { attributeFile = null; } if (attributeFile != null) { - if (LOG.isInfoEnabled()) { - LOG.info("Attribute \"file\" is defined for plugin " + pluginName - + " as " + attributeFile); - } - } else { - if (LOG.isWarnEnabled()) { - LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " - + pluginName); - } + LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName, + attributeFile); } - // domain file and attribute "file" take precedence if defined - String file = conf.get("urlfilter.domainblacklist.file"); + // precedence hierarchy for definition of filter rules + // (first non-empty definition takes precedence): + // 1. string rules defined by `urlfilter.domainblacklist.rules` + // 2. rule file name defined by `urlfilter.domainblacklist.file` + // 3. rule file name defined in plugin.xml (`attributeFile`) String stringRules = conf.get("urlfilter.domainblacklist.rules"); - if (domainFile != null) { - file = domainFile; - } else if (attributeFile != null) { - file = attributeFile; - } + String file = conf.get("urlfilter.domainblacklist.file"); Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); } else { + if (file != null) { + // take file + } else if (attributeFile != null) { + file = attributeFile; + } + LOG.info("Reading {} rules file {}", pluginName, file); reader = conf.getConfResourceAsReader(file); } try { if (reader == null) { + // read local file reader = new FileReader(file); } readConfiguration(reader); } catch (IOException e) { - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + LOG.error("Error reading " + pluginName + " rule file " + file, e); } } diff --git a/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java b/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java index d253867..9ab207a 100644 --- a/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java +++ b/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java @@ -31,8 +31,8 @@ public class TestDomainBlacklistURLFilter { String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt"; Configuration conf = NutchConfiguration.create(); - DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter( - domainBlacklistFile); + conf.set("urlfilter.domainblacklist.file", domainBlacklistFile); + DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(); domainBlacklistFilter.setConf(conf); Assert.assertNull(domainBlacklistFilter.filter("http://lucene.apache.org")); Assert.assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org")); diff --git a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java index d1d5caa..eeef9cf 100644 --- a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java +++ b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java @@ -39,9 +39,8 @@ import java.util.ArrayList; /** * Filters URLs based on a file of URL prefixes. The file is named by (1) - * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and (2) - * attribute "file" in plugin.xml of this plugin Attribute "file" has higher - * precedence if defined. + * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, or (2) + * the attribute "file" in plugin.xml of this plugin. * * <p> * The format of this file is one URL prefix per line. @@ -129,43 +128,44 @@ public class PrefixURLFilter implements URLFilter { break; } } - if (attributeFile != null && attributeFile.trim().equals("")) + + if (attributeFile != null && attributeFile.trim().isEmpty()) { attributeFile = null; + } + if (attributeFile != null) { - if (LOG.isInfoEnabled()) { - LOG.info("Attribute \"file\" is defined for plugin " + pluginName - + " as " + attributeFile); - } - } else { - // if (LOG.isWarnEnabled()) { - // LOG.warn("Attribute \"file\" is not defined in plugin.xml for - // plugin "+pluginName); - // } + LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName, attributeFile); } + // precedence hierarchy for definition of filter rules + // (first non-empty definition takes precedence): + // 1. string rules defined by `urlfilter.domainblacklist.rules` + // 2. rule file name defined by `urlfilter.domainblacklist.file` + // 3. rule file name defined in plugin.xml (`attributeFile`) String file = conf.get("urlfilter.prefix.file"); String stringRules = conf.get("urlfilter.prefix.rules"); - // attribute "file" takes precedence if defined - if (attributeFile != null) - file = attributeFile; Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); } else { + if (file != null) { + // take file + } else if (attributeFile != null) { + file = attributeFile; + } + LOG.info("Reading {} rules file {}", pluginName, file); reader = conf.getConfResourceAsReader(file); } if (reader == null) { + LOG.warn("Missing {} rule file '{}': all URLs will be rejected!", + pluginName, file); trie = new PrefixStringMatcher(new String[0]); } else { try { trie = readConfiguration(reader); } catch (IOException e) { - if (LOG.isErrorEnabled()) { - LOG.error(e.getMessage()); - } - // TODO [email protected]: throw Exception? Because broken api. - throw new RuntimeException(e.getMessage(), e); + LOG.error("Error reading " + pluginName + " rule file " + file, e); } } } diff --git a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java index df5a5d8..55382cc 100644 --- a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java +++ b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java @@ -47,8 +47,7 @@ import java.net.MalformedURLException; * <li>property "urlfilter.suffix.file" in ./conf/nutch-default.xml, and</li> * <li>attribute "file" in plugin.xml of this plugin</li> * </ol> - * Attribute "file" has higher precedence if defined. If the config file is - * missing, all URLs will be rejected. + * If the config file is missing, all URLs will be rejected. * * <p> * This filter can be configured to work in one of two modes: @@ -177,9 +176,7 @@ public class SuffixURLFilter implements URLFilter { // handle missing config file if (reader == null) { - if (LOG.isWarnEnabled()) { - LOG.warn("Missing urlfilter.suffix.file, all URLs will be rejected!"); - } + LOG.warn("Missing urlfilter.suffix.file, all URLs will be rejected!"); suffixes = new SuffixStringMatcher(new String[0]); modeAccept = false; ignoreCase = false; @@ -265,39 +262,39 @@ public class SuffixURLFilter implements URLFilter { break; } } - if (attributeFile != null && attributeFile.trim().equals("")) + + if (attributeFile != null && attributeFile.trim().isEmpty()) { attributeFile = null; + } + if (attributeFile != null) { - if (LOG.isInfoEnabled()) { - LOG.info("Attribute \"file\" is defined for plugin " + pluginName - + " as " + attributeFile); - } - } else { - // if (LOG.isWarnEnabled()) { - // LOG.warn("Attribute \"file\" is not defined in plugin.xml for - // plugin "+pluginName); - // } + LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName, attributeFile); } + // precedence hierarchy for definition of filter rules + // (first non-empty definition takes precedence): + // 1. string rules defined by `urlfilter.domainblacklist.rules` + // 2. rule file name defined by `urlfilter.domainblacklist.file` + // 3. rule file name defined in plugin.xml (`attributeFile`) String file = conf.get("urlfilter.suffix.file"); String stringRules = conf.get("urlfilter.suffix.rules"); - // attribute "file" takes precedence if defined - if (attributeFile != null) - file = attributeFile; Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); } else { + if (file != null) { + // take file + } else if (attributeFile != null) { + file = attributeFile; + } + LOG.info("Reading {} rules file {}", pluginName, file); reader = conf.getConfResourceAsReader(file); } try { readConfiguration(reader); } catch (IOException e) { - if (LOG.isErrorEnabled()) { - LOG.error(e.getMessage()); - } - throw new RuntimeException(e.getMessage(), e); + LOG.error("Error reading " + pluginName + " rule file " + file, e); } } diff --git a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java index ef83284..4506c85 100644 --- a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java +++ b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java @@ -53,16 +53,8 @@ public class HostURLNormalizer implements URLNormalizer { .getLogger(MethodHandles.lookup().lookupClass()); private static String attributeFile = null; - private String hostsFile = null; private static final HashMap<String, String> hostsMap = new HashMap<String, String>(); - public HostURLNormalizer() { - } - - public HostURLNormalizer(String hostsFile) { - this.hostsFile = hostsFile; - } - private synchronized void readConfiguration(Reader configReader) throws IOException { if (hostsMap.size() > 0) { @@ -121,11 +113,15 @@ public class HostURLNormalizer implements URLNormalizer { } } - // domain file and attribute "file" take precedence if defined + // precedence hierarchy for definition of normalizer rules + // (first non-empty definition takes precedence): + // 1. string rules defined by `urlnormalizer.hosts.rules` + // 2. rule file name defined by `urlnormalizer.hosts.file"` + // 3. rule file name defined in plugin.xml (`attributeFile`) String file = conf.get("urlnormalizer.hosts.file"); String stringRules = conf.get("urlnormalizer.hosts.rules"); - if (hostsFile != null) { - file = hostsFile; + if (file != null) { + // take file } else if (attributeFile != null) { file = attributeFile; } @@ -133,6 +129,7 @@ public class HostURLNormalizer implements URLNormalizer { if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); } else { + LOG.info("Reading {} rules file {}", pluginName, file); reader = conf.getConfResourceAsReader(file); } try { @@ -141,7 +138,7 @@ public class HostURLNormalizer implements URLNormalizer { } readConfiguration(reader); } catch (IOException e) { - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + LOG.error("Error reading " + pluginName + " rule file " + file, e); } } diff --git a/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java b/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java index c9e1a2c..68cb50a 100644 --- a/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java +++ b/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java @@ -32,7 +32,8 @@ public class TestHostURLNormalizer { Configuration conf = NutchConfiguration.create(); String hostsFile = SAMPLES + SEPARATOR + "hosts.txt"; - HostURLNormalizer normalizer = new HostURLNormalizer(hostsFile); + conf.set("urlnormalizer.hosts.file", hostsFile); + HostURLNormalizer normalizer = new HostURLNormalizer(); normalizer.setConf(conf); // Force www. sub domain when hitting link without sub domain diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java index 12ecbf4..f18ac65 100644 --- a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java +++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java @@ -50,18 +50,11 @@ public class ProtocolURLNormalizer implements URLNormalizer { private static final String PROTOCOL_DELIMITER = "://"; private static String attributeFile = null; - private String protocolsFile = null; // We record a map of hosts and boolean, the boolean denotes whether the host should // have slashes after URL paths. True means slash, false means remove the slash private static final Map<String,String> protocolsMap = new HashMap<String,String>(); - public ProtocolURLNormalizer() {} - - public ProtocolURLNormalizer(String protocolsFile) { - this.protocolsFile = protocolsFile; - } - private synchronized void readConfiguration(Reader configReader) throws IOException { if (protocolsMap.size() > 0) { return; @@ -126,19 +119,23 @@ public class ProtocolURLNormalizer implements URLNormalizer { } } - // domain file and attribute "file" take precedence if defined + // precedence hierarchy for definition of normalizer rules + // (first non-empty definition takes precedence): + // 1. string rules defined by `urlnormalizer.protocols.rules` + // 2. rule file name defined by `urlnormalizer.protocols.file"` + // 3. rule file name defined in plugin.xml (`attributeFile`) String file = conf.get("urlnormalizer.protocols.file"); String stringRules = conf.get("urlnormalizer.protocols.rules"); - if (protocolsFile != null) { - file = protocolsFile; - } - else if (attributeFile != null) { + if (file != null) { + // take file + } else if (attributeFile != null) { file = attributeFile; } Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); } else { + LOG.info("Reading {} rules file {}", pluginName, file); reader = conf.getConfResourceAsReader(file); } try { @@ -148,7 +145,7 @@ public class ProtocolURLNormalizer implements URLNormalizer { readConfiguration(reader); } catch (IOException e) { - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + LOG.error("Error reading " + pluginName + " rule file " + file, e); } } diff --git a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java index 22005ce..1b9760b 100644 --- a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java +++ b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java @@ -31,7 +31,8 @@ public class TestProtocolURLNormalizer extends TestCase { Configuration conf = NutchConfiguration.create(); String protocolsFile = SAMPLES + SEPARATOR + "protocols.txt"; - ProtocolURLNormalizer normalizer = new ProtocolURLNormalizer(protocolsFile); + conf.set("urlnormalizer.protocols.file", protocolsFile); + ProtocolURLNormalizer normalizer = new ProtocolURLNormalizer(); normalizer.setConf(conf); // No change diff --git a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java index 8d05f5e..6e8b7b9 100644 --- a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java +++ b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java @@ -52,20 +52,11 @@ public class SlashURLNormalizer implements URLNormalizer { private static final String PROTOCOL_DELIMITER = "://"; private static String attributeFile = null; - private String slashesFile = null; // We record a map of hosts and boolean, the boolean denotes whether the host should // have slashes after URL paths. True means slash, false means remove the slash private static final Map<String,Boolean> slashesMap = new HashMap<>(); - public SlashURLNormalizer() { - //default constructor - } - - public SlashURLNormalizer(String slashesFile) { - this.slashesFile = slashesFile; - } - private synchronized void readConfiguration(Reader configReader) throws IOException { if (slashesMap.size() > 0) { return; @@ -134,19 +125,23 @@ public class SlashURLNormalizer implements URLNormalizer { } } - // domain file and attribute "file" take precedence if defined + // precedence hierarchy for definition of normalizer rules + // (first non-empty definition takes precedence): + // 1. string rules defined by `urlnormalizer.slashes.rules` + // 2. rule file name defined by `urlnormalizer.slashes.file"` + // 3. rule file name defined in plugin.xml (`attributeFile`) String file = conf.get("urlnormalizer.slashes.file"); String stringRules = conf.get("urlnormalizer.slashes.rules"); - if (slashesFile != null) { - file = slashesFile; - } - else if (attributeFile != null) { + if (file != null) { + // take file + } else if (attributeFile != null) { file = attributeFile; } Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); } else { + LOG.info("Reading {} rules file {}", pluginName, file); reader = conf.getConfResourceAsReader(file); } try { @@ -156,7 +151,7 @@ public class SlashURLNormalizer implements URLNormalizer { readConfiguration(reader); } catch (IOException e) { - LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + LOG.error("Error reading " + pluginName + " rule file " + file, e); } } diff --git a/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java b/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java index c5b3897..54af2bf 100644 --- a/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java +++ b/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java @@ -31,7 +31,8 @@ public class TestSlashURLNormalizer extends TestCase { Configuration conf = NutchConfiguration.create(); String slashesFile = SAMPLES + SEPARATOR + "slashes.txt"; - SlashURLNormalizer normalizer = new SlashURLNormalizer(slashesFile); + conf.set("urlnormalizer.slashes.file", slashesFile); + SlashURLNormalizer normalizer = new SlashURLNormalizer(); normalizer.setConf(conf); // No change
