[nutch] 09/35: NUTCH-2419 Some URL filters and normalizers do not respect command-line override for rule file

snagel Sun, 16 Aug 2020 12:04:43 -0700

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


commit 79f3c0ad54025c4d3f87c625faecc807be2a04b9
Author: Sebastian Nagel <[email protected]>
AuthorDate: Fri Sep 27 22:51:29 2019 +0200

    NUTCH-2419 Some URL filters and normalizers do not respect command-line 
override for rule file
    
    - fix urlfilter-domain, urlfilter-domainblacklist, urlfilter-prefix
      and urlfilter-suffix
    
    - always prefer the configured rule file (urlfilter.domain.file,
      urlfilter.domainblacklist.file, urlfilter.prefix.file,
      urlfilter.suffix.file) over the file defined in plugin.xml
    
    - remove constructors taking rule file as argument
      (used only in unit tests and now obsolete because we can override the
       rule file via configuration)
    
    - update Java API doc comments
---
 .../nutch/parsefilter/regex/RegexParseFilter.java  | 16 +----
 .../parsefilter/regex/TestRegexParseFilter.java    |  6 +-
 .../nutch/urlfilter/domain/DomainURLFilter.java    | 66 +++++++-------------
 .../urlfilter/domain/TestDomainURLFilter.java      |  6 +-
 .../domainblacklist/DomainBlacklistURLFilter.java  | 71 ++++++++--------------
 .../TestDomainBlacklistURLFilter.java              |  4 +-
 .../nutch/urlfilter/prefix/PrefixURLFilter.java    | 42 ++++++-------
 .../nutch/urlfilter/suffix/SuffixURLFilter.java    | 41 ++++++-------
 .../net/urlnormalizer/host/HostURLNormalizer.java  | 21 +++----
 .../urlnormalizer/host/TestHostURLNormalizer.java  |  3 +-
 .../protocol/ProtocolURLNormalizer.java            | 23 +++----
 .../protocol/TestProtocolURLNormalizer.java        |  3 +-
 .../urlnormalizer/slash/SlashURLNormalizer.java    | 25 +++-----
 .../slash/TestSlashURLNormalizer.java              |  3 +-
 14 files changed, 137 insertions(+), 193 deletions(-)

diff --git 
a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
 
b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
index 3c43cf5..c84f27c 100644
--- 
a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
+++ 
b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
@@ -51,20 +51,11 @@ public class RegexParseFilter implements HtmlParseFilter {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
   private static String attributeFile = null;
-  private String regexFile = null;
   
   private Configuration conf;
   
   private static final Map<String,RegexRule> rules = new HashMap<>();
   
-  public RegexParseFilter() {
-    //default constructor
-  }
-  
-  public RegexParseFilter(String regexFile) {
-    this.regexFile = regexFile;
-  }
-
   public ParseResult filter(Content content, ParseResult parseResult, 
HTMLMetaTags metaTags, DocumentFragment doc) {
     Parse parse = parseResult.get(content.getUrl());
     String html = new String(content.getContent());
@@ -132,10 +123,9 @@ public class RegexParseFilter implements HtmlParseFilter {
     // domain file and attribute "file" take precedence if defined
     String file = conf.get("parsefilter.regex.file");
     String stringRules = conf.get("parsefilter.regex.rules");
-    if (regexFile != null) {
-      file = regexFile;
-    }
-    else if (attributeFile != null) {
+    if (file != null) {
+      // take file
+    } else if (attributeFile != null) {
       file = attributeFile;
     }
     Reader reader = null;
diff --git 
a/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
 
b/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
index 238d300..64fa7f6 100644
--- 
a/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
+++ 
b/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
@@ -35,7 +35,8 @@ public class TestRegexParseFilter extends TestCase {
     Configuration conf = NutchConfiguration.create();
 
     String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
-    RegexParseFilter filter = new RegexParseFilter(file);
+    conf.set("parsefilter.regex.file", file);
+    RegexParseFilter filter = new RegexParseFilter();
     filter.setConf(conf);
 
     String url = "http://nutch.apache.org/";;
@@ -56,7 +57,8 @@ public class TestRegexParseFilter extends TestCase {
     Configuration conf = NutchConfiguration.create();
 
     String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
-    RegexParseFilter filter = new RegexParseFilter(file);
+    conf.set("parsefilter.regex.file", file);
+    RegexParseFilter filter = new RegexParseFilter();
     filter.setConf(conf);
 
     String url = "http://nutch.apache.org/";;
diff --git 
a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
 
b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
index 9e2e2e7..fac02af 100644
--- 
a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
+++ 
b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
@@ -38,24 +38,26 @@ import org.apache.nutch.util.domain.DomainSuffix;
 /**
  * <p>
  * Filters URLs based on a file containing domain suffixes, domain names, and
- * hostnames. Only a url that matches one of the suffixes, domains, or hosts
+ * hostnames. Only a URL that matches one of the suffixes, domains, or hosts
  * present in the file is allowed.
  * </p>
  * 
  * <p>
- * Urls are checked in order of domain suffix, domain name, and hostname 
against
+ * URLs are checked in order of domain suffix, domain name, and hostname 
against
  * entries in the domain file. The domain file would be setup as follows with
  * one entry per line:
  * 
  * <pre>
- * com apache.org www.apache.org
+ * com
+ * apache.org
+ * www.apache.org
  * </pre>
  * 
  * <p>
  * The first line is an example of a filter that would allow all .com domains.
- * The second line allows all urls from apache.org and all of its subdomains
+ * The second line allows all URLs from apache.org and all of its subdomains
  * such as lucene.apache.org and hadoop.apache.org. The third line would allow
- * only urls from www.apache.org. There is no specific ordering to entries. The
+ * only URLs from www.apache.org. There is no specific ordering to entries. The
  * entries are from more general to more specific with the more general
  * overridding the more specific.
  * </p>
@@ -72,7 +74,6 @@ import org.apache.nutch.util.domain.DomainSuffix;
  * </li>
  * </ul>
  * 
- * the attribute "file" has higher precedence if defined.
  */
 public class DomainURLFilter implements URLFilter {
 
@@ -82,7 +83,6 @@ public class DomainURLFilter implements URLFilter {
   // read in attribute "file" of this plugin.
   private static String attributeFile = null;
   private Configuration conf;
-  private String domainFile = null;
   private Set<String> domainSet = new LinkedHashSet<String>();
 
   private void readConfiguration(Reader configReader) throws IOException {
@@ -99,23 +99,6 @@ public class DomainURLFilter implements URLFilter {
   }
 
   /**
-   * Default constructor.
-   */
-  public DomainURLFilter() {
-
-  }
-
-  /**
-   * Constructor that specifies the domain file to use.
-   * 
-   * @param domainFile
-   *          The domain file, overrides domain-urlfilter.text default.
-   */
-  public DomainURLFilter(String domainFile) {
-    this.domainFile = domainFile;
-  }
-
-  /**
    * Sets the configuration.
    */
   public void setConf(Configuration conf) {
@@ -133,44 +116,41 @@ public class DomainURLFilter implements URLFilter {
       }
     }
 
-    // handle blank non empty input
-    if (attributeFile != null && attributeFile.trim().equals("")) {
+    if (attributeFile != null && attributeFile.trim().isEmpty()) {
       attributeFile = null;
     }
 
     if (attributeFile != null) {
-      if (LOG.isInfoEnabled()) {
-        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
-            + " as " + attributeFile);
-      }
-    } else {
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
-            + pluginName);
-      }
+      LOG.info("Attribute \"file\" is defined for plugin {} as {}", 
pluginName, attributeFile);
     }
 
-    // domain file and attribute "file" take precedence if defined
-    String file = conf.get("urlfilter.domain.file");
+    // precedence hierarchy for definition of filter rules
+    // (first non-empty definition takes precedence):
+    // 1. string rules defined by `urlfilter.domain.rules`
+    // 2. rule file name defined by `urlfilter.domain.file`
+    // 3. rule file name defined in plugin.xml (`attributeFile`)
     String stringRules = conf.get("urlfilter.domain.rules");
-    if (domainFile != null) {
-      file = domainFile;
-    } else if (attributeFile != null) {
-      file = attributeFile;
-    }
+    String file = conf.get("urlfilter.domain.file");
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
+      if (file != null) {
+        // take file
+      } else if (attributeFile != null) {
+        file = attributeFile;
+      }
+      LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
     try {
       if (reader == null) {
+        // read local file
         reader = new FileReader(file);
       }
       readConfiguration(reader);
     } catch (IOException e) {
-      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      LOG.error("Error reading " + pluginName + " rule file " + file, e);
     }
   }
 
diff --git 
a/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
 
b/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
index 0be1e31..7878aa1 100644
--- 
a/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
+++ 
b/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
@@ -31,7 +31,8 @@ public class TestDomainURLFilter {
 
     String domainFile = SAMPLES + SEPARATOR + "hosts.txt";
     Configuration conf = NutchConfiguration.create();
-    DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+    conf.set("urlfilter.domain.file", domainFile);
+    DomainURLFilter domainFilter = new DomainURLFilter();
     domainFilter.setConf(conf);
     Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org";));
     Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org";));
@@ -50,7 +51,8 @@ public class TestDomainURLFilter {
     // https://issues.apache.org/jira/browse/NUTCH-2189
     String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt";
     Configuration conf = NutchConfiguration.create();
-    DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+    conf.set("urlfilter.domain.file", domainFile);
+    DomainURLFilter domainFilter = new DomainURLFilter();
     domainFilter.setConf(conf);
     Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org";));
     Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org";));
diff --git 
a/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
 
b/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
index 452f6d4..56b11e9 100644
--- 
a/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
+++ 
b/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
@@ -38,24 +38,26 @@ import org.apache.nutch.util.domain.DomainSuffix;
 /**
  * <p>
  * Filters URLs based on a file containing domain suffixes, domain names, and
- * hostnames. A url that matches one of the suffixes, domains, or hosts present
+ * hostnames. A URL that matches one of the suffixes, domains, or hosts present
  * in the file is filtered out.
  * </p>
  * 
  * <p>
- * Urls are checked in order of domain suffix, domain name, and hostname 
against
+ * URLs are checked in order of domain suffix, domain name, and hostname 
against
  * entries in the domain file. The domain file would be setup as follows with
  * one entry per line:
  * 
  * <pre>
- * com apache.org www.apache.org
+ * com
+ * apache.org
+ * www.apache.org
  * </pre>
  * 
  * <p>
- * The first line is an example of a filter that would allow all .com domains.
- * The second line allows all urls from apache.org and all of its subdomains
- * such as lucene.apache.org and hadoop.apache.org. The third line would allow
- * only urls from www.apache.org. There is no specific ordering to entries. The
+ * The first line is an example of a filter that would exclude all .com 
domains.
+ * The second line excludes all URLs from apache.org and all of its subdomains
+ * such as lucene.apache.org and hadoop.apache.org. The third line would 
exclude
+ * only URLs from www.apache.org. There is no specific ordering to entries. The
  * entries are from more general to more specific with the more general
  * overridding the more specific.
  * </p>
@@ -72,7 +74,6 @@ import org.apache.nutch.util.domain.DomainSuffix;
  * </li>
  * </ul>
  * 
- * the attribute "file" has higher precedence if defined.
  */
 public class DomainBlacklistURLFilter implements URLFilter {
 
@@ -82,7 +83,6 @@ public class DomainBlacklistURLFilter implements URLFilter {
   // read in attribute "file" of this plugin.
   private static String attributeFile = null;
   private Configuration conf;
-  private String domainFile = null;
   private Set<String> domainSet = new LinkedHashSet<String>();
 
   private void readConfiguration(Reader configReader) throws IOException {
@@ -99,23 +99,6 @@ public class DomainBlacklistURLFilter implements URLFilter {
   }
 
   /**
-   * Default constructor.
-   */
-  public DomainBlacklistURLFilter() {
-
-  }
-
-  /**
-   * Constructor that specifies the domain file to use.
-   * 
-   * @param domainFile
-   *          The domain file, overrides domainblacklist-urlfilter.text 
default.
-   */
-  public DomainBlacklistURLFilter(String domainFile) {
-    this.domainFile = domainFile;
-  }
-
-  /**
    * Sets the configuration.
    */
   public void setConf(Configuration conf) {
@@ -133,44 +116,42 @@ public class DomainBlacklistURLFilter implements 
URLFilter {
       }
     }
 
-    // handle blank non empty input
-    if (attributeFile != null && attributeFile.trim().equals("")) {
+    if (attributeFile != null && attributeFile.trim().isEmpty()) {
       attributeFile = null;
     }
 
     if (attributeFile != null) {
-      if (LOG.isInfoEnabled()) {
-        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
-            + " as " + attributeFile);
-      }
-    } else {
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
-            + pluginName);
-      }
+      LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName,
+          attributeFile);
     }
 
-    // domain file and attribute "file" take precedence if defined
-    String file = conf.get("urlfilter.domainblacklist.file");
+    // precedence hierarchy for definition of filter rules
+    // (first non-empty definition takes precedence):
+    // 1. string rules defined by `urlfilter.domainblacklist.rules`
+    // 2. rule file name defined by `urlfilter.domainblacklist.file`
+    // 3. rule file name defined in plugin.xml (`attributeFile`)
     String stringRules = conf.get("urlfilter.domainblacklist.rules");
-    if (domainFile != null) {
-      file = domainFile;
-    } else if (attributeFile != null) {
-      file = attributeFile;
-    }
+    String file = conf.get("urlfilter.domainblacklist.file");
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
+      if (file != null) {
+        // take file
+      } else if (attributeFile != null) {
+        file = attributeFile;
+      }
+      LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
     try {
       if (reader == null) {
+        // read local file
         reader = new FileReader(file);
       }
       readConfiguration(reader);
     } catch (IOException e) {
-      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      LOG.error("Error reading " + pluginName + " rule file " + file, e);
     }
   }
 
diff --git 
a/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
 
b/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
index d253867..9ab207a 100644
--- 
a/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
+++ 
b/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
@@ -31,8 +31,8 @@ public class TestDomainBlacklistURLFilter {
 
     String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
     Configuration conf = NutchConfiguration.create();
-    DomainBlacklistURLFilter domainBlacklistFilter = new 
DomainBlacklistURLFilter(
-        domainBlacklistFile);
+    conf.set("urlfilter.domainblacklist.file", domainBlacklistFile);
+    DomainBlacklistURLFilter domainBlacklistFilter = new 
DomainBlacklistURLFilter();
     domainBlacklistFilter.setConf(conf);
     
Assert.assertNull(domainBlacklistFilter.filter("http://lucene.apache.org";));
     
Assert.assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org";));
diff --git 
a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
 
b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
index d1d5caa..eeef9cf 100644
--- 
a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
+++ 
b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
@@ -39,9 +39,8 @@ import java.util.ArrayList;
 
 /**
  * Filters URLs based on a file of URL prefixes. The file is named by (1)
- * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and (2)
- * attribute "file" in plugin.xml of this plugin Attribute "file" has higher
- * precedence if defined.
+ * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, or (2)
+ * the attribute "file" in plugin.xml of this plugin.
  * 
  * <p>
  * The format of this file is one URL prefix per line.
@@ -129,43 +128,44 @@ public class PrefixURLFilter implements URLFilter {
         break;
       }
     }
-    if (attributeFile != null && attributeFile.trim().equals(""))
+
+    if (attributeFile != null && attributeFile.trim().isEmpty()) {
       attributeFile = null;
+    }
+
     if (attributeFile != null) {
-      if (LOG.isInfoEnabled()) {
-        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
-            + " as " + attributeFile);
-      }
-    } else {
-      // if (LOG.isWarnEnabled()) {
-      // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
-      // plugin "+pluginName);
-      // }
+      LOG.info("Attribute \"file\" is defined for plugin {} as {}", 
pluginName, attributeFile);
     }
 
+    // precedence hierarchy for definition of filter rules
+    // (first non-empty definition takes precedence):
+    // 1. string rules defined by `urlfilter.domainblacklist.rules`
+    // 2. rule file name defined by `urlfilter.domainblacklist.file`
+    // 3. rule file name defined in plugin.xml (`attributeFile`)
     String file = conf.get("urlfilter.prefix.file");
     String stringRules = conf.get("urlfilter.prefix.rules");
-    // attribute "file" takes precedence if defined
-    if (attributeFile != null)
-      file = attributeFile;
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
+      if (file != null) {
+        // take file
+      } else if (attributeFile != null) {
+        file = attributeFile;
+      }
+      LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
 
     if (reader == null) {
+      LOG.warn("Missing {} rule file '{}': all URLs will be rejected!",
+          pluginName, file);
       trie = new PrefixStringMatcher(new String[0]);
     } else {
       try {
         trie = readConfiguration(reader);
       } catch (IOException e) {
-        if (LOG.isErrorEnabled()) {
-          LOG.error(e.getMessage());
-        }
-        // TODO [email protected]: throw Exception? Because broken api.
-        throw new RuntimeException(e.getMessage(), e);
+        LOG.error("Error reading " + pluginName + " rule file " + file, e);
       }
     }
   }
diff --git 
a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
 
b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
index df5a5d8..55382cc 100644
--- 
a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
+++ 
b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
@@ -47,8 +47,7 @@ import java.net.MalformedURLException;
  * <li>property "urlfilter.suffix.file" in ./conf/nutch-default.xml, and</li>
  * <li>attribute "file" in plugin.xml of this plugin</li>
  * </ol>
- * Attribute "file" has higher precedence if defined. If the config file is
- * missing, all URLs will be rejected.
+ * If the config file is missing, all URLs will be rejected.
  * 
  * <p>
  * This filter can be configured to work in one of two modes:
@@ -177,9 +176,7 @@ public class SuffixURLFilter implements URLFilter {
 
     // handle missing config file
     if (reader == null) {
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("Missing urlfilter.suffix.file, all URLs will be rejected!");
-      }
+      LOG.warn("Missing urlfilter.suffix.file, all URLs will be rejected!");
       suffixes = new SuffixStringMatcher(new String[0]);
       modeAccept = false;
       ignoreCase = false;
@@ -265,39 +262,39 @@ public class SuffixURLFilter implements URLFilter {
         break;
       }
     }
-    if (attributeFile != null && attributeFile.trim().equals(""))
+
+    if (attributeFile != null && attributeFile.trim().isEmpty()) {
       attributeFile = null;
+    }
+
     if (attributeFile != null) {
-      if (LOG.isInfoEnabled()) {
-        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
-            + " as " + attributeFile);
-      }
-    } else {
-      // if (LOG.isWarnEnabled()) {
-      // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
-      // plugin "+pluginName);
-      // }
+      LOG.info("Attribute \"file\" is defined for plugin {} as {}", 
pluginName, attributeFile);
     }
 
+    // precedence hierarchy for definition of filter rules
+    // (first non-empty definition takes precedence):
+    // 1. string rules defined by `urlfilter.domainblacklist.rules`
+    // 2. rule file name defined by `urlfilter.domainblacklist.file`
+    // 3. rule file name defined in plugin.xml (`attributeFile`)
     String file = conf.get("urlfilter.suffix.file");
     String stringRules = conf.get("urlfilter.suffix.rules");
-    // attribute "file" takes precedence if defined
-    if (attributeFile != null)
-      file = attributeFile;
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
+      if (file != null) {
+        // take file
+      } else if (attributeFile != null) {
+        file = attributeFile;
+      }
+      LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
 
     try {
       readConfiguration(reader);
     } catch (IOException e) {
-      if (LOG.isErrorEnabled()) {
-        LOG.error(e.getMessage());
-      }
-      throw new RuntimeException(e.getMessage(), e);
+      LOG.error("Error reading " + pluginName + " rule file " + file, e);
     }
   }
 
diff --git 
a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
 
b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
index ef83284..4506c85 100644
--- 
a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
@@ -53,16 +53,8 @@ public class HostURLNormalizer implements URLNormalizer {
       .getLogger(MethodHandles.lookup().lookupClass());
 
   private static String attributeFile = null;
-  private String hostsFile = null;
   private static final HashMap<String, String> hostsMap = new HashMap<String, 
String>();
 
-  public HostURLNormalizer() {
-  }
-
-  public HostURLNormalizer(String hostsFile) {
-    this.hostsFile = hostsFile;
-  }
-
   private synchronized void readConfiguration(Reader configReader)
       throws IOException {
     if (hostsMap.size() > 0) {
@@ -121,11 +113,15 @@ public class HostURLNormalizer implements URLNormalizer {
       }
     }
 
-    // domain file and attribute "file" take precedence if defined
+    // precedence hierarchy for definition of normalizer rules
+    // (first non-empty definition takes precedence):
+    // 1. string rules defined by `urlnormalizer.hosts.rules`
+    // 2. rule file name defined by `urlnormalizer.hosts.file"`
+    // 3. rule file name defined in plugin.xml (`attributeFile`)
     String file = conf.get("urlnormalizer.hosts.file");
     String stringRules = conf.get("urlnormalizer.hosts.rules");
-    if (hostsFile != null) {
-      file = hostsFile;
+    if (file != null) {
+      // take file
     } else if (attributeFile != null) {
       file = attributeFile;
     }
@@ -133,6 +129,7 @@ public class HostURLNormalizer implements URLNormalizer {
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
+      LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
     try {
@@ -141,7 +138,7 @@ public class HostURLNormalizer implements URLNormalizer {
       }
       readConfiguration(reader);
     } catch (IOException e) {
-      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      LOG.error("Error reading " + pluginName + " rule file " + file, e);
     }
   }
 
diff --git 
a/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
 
b/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
index c9e1a2c..68cb50a 100644
--- 
a/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
@@ -32,7 +32,8 @@ public class TestHostURLNormalizer {
     Configuration conf = NutchConfiguration.create();
 
     String hostsFile = SAMPLES + SEPARATOR + "hosts.txt";
-    HostURLNormalizer normalizer = new HostURLNormalizer(hostsFile);
+    conf.set("urlnormalizer.hosts.file", hostsFile);
+    HostURLNormalizer normalizer = new HostURLNormalizer();
     normalizer.setConf(conf);
 
     // Force www. sub domain when hitting link without sub domain
diff --git 
a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
 
b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
index 12ecbf4..f18ac65 100644
--- 
a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
@@ -50,18 +50,11 @@ public class ProtocolURLNormalizer implements URLNormalizer 
{
   private static final String PROTOCOL_DELIMITER = "://";
 
   private static String attributeFile = null;
-  private String protocolsFile = null;
   
   // We record a map of hosts and boolean, the boolean denotes whether the 
host should
   // have slashes after URL paths. True means slash, false means remove the 
slash
   private static final Map<String,String> protocolsMap = new 
HashMap<String,String>();
 
-  public ProtocolURLNormalizer() {}
-
-  public ProtocolURLNormalizer(String protocolsFile) {
-    this.protocolsFile = protocolsFile;
-  }
-
   private synchronized void readConfiguration(Reader configReader) throws 
IOException {
     if (protocolsMap.size() > 0) {
       return;
@@ -126,19 +119,23 @@ public class ProtocolURLNormalizer implements 
URLNormalizer {
       }
     }
 
-    // domain file and attribute "file" take precedence if defined
+    // precedence hierarchy for definition of normalizer rules
+    // (first non-empty definition takes precedence):
+    // 1. string rules defined by `urlnormalizer.protocols.rules`
+    // 2. rule file name defined by `urlnormalizer.protocols.file"`
+    // 3. rule file name defined in plugin.xml (`attributeFile`)
     String file = conf.get("urlnormalizer.protocols.file");
     String stringRules = conf.get("urlnormalizer.protocols.rules");
-    if (protocolsFile != null) {
-      file = protocolsFile;
-    }
-    else if (attributeFile != null) {
+    if (file != null) {
+      // take file
+    } else if (attributeFile != null) {
       file = attributeFile;
     }
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
+      LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
     try {
@@ -148,7 +145,7 @@ public class ProtocolURLNormalizer implements URLNormalizer 
{
       readConfiguration(reader);
     }
     catch (IOException e) {
-      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      LOG.error("Error reading " + pluginName + " rule file " + file, e);
     }
   }
   
diff --git 
a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
 
b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
index 22005ce..1b9760b 100644
--- 
a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
@@ -31,7 +31,8 @@ public class TestProtocolURLNormalizer extends TestCase {
     Configuration conf = NutchConfiguration.create();
 
     String protocolsFile = SAMPLES + SEPARATOR + "protocols.txt";
-    ProtocolURLNormalizer normalizer = new 
ProtocolURLNormalizer(protocolsFile);
+    conf.set("urlnormalizer.protocols.file", protocolsFile);
+    ProtocolURLNormalizer normalizer = new ProtocolURLNormalizer();
     normalizer.setConf(conf);
 
     // No change
diff --git 
a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
 
b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
index 8d05f5e..6e8b7b9 100644
--- 
a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
@@ -52,20 +52,11 @@ public class SlashURLNormalizer implements URLNormalizer {
   private static final String PROTOCOL_DELIMITER = "://";
 
   private static String attributeFile = null;
-  private String slashesFile = null;
   
   // We record a map of hosts and boolean, the boolean denotes whether the 
host should
   // have slashes after URL paths. True means slash, false means remove the 
slash
   private static final Map<String,Boolean> slashesMap = new HashMap<>();
 
-  public SlashURLNormalizer() {
-    //default constructor
-  }
-
-  public SlashURLNormalizer(String slashesFile) {
-    this.slashesFile = slashesFile;
-  }
-
   private synchronized void readConfiguration(Reader configReader) throws 
IOException {
     if (slashesMap.size() > 0) {
       return;
@@ -134,19 +125,23 @@ public class SlashURLNormalizer implements URLNormalizer {
       }
     }
 
-    // domain file and attribute "file" take precedence if defined
+    // precedence hierarchy for definition of normalizer rules
+    // (first non-empty definition takes precedence):
+    // 1. string rules defined by `urlnormalizer.slashes.rules`
+    // 2. rule file name defined by `urlnormalizer.slashes.file"`
+    // 3. rule file name defined in plugin.xml (`attributeFile`)
     String file = conf.get("urlnormalizer.slashes.file");
     String stringRules = conf.get("urlnormalizer.slashes.rules");
-    if (slashesFile != null) {
-      file = slashesFile;
-    }
-    else if (attributeFile != null) {
+    if (file != null) {
+      // take file
+    } else if (attributeFile != null) {
       file = attributeFile;
     }
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
+      LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
     try {
@@ -156,7 +151,7 @@ public class SlashURLNormalizer implements URLNormalizer {
       readConfiguration(reader);
     }
     catch (IOException e) {
-      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      LOG.error("Error reading " + pluginName + " rule file " + file, e);
     }
   }
   
diff --git 
a/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
 
b/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
index c5b3897..54af2bf 100644
--- 
a/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
@@ -31,7 +31,8 @@ public class TestSlashURLNormalizer extends TestCase {
     Configuration conf = NutchConfiguration.create();
 
     String slashesFile = SAMPLES + SEPARATOR + "slashes.txt";
-    SlashURLNormalizer normalizer = new SlashURLNormalizer(slashesFile);
+    conf.set("urlnormalizer.slashes.file", slashesFile);
+    SlashURLNormalizer normalizer = new SlashURLNormalizer();
     normalizer.setConf(conf);
 
     // No change

[nutch] 09/35: NUTCH-2419 Some URL filters and normalizers do not respect command-line override for rule file

Reply via email to