This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 83011a08b98c55406583eb068d516ccb9f137266
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Wed May 13 14:39:15 2020 +0200

    NUTCH-2419 Some URL filters and normalizers do not respect command-line 
override for rule file
    
    - simplify selection of rule file (from property or attribute in plugin.xml)
---
 .../org/apache/nutch/parsefilter/regex/RegexParseFilter.java     | 8 +-------
 .../java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java  | 9 ++-------
 .../urlfilter/domainblacklist/DomainBlacklistURLFilter.java      | 9 ++-------
 .../java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java  | 7 +------
 .../java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java  | 7 +------
 .../apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java   | 7 +------
 .../nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java  | 7 +------
 .../apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java | 7 +------
 8 files changed, 10 insertions(+), 51 deletions(-)

diff --git 
a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
 
b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
index c84f27c..6e86fc6 100644
--- 
a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
+++ 
b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
@@ -120,14 +120,8 @@ public class RegexParseFilter implements HtmlParseFilter {
       }
     }
 
-    // domain file and attribute "file" take precedence if defined
-    String file = conf.get("parsefilter.regex.file");
+    String file = conf.get("parsefilter.regex.file", attributeFile);
     String stringRules = conf.get("parsefilter.regex.rules");
-    if (file != null) {
-      // take file
-    } else if (attributeFile != null) {
-      file = attributeFile;
-    }
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
diff --git 
a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
 
b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
index fac02af..f629262 100644
--- 
a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
+++ 
b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
@@ -59,7 +59,7 @@ import org.apache.nutch.util.domain.DomainSuffix;
  * such as lucene.apache.org and hadoop.apache.org. The third line would allow
  * only URLs from www.apache.org. There is no specific ordering to entries. The
  * entries are from more general to more specific with the more general
- * overridding the more specific.
+ * overriding the more specific.
  * </p>
  * 
  * The domain file defaults to domain-urlfilter.txt in the classpath but can be
@@ -130,16 +130,11 @@ public class DomainURLFilter implements URLFilter {
     // 2. rule file name defined by `urlfilter.domain.file`
     // 3. rule file name defined in plugin.xml (`attributeFile`)
     String stringRules = conf.get("urlfilter.domain.rules");
-    String file = conf.get("urlfilter.domain.file");
+    String file = conf.get("urlfilter.domain.file", attributeFile);
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
-      if (file != null) {
-        // take file
-      } else if (attributeFile != null) {
-        file = attributeFile;
-      }
       LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
diff --git 
a/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
 
b/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
index 56b11e9..77c238b 100644
--- 
a/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
+++ 
b/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
@@ -59,7 +59,7 @@ import org.apache.nutch.util.domain.DomainSuffix;
  * such as lucene.apache.org and hadoop.apache.org. The third line would 
exclude
  * only URLs from www.apache.org. There is no specific ordering to entries. The
  * entries are from more general to more specific with the more general
- * overridding the more specific.
+ * overriding the more specific.
  * </p>
  * 
  * The domain file defaults to domainblacklist-urlfilter.txt in the classpath
@@ -131,16 +131,11 @@ public class DomainBlacklistURLFilter implements 
URLFilter {
     // 2. rule file name defined by `urlfilter.domainblacklist.file`
     // 3. rule file name defined in plugin.xml (`attributeFile`)
     String stringRules = conf.get("urlfilter.domainblacklist.rules");
-    String file = conf.get("urlfilter.domainblacklist.file");
+    String file = conf.get("urlfilter.domainblacklist.file", attributeFile);
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
-      if (file != null) {
-        // take file
-      } else if (attributeFile != null) {
-        file = attributeFile;
-      }
       LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
diff --git 
a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
 
b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
index eeef9cf..61c6f17 100644
--- 
a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
+++ 
b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
@@ -142,17 +142,12 @@ public class PrefixURLFilter implements URLFilter {
     // 1. string rules defined by `urlfilter.domainblacklist.rules`
     // 2. rule file name defined by `urlfilter.domainblacklist.file`
     // 3. rule file name defined in plugin.xml (`attributeFile`)
-    String file = conf.get("urlfilter.prefix.file");
+    String file = conf.get("urlfilter.prefix.file", attributeFile);
     String stringRules = conf.get("urlfilter.prefix.rules");
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
-      if (file != null) {
-        // take file
-      } else if (attributeFile != null) {
-        file = attributeFile;
-      }
       LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
diff --git 
a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
 
b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
index 55382cc..3833f3c 100644
--- 
a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
+++ 
b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
@@ -276,17 +276,12 @@ public class SuffixURLFilter implements URLFilter {
     // 1. string rules defined by `urlfilter.domainblacklist.rules`
     // 2. rule file name defined by `urlfilter.domainblacklist.file`
     // 3. rule file name defined in plugin.xml (`attributeFile`)
-    String file = conf.get("urlfilter.suffix.file");
+    String file = conf.get("urlfilter.suffix.file", attributeFile);
     String stringRules = conf.get("urlfilter.suffix.rules");
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
-      if (file != null) {
-        // take file
-      } else if (attributeFile != null) {
-        file = attributeFile;
-      }
       LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
diff --git 
a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
 
b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
index 4506c85..3a3c8a4 100644
--- 
a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
@@ -118,13 +118,8 @@ public class HostURLNormalizer implements URLNormalizer {
     // 1. string rules defined by `urlnormalizer.hosts.rules`
     // 2. rule file name defined by `urlnormalizer.hosts.file"`
     // 3. rule file name defined in plugin.xml (`attributeFile`)
-    String file = conf.get("urlnormalizer.hosts.file");
+    String file = conf.get("urlnormalizer.hosts.file", attributeFile);
     String stringRules = conf.get("urlnormalizer.hosts.rules");
-    if (file != null) {
-      // take file
-    } else if (attributeFile != null) {
-      file = attributeFile;
-    }
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
diff --git 
a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
 
b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
index f18ac65..f60c291 100644
--- 
a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
@@ -124,13 +124,8 @@ public class ProtocolURLNormalizer implements 
URLNormalizer {
     // 1. string rules defined by `urlnormalizer.protocols.rules`
     // 2. rule file name defined by `urlnormalizer.protocols.file"`
     // 3. rule file name defined in plugin.xml (`attributeFile`)
-    String file = conf.get("urlnormalizer.protocols.file");
+    String file = conf.get("urlnormalizer.protocols.file", attributeFile);
     String stringRules = conf.get("urlnormalizer.protocols.rules");
-    if (file != null) {
-      // take file
-    } else if (attributeFile != null) {
-      file = attributeFile;
-    }
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
diff --git 
a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
 
b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
index 6e8b7b9..2570427 100644
--- 
a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
@@ -130,13 +130,8 @@ public class SlashURLNormalizer implements URLNormalizer {
     // 1. string rules defined by `urlnormalizer.slashes.rules`
     // 2. rule file name defined by `urlnormalizer.slashes.file"`
     // 3. rule file name defined in plugin.xml (`attributeFile`)
-    String file = conf.get("urlnormalizer.slashes.file");
+    String file = conf.get("urlnormalizer.slashes.file", attributeFile);
     String stringRules = conf.get("urlnormalizer.slashes.rules");
-    if (file != null) {
-      // take file
-    } else if (attributeFile != null) {
-      file = attributeFile;
-    }
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);

Reply via email to