This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 9d138ffb2 NUTCH-3061 URL filters to log name of the rules file
9d138ffb2 is described below

commit 9d138ffb281d694d8fbe578a96fa671bbfd34640
Author: Sebastian Nagel <[email protected]>
AuthorDate: Tue Jul 9 16:50:40 2024 +0200

    NUTCH-3061 URL filters to log name of the rules file
---
 .../nutch/urlfilter/api/RegexURLFilterBase.java       |  4 ++--
 .../nutch/urlfilter/automaton/AutomatonURLFilter.java | 16 ++++++++++------
 .../apache/nutch/urlfilter/fast/FastURLFilter.java    |  4 ++++
 .../nutch/urlfilter/fast/TestFastURLFilter.java       |  1 -
 .../apache/nutch/urlfilter/regex/RegexURLFilter.java  | 19 +++++++++----------
 5 files changed, 25 insertions(+), 19 deletions(-)

diff --git 
a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
 
b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
index 0ddb69855..fd59fb9cc 100644
--- 
a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
+++ 
b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
@@ -52,8 +52,6 @@ import org.apache.nutch.util.URLUtil;
  * where plus (<code>+</code>)means go ahead and index it and minus (
  * <code>-</code>)means no.
  * </p>
- * 
- * @author J&eacute;r&ocirc;me Charron
  */
 public abstract class RegexURLFilterBase implements URLFilter {
 
@@ -283,6 +281,8 @@ public abstract class RegexURLFilterBase implements 
URLFilter {
       RegexRule rule = createRule(sign, regex, hostOrDomain);
       rules.add(rule);
     }
+    LOG.info("Read {} regex rules ({})", rules.size(),
+        this.getClass().getName());
     return rules;
   }
 
diff --git 
a/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
 
b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
index 89c1e2dfe..cc7188ecd 100644
--- 
a/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
+++ 
b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
@@ -19,6 +19,7 @@ package org.apache.nutch.urlfilter.automaton;
 import java.io.Reader;
 import java.io.IOException;
 import java.io.StringReader;
+import java.lang.invoke.MethodHandles;
 import java.util.regex.PatternSyntaxException;
 
 import org.apache.hadoop.conf.Configuration;
@@ -27,19 +28,24 @@ import dk.brics.automaton.RegExp;
 import dk.brics.automaton.RunAutomaton;
 import org.apache.nutch.urlfilter.api.RegexRule;
 import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * RegexURLFilterBase implementation based on the <a
  * href="https://www.brics.dk/automaton/";>dk.brics.automaton</a> Finite-State
  * Automata for Java<sup>TM</sup>.
  * 
- * @author J&eacute;r&ocirc;me Charron
  * @see <a href="https://www.brics.dk/automaton/";>dk.brics.automaton</a>
  */
 public class AutomatonURLFilter extends RegexURLFilterBase {
+
   public static final String URLFILTER_AUTOMATON_FILE = 
"urlfilter.automaton.file";
   public static final String URLFILTER_AUTOMATON_RULES = 
"urlfilter.automaton.rules";
 
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
   public AutomatonURLFilter() {
     super();
   }
@@ -54,11 +60,6 @@ public class AutomatonURLFilter extends RegexURLFilterBase {
     super(reader);
   }
 
-  /*
-   * ----------------------------------- * <implementation:RegexURLFilterBase> 
*
-   * -----------------------------------
-   */
-
   /**
    * Rules specified as a config property will override rules specified as a
    * config file.
@@ -67,9 +68,12 @@ public class AutomatonURLFilter extends RegexURLFilterBase {
   protected Reader getRulesReader(Configuration conf) throws IOException {
     String stringRules = conf.get(URLFILTER_AUTOMATON_RULES);
     if (stringRules != null) {
+      LOG.info("Reading urlfilter-automaton string rules from property: {}",
+          URLFILTER_AUTOMATON_RULES);
       return new StringReader(stringRules);
     }
     String fileRules = conf.get(URLFILTER_AUTOMATON_FILE);
+    LOG.info("Reading urlfilter-automaton rules file: {}", fileRules);
     return conf.getConfResourceAsReader(fileRules);
   }
 
diff --git 
a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
 
b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
index b1e589a0e..cbc08f4c3 100644
--- 
a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
+++ 
b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
@@ -236,6 +236,7 @@ public class FastURLFilter implements URLFilter {
 
   public void reloadRules() throws IOException {
     String fileRules = conf.get(URLFILTER_FAST_FILE);
+    LOG.info("Reading urlfilter-fast rules file: {}", fileRules);
     InputStream is;
 
     Path fileRulesPath = new Path(fileRules);
@@ -328,6 +329,9 @@ public class FastURLFilter implements URLFilter {
           }
         }
       }
+      LOG.info(
+          "Read {} lines, {} host and {} domain rules from urlfilter-fast 
rules file",
+          lineno, hostRules.size(), domainRules.size());
     } catch (IOException e) {
       LOG.warn("Caught exception while reading rules file at line {}: {}",
           lineno, e.getMessage());
diff --git 
a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
 
b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
index 75b37250e..2c9ceea6f 100644
--- 
a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
+++ 
b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
@@ -16,7 +16,6 @@
  */
 package org.apache.nutch.urlfilter.fast;
 
-import java.io.FileReader;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
diff --git 
a/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
 
b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
index 19d14c33c..5b283b76f 100644
--- 
a/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
+++ 
b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
@@ -19,6 +19,7 @@ package org.apache.nutch.urlfilter.regex;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
+import java.lang.invoke.MethodHandles;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
 
@@ -26,6 +27,8 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.urlfilter.api.RegexRule;
 import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
 import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Filters URLs based on a file of regular expressions using the
@@ -36,6 +39,9 @@ public class RegexURLFilter extends RegexURLFilterBase {
   public static final String URLFILTER_REGEX_FILE = "urlfilter.regex.file";
   public static final String URLFILTER_REGEX_RULES = "urlfilter.regex.rules";
 
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
   public RegexURLFilter() {
     super();
   }
@@ -49,11 +55,6 @@ public class RegexURLFilter extends RegexURLFilterBase {
     super(reader);
   }
 
-  /*
-   * ----------------------------------- * <implementation:RegexURLFilterBase> 
*
-   * -----------------------------------
-   */
-
   /**
    * Rules specified as a config property will override rules specified as a
    * config file.
@@ -62,9 +63,12 @@ public class RegexURLFilter extends RegexURLFilterBase {
   protected Reader getRulesReader(Configuration conf) throws IOException {
     String stringRules = conf.get(URLFILTER_REGEX_RULES);
     if (stringRules != null) {
+      LOG.info("Reading urlfilter-regex string rules from property: {}",
+          URLFILTER_REGEX_RULES);
       return new StringReader(stringRules);
     }
     String fileRules = conf.get(URLFILTER_REGEX_FILE);
+    LOG.info("Reading urlfilter-regex rules file: {}", fileRules);
     return conf.getConfResourceAsReader(fileRules);
   }
 
@@ -81,11 +85,6 @@ public class RegexURLFilter extends RegexURLFilterBase {
   
   
 
-  /*
-   * ------------------------------------ * 
</implementation:RegexURLFilterBase>
-   * * ------------------------------------
-   */
-
   public static void main(String args[]) throws IOException {
     RegexURLFilter filter = new RegexURLFilter();
     filter.setConf(NutchConfiguration.create());

Reply via email to