This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 9d138ffb2 NUTCH-3061 URL filters to log name of the rules file
9d138ffb2 is described below
commit 9d138ffb281d694d8fbe578a96fa671bbfd34640
Author: Sebastian Nagel <[email protected]>
AuthorDate: Tue Jul 9 16:50:40 2024 +0200
NUTCH-3061 URL filters to log name of the rules file
---
.../nutch/urlfilter/api/RegexURLFilterBase.java | 4 ++--
.../nutch/urlfilter/automaton/AutomatonURLFilter.java | 16 ++++++++++------
.../apache/nutch/urlfilter/fast/FastURLFilter.java | 4 ++++
.../nutch/urlfilter/fast/TestFastURLFilter.java | 1 -
.../apache/nutch/urlfilter/regex/RegexURLFilter.java | 19 +++++++++----------
5 files changed, 25 insertions(+), 19 deletions(-)
diff --git
a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
index 0ddb69855..fd59fb9cc 100644
---
a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
+++
b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
@@ -52,8 +52,6 @@ import org.apache.nutch.util.URLUtil;
* where plus (<code>+</code>)means go ahead and index it and minus (
* <code>-</code>)means no.
* </p>
- *
- * @author Jérôme Charron
*/
public abstract class RegexURLFilterBase implements URLFilter {
@@ -283,6 +281,8 @@ public abstract class RegexURLFilterBase implements
URLFilter {
RegexRule rule = createRule(sign, regex, hostOrDomain);
rules.add(rule);
}
+ LOG.info("Read {} regex rules ({})", rules.size(),
+ this.getClass().getName());
return rules;
}
diff --git
a/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
index 89c1e2dfe..cc7188ecd 100644
---
a/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
+++
b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
@@ -19,6 +19,7 @@ package org.apache.nutch.urlfilter.automaton;
import java.io.Reader;
import java.io.IOException;
import java.io.StringReader;
+import java.lang.invoke.MethodHandles;
import java.util.regex.PatternSyntaxException;
import org.apache.hadoop.conf.Configuration;
@@ -27,19 +28,24 @@ import dk.brics.automaton.RegExp;
import dk.brics.automaton.RunAutomaton;
import org.apache.nutch.urlfilter.api.RegexRule;
import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* RegexURLFilterBase implementation based on the <a
* href="https://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State
* Automata for Java<sup>TM</sup>.
*
- * @author Jérôme Charron
* @see <a href="https://www.brics.dk/automaton/">dk.brics.automaton</a>
*/
public class AutomatonURLFilter extends RegexURLFilterBase {
+
public static final String URLFILTER_AUTOMATON_FILE =
"urlfilter.automaton.file";
public static final String URLFILTER_AUTOMATON_RULES =
"urlfilter.automaton.rules";
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
public AutomatonURLFilter() {
super();
}
@@ -54,11 +60,6 @@ public class AutomatonURLFilter extends RegexURLFilterBase {
super(reader);
}
- /*
- * ----------------------------------- * <implementation:RegexURLFilterBase>
*
- * -----------------------------------
- */
-
/**
* Rules specified as a config property will override rules specified as a
* config file.
@@ -67,9 +68,12 @@ public class AutomatonURLFilter extends RegexURLFilterBase {
protected Reader getRulesReader(Configuration conf) throws IOException {
String stringRules = conf.get(URLFILTER_AUTOMATON_RULES);
if (stringRules != null) {
+ LOG.info("Reading urlfilter-automaton string rules from property: {}",
+ URLFILTER_AUTOMATON_RULES);
return new StringReader(stringRules);
}
String fileRules = conf.get(URLFILTER_AUTOMATON_FILE);
+ LOG.info("Reading urlfilter-automaton rules file: {}", fileRules);
return conf.getConfResourceAsReader(fileRules);
}
diff --git
a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
index b1e589a0e..cbc08f4c3 100644
---
a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
+++
b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
@@ -236,6 +236,7 @@ public class FastURLFilter implements URLFilter {
public void reloadRules() throws IOException {
String fileRules = conf.get(URLFILTER_FAST_FILE);
+ LOG.info("Reading urlfilter-fast rules file: {}", fileRules);
InputStream is;
Path fileRulesPath = new Path(fileRules);
@@ -328,6 +329,9 @@ public class FastURLFilter implements URLFilter {
}
}
}
+ LOG.info(
+ "Read {} lines, {} host and {} domain rules from urlfilter-fast
rules file",
+ lineno, hostRules.size(), domainRules.size());
} catch (IOException e) {
LOG.warn("Caught exception while reading rules file at line {}: {}",
lineno, e.getMessage());
diff --git
a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
index 75b37250e..2c9ceea6f 100644
---
a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
+++
b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
@@ -16,7 +16,6 @@
*/
package org.apache.nutch.urlfilter.fast;
-import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
diff --git
a/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
index 19d14c33c..5b283b76f 100644
---
a/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
+++
b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
@@ -19,6 +19,7 @@ package org.apache.nutch.urlfilter.regex;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
+import java.lang.invoke.MethodHandles;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
@@ -26,6 +27,8 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.urlfilter.api.RegexRule;
import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* Filters URLs based on a file of regular expressions using the
@@ -36,6 +39,9 @@ public class RegexURLFilter extends RegexURLFilterBase {
public static final String URLFILTER_REGEX_FILE = "urlfilter.regex.file";
public static final String URLFILTER_REGEX_RULES = "urlfilter.regex.rules";
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
public RegexURLFilter() {
super();
}
@@ -49,11 +55,6 @@ public class RegexURLFilter extends RegexURLFilterBase {
super(reader);
}
- /*
- * ----------------------------------- * <implementation:RegexURLFilterBase>
*
- * -----------------------------------
- */
-
/**
* Rules specified as a config property will override rules specified as a
* config file.
@@ -62,9 +63,12 @@ public class RegexURLFilter extends RegexURLFilterBase {
protected Reader getRulesReader(Configuration conf) throws IOException {
String stringRules = conf.get(URLFILTER_REGEX_RULES);
if (stringRules != null) {
+ LOG.info("Reading urlfilter-regex string rules from property: {}",
+ URLFILTER_REGEX_RULES);
return new StringReader(stringRules);
}
String fileRules = conf.get(URLFILTER_REGEX_FILE);
+ LOG.info("Reading urlfilter-regex rules file: {}", fileRules);
return conf.getConfResourceAsReader(fileRules);
}
@@ -81,11 +85,6 @@ public class RegexURLFilter extends RegexURLFilterBase {
- /*
- * ------------------------------------ *
</implementation:RegexURLFilterBase>
- * * ------------------------------------
- */
-
public static void main(String args[]) throws IOException {
RegexURLFilter filter = new RegexURLFilter();
filter.setConf(NutchConfiguration.create());