This is an automated email from the ASF dual-hosted git repository. lewismc pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 8abc78a65 NUTCH-3041 Address confusing logging in o.a.n.net.URLExemptionFilters (#813) 8abc78a65 is described below commit 8abc78a653eb7970def10031d732fb4c7aa0fb6f Author: Lewis John McGibbney <lewis.mcgibb...@gmail.com> AuthorDate: Wed May 15 20:07:15 2024 -0700 NUTCH-3041 Address confusing logging in o.a.n.net.URLExemptionFilters (#813) --- .../org/apache/nutch/net/URLExemptionFilters.java | 7 +++++-- src/plugin/urlfilter-ignoreexempt/README.md | 18 +++++++--------- .../urlfilter/ignoreexempt/ExemptionUrlFilter.java | 24 +++++++++++++--------- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/src/java/org/apache/nutch/net/URLExemptionFilters.java b/src/java/org/apache/nutch/net/URLExemptionFilters.java index c730228e4..ed401053e 100644 --- a/src/java/org/apache/nutch/net/URLExemptionFilters.java +++ b/src/java/org/apache/nutch/net/URLExemptionFilters.java @@ -24,6 +24,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.lang.invoke.MethodHandles; +import java.util.Arrays; /** Creates and caches {@link URLExemptionFilter} implementing plugins. */ public class URLExemptionFilters { @@ -44,8 +45,10 @@ public class URLExemptionFilters { throw new IllegalStateException(e); } } - LOG.info("Found {} extensions at point:'{}'", filters.length, - URLExemptionFilter.X_POINT_ID); + if (filters.length > 0) { + LOG.info("Found {} URLExemptionFilter implementations: '{}'", filters.length, + Arrays.toString(filters)); + } } /** diff --git a/src/plugin/urlfilter-ignoreexempt/README.md b/src/plugin/urlfilter-ignoreexempt/README.md index a8f932e75..374b29abd 100644 --- a/src/plugin/urlfilter-ignoreexempt/README.md +++ b/src/plugin/urlfilter-ignoreexempt/README.md @@ -17,8 +17,8 @@ urlfilter-ignoreexempt ====================== - This plugin allows certain urls to be exempted when the external links are configured to be ignored. - This is useful when focused crawl is setup but some resources like static files are linked from CDNs (external domains). +This plugin allows certain urls to be exempted when the external links are configured to be ignored. +This is useful when focused crawl is setup but some resources like static files are linked from CDNs (external domains). # How to enable ? Add `urlfilter-ignoreexempt` value to `plugin.includes` property @@ -36,25 +36,21 @@ open `conf/db-ignore-external-exemptions.txt` and add the regex rules. ## Format : The format is same same as `regex-urlfilter.txt`. - Each non-comment, non-blank line contains a regular expression - prefixed by '+' or '-'. The first matching pattern in the file - determines whether a URL is exempted or ignored. If no pattern - matches, the URL is ignored. - +Each non-comment, non-blank line contains a regular expression +prefixed by '+' or '-'. The first matching pattern in the file +determines whether a URL is exempted or ignored. If no pattern +matches, the URL is ignored. ## Example : - To exempt urls ending with image extensions, use this rule +To exempt urls ending with image extensions, use this rule `+(?i)\.(jpg|png|gif)$` - - ## Testing the Rules : After enabling the plugin and adding your rules to `conf/db-ignore-external-exemptions.txt`, run: `bin/nutch plugin urlfilter-ignoreexempt org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter http://yoururl.here` - This should print `true` for urls which are accepted by configured rules. \ No newline at end of file diff --git a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java index 96ca9b4ac..8028e3672 100644 --- a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java +++ b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java @@ -25,21 +25,25 @@ import java.io.Reader; import java.util.regex.Pattern; import java.util.List; - /** - * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} uses regex configuration - * to check if URL is eligible for exemption from 'db.ignore.external'. - * When this filter is enabled, the external urls will be checked against configured sequence of regex rules. + * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} + * uses regex configuration to check if URL is eligible for exemption from + * the <code>db.ignore.external.links</code> configuration property. + * When this filter is enabled, the external urls will be checked + * against configured sequence of regex rules. *<p> - * The exemption rule file defaults to db-ignore-external-exemptions.txt in the classpath but can be - * overridden using the property <code>"db.ignore.external.exemptions.file" in ./conf/nutch-*.xml</code> + * The exemption rule file defaults to + * <code>db-ignore-external-exemptions.txt</code> in the classpath but + * can be overridden using the configuration property + * <code>db.ignore.external.exemptions.file</code>. *</p> * - * The exemption rules are specified in plain text file where each line is a rule. - * The format is same same as `regex-urlfilter.txt`. + * The exemption rules are specified in plain text file where each line + * is a rule. + * The format is same same as <code>regex-urlfilter.txt</code>. * Each non-comment, non-blank line contains a regular expression - * prefixed by '+' or '-'. The first matching pattern in the file - * determines whether a URL is exempted or ignored. If no pattern + * prefixed by <b>+</b> or <b>-</b>. The first matching pattern in the file + * determines whether a URL is exempted or ignored. If no pattern * matches, the URL is ignored. * * @since Feb 10, 2016