This is an automated email from the ASF dual-hosted git repository.

lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 8abc78a65 NUTCH-3041 Address confusing logging in 
o.a.n.net.URLExemptionFilters (#813)
8abc78a65 is described below

commit 8abc78a653eb7970def10031d732fb4c7aa0fb6f
Author: Lewis John McGibbney <lewis.mcgibb...@gmail.com>
AuthorDate: Wed May 15 20:07:15 2024 -0700

    NUTCH-3041 Address confusing logging in o.a.n.net.URLExemptionFilters (#813)
---
 .../org/apache/nutch/net/URLExemptionFilters.java  |  7 +++++--
 src/plugin/urlfilter-ignoreexempt/README.md        | 18 +++++++---------
 .../urlfilter/ignoreexempt/ExemptionUrlFilter.java | 24 +++++++++++++---------
 3 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/src/java/org/apache/nutch/net/URLExemptionFilters.java 
b/src/java/org/apache/nutch/net/URLExemptionFilters.java
index c730228e4..ed401053e 100644
--- a/src/java/org/apache/nutch/net/URLExemptionFilters.java
+++ b/src/java/org/apache/nutch/net/URLExemptionFilters.java
@@ -24,6 +24,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.lang.invoke.MethodHandles;
+import java.util.Arrays;
 
 /** Creates and caches {@link URLExemptionFilter} implementing plugins. */
 public class URLExemptionFilters {
@@ -44,8 +45,10 @@ public class URLExemptionFilters {
         throw new IllegalStateException(e);
       }
     }
-    LOG.info("Found {} extensions at point:'{}'", filters.length,
-        URLExemptionFilter.X_POINT_ID);
+    if (filters.length > 0) {
+      LOG.info("Found {} URLExemptionFilter implementations: '{}'", 
filters.length,
+        Arrays.toString(filters));
+    }
   }
 
   /**
diff --git a/src/plugin/urlfilter-ignoreexempt/README.md 
b/src/plugin/urlfilter-ignoreexempt/README.md
index a8f932e75..374b29abd 100644
--- a/src/plugin/urlfilter-ignoreexempt/README.md
+++ b/src/plugin/urlfilter-ignoreexempt/README.md
@@ -17,8 +17,8 @@
 
 urlfilter-ignoreexempt
 ======================
-  This plugin allows certain urls to be exempted when the external links are 
configured to be ignored.
-  This is useful when focused crawl is setup but some resources like static 
files are linked from CDNs (external domains).
+This plugin allows certain urls to be exempted when the external links are 
configured to be ignored.
+This is useful when focused crawl is setup but some resources like static 
files are linked from CDNs (external domains).
 
 # How to enable ?
 Add `urlfilter-ignoreexempt` value to `plugin.includes` property
@@ -36,25 +36,21 @@ open `conf/db-ignore-external-exemptions.txt` and add the 
regex rules.
 ## Format :
 
 The format is same same as `regex-urlfilter.txt`.
- Each non-comment, non-blank line contains a regular expression
- prefixed by '+' or '-'.  The first matching pattern in the file
- determines whether a URL is exempted or ignored.  If no pattern
- matches, the URL is ignored.
-
+Each non-comment, non-blank line contains a regular expression
+prefixed by '+' or '-'.  The first matching pattern in the file
+determines whether a URL is exempted or ignored.  If no pattern
+matches, the URL is ignored.
 
 ## Example :
 
- To exempt urls ending with image extensions, use this rule
+To exempt urls ending with image extensions, use this rule
 
 `+(?i)\.(jpg|png|gif)$`
 
-   
-   
 ## Testing the Rules :
 
 After enabling the plugin and adding your rules to 
`conf/db-ignore-external-exemptions.txt`, run:
    
 `bin/nutch plugin urlfilter-ignoreexempt  
org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter http://yoururl.here`
 
-
 This should print `true` for urls which are accepted by configured rules.
\ No newline at end of file
diff --git 
a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
 
b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
index 96ca9b4ac..8028e3672 100644
--- 
a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
+++ 
b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
@@ -25,21 +25,25 @@ import java.io.Reader;
 import java.util.regex.Pattern;
 import java.util.List;
 
-
 /**
- * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} uses 
regex configuration
- * to check if URL is eligible for exemption from 'db.ignore.external'.
- * When this filter is enabled, the external urls will be checked against 
configured sequence of regex rules.
+ * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} 
+ * uses regex configuration to check if URL is eligible for exemption from 
+ * the <code>db.ignore.external.links</code> configuration property.
+ * When this filter is enabled, the external urls will be checked 
+ * against configured sequence of regex rules.
  *<p>
- * The exemption rule file defaults to db-ignore-external-exemptions.txt in 
the classpath but can be
- * overridden using the property  <code>"db.ignore.external.exemptions.file" 
in ./conf/nutch-*.xml</code>
+ * The exemption rule file defaults to 
+ * <code>db-ignore-external-exemptions.txt</code> in the classpath but 
+ * can be overridden using the configuration property 
+ * <code>db.ignore.external.exemptions.file</code>.
  *</p>
  *
- * The exemption rules are specified in plain text file where each line is a 
rule.
- * The format is same same as `regex-urlfilter.txt`.
+ * The exemption rules are specified in plain text file where each line 
+ * is a rule.
+ * The format is same same as <code>regex-urlfilter.txt</code>.
  * Each non-comment, non-blank line contains a regular expression
- * prefixed by '+' or '-'.  The first matching pattern in the file
- * determines whether a URL is exempted or ignored.  If no pattern
+ * prefixed by <b>+</b> or <b>-</b>. The first matching pattern in the file
+ * determines whether a URL is exempted or ignored. If no pattern
  * matches, the URL is ignored.
  *
  * @since Feb 10, 2016

Reply via email to