This is an automated email from the ASF dual-hosted git repository. markus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 89c41e1b5a245322b27e8dd0728b543faa171e9d Author: Markus Jelsma <[email protected]> AuthorDate: Fri Feb 22 16:44:25 2019 +0100 NUTCH-2692 Subcollection to support case-insensitive white and black lists --- conf/nutch-default.xml | 8 ++++++++ .../src/java/org/apache/nutch/collection/Subcollection.java | 13 ++++++++++++- .../indexer/subcollection/SubcollectionIndexingFilter.java | 6 ++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index a42e6a9..69fbb7d 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -2407,6 +2407,14 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> </description> </property> +<property> + <name>subcollection.case.insensitive</name> + <value>false</value> + <description> + Whether the URL prefixes are to be treated case insensitive. + </description> +</property> + <!-- Headings plugin properties --> <property> diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java index 13064eb..8478390 100644 --- a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java +++ b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java @@ -69,6 +69,11 @@ public class Subcollection extends Configured implements URLFilter { * SubCollection blacklist as String */ String blString; + + /** + * Whether the white and black lists are case sensitive + */ + boolean caseInsensitive = false; /** * public Constructor @@ -95,10 +100,12 @@ public class Subcollection extends Configured implements URLFilter { this.id = id; this.key = key; this.name = name; + caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false); } public Subcollection(Configuration conf) { super(conf); + caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false); } /** @@ -231,7 +238,11 @@ public class Subcollection extends Configured implements URLFilter { while (st.hasMoreElements()) { String line = (String) st.nextElement(); - list.add(line.trim()); + line = line.trim(); + if (caseInsensitive) { + line = line.toLowerCase(); + } + list.add(line); } } diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java index 898d314..767d54d 100644 --- a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java +++ b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java @@ -36,6 +36,7 @@ public class SubcollectionIndexingFilter extends Configured implements IndexingFilter { private Configuration conf; + private boolean caseInsensitive = false; public SubcollectionIndexingFilter() { super(NutchConfiguration.create()); @@ -52,7 +53,9 @@ public class SubcollectionIndexingFilter extends Configured implements this.conf = conf; fieldName = conf.get("subcollection.default.fieldname", "subcollection"); metadataSource = conf.get("subcollection.metadata.source", "subcollection"); + caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false); } + /** * @return Configuration @@ -102,6 +105,9 @@ public class SubcollectionIndexingFilter extends Configured implements } String sUrl = url.toString(); + if (caseInsensitive) { + sUrl = sUrl.toLowerCase(); + } addSubCollectionField(doc, sUrl); return doc; }
