This is an automated email from the ASF dual-hosted git repository.
markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 9fb7d6c NUTCH-2068 Allow subcollection overrides via metadata
9fb7d6c is described below
commit 9fb7d6c2e61ce36375722b16842b694621f3b053
Author: Markus Jelsma <[email protected]>
AuthorDate: Thu Mar 16 11:51:39 2017 +0100
NUTCH-2068 Allow subcollection overrides via metadata
---
.../subcollection/SubcollectionIndexingFilter.java | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git
a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
index c4b8b31..df12e4f 100644
---
a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
+++
b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
@@ -57,6 +57,7 @@ public class SubcollectionIndexingFilter extends Configured
implements
public void setConf(Configuration conf) {
this.conf = conf;
fieldName = conf.get("subcollection.default.fieldname", "subcollection");
+ metadataSource = conf.get("subcollection.metadata.source",
"subcollection");
}
/**
@@ -70,6 +71,11 @@ public class SubcollectionIndexingFilter extends Configured
implements
* Doc field name
*/
public static String fieldName = "subcollection";
+
+ /**
+ * Metadata source field name
+ */
+ public static String metadataSource = "subcollection";
/**
* Logger
@@ -96,6 +102,17 @@ public class SubcollectionIndexingFilter extends Configured
implements
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+ // Check for subcollection overrride in HTML metadata
+ String subcollection = parse.getData().getMeta(metadataSource);
+ if (subcollection != null) {
+ subcollection = subcollection.trim();
+
+ if (subcollection.length() > 0) {
+ doc.add(fieldName, subcollection);
+ return doc;
+ }
+ }
+
String sUrl = url.toString();
addSubCollectionField(doc, sUrl);
return doc;
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].