Author: markus
Date: Thu Jun 20 09:07:12 2013
New Revision: 1494894
URL: http://svn.apache.org/r1494894
Log:
NUTCH-1583 Headings plugin to support multivalued headings
Modified:
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
Modified:
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java?rev=1494894&r1=1494893&r2=1494894&view=diff
==============================================================================
---
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
(original)
+++
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
Thu Jun 20 09:07:12 2013
@@ -17,7 +17,8 @@
package org.apache.nutch.parse.headings;
-// Nutch imports
+import java.util.ArrayList;
+import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
@@ -25,8 +26,6 @@ import org.apache.nutch.parse.HtmlParseF
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NodeWalker;
-
-// W3C imports
import org.w3c.dom.*;
/**
@@ -37,21 +36,25 @@ public class HeadingsParseFilter impleme
private Configuration conf;
private DocumentFragment doc;
private String[] headings;
+ private boolean multiValued = false;
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
this.doc = doc;
- String heading;
Parse parse = parseResult.get(content.getUrl());
for (int i = 0 ; headings != null && i < headings.length ; i++ ) {
- heading = getElement(headings[i]);
-
- if (heading != null) {
- heading.trim();
+ List<String> discoveredHeadings = getElement(headings[i]);
- if (heading.length() > 0) {
- parse.getData().getParseMeta().set(headings[i], heading);
+ if (discoveredHeadings.size() > 0) {
+ for (String heading : discoveredHeadings) {
+ if (heading != null) {
+ heading.trim();
+
+ if (heading.length() > 0) {
+ parse.getData().getParseMeta().add(headings[i], heading);
+ }
+ }
}
}
}
@@ -63,6 +66,7 @@ public class HeadingsParseFilter impleme
this.conf = conf;
headings = conf.getStrings("headings");
+ multiValued = conf.getBoolean("headings.multivalued", false);
}
public Configuration getConf() {
@@ -72,7 +76,8 @@ public class HeadingsParseFilter impleme
/**
* Finds the specified element and returns its value
*/
- protected String getElement(String element) {
+ protected List<String> getElement(String element) {
+ List<String> headings = new ArrayList<String>();
NodeWalker walker = new NodeWalker(doc);
while (walker.hasNext()) {
@@ -80,13 +85,18 @@ public class HeadingsParseFilter impleme
if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
if (element.equalsIgnoreCase(currentNode.getNodeName())) {
- return getNodeValue(currentNode);
+ headings.add(getNodeValue(currentNode));
+
+ // Check for multiValued here, if disabled we don't need
+ // to discover more headings.
+ if (!multiValued) {
+ break;
+ }
}
}
}
- // Seems nothing is found
- return null;
+ return headings;
}
/**
@@ -105,5 +115,4 @@ public class HeadingsParseFilter impleme
return buffer.toString();
}
-
}