Author: markus
Date: Thu Jun 20 09:07:12 2013
New Revision: 1494894

URL: http://svn.apache.org/r1494894
Log:
NUTCH-1583 Headings plugin to support multivalued headings

Modified:
    
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java

Modified: 
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java?rev=1494894&r1=1494893&r2=1494894&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
 Thu Jun 20 09:07:12 2013
@@ -17,7 +17,8 @@
 
 package org.apache.nutch.parse.headings;
 
-// Nutch imports
+import java.util.ArrayList;
+import java.util.List;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.Parse;
@@ -25,8 +26,6 @@ import org.apache.nutch.parse.HtmlParseF
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.NodeWalker;
-
-// W3C imports
 import org.w3c.dom.*;
 
 /**
@@ -37,21 +36,25 @@ public class HeadingsParseFilter impleme
   private Configuration conf;
   private DocumentFragment doc;
   private String[] headings;
+  private boolean multiValued = false;
 
   public ParseResult filter(Content content, ParseResult parseResult, 
HTMLMetaTags metaTags, DocumentFragment doc) {
     this.doc = doc;
 
-    String heading;
     Parse parse = parseResult.get(content.getUrl());
 
     for (int i = 0 ; headings != null && i < headings.length ; i++ ) {
-      heading = getElement(headings[i]);
-
-      if (heading != null) {
-        heading.trim();
+      List<String> discoveredHeadings = getElement(headings[i]);
 
-        if (heading.length() > 0) {
-          parse.getData().getParseMeta().set(headings[i], heading);
+      if (discoveredHeadings.size() > 0) {
+        for (String heading : discoveredHeadings) {
+          if (heading != null) {
+            heading.trim();
+
+            if (heading.length() > 0) {
+              parse.getData().getParseMeta().add(headings[i], heading);
+            }
+          }
         }
       }
     }
@@ -63,6 +66,7 @@ public class HeadingsParseFilter impleme
     this.conf = conf;
 
     headings = conf.getStrings("headings");
+    multiValued = conf.getBoolean("headings.multivalued", false);
   }
 
   public Configuration getConf() {
@@ -72,7 +76,8 @@ public class HeadingsParseFilter impleme
   /**
    * Finds the specified element and returns its value
    */
-  protected String getElement(String element) {
+  protected List<String> getElement(String element) {
+    List<String> headings = new ArrayList<String>();
     NodeWalker walker = new NodeWalker(doc);
 
     while (walker.hasNext()) {
@@ -80,13 +85,18 @@ public class HeadingsParseFilter impleme
 
       if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
         if (element.equalsIgnoreCase(currentNode.getNodeName())) {
-          return getNodeValue(currentNode);
+          headings.add(getNodeValue(currentNode));
+          
+          // Check for multiValued here, if disabled we don't need
+          // to discover more headings.
+          if (!multiValued) {
+            break;
+          }
         }
       }
     }
 
-    // Seems nothing is found
-    return null;
+    return headings;
   }
 
   /**
@@ -105,5 +115,4 @@ public class HeadingsParseFilter impleme
 
     return buffer.toString();
   }
-
 }


Reply via email to