Author: jnioche
Date: Mon Mar  1 15:08:05 2010
New Revision: 917557

URL: http://svn.apache.org/viewvc?rev=917557&view=rev
Log:
NUTCH-782: Ability to order htmlparsefilters

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=917557&r1=917556&r2=917557&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar  1 15:08:05 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-782 Ability to order htmlparsefilters (jnioche)
+
 * NUTCH-719 fetchQueues.totalSize incorrect in Fetcher (Steven Denny via 
jnioche) 
 
 * NUTCH-790 Some external javadoc links are broken (siren)

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=917557&r1=917556&r2=917557&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar  1 15:08:05 2010
@@ -996,6 +996,18 @@
   for most people would be "img,script,link".</description>
 </property>
 
+<property>
+  <name>htmlparsefilter.order</name>
+  <value></value>
+  <description>The order by which HTMLParse filters are applied.
+  If empty, all available HTMLParse filters (as dictated by properties
+  plugin-includes and plugin-excludes above) are loaded and applied in system
+  defined order. If not empty, only named filters are loaded and applied
+  in given order.
+  HTMLParse filter ordering MAY have an impact
+  on end result, as some filters could rely on the metadata generated by a 
previous filter.
+  </description>
+</property>
 
 <!-- urlfilter plugin properties -->
 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?rev=917557&r1=917556&r2=917557&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java 
Mon Mar  1 15:08:05 2010
@@ -17,6 +17,7 @@
 
 package org.apache.nutch.parse;
 
+import java.util.ArrayList;
 import java.util.HashMap;
 
 import org.apache.nutch.protocol.Content;
@@ -30,12 +31,23 @@
 public class HtmlParseFilters {
 
   private HtmlParseFilter[] htmlParseFilters;
+  
+  public static final String HTMLPARSEFILTER_ORDER = "htmlparsefilter.order";
 
   public HtmlParseFilters(Configuration conf) {
+        String order = conf.get(HTMLPARSEFILTER_ORDER);
         ObjectCache objectCache = ObjectCache.get(conf);
         this.htmlParseFilters = (HtmlParseFilter[]) 
objectCache.getObject(HtmlParseFilter.class.getName());
         if (htmlParseFilters == null) {
-            HashMap<String, HtmlParseFilter> filters =
+          /*
+           * If ordered filters are required, prepare array of filters based on
+           * property
+           */
+          String[] orderedFilters = null;
+          if (order != null && !order.trim().equals("")) {
+            orderedFilters = order.split("\\s+");
+          }
+            HashMap<String, HtmlParseFilter> filterMap =
               new HashMap<String, HtmlParseFilter>();
             try {
                 ExtensionPoint point = 
PluginRepository.get(conf).getExtensionPoint(HtmlParseFilter.X_POINT_ID);
@@ -45,12 +57,31 @@
                 for (int i = 0; i < extensions.length; i++) {
                     Extension extension = extensions[i];
                     HtmlParseFilter parseFilter = (HtmlParseFilter) 
extension.getExtensionInstance();
-                    if 
(!filters.containsKey(parseFilter.getClass().getName())) {
-                        filters.put(parseFilter.getClass().getName(), 
parseFilter);
+                    if 
(!filterMap.containsKey(parseFilter.getClass().getName())) {
+                        filterMap.put(parseFilter.getClass().getName(), 
parseFilter);
                     }
                 }
-                HtmlParseFilter[] htmlParseFilters = 
filters.values().toArray(new HtmlParseFilter[filters.size()]);
-                objectCache.setObject(HtmlParseFilter.class.getName(), 
htmlParseFilters);
+                HtmlParseFilter[] htmlParseFilters = 
filterMap.values().toArray(new HtmlParseFilter[filterMap.size()]);
+                /*
+                 * If no ordered filters required, just get the filters in an
+                 * indeterminate order
+                 */
+                if (orderedFilters == null) {
+                  objectCache.setObject(HtmlParseFilter.class.getName(), 
htmlParseFilters);
+                }
+                /* Otherwise run the filters in the required order */
+                else {
+                  ArrayList<HtmlParseFilter> filters = new 
ArrayList<HtmlParseFilter>();
+                  for (int i = 0; i < orderedFilters.length; i++) {
+                    HtmlParseFilter filter = filterMap
+                        .get(orderedFilters[i]);
+                    if (filter != null) {
+                      filters.add(filter);
+                    }
+                  }
+                  objectCache.setObject(HtmlParseFilter.class.getName(), 
filters
+                      .toArray(new HtmlParseFilter[filters.size()]));
+                }
             } catch (PluginRuntimeException e) {
                 throw new RuntimeException(e);
             }


Reply via email to