Author: jnioche Date: Mon Mar 1 15:08:05 2010 New Revision: 917557 URL: http://svn.apache.org/viewvc?rev=917557&view=rev Log: NUTCH-782: Ability to order htmlparsefilters
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=917557&r1=917556&r2=917557&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 1 15:08:05 2010 @@ -2,6 +2,8 @@ Unreleased Changes +* NUTCH-782 Ability to order htmlparsefilters (jnioche) + * NUTCH-719 fetchQueues.totalSize incorrect in Fetcher (Steven Denny via jnioche) * NUTCH-790 Some external javadoc links are broken (siren) Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=917557&r1=917556&r2=917557&view=diff ============================================================================== --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Mon Mar 1 15:08:05 2010 @@ -996,6 +996,18 @@ for most people would be "img,script,link".</description> </property> +<property> + <name>htmlparsefilter.order</name> + <value></value> + <description>The order by which HTMLParse filters are applied. + If empty, all available HTMLParse filters (as dictated by properties + plugin-includes and plugin-excludes above) are loaded and applied in system + defined order. If not empty, only named filters are loaded and applied + in given order. + HTMLParse filter ordering MAY have an impact + on end result, as some filters could rely on the metadata generated by a previous filter. + </description> +</property> <!-- urlfilter plugin properties --> Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?rev=917557&r1=917556&r2=917557&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java Mon Mar 1 15:08:05 2010 @@ -17,6 +17,7 @@ package org.apache.nutch.parse; +import java.util.ArrayList; import java.util.HashMap; import org.apache.nutch.protocol.Content; @@ -30,12 +31,23 @@ public class HtmlParseFilters { private HtmlParseFilter[] htmlParseFilters; + + public static final String HTMLPARSEFILTER_ORDER = "htmlparsefilter.order"; public HtmlParseFilters(Configuration conf) { + String order = conf.get(HTMLPARSEFILTER_ORDER); ObjectCache objectCache = ObjectCache.get(conf); this.htmlParseFilters = (HtmlParseFilter[]) objectCache.getObject(HtmlParseFilter.class.getName()); if (htmlParseFilters == null) { - HashMap<String, HtmlParseFilter> filters = + /* + * If ordered filters are required, prepare array of filters based on + * property + */ + String[] orderedFilters = null; + if (order != null && !order.trim().equals("")) { + orderedFilters = order.split("\\s+"); + } + HashMap<String, HtmlParseFilter> filterMap = new HashMap<String, HtmlParseFilter>(); try { ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(HtmlParseFilter.X_POINT_ID); @@ -45,12 +57,31 @@ for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; HtmlParseFilter parseFilter = (HtmlParseFilter) extension.getExtensionInstance(); - if (!filters.containsKey(parseFilter.getClass().getName())) { - filters.put(parseFilter.getClass().getName(), parseFilter); + if (!filterMap.containsKey(parseFilter.getClass().getName())) { + filterMap.put(parseFilter.getClass().getName(), parseFilter); } } - HtmlParseFilter[] htmlParseFilters = filters.values().toArray(new HtmlParseFilter[filters.size()]); - objectCache.setObject(HtmlParseFilter.class.getName(), htmlParseFilters); + HtmlParseFilter[] htmlParseFilters = filterMap.values().toArray(new HtmlParseFilter[filterMap.size()]); + /* + * If no ordered filters required, just get the filters in an + * indeterminate order + */ + if (orderedFilters == null) { + objectCache.setObject(HtmlParseFilter.class.getName(), htmlParseFilters); + } + /* Otherwise run the filters in the required order */ + else { + ArrayList<HtmlParseFilter> filters = new ArrayList<HtmlParseFilter>(); + for (int i = 0; i < orderedFilters.length; i++) { + HtmlParseFilter filter = filterMap + .get(orderedFilters[i]); + if (filter != null) { + filters.add(filter); + } + } + objectCache.setObject(HtmlParseFilter.class.getName(), filters + .toArray(new HtmlParseFilter[filters.size()])); + } } catch (PluginRuntimeException e) { throw new RuntimeException(e); }