Author: jnioche
Date: Mon Oct 7 10:08:43 2013
New Revision: 1529813
URL: http://svn.apache.org/r1529813
Log:
NUTCH-1562
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java
nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1529813&r1=1529812&r2=1529813&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Oct 7 10:08:43 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1562 Order of execution for scoring filters (jnioche, snagel)
+
* NUTCH-1640 Reuse ParseUtil instance in ParseSegment (Mitesh Singh Jat via
jnioche)
* NUTCH-1639 bin/crawl fails on mac os (various contributors via snagel)
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1529813&r1=1529812&r2=1529813&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Oct 7 10:08:43 2013
@@ -1172,11 +1172,10 @@
<property>
<name>scoring.filter.order</name>
<value></value>
- <description>The order in which scoring filters are applied.
- This may be left empty (in which case all available scoring
- filters will be applied in the order defined in plugin-includes
- and plugin-excludes), or a space separated list of implementation
- classes.
+ <description>The order in which scoring filters are applied. This
+ may be left empty (in which case all available scoring filters will
+ be applied in system defined order), or a space separated list of
+ implementation classes.
</description>
</property>
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?rev=1529813&r1=1529812&r2=1529813&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Mon Oct
7 10:08:43 2013
@@ -17,16 +17,12 @@
package org.apache.nutch.indexer;
-import java.util.ArrayList;
-import java.util.HashMap;
-
// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.nutch.plugin.*;
+import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.parse.Parse;
-import org.apache.nutch.util.ObjectCache;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
@@ -42,65 +38,10 @@ public class IndexingFilters {
private IndexingFilter[] indexingFilters;
public IndexingFilters(Configuration conf) {
- /* Get indexingfilter.order property */
- String order = conf.get(INDEXINGFILTER_ORDER);
- ObjectCache objectCache = ObjectCache.get(conf);
- this.indexingFilters = (IndexingFilter[]) objectCache
- .getObject(IndexingFilter.class.getName());
- if (this.indexingFilters == null) {
- /*
- * If ordered filters are required, prepare array of filters based on
- * property
- */
- String[] orderedFilters = null;
- if (order != null && !order.trim().equals("")) {
- orderedFilters = order.trim().split("\\s+");
- }
- try {
- ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
- IndexingFilter.X_POINT_ID);
- if (point == null)
- throw new RuntimeException(IndexingFilter.X_POINT_ID + " not
found.");
- Extension[] extensions = point.getExtensions();
- HashMap<String, IndexingFilter> filterMap =
- new HashMap<String, IndexingFilter>();
- for (int i = 0; i < extensions.length; i++) {
- Extension extension = extensions[i];
- IndexingFilter filter = (IndexingFilter) extension
- .getExtensionInstance();
- LOG.info("Adding " + filter.getClass().getName());
- if (!filterMap.containsKey(filter.getClass().getName())) {
- filterMap.put(filter.getClass().getName(), filter);
- }
- }
- /*
- * If no ordered filters required, just get the filters in an
- * indeterminate order
- */
- if (orderedFilters == null) {
- objectCache.setObject(IndexingFilter.class.getName(),
- filterMap.values().toArray(
- new IndexingFilter[0]));
- /* Otherwise run the filters in the required order */
- } else {
- ArrayList<IndexingFilter> filters = new ArrayList<IndexingFilter>();
- for (int i = 0; i < orderedFilters.length; i++) {
- IndexingFilter filter = filterMap
- .get(orderedFilters[i]);
- if (filter != null) {
- filters.add(filter);
- }
- }
- objectCache.setObject(IndexingFilter.class.getName(), filters
- .toArray(new IndexingFilter[filters.size()]));
- }
- } catch (PluginRuntimeException e) {
- throw new RuntimeException(e);
- }
- this.indexingFilters = (IndexingFilter[]) objectCache
- .getObject(IndexingFilter.class.getName());
- }
- }
+ indexingFilters = (IndexingFilter[]) PluginRepository.get(conf)
+ .getOrderedPlugins(IndexingFilter.class, IndexingFilter.X_POINT_ID,
+ INDEXINGFILTER_ORDER);
+ }
/** Run all defined filters. */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum,
Modified: nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java?rev=1529813&r1=1529812&r2=1529813&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java Mon Oct 7
10:08:43 2013
@@ -17,17 +17,9 @@
package org.apache.nutch.net;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.nutch.plugin.Extension;
-import org.apache.nutch.plugin.ExtensionPoint;
-import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.plugin.PluginRepository;
-import org.apache.nutch.util.ObjectCache;
-import org.apache.hadoop.conf.Configuration;
/** Creates and caches {@link URLFilter} implementing plugins.*/
public class URLFilters {
@@ -35,49 +27,9 @@ public class URLFilters {
private URLFilter[] filters;
public URLFilters(Configuration conf) {
- String order = conf.get(URLFILTER_ORDER);
- ObjectCache objectCache = ObjectCache.get(conf);
- this.filters = (URLFilter[])
objectCache.getObject(URLFilter.class.getName());
-
- if (this.filters == null) {
- String[] orderedFilters = null;
- if (order != null && !order.trim().equals("")) {
- orderedFilters = order.trim().split("\\s+");
- }
-
- try {
- ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
- URLFilter.X_POINT_ID);
- if (point == null)
- throw new RuntimeException(URLFilter.X_POINT_ID + " not found.");
- Extension[] extensions = point.getExtensions();
- Map<String, URLFilter> filterMap = new HashMap<String, URLFilter>();
- for (int i = 0; i < extensions.length; i++) {
- Extension extension = extensions[i];
- URLFilter filter = (URLFilter) extension.getExtensionInstance();
- if (!filterMap.containsKey(filter.getClass().getName())) {
- filterMap.put(filter.getClass().getName(), filter);
- }
- }
- if (orderedFilters == null) {
- objectCache.setObject(URLFilter.class.getName(),
filterMap.values().toArray(
- new URLFilter[0]));
- } else {
- ArrayList<URLFilter> filters = new ArrayList<URLFilter>();
- for (int i = 0; i < orderedFilters.length; i++) {
- URLFilter filter = filterMap.get(orderedFilters[i]);
- if (filter != null) {
- filters.add(filter);
- }
- }
- objectCache.setObject(URLFilter.class.getName(), filters
- .toArray(new URLFilter[filters.size()]));
- }
- } catch (PluginRuntimeException e) {
- throw new RuntimeException(e);
- }
- this.filters = (URLFilter[])
objectCache.getObject(URLFilter.class.getName());
- }
+ this.filters = (URLFilter[]) PluginRepository.get(conf)
+ .getOrderedPlugins(URLFilter.class, URLFilter.X_POINT_ID,
+ URLFILTER_ORDER);
}
/** Run all defined filters. Assume logical AND. */
Modified: nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?rev=1529813&r1=1529812&r2=1529813&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java Mon Oct
7 10:08:43 2013
@@ -17,12 +17,8 @@
package org.apache.nutch.parse;
-import java.util.ArrayList;
-import java.util.HashMap;
-
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.plugin.*;
-import org.apache.nutch.util.ObjectCache;
+import org.apache.nutch.plugin.PluginRepository;
import org.apache.hadoop.conf.Configuration;
import org.w3c.dom.DocumentFragment;
@@ -35,59 +31,10 @@ public class HtmlParseFilters {
public static final String HTMLPARSEFILTER_ORDER = "htmlparsefilter.order";
public HtmlParseFilters(Configuration conf) {
- String order = conf.get(HTMLPARSEFILTER_ORDER);
- ObjectCache objectCache = ObjectCache.get(conf);
- this.htmlParseFilters = (HtmlParseFilter[])
objectCache.getObject(HtmlParseFilter.class.getName());
- if (htmlParseFilters == null) {
- /*
- * If ordered filters are required, prepare array of filters based on
- * property
- */
- String[] orderedFilters = null;
- if (order != null && !order.trim().equals("")) {
- orderedFilters = order.trim().split("\\s+");
- }
- HashMap<String, HtmlParseFilter> filterMap =
- new HashMap<String, HtmlParseFilter>();
- try {
- ExtensionPoint point =
PluginRepository.get(conf).getExtensionPoint(HtmlParseFilter.X_POINT_ID);
- if (point == null)
- throw new RuntimeException(HtmlParseFilter.X_POINT_ID + "
not found.");
- Extension[] extensions = point.getExtensions();
- for (int i = 0; i < extensions.length; i++) {
- Extension extension = extensions[i];
- HtmlParseFilter parseFilter = (HtmlParseFilter)
extension.getExtensionInstance();
- if
(!filterMap.containsKey(parseFilter.getClass().getName())) {
- filterMap.put(parseFilter.getClass().getName(),
parseFilter);
- }
- }
- HtmlParseFilter[] htmlParseFilters =
filterMap.values().toArray(new HtmlParseFilter[filterMap.size()]);
- /*
- * If no ordered filters required, just get the filters in an
- * indeterminate order
- */
- if (orderedFilters == null) {
- objectCache.setObject(HtmlParseFilter.class.getName(),
htmlParseFilters);
- }
- /* Otherwise run the filters in the required order */
- else {
- ArrayList<HtmlParseFilter> filters = new
ArrayList<HtmlParseFilter>();
- for (int i = 0; i < orderedFilters.length; i++) {
- HtmlParseFilter filter = filterMap
- .get(orderedFilters[i]);
- if (filter != null) {
- filters.add(filter);
- }
- }
- objectCache.setObject(HtmlParseFilter.class.getName(),
filters
- .toArray(new HtmlParseFilter[filters.size()]));
- }
- } catch (PluginRuntimeException e) {
- throw new RuntimeException(e);
- }
- this.htmlParseFilters = (HtmlParseFilter[])
objectCache.getObject(HtmlParseFilter.class.getName());
- }
- }
+ htmlParseFilters = (HtmlParseFilter[]) PluginRepository.get(conf)
+ .getOrderedPlugins(HtmlParseFilter.class, HtmlParseFilter.X_POINT_ID,
+ HTMLPARSEFILTER_ORDER);
+ }
/** Run all defined filters. */
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=1529813&r1=1529812&r2=1529813&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java Mon Oct
7 10:08:43 2013
@@ -16,10 +16,12 @@
*/
package org.apache.nutch.plugin;
+import java.lang.reflect.Array;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.WeakHashMap;
import java.util.List;
@@ -29,6 +31,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.ObjectCache;
/**
* The plugin repositority is a registry of all plugins.
@@ -371,6 +374,81 @@ public class PluginRepository {
}
return map;
}
+
+ /**
+ * Get ordered list of plugins. Filter and normalization plugins are applied
+ * in a configurable "pipeline" order, e.g., if one plugin depends on the
+ * output of another plugin. This method loads the plugins in the order
+ * defined by orderProperty. If orderProperty is empty or unset, all active
+ * plugins of the given interface and extension point are loaded.
+ *
+ * @param clazz
+ * interface class implemented by required plugins
+ * @param xPointId
+ * extension point id of required plugins
+ * @param orderProperty
+ * property name defining plugin order
+ * @return array of plugin instances
+ */
+ public synchronized Object[] getOrderedPlugins(Class<?> clazz, String
xPointId,
+ String orderProperty) {
+ Object[] filters;
+ ObjectCache objectCache = ObjectCache.get(conf);
+ filters = (Object[]) objectCache.getObject(clazz.getName());
+
+ if (filters == null) {
+ String order = conf.get(orderProperty);
+ List<String> orderOfFilters = new ArrayList<String>();
+ boolean userDefinedOrder = false;
+ if (order != null && !order.trim().isEmpty()) {
+ orderOfFilters = Arrays.asList(order.trim().split("\\s+"));
+ userDefinedOrder = true;
+ }
+
+ try {
+ ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+ xPointId);
+ if (point == null)
+ throw new RuntimeException(xPointId + " not found.");
+ Extension[] extensions = point.getExtensions();
+ HashMap<String, Object> filterMap = new HashMap<String, Object>();
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ Object filter = extension.getExtensionInstance();
+ if (!filterMap.containsKey(filter.getClass().getName())) {
+ filterMap.put(filter.getClass().getName(), filter);
+ if (!userDefinedOrder)
+ orderOfFilters.add(filter.getClass().getName());
+ }
+ }
+ List<Object> sorted = new ArrayList<Object>();
+ for (String orderedFilter : orderOfFilters) {
+ Object f = filterMap.get(orderedFilter);
+ if (f == null) {
+ LOG.error(clazz.getSimpleName() + " : " + orderedFilter
+ + " declared in configuration property " + orderProperty
+ + " but not found in an active plugin - ignoring.");
+ continue;
+ }
+ sorted.add(f);
+ }
+ Object[] filter = (Object[]) Array.newInstance(clazz, sorted.size());
+ for (int i = 0; i < sorted.size(); i++) {
+ filter[i] = sorted.get(i);
+ if (LOG.isTraceEnabled()) {
+ LOG.trace(clazz.getSimpleName() + " : filters[" + i + "] = "
+ + filter[i].getClass());
+ }
+ }
+ objectCache.setObject(clazz.getName(), filter);
+ } catch (PluginRuntimeException e) {
+ throw new RuntimeException(e);
+ }
+
+ filters = (Object[]) objectCache.getObject(clazz.getName());
+ }
+ return filters;
+ }
/**
* Loads all necessary dependencies for a selected plugin, and then runs one
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java?rev=1529813&r1=1529812&r2=1529813&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java Mon Oct
7 10:08:43 2013
@@ -18,25 +18,19 @@
package org.apache.nutch.scoring;
import java.util.Collection;
-import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.plugin.Extension;
-import org.apache.nutch.plugin.ExtensionPoint;
-import org.apache.nutch.plugin.PluginRuntimeException;
import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.ObjectCache;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.Text;
/**
* Creates and caches {@link ScoringFilter} implementing plugins.
@@ -49,43 +43,9 @@ public class ScoringFilters extends Conf
public ScoringFilters(Configuration conf) {
super(conf);
- ObjectCache objectCache = ObjectCache.get(conf);
- String order = conf.get("scoring.filter.order");
- this.filters = (ScoringFilter[])
objectCache.getObject(ScoringFilter.class.getName());
-
- if (this.filters == null) {
- String[] orderedFilters = null;
- if (order != null && !order.trim().equals("")) {
- orderedFilters = order.trim().split("\\s+");
- }
-
- try {
- ExtensionPoint point =
PluginRepository.get(conf).getExtensionPoint(ScoringFilter.X_POINT_ID);
- if (point == null) throw new RuntimeException(ScoringFilter.X_POINT_ID
+ " not found.");
- Extension[] extensions = point.getExtensions();
- HashMap<String, ScoringFilter> filterMap =
- new HashMap<String, ScoringFilter>();
- for (int i = 0; i < extensions.length; i++) {
- Extension extension = extensions[i];
- ScoringFilter filter = (ScoringFilter)
extension.getExtensionInstance();
- if (!filterMap.containsKey(filter.getClass().getName())) {
- filterMap.put(filter.getClass().getName(), filter);
- }
- }
- if (orderedFilters == null) {
- objectCache.setObject(ScoringFilter.class.getName(),
filterMap.values().toArray(new ScoringFilter[0]));
- } else {
- ScoringFilter[] filter = new ScoringFilter[orderedFilters.length];
- for (int i = 0; i < orderedFilters.length; i++) {
- filter[i] = filterMap.get(orderedFilters[i]);
- }
- objectCache.setObject(ScoringFilter.class.getName(), filter);
- }
- } catch (PluginRuntimeException e) {
- throw new RuntimeException(e);
- }
- this.filters = (ScoringFilter[])
objectCache.getObject(ScoringFilter.class.getName());
- }
+ this.filters = (ScoringFilter[]) PluginRepository.get(conf)
+ .getOrderedPlugins(ScoringFilter.class, ScoringFilter.X_POINT_ID,
+ "scoring.filter.order");
}
/** Calculate a sort value for Generate. */