Author: jerome
Date: Fri Sep 30 15:11:19 2005
New Revision: 292865
URL: http://svn.apache.org/viewcvs?rev=292865&view=rev
Log:
NUTCH-88, Second step implementation:
* Add a configuration property for the parse-plugins.xml file location
* ParserFactory now returns an ordered list of Parsers
* Improve logging
* Improve Parser selection policy
* Unit Tests added
Added:
lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml
(with props)
Modified:
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=292865&r1=292864&r2=292865&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Fri Sep 30 15:11:19 2005
@@ -612,6 +612,15 @@
</description>
</property>
+<!-- parser properties -->
+
+<property>
+ <name>parse.plugin.file</name>
+ <value>parse-plugins.xml</value>
+ <description>The name of the file that defines the associations between
+ content-types and parsers.</description>
+</property>
+
<property>
<name>parser.character.encoding.default</name>
<value>windows-1252</value>
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=292865&r1=292864&r2=292865&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
Fri Sep 30 15:11:19 2005
@@ -22,6 +22,12 @@
import java.util.logging.Level;
import java.util.logging.Logger;
+import java.io.InputStream;
+import java.io.IOException;
+
+import java.net.URL;
+import java.net.MalformedURLException;
+
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
@@ -76,8 +82,35 @@
Document document = null;
InputSource inputSource = null;
- inputSource = new InputSource(NutchConf.get()
- .getConfResourceAsInputStream(fParsePluginsFile));
+ //check to see if the Nutch conf property
+ //parse.plugin.file is defined
+ String parsePluginFileUrl = NutchConf.get().get("parse.plugin.file");
+
+ InputStream ppInputStream = null;
+
+ if (parsePluginFileUrl != null) {
+ URL parsePluginUrl = null;
+
+ try {
+ parsePluginUrl = new URL(parsePluginFileUrl);
+ ppInputStream = parsePluginUrl.openStream();
+ } catch (MalformedURLException e) {
+ LOG.log(Level.SEVERE,
+ "Unable to load parse plugins file from URL ["
+ + parsePluginFileUrl + "]", e);
+ return null;
+ } catch (IOException e) {
+ LOG.log(Level.SEVERE,
+ "Unable to load parse plugins file from URL ["
+ + parsePluginFileUrl + "]", e);
+ return null;
+ }
+ } else {
+ ppInputStream = NutchConf.get().getConfResourceAsInputStream(
+ fParsePluginsFile);
+ }
+
+ inputSource = new InputSource(ppInputStream);
try {
factory = DocumentBuilderFactory.newInstance();
@@ -154,6 +187,12 @@
public static void main(String[] args) throws Exception {
String parsePluginFile = null;
String usage = "ParsePluginsReader [--file <parse plugin file location>]";
+
+ if (( args.length != 0 && args.length != 2 )
+ || (args.length == 2 && !"--file".equals(args[0]))) {
+ System.err.println(usage);
+ System.exit(1);
+ }
for (int i = 0; i < args.length; i++) {
if (args[i].equals("--file")) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=292865&r1=292864&r2=292865&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Fri
Sep 30 15:11:19 2005
@@ -16,9 +16,11 @@
package org.apache.nutch.parse;
// JDK imports
+import java.util.Collections;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
+import java.util.Vector;
import java.util.logging.Logger;
// Nutch imports
@@ -35,14 +37,19 @@
public static final Logger LOG =
LogFormatter.getLogger(ParserFactory.class.getName());
+ /** Wildcard for default plugins. */
public static final String DEFAULT_PLUGIN = "*";
+ /** Extension point. */
private static final ExtensionPoint X_POINT =
PluginRepository.getInstance().getExtensionPoint(Parser.X_POINT_ID);
+ /** List of parser plugins. */
private static final ParsePluginList PARSE_PLUGIN_LIST =
new ParsePluginsReader().parse();
+ /** Empty extension list for caching purposes. */
+ private static final List EMPTY_EXTENSION_LIST = Collections.EMPTY_LIST;
static {
if (X_POINT == null) {
@@ -53,100 +60,299 @@
}
}
+ //cache mapping mimeType->List of Extensions
private static final Hashtable CACHE = new Hashtable();
+ //cache mapping parser plugin id->Parser instance
+ private static final Hashtable PARSER_CACHE = new Hashtable();
private ParserFactory() {} // no public ctor
+
/**
* Returns the appropriate [EMAIL PROTECTED] Parser} implementation given a
content
* type and url.
- *
- * <p>Parser extensions should define the attributes"contentType" and/or
- * "pathSuffix". Content type has priority: the first plugin found whose
+ *
+ * @deprecated Since the addition of NUTCH-88, this method is replaced by
+ * taking the highest priority [EMAIL PROTECTED] Parser} returned from
+ * [EMAIL PROTECTED] #getParsers(String, String)}.
+ *
+ * Parser extensions should define the attributes "contentType" and/or
+ * "pathSuffix". Content type has priority: the first plugin found whose
* "contentType" attribute matches the beginning of the content's type is
- * used. If none match, then the first whose "pathSuffix" attribute matches
+ * used. If none match, then the first whose "pathSuffix" attribute matches
* the end of the url's path is used. If neither of these match, then the
* first plugin whose "pathSuffix" is the empty string is used.
*/
public static Parser getParser(String contentType, String url)
throws ParserNotFound {
- try {
- Extension extension = getExtension(contentType);
- if (extension != null) {
- return (Parser) extension.getExtensionInstance();
- }
- // TODO once the MimeTypes is available
- // extension = getExtension(MimeUtils.map(contentType));
- // if (extension != null) {
- // return (Parser) extension.getExtensionInstance();
- // }
- // Last Chance: Guess content-type from file url...
- // extension = getExtension(MimeUtils.getMimeType(url));
- throw new ParserNotFound(url, contentType);
- } catch (PluginRuntimeException e) {
- throw new ParserNotFound(url, contentType, e.toString());
+ Parser[] parsers = getParsers(contentType, url);
+
+ if(parsers != null){
+ //give the user the highest priority parser available
+ for(int i = 0; i < parsers.length; i++ ){
+ Parser p = parsers[i];
+ if(p != null){
+ return p;
+ }
+ }
+
+ throw new ParserNotFound(url, contentType);
+
+ }
+ else{
+ throw new ParserNotFound(url, contentType);
}
}
+
+ /**
+ * Function returns an array of [EMAIL PROTECTED] Parser}s for a given
content type.
+ *
+ * The function consults the internal [EMAIL PROTECTED] ParsePluginList} for
the
+ * ParserFactory to determine the list of pluginIds, then gets the
+ * appropriate extension points to instantiate as {Parser}s.
+ *
+ * @param contentType The contentType to return the <code>Array</code>
+ * of {Parser}s for.
+ * @param url The url for the content that may allow us to get the type from
+ * the file suffix.
+ * @return An <code>Array</code> of [EMAIL PROTECTED] for the given
contentType.
+ * If there were plugins mapped to a contentType via the
+ * <code>parse-plugins.xml</code> file, but never enabled via
+ * the <code>plugin.includes</code> Nutch conf, then those plugins
+ * won't be part of this array, i.e., they will be skipped.
+ * So, if the ordered list of parsing plugins for
+ * <code>text/plain</code> was <code>[parse-text,parse-html,
+ * parse-rtf]</code>, and only <code>parse-html</code> and
+ * <code>parse-rtf</code> were enabled via
+ * <code>plugin.includes</code>, then this ordered Array would
+ * consist of two [EMAIL PROTECTED] Parser} interfaces,
+ * <code>[parse-html, parse-rtf]</code>.
+ */
+ public static Parser[] getParsers(String contentType, String url)
+ throws ParserNotFound {
+
+ List parsers = null;
+ List parserExts = null;
- protected static Extension getExtension(String contentType)
- throws PluginRuntimeException {
-
- Extension extension = (Extension) CACHE.get(contentType);
- if (extension == null) {
- extension = findExtension(contentType);
- // TODO: For null extension, add a fake extension in the CACHE
- // in order to avoid trying to find each time
- // an unavailable extension
- if (extension != null) {
- CACHE.put(contentType, extension);
+ // TODO once the MimeTypes is available
+ // parsers = getExtensions(MimeUtils.map(contentType));
+ // if (parsers != null) {
+ // return parsers;
+ // }
+ // Last Chance: Guess content-type from file url...
+ // parsers = getExtensions(MimeUtils.getMimeType(url));
+
+ parserExts = getExtensions(contentType);
+ if (parserExts == null) {
+ throw new ParserNotFound(url, contentType);
+ }
+
+ parsers = new Vector(parserExts.size());
+ for (Iterator i=parserExts.iterator(); i.hasNext(); ){
+ Extension ext = (Extension) i.next();
+ Parser p = null;
+ try {
+ //check to see if we've cached this parser instance yet
+ p = (Parser) PARSER_CACHE.get(ext.getDescriptor().getPluginId());
+ if (p == null) {
+ // go ahead and instantiate it and then cache it
+ p = (Parser) ext.getExtensionInstance();
+ PARSER_CACHE.put(ext.getDescriptor().getPluginId(),p);
+ }
+ parsers.add(p);
+ } catch (PluginRuntimeException e) {
+ LOG.warning("ParserFactory:PluginRuntimeException when "
+ + "initializing parser plugin "
+ + ext.getDescriptor().getPluginId()
+ + " instance in getParsers "
+ + "function: attempting to continue instantiating parsers");
}
}
- return extension;
+ return (Parser[]) parsers.toArray(new Parser[]{});
}
- private static Extension findExtension(String contentType)
- throws PluginRuntimeException{
+ /**
+ * finds the best-suited parse plugin for a given contentType.
+ *
+ * @param contentType Content-Type for which we seek a parse plugin.
+ * @return List - List of extensions to be used for this contentType.
+ * If none, returns null.
+ */
+ protected static List getExtensions(String contentType){
+
+ List extensions = (List)CACHE.get(contentType);
+
+ // Just compare the reference:
+ // if this is the empty list, we know we will find no extension.
+ if (extensions == EMPTY_EXTENSION_LIST) {
+ return null;
+ }
+
+ if (extensions == null) {
+ extensions = findExtensions(contentType);
+ if (extensions != null) {
+ CACHE.put(contentType, extensions);
+ } else {
+ // Put the empty extension list into cache
+ // to remember we don't know any related extension.
+ CACHE.put(contentType, EMPTY_EXTENSION_LIST);
+ }
+ }
+ return extensions;
+ }
+
+ /**
+ * searches a list of suitable parse plugins for the given contentType.
+ * <p>It first looks for a preferred plugin defined in the parse-plugin
+ * file. If none is found, it returns a list of default plugins.
+ *
+ * @param contentType Content-Type for which we seek a parse plugin.
+ * @return List - List of extensions to be used for this contentType.
+ * If none, returns null.
+ */
+ private static List findExtensions(String contentType){
Extension[] extensions = X_POINT.getExtensions();
// Look for a preferred plugin.
List parsePluginList = PARSE_PLUGIN_LIST.getPluginList(contentType);
- Extension extension = matchExtension(parsePluginList, extensions,
contentType);
- if (extension != null) {
- return extension;
+ List extensionList = matchExtensions(parsePluginList, extensions,
contentType);
+ if (extensionList != null) {
+ return extensionList;
}
// If none found, look for a default plugin.
parsePluginList = PARSE_PLUGIN_LIST.getPluginList(DEFAULT_PLUGIN);
- return matchExtension(parsePluginList, extensions, DEFAULT_PLUGIN);
+ return matchExtensions(parsePluginList, extensions, DEFAULT_PLUGIN);
}
- private static Extension matchExtension(List plugins,
- Extension[] extensions,
- String contentType) {
-
- // Preliminary check
- if (plugins == null) { return null; }
-
- Iterator iter = plugins.iterator();
- while (iter.hasNext()) {
- String pluginId = (String) iter.next();
- if (pluginId != null) {
- for (int i=0; i<extensions.length; i++) {
- if (match(extensions[i], pluginId, contentType)) {
- return extensions[i];
+ /**
+ * Tries to find a suitable parser for the given contentType.
+ * <ol>
+ * <li>It checks if a parser which accepts the contentType
+ * can be found in the <code>plugins</code> list;</li>
+ * <li>If this list is empty, it tries to find amongst the loaded
+ * extensions whether some of them might suit and warns the user.</li>
+ * </ol>
+ * @param plugins List of candidate plugins.
+ * @param extensions Array of loaded extensions.
+ * @param contentType Content-Type for which we seek a parse plugin.
+ * @return List - List of extensions to be used for this contentType.
+ * If none, returns null.
+ */
+ private static List matchExtensions(List plugins,
+ Extension[] extensions,
+ String contentType) {
+
+ List extList = null;
+ if (plugins != null) {
+ extList = new Vector(plugins.size());
+
+ for (Iterator i = plugins.iterator(); i.hasNext();) {
+ String parsePluginId = (String) i.next();
+
+ Extension ext = getExtensionByIdAndType(extensions,
+ parsePluginId,
+ contentType);
+ // the extension returned may be null
+ // that means that it was not enabled in the plugin.includes
+ // nutch conf property, but it was mapped in the
+ // parse-plugins.xml
+ // file.
+ // OR it was enabled in plugin.includes, but the plugin's plugin.xml
+ // file does not claim that the plugin supports the specified mimeType
+ // in either case, LOG the appropriate error message to SEVERE level
+
+ if (ext == null) {
+ //try to get it just by its pluginId
+ ext = getExtensionById(extensions, parsePluginId);
+ if (ext != null) {
+ // plugin was enabled via plugin.includes
+ // its plugin.xml just doesn't claim to support that
+ // particular mimeType
+ LOG.warning("ParserFactory:Plugin: " + parsePluginId +
+ " mapped to contentType " + contentType +
+ " via parse-plugins.xml, but " + "its plugin.xml " +
+ "file does not claim to support contentType: " +
+ contentType);
+
+ //go ahead and load the extension anyways, though
+ extList.add(ext);
+
+ } else{
+ // plugin wasn't enabled via plugin.includes
+ LOG.severe("ParserFactory: Plugin: " + parsePluginId +
+ " mapped to contentType " + contentType +
+ " via parse-plugins.xml, but not enabled via " +
+ "plugin.includes in nutch-default.xml");
}
+
+ } else{
+ // add it to the list
+ extList.add(ext);
}
}
+
+ return extList;
+ } else {
+ // okay, there were no list of plugins defined for
+ // this mimeType, however, there may be plugins registered
+ // via the plugin.includes nutch conf property that claim
+ // via their plugin.xml file to support this contentType
+ // so, iterate through the list of extensions and if you find
+ // any extensions where this is the case, throw a
+ // NotMappedParserException
+
+ List unmappedPlugins = new Vector();
+
+ for (int i = 0; i < extensions.length; i++) {
+ if (extensions[i].getAttribute("contentType") != null
+ && extensions[i].getAttribute("contentType").equals(
+ contentType)) {
+ unmappedPlugins.add(extensions[i].getDescriptor()
+ .getPluginId());
+ }
+ }
+
+ if (unmappedPlugins.size() > 0) {
+ LOG.info("The parsing plugins: " + unmappedPlugins +
+ " are enabled via the plugin.includes system " +
+ "property, and all claim to support the content type " +
+ contentType + ", but they are not mapped to it in the " +
+ "parse-plugins.xml file");
+ } else {
+ LOG.fine("ParserFactory:No parse plugins mapped or enabled for " +
+ "contentType " + contentType);
+ }
+ return null;
}
- return null;
}
private static boolean match(Extension extension, String id, String type) {
return (id.equals(extension.getDescriptor().getPluginId())) &&
- (type.equals(extension.getAttribute("contentType")) ||
- (type.equals(DEFAULT_PLUGIN)));
+ (type.equals(extension.getAttribute("contentType")) ||
+ (type.equals(DEFAULT_PLUGIN)));
+ }
+
+ private static Extension getExtensionByIdAndType(Extension[] extList,
+ String plugId,
+ String contentType) {
+ for (int i = 0; i < extList.length; i++) {
+ if (match(extList[i], plugId, contentType)) {
+ return extList[i];
+ }
+ }
+ return null;
+ }
+
+ private static Extension getExtensionById(Extension[] extList, String
plugId) {
+ for(int i = 0; i < extList.length; i++){
+ if(plugId.equals(extList[i].getDescriptor().getPluginId())){
+ return extList[i];
+ }
+ }
+ return null;
}
}
Modified:
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java?rev=292865&r1=292864&r2=292865&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
Fri Sep 30 15:11:19 2005
@@ -21,6 +21,12 @@
// Nutch imports
import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.util.NutchConf;
+
+// JDK imports
+import java.io.File;
+
+import java.net.MalformedURLException;
/**
@@ -30,9 +36,32 @@
* @version 1.0
*/
public class TestParserFactory extends TestCase {
+
+ private NutchConf conf = null;
public TestParserFactory(String name) { super(name); }
+ private void initNutchConf(String testFile) {
+ // set the Nutch Conf property for parse.plugin.file.url
+ // to ${test.src.dir}/org/apache/nutch/parse/parse-plugin-test.xml
+ String testParsePluginFileUrl = null;
+ try{
+ testParsePluginFileUrl = new File(System.getProperty("test.src.dir")
+ +"/org/apache/nutch/parse/" + testFile).toURL().toString();
+ NutchConf.get().set("parse.plugin.file.url",testParsePluginFileUrl);
+ this.conf = NutchConf.get();
+ }
+ catch(MalformedURLException e){
+ throw new RuntimeException("Unable to load parse-plugins.xml file from
URL: "+testParsePluginFileUrl);
+ }
+ }
+
+ /** Inits the Test Case: loads the Nutch Conf instance. */
+ protected void setUp() throws Exception {
+ if (conf == null) {
+ initNutchConf("parse-plugin-test.xml");
+ }
+ }
/** Unit test for <code>getParser(String, String)</code> method. */
public void testGetParser() throws Exception {
@@ -42,12 +71,46 @@
assertNotNull(parser);
}
- /** Unit test for <code>getExtension(String)</code> method. */
- public void testGetExtension() throws Exception {
- Extension ext = ParserFactory.getExtension("text/html");
+ /** Unit test for <code>getExtensions(String)</code> method. */
+ public void testGetExtensions() throws Exception {
+ Extension ext = (Extension)ParserFactory.getExtensions("text/html").get(0);
assertEquals("parse-html", ext.getDescriptor().getPluginId());
- ext = ParserFactory.getExtension("foo/bar");
+ ext = (Extension)ParserFactory.getExtensions("foo/bar").get(0);
assertEquals("parse-text", ext.getDescriptor().getPluginId());
}
+ /** Unit test to check <code>getParsers</code> method */
+ public void testGetParsers() throws Exception {
+ Parser [] parsers = ParserFactory.getParsers("text/html",
"http://foo.com");
+ assertNotNull(parsers);
+ assertEquals(1, parsers.length);
+ assertEquals("org.apache.nutch.parse.html.HtmlParser",
+ parsers[0].getClass().getName());
+
+ parsers = ParserFactory.getParsers("application/x-javascript",
+ "http://foo.com");
+ assertNotNull(parsers);
+ assertEquals(1, parsers.length);
+ assertEquals("org.apache.nutch.parse.js.JSParseFilter",
+ parsers[0].getClass().getName());
+
+ parsers = ParserFactory.getParsers("text/plain", "http://foo.com");
+ assertNotNull(parsers);
+ assertEquals(1, parsers.length);
+ assertEquals("org.apache.nutch.parse.text.TextParser",
+ parsers[0].getClass().getName());
+
+ Parser parser1 = ParserFactory.getParsers("text/plain",
"http://foo.com")[0];
+ Parser parser2 = ParserFactory.getParsers("*", "http://foo.com")[0];
+
+ assertEquals("Different instances!", parser1.hashCode(),
parser2.hashCode());
+
+ //test and make sure that the rss parser is loaded even though its
plugin.xml
+ //doesn't claim to support text/rss, only application/rss+xml
+ parsers = ParserFactory.getParsers("text/rss","http://foo.com");
+ assertNotNull(parsers);
+ assertEquals(1,parsers.length);
+
assertEquals("org.apache.nutch.parse.rss.RSSParser",parsers[0].getClass().getName());
+ }
+
}
Added: lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml?rev=292865&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml
(added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml
Fri Sep 30 15:11:19 2005
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2005 The Apache Software Foundation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+ Author : mattmann
+ Description: Test parse-plugins.xml file.
+-->
+
+<parse-plugins>
+
+ <!-- by default if the mimeType is set to *, or
+ can't be determined, use parse-text -->
+ <mimeType name="*">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <!-- test these 4 plugins -->
+ <mimeType name="text/html">
+ <plugin id="parse-html"/>
+ </mimeType>
+
+ <mimeType name="text/plain">
+ <plugin id="parse-text"/>
+ </mimeType>
+
+ <mimeType name="application/x-javascript">
+ <plugin id="parse-js"/>
+ </mimeType>
+
+ <mimeType name="text/rss">
+ <plugin id="parse-rss"/>
+ </mimeType>
+
+</parse-plugins>
Propchange:
lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml
------------------------------------------------------------------------------
svn:eol-style = native