Author: jerome Date: Tue Feb 21 01:54:21 2006 New Revision: 379403 URL: http://svn.apache.org/viewcvs?rev=379403&view=rev Log: NUTCH-140, parse-plugin.xml can now use extension-id and plugin-id
Modified: lucene/nutch/trunk/conf/parse-plugins.dtd lucene/nutch/trunk/conf/parse-plugins.xml lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml Modified: lucene/nutch/trunk/conf/parse-plugins.dtd URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.dtd?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/conf/parse-plugins.dtd (original) +++ lucene/nutch/trunk/conf/parse-plugins.dtd Tue Feb 21 01:54:21 2006 @@ -1,7 +1,12 @@ -<!ELEMENT parse-plugins (mimeType+)> +<!ELEMENT parse-plugins (mimeType+,aliases)> <!ELEMENT mimeType (plugin+)> <!ATTLIST mimeType name CDATA #REQUIRED> <!ELEMENT plugin EMPTY> <!ATTLIST plugin id CDATA #REQUIRED> -<!ATTLIST plugin order CDATA ''> \ No newline at end of file +<!ATTLIST plugin order CDATA ''> + +<!ELEMENT aliases (alias+)> +<!ELEMENT alias EMPTY> +<!ATTLIST alias name CDATA #REQUIRED> +<!ATTLIST alias extension-id CDATA #REQUIRED> \ No newline at end of file Modified: lucene/nutch/trunk/conf/parse-plugins.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/conf/parse-plugins.xml (original) +++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Feb 21 01:54:21 2006 @@ -218,4 +218,33 @@ <plugin id="parse-ext" /> </mimeType> + <!-- alias mappings for parse-xxx names to the actual extension implementation + ids described in each plugin's plugin.xml file --> + <aliases> + <alias name="parse-ext" extension-id="ExtParser" /> + <alias name="parse-html" + extension-id="org.apache.nutch.parse.html.HtmlParser" /> + <alias name="parse-js" extension-id="JSParser" /> + <alias name="parse-mp3" + extension-id="org.apache.nutch.parse.mp3.MP3Parser" /> + <alias name="parse-msexcel" + extension-id="org.apache.nutch.parse.msexcel.MSExcelParser" /> + <alias name="parse-mspowerpoint" + extension-id="org.apache.nutch.parse.mspowerpoint.MSPowerPointParser" /> + <alias name="parse-msword" + extension-id="org.apache.nutch.parse.msword.MSWordParser" /> + <alias name="parse-pdf" + extension-id="org.apache.nutch.parse.pdf.PdfParser" /> + <alias name="parse-rss" + extension-id="org.apache.nutch.parse.rss.RSSParser" /> + <alias name="parse-rtf" + extension-id="org.apache.nutch.parse.rtf.RTFParseFactory" /> + <alias name="parse-swf" + extension-id="org.apache.nutch.parse.swf.SWFParser" /> + <alias name="parse-text" + extension-id="org.apache.nutch.parse.text.TextParser" /> + <alias name="parse-zip" + extension-id="org.apache.nutch.parse.zip.ZipParser" /> + </aliases> + </parse-plugins> Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java Tue Feb 21 01:54:21 2006 @@ -19,6 +19,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.List; +import java.util.Map; /** @@ -30,27 +31,40 @@ * @author mattmann * @version 1.0 */ -public class ParsePluginList { +class ParsePluginList { /* a map to link mimeType to an ordered list of parsing plugins */ - private HashMap fMimeTypeToPluginMap = null; + private Map fMimeTypeToPluginMap = null; + + /* A list of aliases */ + private Map aliases = null; + /** * Constructs a new ParsePluginList */ - public ParsePluginList() { + ParsePluginList() { fMimeTypeToPluginMap = new HashMap(); + aliases = new HashMap(); } - public List getPluginList(String mimeType) { + List getPluginList(String mimeType) { return (List) fMimeTypeToPluginMap.get(mimeType); } + + void setAliases(Map aliases) { + this.aliases = aliases; + } + + Map getAliases() { + return aliases; + } - public void setPluginList(String mimeType, List l) { + void setPluginList(String mimeType, List l) { fMimeTypeToPluginMap.put(mimeType, l); } - public List getSupportedMimeTypes() { + List getSupportedMimeTypes() { return Arrays.asList(fMimeTypeToPluginMap.keySet().toArray( new String[] {})); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java Tue Feb 21 01:54:21 2006 @@ -16,12 +16,14 @@ package org.apache.nutch.parse; // JDK imports +import java.io.InputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; import java.util.Iterator; import java.util.List; -import java.util.Vector; +import java.util.Map; import java.util.logging.Logger; -import java.io.InputStream; -import java.net.URL; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; @@ -53,7 +55,7 @@ /** The property name of the parse-plugins location */ private static final String PP_FILE_PROP = "parse.plugin.file"; - /* the parse-plugins file */ + /** the parse-plugins file */ private String fParsePluginsFile = null; @@ -111,8 +113,12 @@ Element parsePlugins = document.getDocumentElement(); + // build up the alias hash map + Map aliases = getAliases(parsePlugins); + // And store it on the parse plugin list + pList.setAliases(aliases); + // get all the mime type nodes - NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType"); // iterate through the mime types @@ -125,30 +131,29 @@ // iterate through the plugins, add them in order read // OR if they have a special order="" attribute, then hold those in - // a - // separate list, and then insert them into the final list at the - // order - // specified - + // a separate list, and then insert them into the final list at the + // order specified if (pluginList != null && pluginList.getLength() > 0) { - List plugList = new Vector(pluginList.getLength()); + List plugList = new ArrayList(pluginList.getLength()); - for (int j = 0; j < pluginList.getLength(); j++) { + for (int j = 0; j<pluginList.getLength(); j++) { Element plugin = (Element) pluginList.item(j); String pluginId = plugin.getAttribute("id"); - + String extId = (String) aliases.get(pluginId); + if (extId == null) { + // Assume an extension id is directly specified + extId = pluginId; + } String orderStr = plugin.getAttribute("order"); int order = -1; - try { order = Integer.parseInt(orderStr); } catch (NumberFormatException ignore) { } - if (order != -1) { - plugList.add(order - 1, pluginId); + plugList.add(order - 1, extId); } else { - plugList.add(pluginId); + plugList.add(extId); } } @@ -202,7 +207,7 @@ System.out.println("MIMETYPE: " + mimeType); List plugList = prefs.getPluginList(mimeType); - System.out.println("PLUGINS:"); + System.out.println("EXTENSION IDs:"); for (Iterator j = plugList.iterator(); j.hasNext();) { System.out.println((String) j.next()); @@ -224,6 +229,39 @@ */ public void setFParsePluginsFile(String parsePluginsFile) { fParsePluginsFile = parsePluginsFile; + } + + private Map getAliases(Element parsePluginsRoot) { + + Map aliases = new HashMap(); + NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases"); + + if (aliasRoot == null || (aliasRoot != null && aliasRoot.getLength() == 0)) { + LOG.warning("No aliases defined in parse-plugins.xml!"); + return aliases; + } + + if (aliasRoot.getLength() > 1) { + // log a warning, but try and continue processing + LOG.warning("There should only be one \"aliases\" tag in parse-plugins.xml"); + } + + Element aliasRootElem = (Element)aliasRoot.item(0); + NodeList aliasElements = aliasRootElem.getElementsByTagName("alias"); + + if (aliasElements != null && aliasElements.getLength() > 0) { + for (int i=0; i<aliasElements.getLength(); i++) { + Element aliasElem = (Element)aliasElements.item(i); + String parsePluginId = aliasElem.getAttribute("name"); + String extensionId = aliasElem.getAttribute("extension-id"); + LOG.finest("Found alias: plugin-id: " + parsePluginId + + ", extension-id: " + extensionId); + if (parsePluginId != null && extensionId != null) { + aliases.put(parsePluginId, extensionId); + } + } + } + return aliases; } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java Tue Feb 21 01:54:21 2006 @@ -36,8 +36,8 @@ public class ParseUtil { /* our log stream */ - public static final Logger LOG = LogFormatter.getLogger(ParseUtil.class - .getName()); + public static final Logger LOG = + LogFormatter.getLogger(ParseUtil.class.getName()); private Configuration conf; private ParserFactory parserFactory; @@ -84,33 +84,36 @@ " of type " + content.getContentType()); ParseStatus ps = (parse.getData() != null) ? parse.getData().getStatus() : null; - return (ps == null) ? new ParseStatus().getEmptyParse(this.conf) : ps.getEmptyParse(this.conf); + return (ps == null) ? new ParseStatus().getEmptyParse(this.conf) + : ps.getEmptyParse(this.conf); } - + /** * Method parses a [EMAIL PROTECTED] Content} object using the [EMAIL PROTECTED] Parser} specified - * by the parameter <code>parserId</code>. If a suitable [EMAIL PROTECTED] Parser} is not - * found, then a <code>WARNING</code> level message is logged, and a - * ParseException is thrown. - * If the parse is uncessful for any other reason, then a <code>WARNING</code> - * level message is logged, and a <code>ParseStatus.getEmptyParse() is + * by the parameter <code>extId</code>, i.e., the Parser's extension ID. + * If a suitable [EMAIL PROTECTED] Parser} is not found, then a <code>WARNING</code> + * level message is logged, and a ParseException is thrown. If the parse is + * uncessful for any other reason, then a <code>WARNING</code> level + * message is logged, and a <code>ParseStatus.getEmptyParse()</code> is * returned. * - * @param parserId The ID of the [EMAIL PROTECTED] Parser} to use to parse the specified - * content. + * @param extId The extension implementation ID of the [EMAIL PROTECTED] Parser} to use + * to parse the specified content. * @param content The content to parse. + * * @return A [EMAIL PROTECTED] Parse} object if the parse is successful, otherwise, * a <code>ParseStatus.getEmptyParse()</code>. + * * @throws ParseException If there is no suitable [EMAIL PROTECTED] Parser} found * to perform the parse. */ - public Parse parseByParserId(String parserId, Content content) + public Parse parseByExtensionId(String extId, Content content) throws ParseException { Parse parse = null; Parser p = null; try { - p = this.parserFactory.getParserById(parserId); + p = this.parserFactory.getParserById(extId); } catch (ParserNotFound e) { LOG.warning("No suitable parser found when trying to parse content " + content); @@ -126,6 +129,6 @@ " of type " + content.getContentType()); return new ParseStatus().getEmptyParse(this.conf); } - } + } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Tue Feb 21 01:54:21 2006 @@ -16,9 +16,11 @@ package org.apache.nutch.parse; // JDK imports +import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; +import java.util.Map; import java.util.Vector; import java.util.logging.Logger; @@ -32,6 +34,7 @@ import org.apache.nutch.util.mime.MimeType; import org.apache.nutch.util.mime.MimeTypeException; + /** Creates and caches [EMAIL PROTECTED] Parser} plugins.*/ public final class ParserFactory { @@ -63,43 +66,6 @@ } } - - /** - * Returns the appropriate [EMAIL PROTECTED] Parser} implementation given a content type - * and url. - * - * @deprecated Since the addition of NUTCH-88, this method is replaced by - * taking the highest priority [EMAIL PROTECTED] Parser} returned from - * [EMAIL PROTECTED] #getParsers(String, String)}. - * - * Parser extensions should define the attributes "contentType" and/or - * "pathSuffix". Content type has priority: the first plugin found whose - * "contentType" attribute matches the beginning of the content's type is - * used. If none match, then the first whose "pathSuffix" attribute matches - * the end of the url's path is used. If neither of these match, then the - * first plugin whose "pathSuffix" is the empty string is used. - */ - public Parser getParser(String contentType, String url) - throws ParserNotFound { - - Parser[] parsers = getParsers(contentType, url); - - if(parsers != null){ - //give the user the highest priority parser available - for(int i = 0; i < parsers.length; i++ ){ - Parser p = parsers[i]; - if(p != null){ - return p; - } - } - - throw new ParserNotFound(url, contentType); - - } - else{ - throw new ParserNotFound(url, contentType); - } - } /** * Function returns an array of [EMAIL PROTECTED] Parser}s for a given content type. @@ -150,11 +116,11 @@ Parser p = null; try { //check to see if we've cached this parser instance yet - p = (Parser) this.conf.getObject(ext.getDescriptor().getPluginId()); + p = (Parser) this.conf.getObject(ext.getId()); if (p == null) { // go ahead and instantiate it and then cache it p = (Parser) ext.getExtensionInstance(); - this.conf.setObject(ext.getDescriptor().getPluginId(),p); + this.conf.setObject(ext.getId(),p); } parsers.add(p); } catch (PluginRuntimeException e) { @@ -168,79 +134,79 @@ } return (Parser[]) parsers.toArray(new Parser[]{}); } - + /** - * <p> * Function returns a [EMAIL PROTECTED] Parser} instance with the specified - * <code>parserId</code>. If the Parser instance isn't found, then the - * function throws a <code>ParserNotFound</code> exception. If the function - * is able to find the [EMAIL PROTECTED] Parser} in the internal - * <code>PARSER_CACHE</code> then it will return the already instantiated - * Parser. Otherwise, if it has to instantiate the Parser itself , then this - * function will cache that Parser in the internal <code>PARSER_CACHE</code>. + * <code>extId</code>, representing its extension ID. If the Parser + * instance isn't found, then the function throws a + * <code>ParserNotFound</code> exception. If the function is able to find + * the [EMAIL PROTECTED] Parser} in the internal <code>PARSER_CACHE</code> then it + * will return the already instantiated Parser. Otherwise, if it has to + * instantiate the Parser itself , then this function will cache that Parser + * in the internal <code>PARSER_CACHE</code>. * - * @param parserId - * The string ID (e.g., "parse-text", "parse-msword") of the - * [EMAIL PROTECTED] Parser} implementation to return. + * @param extId The string extension ID (e.g., + * "org.apache.nutch.parse.rss.RSSParser", + * "org.apache.nutch.parse.rtf.RTFParseFactory") of the [EMAIL PROTECTED] Parser} + * implementation to return. * @return A [EMAIL PROTECTED] Parser} implementation specified by the parameter - * <code>parserId</code>. - * @throws ParserNotFound - * If the Parser is not found (i.e., registered with the extension - * point), or if the there a [EMAIL PROTECTED] PluginRuntimeException} - * instantiating the [EMAIL PROTECTED] Parser}. + * <code>extId</code>. + * @throws ParserNotFound If the Parser is not found (i.e., registered with + * the extension point), or if the there a + * [EMAIL PROTECTED] PluginRuntimeException} instantiating the [EMAIL PROTECTED] Parser}. */ - public Parser getParserById(String parserId) throws ParserNotFound { - // first check the cache + public Parser getParserById(String id) throws ParserNotFound { - if (this.conf.getObject(parserId) != null) { - return (Parser) this.conf.getObject(parserId); - } else { - // get the list of registered parsing extensions - // then find the right one by Id + Extension[] extensions = this.extensionPoint.getExtensions(); + Extension parserExt = null; - Extension[] extensions = this.extensionPoint.getExtensions(); - Extension parserExt = getExtensionById(extensions, parserId); + if (id != null) { + parserExt = getExtension(extensions, id); + } + if (parserExt == null) { + parserExt = getExtensionFromAlias(extensions, id); + } - if (parserExt == null) { - throw new ParserNotFound("No Parser Found for parserId: " + parserId - + "!"); - } else { - // instantiate the Parser - try { - Parser p = null; - p = (Parser) parserExt.getExtensionInstance(); - this.conf.setObject(parserId, p); - return p; - } catch (PluginRuntimeException e) { - LOG.warning("ParserFactory:PluginRuntimeException when " - + "initializing parser plugin " - + parserExt.getDescriptor().getPluginId() - + " instance in getParserById"); - throw new ParserNotFound("No Parser Found for parserId: " + parserId - + "!"); - } + if (parserExt == null) { + throw new ParserNotFound("No Parser Found for id [" + id + "]"); + } + + // first check the cache + if (this.conf.getObject(parserExt.getId()) != null) { + return (Parser) this.conf.getObject(parserExt.getId()); + + // if not found in cache, instantiate the Parser + } else { + try { + Parser p = (Parser) parserExt.getExtensionInstance(); + this.conf.setObject(parserExt.getId(), p); + return p; + } catch (PluginRuntimeException e) { + LOG.warning("Canno initialize parser " + + parserExt.getDescriptor().getPluginId() + + " (cause: " + e.toString()); + throw new ParserNotFound("Cannot init parser for id [" + id + "]"); } } } /** - * finds the best-suited parse plugin for a given contentType. + * Finds the best-suited parse plugin for a given contentType. * - * @param contentType - * Content-Type for which we seek a parse plugin. - * @return List - List of extensions to be used for this contentType. If none, - * returns null. + * @param contentType Content-Type for which we seek a parse plugin. + * @return a list of extensions to be used for this contentType. + * If none, returns <code>null</code>. */ protected List getExtensions(String contentType) { // First of all, tries to clean the content-type String type = null; try { - type = MimeType.clean(contentType); + type = MimeType.clean(contentType); } catch (MimeTypeException mte) { - LOG.info("Could not clean the content-type [" + contentType + - "], Reason is [" + mte + "]. Using its raw version..."); - type = contentType; + LOG.fine("Could not clean the content-type [" + contentType + + "], Reason is [" + mte + "]. Using its raw version..."); + type = contentType; } List extensions = (List) this.conf.getObject(type); @@ -304,19 +270,16 @@ * If none, returns null. */ private List matchExtensions(List plugins, - Extension[] extensions, - String contentType) { + Extension[] extensions, + String contentType) { - List extList = null; + List extList = new ArrayList(); if (plugins != null) { - extList = new Vector(plugins.size()); for (Iterator i = plugins.iterator(); i.hasNext();) { String parsePluginId = (String) i.next(); - Extension ext = getExtensionByIdAndType(extensions, - parsePluginId, - contentType); + Extension ext = getExtension(extensions, parsePluginId, contentType); // the extension returned may be null // that means that it was not enabled in the plugin.includes // nutch conf property, but it was mapped in the @@ -327,8 +290,9 @@ // in either case, LOG the appropriate error message to WARN level if (ext == null) { - //try to get it just by its pluginId - ext = getExtensionById(extensions, parsePluginId); + //try to get it just by its pluginId + ext = getExtension(extensions, parsePluginId); + if (ext != null) { // plugin was enabled via plugin.includes // its plugin.xml just doesn't claim to support that @@ -338,25 +302,21 @@ " via parse-plugins.xml, but " + "its plugin.xml " + "file does not claim to support contentType: " + contentType); - - //go ahead and load the extension anyways, though - extList.add(ext); - - } else{ + } else { // plugin wasn't enabled via plugin.includes LOG.warning("ParserFactory: Plugin: " + parsePluginId + " mapped to contentType " + contentType + " via parse-plugins.xml, but not enabled via " + "plugin.includes in nutch-default.xml"); } - - } else{ + } + + if (ext != null) { // add it to the list extList.add(ext); } } - return extList; } else { // okay, there were no list of plugins defined for // this mimeType, however, there may be plugins registered @@ -366,19 +326,16 @@ // any extensions where this is the case, throw a // NotMappedParserException - List unmappedPlugins = new Vector(); - - for (int i = 0; i < extensions.length; i++) { + for (int i=0; i<extensions.length; i++) { if (extensions[i].getAttribute("contentType") != null && extensions[i].getAttribute("contentType").equals( contentType)) { - unmappedPlugins.add(extensions[i].getDescriptor() - .getPluginId()); + extList.add(extensions[i].getId()); } } - if (unmappedPlugins.size() > 0) { - LOG.info("The parsing plugins: " + unmappedPlugins + + if (extList.size() > 0) { + LOG.info("The parsing plugins: " + extList + " are enabled via the plugin.includes system " + "property, and all claim to support the content type " + contentType + ", but they are not mapped to it in the " + @@ -387,33 +344,38 @@ LOG.fine("ParserFactory:No parse plugins mapped or enabled for " + "contentType " + contentType); } - return null; } + + return (extList.size() > 0) ? extList : null; } private boolean match(Extension extension, String id, String type) { - return (id.equals(extension.getDescriptor().getPluginId())) && - (type.equals(extension.getAttribute("contentType")) || - (type.equals(DEFAULT_PLUGIN))); + return ((id.equals(extension.getId())) && + (type.equals(extension.getAttribute("contentType")) || + type.equals(DEFAULT_PLUGIN))); } - private Extension getExtensionByIdAndType(Extension[] extList, - String plugId, - String contentType) { - for (int i = 0; i < extList.length; i++) { - if (match(extList[i], plugId, contentType)) { - return extList[i]; + /** Get an extension from its id and supported content-type. */ + private Extension getExtension(Extension[] list, String id, String type) { + for (int i=0; i<list.length; i++) { + if (match(list[i], id, type)) { + return list[i]; } } return null; } - - private Extension getExtensionById(Extension[] extList, String plugId) { - for(int i = 0; i < extList.length; i++){ - if(plugId.equals(extList[i].getDescriptor().getPluginId())){ - return extList[i]; + + private Extension getExtension(Extension[] list, String id) { + for (int i=0; i<list.length; i++) { + if (id.equals(list[i].getId())) { + return list[i]; } } return null; } + + private Extension getExtensionFromAlias(Extension[] list, String id) { + return getExtension(list, (String) parsePluginList.getAliases().get(id)); + } + } Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original) +++ lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Tue Feb 21 01:54:21 2006 @@ -61,7 +61,7 @@ Content content = new Content(url, url, bytes, contentType, new Metadata(), conf); - Parse parse = new ParseUtil(conf).parseByParserId("parse-html",content); + Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html",content); Metadata metadata = parse.getData().getParseMeta(); assertEquals(license, metadata.get("License-Url")); Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Tue Feb 21 01:54:21 2006 @@ -37,8 +37,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.hadoop.io.UTF8; import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.Parser; -import org.apache.nutch.parse.ParserFactory; +import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.parse.ParseException; import org.apache.nutch.parse.ParserNotFound; import org.apache.nutch.protocol.Content; @@ -341,9 +340,7 @@ try { protocol = new ProtocolFactory(conf).getProtocol(url); Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent(); - String contentType = content.getContentType(); - Parser parser = new ParserFactory(conf).getParser(contentType, url); - Parse parse = parser.getParse(content); + Parse parse = new ParseUtil(conf).parse(content); System.out.println("text:" + parse.getText()); return parse.getText(); Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Tue Feb 21 01:54:21 2006 @@ -23,8 +23,8 @@ // Nutch imports import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.Parser; import org.apache.nutch.parse.ParserFactory; +import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.NutchConfiguration; @@ -48,16 +48,12 @@ public void testMetaHTMLParsing() { try { - + ParseUtil parser = new ParseUtil(NutchConfiguration.create()); /* loop through the test documents and validate result */ for (int t = 0; t < docs.length; t++) { - Content content = getContent(docs[t]); - Parser parser = new ParserFactory(NutchConfiguration.create()).getParser("text/html", URL); - Parse parse = parser.getParse(content); - + Parse parse = parser.parse(content); assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE)); - } } catch (Exception e) { e.printStackTrace(System.out); Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java Tue Feb 21 01:54:21 2006 @@ -111,13 +111,13 @@ // check external parser that does 'cat' contentType = "application/vnd.nutch.example.cat"; content.setContentType(contentType); - parse = new ParseUtil(conf).parseByParserId("parse-ext", content); + parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content); assertEquals(expectedText,parse.getText()); // check external parser that does 'md5sum' contentType = "application/vnd.nutch.example.md5sum"; content.setContentType(contentType); - parse = new ParseUtil(conf).parseByParserId("parse-ext", content); + parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content); assertTrue(parse.getText().startsWith(expectedMD5sum)); } } Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java Tue Feb 21 01:54:21 2006 @@ -73,7 +73,7 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()) .getContent(); - parse = new ParseUtil(conf).parseByParserId("parse-mp3", content); + parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content); Metadata metadata = parse.getData().getParseMeta(); assertEquals("postgresql comment id3v2", metadata.get("COMM-Text")); assertEquals("postgresql composer id3v2", metadata.get("TCOM-Text")); @@ -105,7 +105,7 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()) .getContent(); - parse = new ParseUtil(conf).parseByParserId("parse-mp3", content); + parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content); Metadata metadata = parse.getData().getParseMeta(); assertEquals("postgresql comment id3v1", metadata.get("COMM-Text")); Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java Tue Feb 21 01:54:21 2006 @@ -63,7 +63,7 @@ protocol = factory.getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); - parse = parser.parseByParserId("parse-msexcel", content); + parse = parser.parseByExtensionId("parse-msexcel", content); assertTrue(parse.getText().equals(expectedText)); } Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java Tue Feb 21 01:54:21 2006 @@ -126,7 +126,8 @@ */ public void testContent() throws Exception { - Parse parse = new ParseUtil(NutchConfiguration.create()).parseByParserId("parse-mspowerpoint",this.content); + Parse parse = new ParseUtil(NutchConfiguration.create()) + .parseByExtensionId("parse-mspowerpoint", this.content); ParseData data = parse.getData(); String text = parse.getText(); @@ -163,7 +164,8 @@ */ public void testMeta() throws Exception { - Parse parse = new ParseUtil(NutchConfiguration.create()).parseByParserId("parse-mspowerpoint",content); + Parse parse = new ParseUtil(NutchConfiguration.create()) + .parseByExtensionId("parse-mspowerpoint", content); ParseData data = parse.getData(); Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Tue Feb 21 01:54:21 2006 @@ -69,7 +69,7 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByParserId("parse-msword",content); + parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content); assertTrue(parse.getText().startsWith(expectedText)); } Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java Tue Feb 21 01:54:21 2006 @@ -69,7 +69,7 @@ Configuration conf = NutchConfiguration.create(); protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByParserId("parse-pdf",content); + parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content); int index = parse.getText().indexOf(expectedText); assertTrue(index > 0); Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java Tue Feb 21 01:54:21 2006 @@ -87,7 +87,7 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByParserId("parse-rss",content); + parse = new ParseUtil(conf).parseByExtensionId("parse-rss", content); //check that there are 3 outlinks: //http://test.channel.com Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Tue Feb 21 01:54:21 2006 @@ -74,7 +74,7 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()) .getContent(); - parse = new ParseUtil(conf).parseByParserId("parse-rtf", content); + parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content); String text = parse.getText(); assertEquals("The quick brown fox jumps over the lazy dog", text.trim()); Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java Tue Feb 21 01:54:21 2006 @@ -69,7 +69,7 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByParserId("parse-zip",content); + parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content); assertTrue(parse.getText().equals(expectedText)); } } Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java Tue Feb 21 01:54:21 2006 @@ -42,18 +42,10 @@ conf = NutchConfiguration.create(); conf.set("plugin.includes", ".*"); conf.set("parse.plugin.file", - "org/apache/nutch/parse/parse-plugin-test.xml"); + "org/apache/nutch/parse/parse-plugin-test.xml"); parserFactory = new ParserFactory(conf); } - - /** Unit test for <code>getParser(String, String)</code> method. */ - public void testGetParser() throws Exception { - Parser parser = parserFactory.getParser("text/html", "http://foo.com/"); - assertNotNull(parser); - parser = parserFactory.getParser("foo/bar", "http://foo.com/"); - assertNotNull(parser); - } - + /** Unit test for <code>getExtensions(String)</code> method. */ public void testGetExtensions() throws Exception { Extension ext = (Extension)parserFactory.getExtensions("text/html").get(0); @@ -70,27 +62,27 @@ assertNotNull(parsers); assertEquals(1, parsers.length); assertEquals("org.apache.nutch.parse.html.HtmlParser", - parsers[0].getClass().getName()); + parsers[0].getClass().getName()); - parsers = parserFactory.getParsers("text/html; charset=ISO-8859-1", "http://foo.com"); + parsers = parserFactory.getParsers("text/html; charset=ISO-8859-1", + "http://foo.com"); assertNotNull(parsers); assertEquals(1, parsers.length); assertEquals("org.apache.nutch.parse.html.HtmlParser", - parsers[0].getClass().getName()); - + parsers[0].getClass().getName()); parsers = parserFactory.getParsers("application/x-javascript", - "http://foo.com"); + "http://foo.com"); assertNotNull(parsers); assertEquals(1, parsers.length); assertEquals("org.apache.nutch.parse.js.JSParseFilter", - parsers[0].getClass().getName()); + parsers[0].getClass().getName()); parsers = parserFactory.getParsers("text/plain", "http://foo.com"); assertNotNull(parsers); assertEquals(1, parsers.length); assertEquals("org.apache.nutch.parse.text.TextParser", - parsers[0].getClass().getName()); + parsers[0].getClass().getName()); Parser parser1 = parserFactory.getParsers("text/plain", "http://foo.com")[0]; Parser parser2 = parserFactory.getParsers("*", "http://foo.com")[0]; @@ -102,7 +94,8 @@ parsers = parserFactory.getParsers("text/rss","http://foo.com"); assertNotNull(parsers); assertEquals(1,parsers.length); - assertEquals("org.apache.nutch.parse.rss.RSSParser",parsers[0].getClass().getName()); + assertEquals("org.apache.nutch.parse.rss.RSSParser", + parsers[0].getClass().getName()); } } Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml Tue Feb 21 01:54:21 2006 @@ -1,46 +1,64 @@ <?xml version="1.0" encoding="UTF-8"?> <!-- - Copyright 2005 The Apache Software Foundation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - Author : mattmann - Description: Test parse-plugins.xml file. + Copyright 2005 The Apache Software Foundation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Author : mattmann + Description: Test parse-plugins.xml file. --> <parse-plugins> - <!-- by default if the mimeType is set to *, or - can't be determined, use parse-text --> - <mimeType name="*"> - <plugin id="parse-text" /> - </mimeType> - - <!-- test these 4 plugins --> - <mimeType name="text/html"> - <plugin id="parse-html"/> - </mimeType> - - <mimeType name="text/plain"> - <plugin id="parse-text"/> - </mimeType> + <!-- by default if the mimeType is set to *, or + can't be determined, use parse-text --> + <mimeType name="*"> + <plugin id="parse-text" /> + </mimeType> + + <!-- test these 4 plugins --> + <mimeType name="text/html"> + <!-- + ! Test that if a parser cannot be instanciated, + ! it should not block the process and then the next one is used + !--> + <plugin id="parse-plugin-that-not-exist"/> + <plugin id="parse-html"/> + </mimeType> + + <mimeType name="text/plain"> + <!-- Test that an extension-id can be directly used here --> + <plugin id="org.apache.nutch.parse.text.TextParser"/> + </mimeType> - <mimeType name="application/x-javascript"> - <plugin id="parse-js"/> - </mimeType> + <mimeType name="application/x-javascript"> + <plugin id="parse-js"/> + </mimeType> - <mimeType name="text/rss"> - <plugin id="parse-rss"/> - </mimeType> + <mimeType name="text/rss"> + <plugin id="parse-rss"/> + </mimeType> + <!-- alias mappings for parse-xxx names to the actual extension implementation + ids described in each plugin's plugin.xml file --> + <aliases> + <alias name="parse-html" + extension-id="org.apache.nutch.parse.html.HtmlParser" /> + <alias name="parse-js" + extension-id="JSParser" /> + <alias name="parse-rss" + extension-id="org.apache.nutch.parse.rss.RSSParser" /> + <alias name="parse-text" + extension-id="org.apache.nutch.parse.text.TextParser" /> + </aliases> </parse-plugins>