Author: jerome Date: Tue Feb 21 01:54:21 2006 New Revision: 379403 URL: http://svn.apache.org/viewcvs?rev=379403&view=rev Log: NUTCH-140, parse-plugin.xml can now use extension-id and plugin-id
Modified: lucene/nutch/trunk/conf/parse-plugins.dtd lucene/nutch/trunk/conf/parse-plugins.xml lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml Modified: lucene/nutch/trunk/conf/parse-plugins.dtd URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.dtd?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/conf/parse-plugins.dtd (original) +++ lucene/nutch/trunk/conf/parse-plugins.dtd Tue Feb 21 01:54:21 2006 @@ -1,7 +1,12 @@ -<!ELEMENT parse-plugins (mimeType+)> +<!ELEMENT parse-plugins (mimeType+,aliases)> <!ELEMENT mimeType (plugin+)> <!ATTLIST mimeType name CDATA #REQUIRED> <!ELEMENT plugin EMPTY> <!ATTLIST plugin id CDATA #REQUIRED> -<!ATTLIST plugin order CDATA ''> \ No newline at end of file +<!ATTLIST plugin order CDATA ''> + +<!ELEMENT aliases (alias+)> +<!ELEMENT alias EMPTY> +<!ATTLIST alias name CDATA #REQUIRED> +<!ATTLIST alias extension-id CDATA #REQUIRED> \ No newline at end of file Modified: lucene/nutch/trunk/conf/parse-plugins.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/conf/parse-plugins.xml (original) +++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Feb 21 01:54:21 2006 @@ -218,4 +218,33 @@ <plugin id="parse-ext" /> </mimeType> + <!-- alias mappings for parse-xxx names to the actual extension implementation + ids described in each plugin's plugin.xml file --> + <aliases> + <alias name="parse-ext" extension-id="ExtParser" /> + <alias name="parse-html" + extension-id="org.apache.nutch.parse.html.HtmlParser" /> + <alias name="parse-js" extension-id="JSParser" /> + <alias name="parse-mp3" + extension-id="org.apache.nutch.parse.mp3.MP3Parser" /> + <alias name="parse-msexcel" + extension-id="org.apache.nutch.parse.msexcel.MSExcelParser" /> + <alias name="parse-mspowerpoint" + extension-id="org.apache.nutch.parse.mspowerpoint.MSPowerPointParser" /> + <alias name="parse-msword" + extension-id="org.apache.nutch.parse.msword.MSWordParser" /> + <alias name="parse-pdf" + extension-id="org.apache.nutch.parse.pdf.PdfParser" /> + <alias name="parse-rss" + extension-id="org.apache.nutch.parse.rss.RSSParser" /> + <alias name="parse-rtf" + extension-id="org.apache.nutch.parse.rtf.RTFParseFactory" /> + <alias name="parse-swf" + extension-id="org.apache.nutch.parse.swf.SWFParser" /> + <alias name="parse-text" + extension-id="org.apache.nutch.parse.text.TextParser" /> + <alias name="parse-zip" + extension-id="org.apache.nutch.parse.zip.ZipParser" /> + </aliases> + </parse-plugins> Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java Tue Feb 21 01:54:21 2006 @@ -19,6 +19,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.List; +import java.util.Map; /** @@ -30,27 +31,40 @@ * @author mattmann * @version 1.0 */ -public class ParsePluginList { +class ParsePluginList { /* a map to link mimeType to an ordered list of parsing plugins */ - private HashMap fMimeTypeToPluginMap = null; + private Map fMimeTypeToPluginMap = null; + + /* A list of aliases */ + private Map aliases = null; + /** * Constructs a new ParsePluginList */ - public ParsePluginList() { + ParsePluginList() { fMimeTypeToPluginMap = new HashMap(); + aliases = new HashMap(); } - public List getPluginList(String mimeType) { + List getPluginList(String mimeType) { return (List) fMimeTypeToPluginMap.get(mimeType); } + + void setAliases(Map aliases) { + this.aliases = aliases; + } + + Map getAliases() { + return aliases; + } - public void setPluginList(String mimeType, List l) { + void setPluginList(String mimeType, List l) { fMimeTypeToPluginMap.put(mimeType, l); } - public List getSupportedMimeTypes() { + List getSupportedMimeTypes() { return Arrays.asList(fMimeTypeToPluginMap.keySet().toArray( new String[] {})); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java Tue Feb 21 01:54:21 2006 @@ -16,12 +16,14 @@ package org.apache.nutch.parse; // JDK imports +import java.io.InputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; import java.util.Iterator; import java.util.List; -import java.util.Vector; +import java.util.Map; import java.util.logging.Logger; -import java.io.InputStream; -import java.net.URL; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; @@ -53,7 +55,7 @@ /** The property name of the parse-plugins location */ private static final String PP_FILE_PROP = "parse.plugin.file"; - /* the parse-plugins file */ + /** the parse-plugins file */ private String fParsePluginsFile = null; @@ -111,8 +113,12 @@ Element parsePlugins = document.getDocumentElement(); + // build up the alias hash map + Map aliases = getAliases(parsePlugins); + // And store it on the parse plugin list + pList.setAliases(aliases); + // get all the mime type nodes - NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType"); // iterate through the mime types @@ -125,30 +131,29 @@ // iterate through the plugins, add them in order read // OR if they have a special order="" attribute, then hold those in - // a - // separate list, and then insert them into the final list at the - // order - // specified - + // a separate list, and then insert them into the final list at the + // order specified if (pluginList != null && pluginList.getLength() > 0) { - List plugList = new Vector(pluginList.getLength()); + List plugList = new ArrayList(pluginList.getLength()); - for (int j = 0; j < pluginList.getLength(); j++) { + for (int j = 0; j<pluginList.getLength(); j++) { Element plugin = (Element) pluginList.item(j); String pluginId = plugin.getAttribute("id"); - + String extId = (String) aliases.get(pluginId); + if (extId == null) { + // Assume an extension id is directly specified + extId = pluginId; + } String orderStr = plugin.getAttribute("order"); int order = -1; - try { order = Integer.parseInt(orderStr); } catch (NumberFormatException ignore) { } - if (order != -1) { - plugList.add(order - 1, pluginId); + plugList.add(order - 1, extId); } else { - plugList.add(pluginId); + plugList.add(extId); } } @@ -202,7 +207,7 @@ System.out.println("MIMETYPE: " + mimeType); List plugList = prefs.getPluginList(mimeType); - System.out.println("PLUGINS:"); + System.out.println("EXTENSION IDs:"); for (Iterator j = plugList.iterator(); j.hasNext();) { System.out.println((String) j.next()); @@ -224,6 +229,39 @@ */ public void setFParsePluginsFile(String parsePluginsFile) { fParsePluginsFile = parsePluginsFile; + } + + private Map getAliases(Element parsePluginsRoot) { + + Map aliases = new HashMap(); + NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases"); + + if (aliasRoot == null || (aliasRoot != null && aliasRoot.getLength() == 0)) { + LOG.warning("No aliases defined in parse-plugins.xml!"); + return aliases; + } + + if (aliasRoot.getLength() > 1) { + // log a warning, but try and continue processing + LOG.warning("There should only be one \"aliases\" tag in parse-plugins.xml"); + } + + Element aliasRootElem = (Element)aliasRoot.item(0); + NodeList aliasElements = aliasRootElem.getElementsByTagName("alias"); + + if (aliasElements != null && aliasElements.getLength() > 0) { + for (int i=0; i<aliasElements.getLength(); i++) { + Element aliasElem = (Element)aliasElements.item(i); + String parsePluginId = aliasElem.getAttribute("name"); + String extensionId = aliasElem.getAttribute("extension-id"); + LOG.finest("Found alias: plugin-id: " + parsePluginId + + ", extension-id: " + extensionId); + if (parsePluginId != null && extensionId != null) { + aliases.put(parsePluginId, extensionId); + } + } + } + return aliases; } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java Tue Feb 21 01:54:21 2006 @@ -36,8 +36,8 @@ public class ParseUtil { /* our log stream */ - public static final Logger LOG = LogFormatter.getLogger(ParseUtil.class - .getName()); + public static final Logger LOG = + LogFormatter.getLogger(ParseUtil.class.getName()); private Configuration conf; private ParserFactory parserFactory; @@ -84,33 +84,36 @@ " of type " + content.getContentType()); ParseStatus ps = (parse.getData() != null) ? parse.getData().getStatus() : null; - return (ps == null) ? new ParseStatus().getEmptyParse(this.conf) : ps.getEmptyParse(this.conf); + return (ps == null) ? new ParseStatus().getEmptyParse(this.conf) + : ps.getEmptyParse(this.conf); } - + /** * Method parses a [EMAIL PROTECTED] Content} object using the [EMAIL PROTECTED] Parser} specified - * by the parameter <code>parserId</code>. If a suitable [EMAIL PROTECTED] Parser} is not - * found, then a <code>WARNING</code> level message is logged, and a - * ParseException is thrown. - * If the parse is uncessful for any other reason, then a <code>WARNING</code> - * level message is logged, and a <code>ParseStatus.getEmptyParse() is + * by the parameter <code>extId</code>, i.e., the Parser's extension ID. + * If a suitable [EMAIL PROTECTED] Parser} is not found, then a <code>WARNING</code> + * level message is logged, and a ParseException is thrown. If the parse is + * uncessful for any other reason, then a <code>WARNING</code> level + * message is logged, and a <code>ParseStatus.getEmptyParse()</code> is * returned. * - * @param parserId The ID of the [EMAIL PROTECTED] Parser} to use to parse the specified - * content. + * @param extId The extension implementation ID of the [EMAIL PROTECTED] Parser} to use + * to parse the specified content. * @param content The content to parse. + * * @return A [EMAIL PROTECTED] Parse} object if the parse is successful, otherwise, * a <code>ParseStatus.getEmptyParse()</code>. + * * @throws ParseException If there is no suitable [EMAIL PROTECTED] Parser} found * to perform the parse. */ - public Parse parseByParserId(String parserId, Content content) + public Parse parseByExtensionId(String extId, Content content) throws ParseException { Parse parse = null; Parser p = null; try { - p = this.parserFactory.getParserById(parserId); + p = this.parserFactory.getParserById(extId); } catch (ParserNotFound e) { LOG.warning("No suitable parser found when trying to parse content " + content); @@ -126,6 +129,6 @@ " of type " + content.getContentType()); return new ParseStatus().getEmptyParse(this.conf); } - } + } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Tue Feb 21 01:54:21 2006 @@ -16,9 +16,11 @@ package org.apache.nutch.parse; // JDK imports +import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; +import java.util.Map; import java.util.Vector; import java.util.logging.Logger; @@ -32,6 +34,7 @@ import org.apache.nutch.util.mime.MimeType; import org.apache.nutch.util.mime.MimeTypeException; + /** Creates and caches [EMAIL PROTECTED] Parser} plugins.*/ public final class ParserFactory { @@ -63,43 +66,6 @@ } } - - /** - * Returns the appropriate [EMAIL PROTECTED] Parser} implementation given a content type - * and url. - * - * @deprecated Since the addition of NUTCH-88, this method is replaced by - * taking the highest priority [EMAIL PROTECTED] Parser} returned from - * [EMAIL PROTECTED] #getParsers(String, String)}. - * - * Parser extensions should define the attributes "contentType" and/or - * "pathSuffix". Content type has priority: the first plugin found whose - * "contentType" attribute matches the beginning of the content's type is - * used. If none match, then the first whose "pathSuffix" attribute matches - * the end of the url's path is used. If neither of these match, then the - * first plugin whose "pathSuffix" is the empty string is used. - */ - public Parser getParser(String contentType, String url) - throws ParserNotFound { - - Parser[] parsers = getParsers(contentType, url); - - if(parsers != null){ - //give the user the highest priority parser available - for(int i = 0; i < parsers.length; i++ ){ - Parser p = parsers[i]; - if(p != null){ - return p; - } - } - - throw new ParserNotFound(url, contentType); - - } - else{ - throw new ParserNotFound(url, contentType); - } - } /** * Function returns an array of [EMAIL PROTECTED] Parser}s for a given content type. @@ -150,11 +116,11 @@ Parser p = null; try { //check to see if we've cached this parser instance yet - p = (Parser) this.conf.getObject(ext.getDescriptor().getPluginId()); + p = (Parser) this.conf.getObject(ext.getId()); if (p == null) { // go ahead and instantiate it and then cache it p = (Parser) ext.getExtensionInstance(); - this.conf.setObject(ext.getDescriptor().getPluginId(),p); + this.conf.setObject(ext.getId(),p); } parsers.add(p); } catch (PluginRuntimeException e) { @@ -168,79 +134,79 @@ } return (Parser[]) parsers.toArray(new Parser[]{}); } - + /** - * <p> * Function returns a [EMAIL PROTECTED] Parser} instance with the specified - * <code>parserId</code>. If the Parser instance isn't found, then the - * function throws a <code>ParserNotFound</code> exception. If the function - * is able to find the [EMAIL PROTECTED] Parser} in the internal - * <code>PARSER_CACHE</code> then it will return the already instantiated - * Parser. Otherwise, if it has to instantiate the Parser itself , then this - * function will cache that Parser in the internal <code>PARSER_CACHE</code>. + * <code>extId</code>, representing its extension ID. If the Parser + * instance isn't found, then the function throws a + * <code>ParserNotFound</code> exception. If the function is able to find + * the [EMAIL PROTECTED] Parser} in the internal <code>PARSER_CACHE</code> then it + * will return the already instantiated Parser. Otherwise, if it has to + * instantiate the Parser itself , then this function will cache that Parser + * in the internal <code>PARSER_CACHE</code>. * - * @param parserId - * The string ID (e.g., "parse-text", "parse-msword") of the - * [EMAIL PROTECTED] Parser} implementation to return. + * @param extId The string extension ID (e.g., + * "org.apache.nutch.parse.rss.RSSParser", + * "org.apache.nutch.parse.rtf.RTFParseFactory") of the [EMAIL PROTECTED] Parser} + * implementation to return. * @return A [EMAIL PROTECTED] Parser} implementation specified by the parameter - * <code>parserId</code>. - * @throws ParserNotFound - * If the Parser is not found (i.e., registered with the extension - * point), or if the there a [EMAIL PROTECTED] PluginRuntimeException} - * instantiating the [EMAIL PROTECTED] Parser}. + * <code>extId</code>. + * @throws ParserNotFound If the Parser is not found (i.e., registered with + * the extension point), or if the there a + * [EMAIL PROTECTED] PluginRuntimeException} instantiating the [EMAIL PROTECTED] Parser}. */ - public Parser getParserById(String parserId) throws ParserNotFound { - // first check the cache + public Parser getParserById(String id) throws ParserNotFound { - if (this.conf.getObject(parserId) != null) { - return (Parser) this.conf.getObject(parserId); - } else { - // get the list of registered parsing extensions - // then find the right one by Id + Extension[] extensions = this.extensionPoint.getExtensions(); + Extension parserExt = null; - Extension[] extensions = this.extensionPoint.getExtensions(); - Extension parserExt = getExtensionById(extensions, parserId); + if (id != null) { + parserExt = getExtension(extensions, id); + } + if (parserExt == null) { + parserExt = getExtensionFromAlias(extensions, id); + } - if (parserExt == null) { - throw new ParserNotFound("No Parser Found for parserId: " + parserId - + "!"); - } else { - // instantiate the Parser - try { - Parser p = null; - p = (Parser) parserExt.getExtensionInstance(); - this.conf.setObject(parserId, p); - return p; - } catch (PluginRuntimeException e) { - LOG.warning("ParserFactory:PluginRuntimeException when " - + "initializing parser plugin " - + parserExt.getDescriptor().getPluginId() - + " instance in getParserById"); - throw new ParserNotFound("No Parser Found for parserId: " + parserId - + "!"); - } + if (parserExt == null) { + throw new ParserNotFound("No Parser Found for id [" + id + "]"); + } + + // first check the cache + if (this.conf.getObject(parserExt.getId()) != null) { + return (Parser) this.conf.getObject(parserExt.getId()); + + // if not found in cache, instantiate the Parser + } else { + try { + Parser p = (Parser) parserExt.getExtensionInstance(); + this.conf.setObject(parserExt.getId(), p); + return p; + } catch (PluginRuntimeException e) { + LOG.warning("Canno initialize parser " + + parserExt.getDescriptor().getPluginId() + + " (cause: " + e.toString()); + throw new ParserNotFound("Cannot init parser for id [" + id + "]"); } } } /** - * finds the best-suited parse plugin for a given contentType. + * Finds the best-suited parse plugin for a given contentType. * - * @param contentType - * Content-Type for which we seek a parse plugin. - * @return List - List of extensions to be used for this contentType. If none, - * returns null. + * @param contentType Content-Type for which we seek a parse plugin. + * @return a list of extensions to be used for this contentType. + * If none, returns <code>null</code>. */ protected List getExtensions(String contentType) { // First of all, tries to clean the content-type String type = null; try { - type = MimeType.clean(contentType); + type = MimeType.clean(contentType); } catch (MimeTypeException mte) { - LOG.info("Could not clean the content-type [" + contentType + - "], Reason is [" + mte + "]. Using its raw version..."); - type = contentType; + LOG.fine("Could not clean the content-type [" + contentType + + "], Reason is [" + mte + "]. Using its raw version..."); + type = contentType; } List extensions = (List) this.conf.getObject(type); @@ -304,19 +270,16 @@ * If none, returns null. */ private List matchExtensions(List plugins, - Extension[] extensions, - String contentType) { + Extension[] extensions, + String contentType) { - List extList = null; + List extList = new ArrayList(); if (plugins != null) { - extList = new Vector(plugins.size()); for (Iterator i = plugins.iterator(); i.hasNext();) { String parsePluginId = (String) i.next(); - Extension ext = getExtensionByIdAndType(extensions, - parsePluginId, - contentType); + Extension ext = getExtension(extensions, parsePluginId, contentType); // the extension returned may be null // that means that it was not enabled in the plugin.includes // nutch conf property, but it was mapped in the @@ -327,8 +290,9 @@ // in either case, LOG the appropriate error message to WARN level if (ext == null) { - //try to get it just by its pluginId - ext = getExtensionById(extensions, parsePluginId); + //try to get it just by its pluginId + ext = getExtension(extensions, parsePluginId); + if (ext != null) { // plugin was enabled via plugin.includes // its plugin.xml just doesn't claim to support that @@ -338,25 +302,21 @@ " via parse-plugins.xml, but " + "its plugin.xml " + "file does not claim to support contentType: " + contentType); - - //go ahead and load the extension anyways, though - extList.add(ext); - - } else{ + } else { // plugin wasn't enabled via plugin.includes LOG.warning("ParserFactory: Plugin: " + parsePluginId + " mapped to contentType " + contentType + " via parse-plugins.xml, but not enabled via " + "plugin.includes in nutch-default.xml"); } - - } else{ + } + + if (ext != null) { // add it to the list extList.add(ext); } } - return extList; } else { // okay, there were no list of plugins defined for // this mimeType, however, there may be plugins registered @@ -366,19 +326,16 @@ // any extensions where this is the case, throw a // NotMappedParserException - List unmappedPlugins = new Vector(); - - for (int i = 0; i < extensions.length; i++) { + for (int i=0; i<extensions.length; i++) { if (extensions[i].getAttribute("contentType") != null && extensions[i].getAttribute("contentType").equals( contentType)) { - unmappedPlugins.add(extensions[i].getDescriptor() - .getPluginId()); + extList.add(extensions[i].getId()); } } - if (unmappedPlugins.size() > 0) { - LOG.info("The parsing plugins: " + unmappedPlugins + + if (extList.size() > 0) { + LOG.info("The parsing plugins: " + extList + " are enabled via the plugin.includes system " + "property, and all claim to support the content type " + contentType + ", but they are not mapped to it in the " + @@ -387,33 +344,38 @@ LOG.fine("ParserFactory:No parse plugins mapped or enabled for " + "contentType " + contentType); } - return null; } + + return (extList.size() > 0) ? extList : null; } private boolean match(Extension extension, String id, String type) { - return (id.equals(extension.getDescriptor().getPluginId())) && - (type.equals(extension.getAttribute("contentType")) || - (type.equals(DEFAULT_PLUGIN))); + return ((id.equals(extension.getId())) && + (type.equals(extension.getAttribute("contentType")) || + type.equals(DEFAULT_PLUGIN))); } - private Extension getExtensionByIdAndType(Extension[] extList, - String plugId, - String contentType) { - for (int i = 0; i < extList.length; i++) { - if (match(extList[i], plugId, contentType)) { - return extList[i]; + /** Get an extension from its id and supported content-type. */ + private Extension getExtension(Extension[] list, String id, String type) { + for (int i=0; i<list.length; i++) { + if (match(list[i], id, type)) { + return list[i]; } } return null; } - - private Extension getExtensionById(Extension[] extList, String plugId) { - for(int i = 0; i < extList.length; i++){ - if(plugId.equals(extList[i].getDescriptor().getPluginId())){ - return extList[i]; + + private Extension getExtension(Extension[] list, String id) { + for (int i=0; i<list.length; i++) { + if (id.equals(list[i].getId())) { + return list[i]; } } return null; } + + private Extension getExtensionFromAlias(Extension[] list, String id) { + return getExtension(list, (String) parsePluginList.getAliases().get(id)); + } + } Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original) +++ lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Tue Feb 21 01:54:21 2006 @@ -61,7 +61,7 @@ Content content = new Content(url, url, bytes, contentType, new Metadata(), conf); - Parse parse = new ParseUtil(conf).parseByParserId("parse-html",content); + Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html",content); Metadata metadata = parse.getData().getParseMeta(); assertEquals(license, metadata.get("License-Url")); Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Tue Feb 21 01:54:21 2006 @@ -37,8 +37,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.hadoop.io.UTF8; import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.Parser; -import org.apache.nutch.parse.ParserFactory; +import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.parse.ParseException; import org.apache.nutch.parse.ParserNotFound; import org.apache.nutch.protocol.Content; @@ -341,9 +340,7 @@ try { protocol = new ProtocolFactory(conf).getProtocol(url); Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent(); - String contentType = content.getContentType(); - Parser parser = new ParserFactory(conf).getParser(contentType, url); - Parse parse = parser.getParse(content); + Parse parse = new ParseUtil(conf).parse(content); System.out.println("text:" + parse.getText()); return parse.getText(); Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original) +++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Tue Feb 21 01:54:21 2006 @@ -23,8 +23,8 @@ // Nutch imports import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.Parser; import org.apache.nutch.parse.ParserFactory; +import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.protocol.Content; import org.apache.nutch.util.NutchConfiguration; @@ -48,16 +48,12 @@ public void testMetaHTMLParsing() { try { - + ParseUtil parser = new ParseUtil(NutchConfiguration.create()); /* loop through the test documents and validate result */ for (int t = 0; t < docs.length; t++) { - Content content = getContent(docs[t]); - Parser parser = new ParserFactory(NutchConfiguration.create()).getParser("text/html", URL); - Parse parse = parser.getParse(content); - + Parse parse = parser.parse(content); assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE)); - } } catch (Exception e) { e.printStackTrace(System.out); Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java Tue Feb 21 01:54:21 2006 @@ -111,13 +111,13 @@ // check external parser that does 'cat' contentType = "application/vnd.nutch.example.cat"; content.setContentType(contentType); - parse = new ParseUtil(conf).parseByParserId("parse-ext", content); + parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content); assertEquals(expectedText,parse.getText()); // check external parser that does 'md5sum' contentType = "application/vnd.nutch.example.md5sum"; content.setContentType(contentType); - parse = new ParseUtil(conf).parseByParserId("parse-ext", content); + parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content); assertTrue(parse.getText().startsWith(expectedMD5sum)); } } Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java Tue Feb 21 01:54:21 2006 @@ -73,7 +73,7 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()) .getContent(); - parse = new ParseUtil(conf).parseByParserId("parse-mp3", content); + parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content); Metadata metadata = parse.getData().getParseMeta(); assertEquals("postgresql comment id3v2", metadata.get("COMM-Text")); assertEquals("postgresql composer id3v2", metadata.get("TCOM-Text")); @@ -105,7 +105,7 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()) .getContent(); - parse = new ParseUtil(conf).parseByParserId("parse-mp3", content); + parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content); Metadata metadata = parse.getData().getParseMeta(); assertEquals("postgresql comment id3v1", metadata.get("COMM-Text")); Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java Tue Feb 21 01:54:21 2006 @@ -63,7 +63,7 @@ protocol = factory.getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); - parse = parser.parseByParserId("parse-msexcel", content); + parse = parser.parseByExtensionId("parse-msexcel", content); assertTrue(parse.getText().equals(expectedText)); } Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java Tue Feb 21 01:54:21 2006 @@ -126,7 +126,8 @@ */ public void testContent() throws Exception { - Parse parse = new ParseUtil(NutchConfiguration.create()).parseByParserId("parse-mspowerpoint",this.content); + Parse parse = new ParseUtil(NutchConfiguration.create()) + .parseByExtensionId("parse-mspowerpoint", this.content); ParseData data = parse.getData(); String text = parse.getText(); @@ -163,7 +164,8 @@ */ public void testMeta() throws Exception { - Parse parse = new ParseUtil(NutchConfiguration.create()).parseByParserId("parse-mspowerpoint",content); + Parse parse = new ParseUtil(NutchConfiguration.create()) + .parseByExtensionId("parse-mspowerpoint", content); ParseData data = parse.getData(); Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Tue Feb 21 01:54:21 2006 @@ -69,7 +69,7 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByParserId("parse-msword",content); + parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content); assertTrue(parse.getText().startsWith(expectedText)); } Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java Tue Feb 21 01:54:21 2006 @@ -69,7 +69,7 @@ Configuration conf = NutchConfiguration.create(); protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByParserId("parse-pdf",content); + parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content); int index = parse.getText().indexOf(expectedText); assertTrue(index > 0); Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java Tue Feb 21 01:54:21 2006 @@ -87,7 +87,7 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByParserId("parse-rss",content); + parse = new ParseUtil(conf).parseByExtensionId("parse-rss", content); //check that there are 3 outlinks: //http://test.channel.com Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Tue Feb 21 01:54:21 2006 @@ -74,7 +74,7 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()) .getContent(); - parse = new ParseUtil(conf).parseByParserId("parse-rtf", content); + parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content); String text = parse.getText(); assertEquals("The quick brown fox jumps over the lazy dog", text.trim()); Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java Tue Feb 21 01:54:21 2006 @@ -69,7 +69,7 @@ protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByParserId("parse-zip",content); + parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content); assertTrue(parse.getText().equals(expectedText)); } } Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java Tue Feb 21 01:54:21 2006 @@ -42,18 +42,10 @@ conf = NutchConfiguration.create(); conf.set("plugin.includes", ".*"); conf.set("parse.plugin.file", - "org/apache/nutch/parse/parse-plugin-test.xml"); + "org/apache/nutch/parse/parse-plugin-test.xml"); parserFactory = new ParserFactory(conf); } - - /** Unit test for <code>getParser(String, String)</code> method. */ - public void testGetParser() throws Exception { - Parser parser = parserFactory.getParser("text/html", "http://foo.com/"); - assertNotNull(parser); - parser = parserFactory.getParser("foo/bar", "http://foo.com/"); - assertNotNull(parser); - } - + /** Unit test for <code>getExtensions(String)</code> method. */ public void testGetExtensions() throws Exception { Extension ext = (Extension)parserFactory.getExtensions("text/html").get(0); @@ -70,27 +62,27 @@ assertNotNull(parsers); assertEquals(1, parsers.length); assertEquals("org.apache.nutch.parse.html.HtmlParser", - parsers[0].getClass().getName()); + parsers[0].getClass().getName()); - parsers = parserFactory.getParsers("text/html; charset=ISO-8859-1", "http://foo.com"); + parsers = parserFactory.getParsers("text/html; charset=ISO-8859-1", + "http://foo.com"); assertNotNull(parsers); assertEquals(1, parsers.length); assertEquals("org.apache.nutch.parse.html.HtmlParser", - parsers[0].getClass().getName()); - + parsers[0].getClass().getName()); parsers = parserFactory.getParsers("application/x-javascript", - "http://foo.com"); + "http://foo.com"); assertNotNull(parsers); assertEquals(1, parsers.length); assertEquals("org.apache.nutch.parse.js.JSParseFilter", - parsers[0].getClass().getName()); + parsers[0].getClass().getName()); parsers = parserFactory.getParsers("text/plain", "http://foo.com"); assertNotNull(parsers); assertEquals(1, parsers.length); assertEquals("org.apache.nutch.parse.text.TextParser", - parsers[0].getClass().getName()); + parsers[0].getClass().getName()); Parser parser1 = parserFactory.getParsers("text/plain", "http://foo.com")[0]; Parser parser2 = parserFactory.getParsers("*", "http://foo.com")[0]; @@ -102,7 +94,8 @@ parsers = parserFactory.getParsers("text/rss","http://foo.com"); assertNotNull(parsers); assertEquals(1,parsers.length); - assertEquals("org.apache.nutch.parse.rss.RSSParser",parsers[0].getClass().getName()); + assertEquals("org.apache.nutch.parse.rss.RSSParser", + parsers[0].getClass().getName()); } } Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml?rev=379403&r1=379402&r2=379403&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml Tue Feb 21 01:54:21 2006 @@ -1,46 +1,64 @@ <?xml version="1.0" encoding="UTF-8"?> <!-- - Copyright 2005 The Apache Software Foundation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - Author : mattmann - Description: Test parse-plugins.xml file. + Copyright 2005 The Apache Software Foundation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Author : mattmann + Description: Test parse-plugins.xml file. --> <parse-plugins> - <!-- by default if the mimeType is set to *, or - can't be determined, use parse-text --> - <mimeType name="*"> - <plugin id="parse-text" /> - </mimeType> - - <!-- test these 4 plugins --> - <mimeType name="text/html"> - <plugin id="parse-html"/> - </mimeType> - - <mimeType name="text/plain"> - <plugin id="parse-text"/> - </mimeType> + <!-- by default if the mimeType is set to *, or + can't be determined, use parse-text --> + <mimeType name="*"> + <plugin id="parse-text" /> + </mimeType> + + <!-- test these 4 plugins --> + <mimeType name="text/html"> + <!-- + ! Test that if a parser cannot be instanciated, + ! it should not block the process and then the next one is used + !--> + <plugin id="parse-plugin-that-not-exist"/> + <plugin id="parse-html"/> + </mimeType> + + <mimeType name="text/plain"> + <!-- Test that an extension-id can be directly used here --> + <plugin id="org.apache.nutch.parse.text.TextParser"/> + </mimeType> - <mimeType name="application/x-javascript"> - <plugin id="parse-js"/> - </mimeType> + <mimeType name="application/x-javascript"> + <plugin id="parse-js"/> + </mimeType> - <mimeType name="text/rss"> - <plugin id="parse-rss"/> - </mimeType> + <mimeType name="text/rss"> + <plugin id="parse-rss"/> + </mimeType> + <!-- alias mappings for parse-xxx names to the actual extension implementation + ids described in each plugin's plugin.xml file --> + <aliases> + <alias name="parse-html" + extension-id="org.apache.nutch.parse.html.HtmlParser" /> + <alias name="parse-js" + extension-id="JSParser" /> + <alias name="parse-rss" + extension-id="org.apache.nutch.parse.rss.RSSParser" /> + <alias name="parse-text" + extension-id="org.apache.nutch.parse.text.TextParser" /> + </aliases> </parse-plugins> ------------------------------------------------------- This SF.net email is sponsored by: Splunk Inc. Do you grep through log files for problems? Stop! Download the new AJAX search engine that makes searching your log files as easy as surfing the web. DOWNLOAD SPLUNK! http://sel.as-us.falkag.net/sel?cmd=lnk&kid=103432&bid=230486&dat=121642 _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs