Author: jerome
Date: Tue Feb 21 01:54:21 2006
New Revision: 379403

URL: http://svn.apache.org/viewcvs?rev=379403&view=rev
Log:
NUTCH-140, parse-plugin.xml can now use extension-id and plugin-id

Modified:
    lucene/nutch/trunk/conf/parse-plugins.dtd
    lucene/nutch/trunk/conf/parse-plugins.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
    
lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
    
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
    
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
    
lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
    
lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
    
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
    
lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
    
lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
    
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
    
lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
    
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
    lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
    lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml

Modified: lucene/nutch/trunk/conf/parse-plugins.dtd
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.dtd?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/parse-plugins.dtd (original)
+++ lucene/nutch/trunk/conf/parse-plugins.dtd Tue Feb 21 01:54:21 2006
@@ -1,7 +1,12 @@
-<!ELEMENT parse-plugins (mimeType+)>
+<!ELEMENT parse-plugins  (mimeType+,aliases)>
 <!ELEMENT mimeType (plugin+)>
 <!ATTLIST mimeType name CDATA #REQUIRED>
 
 <!ELEMENT plugin EMPTY>
 <!ATTLIST plugin id CDATA #REQUIRED>
-<!ATTLIST plugin order CDATA ''>
\ No newline at end of file
+<!ATTLIST plugin order CDATA ''>
+
+<!ELEMENT aliases (alias+)>
+<!ELEMENT alias EMPTY>
+<!ATTLIST alias name CDATA #REQUIRED>
+<!ATTLIST alias extension-id CDATA #REQUIRED>
\ No newline at end of file

Modified: lucene/nutch/trunk/conf/parse-plugins.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/parse-plugins.xml (original)
+++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Feb 21 01:54:21 2006
@@ -218,4 +218,33 @@
                <plugin id="parse-ext" />
        </mimeType>
 
+       <!--  alias mappings for parse-xxx names to the actual extension 
implementation 
+       ids described in each plugin's plugin.xml file -->
+       <aliases>
+               <alias name="parse-ext" extension-id="ExtParser" />
+               <alias name="parse-html"
+                       extension-id="org.apache.nutch.parse.html.HtmlParser" />
+               <alias name="parse-js" extension-id="JSParser" />
+               <alias name="parse-mp3"
+                       extension-id="org.apache.nutch.parse.mp3.MP3Parser" />
+               <alias name="parse-msexcel"
+                       
extension-id="org.apache.nutch.parse.msexcel.MSExcelParser" />
+               <alias name="parse-mspowerpoint"
+                       
extension-id="org.apache.nutch.parse.mspowerpoint.MSPowerPointParser" />
+               <alias name="parse-msword"
+                       
extension-id="org.apache.nutch.parse.msword.MSWordParser" />
+               <alias name="parse-pdf"
+                       extension-id="org.apache.nutch.parse.pdf.PdfParser" />
+               <alias name="parse-rss"
+                       extension-id="org.apache.nutch.parse.rss.RSSParser" />
+               <alias name="parse-rtf"
+                       
extension-id="org.apache.nutch.parse.rtf.RTFParseFactory" />
+               <alias name="parse-swf"
+                       extension-id="org.apache.nutch.parse.swf.SWFParser" />
+               <alias name="parse-text"
+                       extension-id="org.apache.nutch.parse.text.TextParser" />
+               <alias name="parse-zip"
+                       extension-id="org.apache.nutch.parse.zip.ZipParser" />
+       </aliases>
+       
 </parse-plugins>

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java Tue 
Feb 21 01:54:21 2006
@@ -19,6 +19,7 @@
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 
 /**
@@ -30,27 +31,40 @@
  * @author mattmann
  * @version 1.0
  */
-public class ParsePluginList {
+class ParsePluginList {
   
   /* a map to link mimeType to an ordered list of parsing plugins */
-  private HashMap fMimeTypeToPluginMap = null;
+  private Map fMimeTypeToPluginMap = null;
+  
+  /* A list of aliases */
+  private Map aliases = null;
+  
   
   /**
    * Constructs a new ParsePluginList
    */
-  public ParsePluginList() {
+  ParsePluginList() {
     fMimeTypeToPluginMap = new HashMap();
+    aliases = new HashMap();
   }
   
-  public List getPluginList(String mimeType) {
+  List getPluginList(String mimeType) {
     return (List) fMimeTypeToPluginMap.get(mimeType);
   }
+
+  void setAliases(Map aliases) {
+    this.aliases = aliases;
+  }
+  
+  Map getAliases() {
+    return aliases;
+  }
   
-  public void setPluginList(String mimeType, List l) {
+  void setPluginList(String mimeType, List l) {
     fMimeTypeToPluginMap.put(mimeType, l);
   }
   
-  public List getSupportedMimeTypes() {
+  List getSupportedMimeTypes() {
     return Arrays.asList(fMimeTypeToPluginMap.keySet().toArray(
             new String[] {}));
   }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java 
Tue Feb 21 01:54:21 2006
@@ -16,12 +16,14 @@
 package org.apache.nutch.parse;
 
 // JDK imports
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Vector;
+import java.util.Map;
 import java.util.logging.Logger;
-import java.io.InputStream;
-import java.net.URL;
 
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
@@ -53,7 +55,7 @@
   /** The property name of the parse-plugins location */
   private static final String PP_FILE_PROP = "parse.plugin.file";
 
-  /* the parse-plugins file */
+  /** the parse-plugins file */
   private String fParsePluginsFile = null;
 
   
@@ -111,8 +113,12 @@
     
     Element parsePlugins = document.getDocumentElement();
     
+    // build up the alias hash map
+    Map aliases = getAliases(parsePlugins);
+    // And store it on the parse plugin list
+    pList.setAliases(aliases);
+     
     // get all the mime type nodes
-    
     NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType");
     
     // iterate through the mime types
@@ -125,30 +131,29 @@
       
       // iterate through the plugins, add them in order read
       // OR if they have a special order="" attribute, then hold those in
-      // a
-      // separate list, and then insert them into the final list at the
-      // order
-      // specified
-      
+      // a separate list, and then insert them into the final list at the
+      // order specified
       if (pluginList != null && pluginList.getLength() > 0) {
-        List plugList = new Vector(pluginList.getLength());
+        List plugList = new ArrayList(pluginList.getLength());
         
-        for (int j = 0; j < pluginList.getLength(); j++) {
+        for (int j = 0; j<pluginList.getLength(); j++) {
           Element plugin = (Element) pluginList.item(j);
           String pluginId = plugin.getAttribute("id");
-          
+          String extId = (String) aliases.get(pluginId);
+          if (extId == null) {
+            // Assume an extension id is directly specified
+            extId = pluginId;
+          }
           String orderStr = plugin.getAttribute("order");
           int order = -1;
-          
           try {
             order = Integer.parseInt(orderStr);
           } catch (NumberFormatException ignore) {
           }
-          
           if (order != -1) {
-            plugList.add(order - 1, pluginId);
+            plugList.add(order - 1, extId);
           } else {
-            plugList.add(pluginId);
+            plugList.add(extId);
           }
         }
         
@@ -202,7 +207,7 @@
       System.out.println("MIMETYPE: " + mimeType);
       List plugList = prefs.getPluginList(mimeType);
       
-      System.out.println("PLUGINS:");
+      System.out.println("EXTENSION IDs:");
       
       for (Iterator j = plugList.iterator(); j.hasNext();) {
         System.out.println((String) j.next());
@@ -224,6 +229,39 @@
    */
   public void setFParsePluginsFile(String parsePluginsFile) {
     fParsePluginsFile = parsePluginsFile;
+  }
+  
+  private Map getAliases(Element parsePluginsRoot) {
+
+    Map aliases = new HashMap();
+    NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases");
+         
+    if (aliasRoot == null || (aliasRoot != null && aliasRoot.getLength() == 
0)) {
+      LOG.warning("No aliases defined in parse-plugins.xml!");
+      return aliases;
+    }
+         
+    if (aliasRoot.getLength() > 1) {
+      // log a warning, but try and continue processing
+      LOG.warning("There should only be one \"aliases\" tag in 
parse-plugins.xml");
+    }
+         
+    Element aliasRootElem = (Element)aliasRoot.item(0);
+    NodeList aliasElements = aliasRootElem.getElementsByTagName("alias");
+         
+    if (aliasElements != null && aliasElements.getLength() > 0) {
+      for (int i=0; i<aliasElements.getLength(); i++) {
+        Element aliasElem = (Element)aliasElements.item(i);
+       String parsePluginId = aliasElem.getAttribute("name");
+       String extensionId = aliasElem.getAttribute("extension-id");
+        LOG.finest("Found alias: plugin-id: " + parsePluginId +
+                   ", extension-id: " + extensionId);
+        if (parsePluginId != null && extensionId != null) {
+          aliases.put(parsePluginId, extensionId);
+        }
+      }
+    }
+    return aliases;
   }
   
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java Tue Feb 
21 01:54:21 2006
@@ -36,8 +36,8 @@
 public class ParseUtil {
   
   /* our log stream */
-  public static final Logger LOG = LogFormatter.getLogger(ParseUtil.class
-          .getName());
+  public static final Logger LOG =
+          LogFormatter.getLogger(ParseUtil.class.getName());
   private Configuration conf;
   private ParserFactory parserFactory;
   
@@ -84,33 +84,36 @@
                 " of type " + content.getContentType());
 
     ParseStatus ps = (parse.getData() != null) ? parse.getData().getStatus() : 
null;
-    return (ps == null) ? new ParseStatus().getEmptyParse(this.conf) : 
ps.getEmptyParse(this.conf);
+    return (ps == null) ? new ParseStatus().getEmptyParse(this.conf)
+                        : ps.getEmptyParse(this.conf);
   }
-  
+    
   /**
    * Method parses a [EMAIL PROTECTED] Content} object using the [EMAIL 
PROTECTED] Parser} specified
-   * by the parameter <code>parserId</code>. If a suitable [EMAIL PROTECTED] 
Parser} is not
-   * found, then a <code>WARNING</code> level message is logged, and a
-   * ParseException is thrown.
-   * If the parse is uncessful for any other reason, then a 
<code>WARNING</code>
-   * level message is logged, and a <code>ParseStatus.getEmptyParse() is
+   * by the parameter <code>extId</code>, i.e., the Parser's extension ID.
+   * If a suitable [EMAIL PROTECTED] Parser} is not found, then a 
<code>WARNING</code>
+   * level message is logged, and a ParseException is thrown. If the parse is
+   * uncessful for any other reason, then a <code>WARNING</code> level
+   * message is logged, and a <code>ParseStatus.getEmptyParse()</code> is
    * returned.
    *
-   * @param parserId The ID of the [EMAIL PROTECTED] Parser} to use to parse 
the specified
-   *                 content.
+   * @param extId The extension implementation ID of the [EMAIL PROTECTED] 
Parser} to use
+   *              to parse the specified content.
    * @param content The content to parse.
+   *
    * @return A [EMAIL PROTECTED] Parse} object if the parse is successful, 
otherwise,
    *         a <code>ParseStatus.getEmptyParse()</code>.
+   *
    * @throws ParseException If there is no suitable [EMAIL PROTECTED] Parser} 
found
    *                        to perform the parse.
    */
-  public Parse parseByParserId(String parserId, Content content)
+  public Parse parseByExtensionId(String extId, Content content)
   throws ParseException {
     Parse parse = null;
     Parser p = null;
     
     try {
-      p = this.parserFactory.getParserById(parserId);
+      p = this.parserFactory.getParserById(extId);
     } catch (ParserNotFound e) {
       LOG.warning("No suitable parser found when trying to parse content " +
                   content);
@@ -126,6 +129,6 @@
                   " of type " + content.getContentType());
       return new ParseStatus().getEmptyParse(this.conf);
     }
-  }
+  }  
   
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Tue 
Feb 21 01:54:21 2006
@@ -16,9 +16,11 @@
 package org.apache.nutch.parse;
 
 // JDK imports
+import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
 import java.util.Vector;
 import java.util.logging.Logger;
 
@@ -32,6 +34,7 @@
 import org.apache.nutch.util.mime.MimeType;
 import org.apache.nutch.util.mime.MimeTypeException;
 
+
 /** Creates and caches [EMAIL PROTECTED] Parser} plugins.*/
 public final class ParserFactory {
   
@@ -63,43 +66,6 @@
     }
   }                      
   
-
-  /**
-   * Returns the appropriate [EMAIL PROTECTED] Parser} implementation given a 
content type
-   * and url.
-   * 
-   * @deprecated Since the addition of NUTCH-88, this method is replaced by
-   *             taking the highest priority [EMAIL PROTECTED] Parser} 
returned from
-   *             [EMAIL PROTECTED] #getParsers(String, String)}.
-   * 
-   * Parser extensions should define the attributes "contentType" and/or
-   * "pathSuffix". Content type has priority: the first plugin found whose
-   * "contentType" attribute matches the beginning of the content's type is
-   * used. If none match, then the first whose "pathSuffix" attribute matches
-   * the end of the url's path is used. If neither of these match, then the
-   * first plugin whose "pathSuffix" is the empty string is used.
-   */
-  public Parser getParser(String contentType, String url)
-  throws ParserNotFound {
-    
-    Parser[] parsers = getParsers(contentType, url);
-    
-    if(parsers != null){
-      //give the user the highest priority parser available
-      for(int i = 0;  i < parsers.length; i++ ){
-        Parser p = parsers[i];
-        if(p != null){
-          return p;
-        }
-      }
-      
-      throw new ParserNotFound(url, contentType);
-      
-    } 
-    else{
-      throw new ParserNotFound(url, contentType);
-    }
-  }
    
   /**
    * Function returns an array of [EMAIL PROTECTED] Parser}s for a given 
content type.
@@ -150,11 +116,11 @@
       Parser p = null;
       try {
         //check to see if we've cached this parser instance yet
-        p = (Parser) this.conf.getObject(ext.getDescriptor().getPluginId());
+        p = (Parser) this.conf.getObject(ext.getId());
         if (p == null) {
           // go ahead and instantiate it and then cache it
           p = (Parser) ext.getExtensionInstance();
-          this.conf.setObject(ext.getDescriptor().getPluginId(),p);
+          this.conf.setObject(ext.getId(),p);
         }
         parsers.add(p);
       } catch (PluginRuntimeException e) {
@@ -168,79 +134,79 @@
     }
     return (Parser[]) parsers.toArray(new Parser[]{});
   }
-  
+    
   /**
-   * <p>
    * Function returns a [EMAIL PROTECTED] Parser} instance with the specified
-   * <code>parserId</code>. If the Parser instance isn't found, then the
-   * function throws a <code>ParserNotFound</code> exception. If the function
-   * is able to find the [EMAIL PROTECTED] Parser} in the internal
-   * <code>PARSER_CACHE</code> then it will return the already instantiated
-   * Parser. Otherwise, if it has to instantiate the Parser itself , then this
-   * function will cache that Parser in the internal <code>PARSER_CACHE</code>.
+   * <code>extId</code>, representing its extension ID. If the Parser
+   * instance isn't found, then the function throws a
+   * <code>ParserNotFound</code> exception. If the function is able to find
+   * the [EMAIL PROTECTED] Parser} in the internal <code>PARSER_CACHE</code> 
then it
+   * will return the already instantiated Parser. Otherwise, if it has to
+   * instantiate the Parser itself , then this function will cache that Parser
+   * in the internal <code>PARSER_CACHE</code>.
    * 
-   * @param parserId
-   *          The string ID (e.g., "parse-text", "parse-msword") of the
-   *          [EMAIL PROTECTED] Parser} implementation to return.
+   * @param extId The string extension ID (e.g.,
+   *        "org.apache.nutch.parse.rss.RSSParser",
+   *        "org.apache.nutch.parse.rtf.RTFParseFactory") of the [EMAIL 
PROTECTED] Parser}
+   *        implementation to return.
    * @return A [EMAIL PROTECTED] Parser} implementation specified by the 
parameter
-   *         <code>parserId</code>.
-   * @throws ParserNotFound
-   *           If the Parser is not found (i.e., registered with the extension
-   *           point), or if the there a [EMAIL PROTECTED] 
PluginRuntimeException}
-   *           instantiating the [EMAIL PROTECTED] Parser}.
+   *         <code>extId</code>.
+   * @throws ParserNotFound If the Parser is not found (i.e., registered with
+   *         the extension point), or if the there a
+   *         [EMAIL PROTECTED] PluginRuntimeException} instantiating the 
[EMAIL PROTECTED] Parser}.
    */
-  public Parser getParserById(String parserId) throws ParserNotFound {
-    // first check the cache
+  public Parser getParserById(String id) throws ParserNotFound {
 
-    if (this.conf.getObject(parserId) != null) {
-      return (Parser) this.conf.getObject(parserId);
-    } else {
-      // get the list of registered parsing extensions
-      // then find the right one by Id
+    Extension[] extensions = this.extensionPoint.getExtensions();
+    Extension parserExt = null;
 
-      Extension[] extensions = this.extensionPoint.getExtensions();
-      Extension parserExt = getExtensionById(extensions, parserId);
+    if (id != null) {
+      parserExt = getExtension(extensions, id);
+    }
+    if (parserExt == null) {
+      parserExt = getExtensionFromAlias(extensions, id);
+    }
 
-      if (parserExt == null) {
-        throw new ParserNotFound("No Parser Found for parserId: " + parserId
-            + "!");
-      } else {
-        // instantiate the Parser
-        try {
-          Parser p = null;
-          p = (Parser) parserExt.getExtensionInstance();
-          this.conf.setObject(parserId, p);
-          return p;
-        } catch (PluginRuntimeException e) {
-          LOG.warning("ParserFactory:PluginRuntimeException when "
-              + "initializing parser plugin "
-              + parserExt.getDescriptor().getPluginId()
-              + " instance in getParserById");
-          throw new ParserNotFound("No Parser Found for parserId: " + parserId
-              + "!");
-        }
+    if (parserExt == null) {
+      throw new ParserNotFound("No Parser Found for id [" + id + "]");
+    }
+    
+    // first check the cache              
+    if (this.conf.getObject(parserExt.getId()) != null) {
+      return (Parser) this.conf.getObject(parserExt.getId());
+
+    // if not found in cache, instantiate the Parser    
+    } else {
+      try {
+        Parser p = (Parser) parserExt.getExtensionInstance();
+        this.conf.setObject(parserExt.getId(), p);
+        return p;
+      } catch (PluginRuntimeException e) {
+        LOG.warning("Canno initialize parser " +
+                    parserExt.getDescriptor().getPluginId() +
+                   " (cause: " + e.toString());
+        throw new ParserNotFound("Cannot init parser for id [" + id + "]");
       }
     }
   }
   
   /**
-   * finds the best-suited parse plugin for a given contentType.
+   * Finds the best-suited parse plugin for a given contentType.
    * 
-   * @param contentType
-   *          Content-Type for which we seek a parse plugin.
-   * @return List - List of extensions to be used for this contentType. If 
none,
-   *         returns null.
+   * @param contentType Content-Type for which we seek a parse plugin.
+   * @return a list of extensions to be used for this contentType.
+   *         If none, returns <code>null</code>.
    */
   protected List getExtensions(String contentType) {
     
     // First of all, tries to clean the content-type
     String type = null;
     try {
-        type = MimeType.clean(contentType);
+      type = MimeType.clean(contentType);
     } catch (MimeTypeException mte) {
-        LOG.info("Could not clean the content-type [" + contentType +
-                 "], Reason is [" + mte + "]. Using its raw version...");
-        type = contentType;
+      LOG.fine("Could not clean the content-type [" + contentType +
+               "], Reason is [" + mte + "]. Using its raw version...");
+      type = contentType;
     }
 
     List extensions = (List) this.conf.getObject(type);
@@ -304,19 +270,16 @@
    *                If none, returns null.
    */
   private List matchExtensions(List plugins,
-                                      Extension[] extensions,
-                                      String contentType) {
+                               Extension[] extensions,
+                               String contentType) {
     
-    List extList = null;
+    List extList = new ArrayList();
     if (plugins != null) {
-      extList = new Vector(plugins.size());
       
       for (Iterator i = plugins.iterator(); i.hasNext();) {
         String parsePluginId = (String) i.next();
         
-        Extension ext = getExtensionByIdAndType(extensions,
-                                                parsePluginId,
-                                                contentType);
+        Extension ext = getExtension(extensions, parsePluginId, contentType);
         // the extension returned may be null
         // that means that it was not enabled in the plugin.includes
         // nutch conf property, but it was mapped in the
@@ -327,8 +290,9 @@
         // in either case, LOG the appropriate error message to WARN level
         
         if (ext == null) {
-           //try to get it just by its pluginId
-            ext = getExtensionById(extensions, parsePluginId);
+          //try to get it just by its pluginId
+          ext = getExtension(extensions, parsePluginId);
+        
           if (ext != null) {
             // plugin was enabled via plugin.includes
             // its plugin.xml just doesn't claim to support that
@@ -338,25 +302,21 @@
                         " via parse-plugins.xml, but " + "its plugin.xml " +
                         "file does not claim to support contentType: " +
                         contentType);
-            
-            //go ahead and load the extension anyways, though
-            extList.add(ext);
-          
-          } else{
+          } else {
             // plugin wasn't enabled via plugin.includes
             LOG.warning("ParserFactory: Plugin: " + parsePluginId + 
                         " mapped to contentType " + contentType +
                         " via parse-plugins.xml, but not enabled via " +
                         "plugin.includes in nutch-default.xml");               
      
           }
-          
-        } else{
+        }
+
+        if (ext != null) {
           // add it to the list
           extList.add(ext);
         }
       }
       
-      return extList;
     } else {
       // okay, there were no list of plugins defined for
       // this mimeType, however, there may be plugins registered
@@ -366,19 +326,16 @@
       // any extensions where this is the case, throw a
       // NotMappedParserException
       
-      List unmappedPlugins = new Vector();
-      
-      for (int i = 0; i < extensions.length; i++) {
+      for (int i=0; i<extensions.length; i++) {
         if (extensions[i].getAttribute("contentType") != null
             && extensions[i].getAttribute("contentType").equals(
                 contentType)) {
-          unmappedPlugins.add(extensions[i].getDescriptor()
-              .getPluginId());
+          extList.add(extensions[i].getId());
         }
       }
       
-      if (unmappedPlugins.size() > 0) {
-        LOG.info("The parsing plugins: " + unmappedPlugins +
+      if (extList.size() > 0) {
+        LOG.info("The parsing plugins: " + extList +
                  " are enabled via the plugin.includes system " +
                  "property, and all claim to support the content type " +
                  contentType + ", but they are not mapped to it  in the " +
@@ -387,33 +344,38 @@
         LOG.fine("ParserFactory:No parse plugins mapped or enabled for " +
                  "contentType " + contentType);
       }
-      return null;
     }
+    
+    return (extList.size() > 0) ? extList : null;
   }
 
   private boolean match(Extension extension, String id, String type) {
-    return (id.equals(extension.getDescriptor().getPluginId())) &&
-    (type.equals(extension.getAttribute("contentType")) ||
-        (type.equals(DEFAULT_PLUGIN))); 
+    return ((id.equals(extension.getId())) &&
+            (type.equals(extension.getAttribute("contentType")) ||
+             type.equals(DEFAULT_PLUGIN)));
   }
   
-  private Extension getExtensionByIdAndType(Extension[] extList,
-                                                   String plugId,
-                                                   String contentType) {
-    for (int i = 0; i < extList.length; i++) {
-      if (match(extList[i], plugId, contentType)) {
-        return extList[i];
+  /** Get an extension from its id and supported content-type. */
+  private Extension getExtension(Extension[] list, String id, String type) {
+    for (int i=0; i<list.length; i++) {
+      if (match(list[i], id, type)) {
+        return list[i];
       }
     }
     return null;
   }
-  
-  private Extension getExtensionById(Extension[] extList, String plugId) {
-    for(int i = 0; i < extList.length; i++){
-      if(plugId.equals(extList[i].getDescriptor().getPluginId())){
-        return extList[i];
+    
+  private Extension getExtension(Extension[] list, String id) {
+    for (int i=0; i<list.length; i++) {
+      if (id.equals(list[i].getId())) {
+        return list[i];
       }
     }
     return null;
   }
+  
+  private Extension getExtensionFromAlias(Extension[] list, String id) {
+    return getExtension(list, (String) parsePluginList.getAliases().get(id));
+  }
+
 }

Modified: 
lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
 Tue Feb 21 01:54:21 2006
@@ -61,7 +61,7 @@
 
     Content content =
       new Content(url, url, bytes, contentType, new Metadata(), conf);
-    Parse parse = new ParseUtil(conf).parseByParserId("parse-html",content);
+    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html",content);
 
     Metadata metadata = parse.getData().getParseMeta();
     assertEquals(license, metadata.get("License-Url"));

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
 Tue Feb 21 01:54:21 2006
@@ -37,8 +37,7 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.hadoop.io.UTF8;
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.ParserNotFound;
 import org.apache.nutch.protocol.Content;
@@ -341,9 +340,7 @@
     try {
       protocol = new ProtocolFactory(conf).getProtocol(url);
       Content content = protocol.getProtocolOutput(new UTF8(url), new 
CrawlDatum()).getContent();
-      String contentType = content.getContentType();
-      Parser parser = new ParserFactory(conf).getParser(contentType, url);
-      Parse parse = parser.getParse(content);
+      Parse parse = new ParseUtil(conf).parse(content);
       System.out.println("text:" + parse.getText());
       return parse.getText();
 

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
 Tue Feb 21 01:54:21 2006
@@ -23,8 +23,8 @@
 // Nutch imports
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.NutchConfiguration;
 
@@ -48,16 +48,12 @@
   public void testMetaHTMLParsing() {
 
     try {
-
+      ParseUtil parser = new ParseUtil(NutchConfiguration.create());
       /* loop through the test documents and validate result */
       for (int t = 0; t < docs.length; t++) {
-
         Content content = getContent(docs[t]);
-        Parser parser = new 
ParserFactory(NutchConfiguration.create()).getParser("text/html", URL);
-        Parse parse = parser.getParse(content);
-
+        Parse parse = parser.parse(content);
         assertEquals(metalanguages[t], (String) 
parse.getData().getParseMeta().get(Metadata.LANGUAGE));
-
       }
     } catch (Exception e) {
       e.printStackTrace(System.out);

Modified: 
lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
 Tue Feb 21 01:54:21 2006
@@ -111,13 +111,13 @@
       // check external parser that does 'cat'
       contentType = "application/vnd.nutch.example.cat";
       content.setContentType(contentType);
-      parse = new ParseUtil(conf).parseByParserId("parse-ext", content);
+      parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content);
       assertEquals(expectedText,parse.getText());
 
       // check external parser that does 'md5sum'
       contentType = "application/vnd.nutch.example.md5sum";
       content.setContentType(contentType);
-      parse = new ParseUtil(conf).parseByParserId("parse-ext", content);
+      parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content);
       assertTrue(parse.getText().startsWith(expectedMD5sum));
     }
   }

Modified: 
lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
 Tue Feb 21 01:54:21 2006
@@ -73,7 +73,7 @@
     protocol = new ProtocolFactory(conf).getProtocol(urlString);
     content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
                       .getContent();
-    parse = new ParseUtil(conf).parseByParserId("parse-mp3", content);
+    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);
     Metadata metadata = parse.getData().getParseMeta();
     assertEquals("postgresql comment id3v2", metadata.get("COMM-Text"));
     assertEquals("postgresql composer id3v2", metadata.get("TCOM-Text"));
@@ -105,7 +105,7 @@
     protocol = new ProtocolFactory(conf).getProtocol(urlString);
     content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
                       .getContent();
-    parse = new ParseUtil(conf).parseByParserId("parse-mp3", content);
+    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);
 
     Metadata metadata = parse.getData().getParseMeta();
     assertEquals("postgresql comment id3v1", metadata.get("COMM-Text"));

Modified: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
 Tue Feb 21 01:54:21 2006
@@ -63,7 +63,7 @@
       protocol = factory.getProtocol(urlString);
       content = protocol.getProtocolOutput(new UTF8(urlString),
                                            new CrawlDatum()).getContent();
-      parse = parser.parseByParserId("parse-msexcel", content);
+      parse = parser.parseByExtensionId("parse-msexcel", content);
 
       assertTrue(parse.getText().equals(expectedText));
     }

Modified: 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
 Tue Feb 21 01:54:21 2006
@@ -126,7 +126,8 @@
    */
   public void testContent() throws Exception {
 
-    Parse parse = new 
ParseUtil(NutchConfiguration.create()).parseByParserId("parse-mspowerpoint",this.content);
+    Parse parse = new ParseUtil(NutchConfiguration.create())
+                        .parseByExtensionId("parse-mspowerpoint", 
this.content);
 
     ParseData data = parse.getData();
     String text = parse.getText();
@@ -163,7 +164,8 @@
    */
   public void testMeta() throws Exception {
 
-    Parse parse = new 
ParseUtil(NutchConfiguration.create()).parseByParserId("parse-mspowerpoint",content);
+    Parse parse = new ParseUtil(NutchConfiguration.create())
+                        .parseByExtensionId("parse-mspowerpoint", content);
     
     ParseData data = parse.getData();
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
 Tue Feb 21 01:54:21 2006
@@ -69,7 +69,7 @@
 
       protocol = new ProtocolFactory(conf).getProtocol(urlString);
       content = protocol.getProtocolOutput(new UTF8(urlString), new 
CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByParserId("parse-msword",content);
+      parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content);
 
       assertTrue(parse.getText().startsWith(expectedText));
     }

Modified: 
lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
 Tue Feb 21 01:54:21 2006
@@ -69,7 +69,7 @@
       Configuration conf = NutchConfiguration.create();
       protocol = new ProtocolFactory(conf).getProtocol(urlString);
       content = protocol.getProtocolOutput(new UTF8(urlString), new 
CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByParserId("parse-pdf",content);
+      parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);
 
       int index = parse.getText().indexOf(expectedText);
       assertTrue(index > 0);

Modified: 
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
 Tue Feb 21 01:54:21 2006
@@ -87,7 +87,7 @@
 
             protocol = new ProtocolFactory(conf).getProtocol(urlString);
             content = protocol.getProtocolOutput(new UTF8(urlString), new 
CrawlDatum()).getContent();
-            parse = new ParseUtil(conf).parseByParserId("parse-rss",content);
+            parse = new ParseUtil(conf).parseByExtensionId("parse-rss", 
content);
 
             //check that there are 3 outlinks:
             //http://test.channel.com

Modified: 
lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
 Tue Feb 21 01:54:21 2006
@@ -74,7 +74,7 @@
     protocol = new ProtocolFactory(conf).getProtocol(urlString);
     content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
                       .getContent();
-    parse = new ParseUtil(conf).parseByParserId("parse-rtf", content);
+    parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content);
     String text = parse.getText();
     assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
 Tue Feb 21 01:54:21 2006
@@ -69,7 +69,7 @@
 
       protocol = new ProtocolFactory(conf).getProtocol(urlString);
       content = protocol.getProtocolOutput(new UTF8(urlString), new 
CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByParserId("parse-zip",content);
+      parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content);
       assertTrue(parse.getText().equals(expectedText));
     }
   }

Modified: 
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java 
Tue Feb 21 01:54:21 2006
@@ -42,18 +42,10 @@
       conf = NutchConfiguration.create();
       conf.set("plugin.includes", ".*");
       conf.set("parse.plugin.file",
-                        "org/apache/nutch/parse/parse-plugin-test.xml");
+               "org/apache/nutch/parse/parse-plugin-test.xml");
       parserFactory = new ParserFactory(conf);
   }
-  
-  /** Unit test for <code>getParser(String, String)</code> method. */
-  public void testGetParser() throws Exception {
-    Parser parser = parserFactory.getParser("text/html", "http://foo.com/";);
-    assertNotNull(parser);
-    parser  = parserFactory.getParser("foo/bar", "http://foo.com/";);
-    assertNotNull(parser);
-  }
-  
+    
   /** Unit test for <code>getExtensions(String)</code> method. */
   public void testGetExtensions() throws Exception {
     Extension ext = (Extension)parserFactory.getExtensions("text/html").get(0);
@@ -70,27 +62,27 @@
     assertNotNull(parsers);
     assertEquals(1, parsers.length);
     assertEquals("org.apache.nutch.parse.html.HtmlParser",
-        parsers[0].getClass().getName());
+                 parsers[0].getClass().getName());
 
-    parsers = parserFactory.getParsers("text/html; charset=ISO-8859-1", 
"http://foo.com";);
+    parsers = parserFactory.getParsers("text/html; charset=ISO-8859-1",
+                                       "http://foo.com";);
     assertNotNull(parsers);
     assertEquals(1, parsers.length);
     assertEquals("org.apache.nutch.parse.html.HtmlParser",
-        parsers[0].getClass().getName());
-
+                 parsers[0].getClass().getName());
     
     parsers = parserFactory.getParsers("application/x-javascript",
-    "http://foo.com";);
+                                       "http://foo.com";);
     assertNotNull(parsers);
     assertEquals(1, parsers.length);
     assertEquals("org.apache.nutch.parse.js.JSParseFilter",
-        parsers[0].getClass().getName());
+                 parsers[0].getClass().getName());
     
     parsers = parserFactory.getParsers("text/plain", "http://foo.com";);
     assertNotNull(parsers);
     assertEquals(1, parsers.length);
     assertEquals("org.apache.nutch.parse.text.TextParser",
-        parsers[0].getClass().getName());
+                 parsers[0].getClass().getName());
     
     Parser parser1 = parserFactory.getParsers("text/plain", 
"http://foo.com";)[0];
     Parser parser2 = parserFactory.getParsers("*", "http://foo.com";)[0];
@@ -102,7 +94,8 @@
     parsers = parserFactory.getParsers("text/rss","http://foo.com";);
     assertNotNull(parsers);
     assertEquals(1,parsers.length);
-    
assertEquals("org.apache.nutch.parse.rss.RSSParser",parsers[0].getClass().getName());
+    assertEquals("org.apache.nutch.parse.rss.RSSParser",
+                 parsers[0].getClass().getName());
   }
  
 }

Modified: 
lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml?rev=379403&r1=379402&r2=379403&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/parse-plugin-test.xml 
Tue Feb 21 01:54:21 2006
@@ -1,46 +1,64 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
-       Copyright 2005 The Apache Software Foundation
-       
-       Licensed under the Apache License, Version 2.0 (the "License");
-       you may not use this file except in compliance with the License.
-       You may obtain a copy of the License at
-       
-       http://www.apache.org/licenses/LICENSE-2.0
-       
-       Unless required by applicable law or agreed to in writing, software
-       distributed under the License is distributed on an "AS IS" BASIS,
-       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-       See the License for the specific language governing permissions and
-       limitations under the License.
-       
-       Author     : mattmann 
-       Description: Test parse-plugins.xml file. 
+    Copyright 2005 The Apache Software Foundation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+    Author     : mattmann 
+    Description: Test parse-plugins.xml file. 
 -->
 
 <parse-plugins>
 
-       <!--  by default if the mimeType is set to *, or 
-               can't be determined, use parse-text -->
-       <mimeType name="*">
-                <plugin id="parse-text" />
-       </mimeType>
-       
-    <!--  test these 4 plugins -->
-     <mimeType name="text/html">
-          <plugin id="parse-html"/>
-     </mimeType>
-       
-        <mimeType name="text/plain">
-          <plugin id="parse-text"/>
-        </mimeType>
+  <!--  by default if the mimeType is set to *, or 
+        can't be determined, use parse-text -->
+  <mimeType name="*">
+    <plugin id="parse-text" />
+  </mimeType>
+       
+  <!--  test these 4 plugins -->
+  <mimeType name="text/html">
+    <!--
+     ! Test that if a parser cannot be instanciated,
+     ! it should not block the process and then the next one is used
+     !-->
+    <plugin id="parse-plugin-that-not-exist"/>
+    <plugin id="parse-html"/>
+  </mimeType>
+       
+  <mimeType name="text/plain">
+    <!-- Test that an extension-id can be directly used here -->
+    <plugin id="org.apache.nutch.parse.text.TextParser"/>
+  </mimeType>
         
-        <mimeType name="application/x-javascript">
-             <plugin id="parse-js"/>
-        </mimeType>
+  <mimeType name="application/x-javascript">
+    <plugin id="parse-js"/>
+  </mimeType>
         
-        <mimeType name="text/rss">
-             <plugin id="parse-rss"/>
-        </mimeType>
+  <mimeType name="text/rss">
+    <plugin id="parse-rss"/>
+  </mimeType>
 
+  <!--  alias mappings for parse-xxx names to the actual extension 
implementation 
+  ids described in each plugin's plugin.xml file -->
+  <aliases>
+    <alias name="parse-html"
+           extension-id="org.apache.nutch.parse.html.HtmlParser" />
+    <alias name="parse-js"
+           extension-id="JSParser" />
+    <alias name="parse-rss"
+           extension-id="org.apache.nutch.parse.rss.RSSParser" />
+    <alias name="parse-text"
+           extension-id="org.apache.nutch.parse.text.TextParser" />    
+  </aliases>
 </parse-plugins>


Reply via email to