svn commit: r377494 - in /lucene/nutch/trunk/src/plugin: parse-msexcel/ parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ parse-mspowerpoint/ parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ parse-msword/ parse-msword/src/java/org/...

jerome Mon, 13 Feb 2006 13:28:40 -0800

Author: jerome
Date: Mon Feb 13 13:28:13 2006
New Revision: 377494

URL: http://svn.apache.org/viewcvs?rev=377494&view=rev
Log:
Make use of lib-parsems in word, powerpoint and excel parsers


Removed:
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java
    
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java
    
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java
Modified:
    lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
    lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml
    
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
    
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java
    
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html
    lucene/nutch/trunk/src/plugin/parse-msword/build.xml
    lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml
    
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java
    
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
    
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java
    
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java
    
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html

Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml Mon Feb 13 13:28:13 
2006
@@ -2,19 +2,23 @@
 
 <project name="parse-msexcel" default="jar">
 
-       <import file="../build-plugin.xml" />
+  <import file="../build-plugin.xml" />
 
   <path id="plugin.deps">
     <fileset dir="../lib-jakarta-poi/lib">
       <include name="*.jar" />
     </fileset>
+    <fileset dir="../../../build/lib-parsems">
+      <include name="*.jar" />
+    </fileset>
   </path>
 
-       <!-- for junit test -->
-       <mkdir dir="${build.test}/data" />
-       <copy todir="${build.test}/data">
-               <fileset dir="sample">
-                       <include name="*.xls" />
-               </fileset>
-       </copy>
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+    <fileset dir="sample">
+      <include name="*.xls" />
+    </fileset>
+  </copy>
+
 </project>

Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml Mon Feb 13 13:28:13 
2006
@@ -14,6 +14,7 @@
    <requires>
      <import plugin="nutch-extensionpoints"/>
      <import plugin="lib-jakarta-poi"/>
+     <import plugin="lib-parsems"/>
    </requires>
 
    <extension id="org.apache.nutch.parse.msexcel"

Modified: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
 Mon Feb 13 13:28:13 2006
@@ -16,17 +16,17 @@
 package org.apache.nutch.parse.msexcel;
 
 // JDK imports
-import java.io.IOException;
 import java.io.InputStream;
-import java.util.Date;
-import java.util.Properties;
 
 // Jakarta POI imports
 import org.apache.poi.hssf.usermodel.HSSFCell;
 import org.apache.poi.hssf.usermodel.HSSFRow;
 import org.apache.poi.hssf.usermodel.HSSFSheet;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
-import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+
+// Nutch imports
+import org.apache.nutch.parse.ms.MSExtractor;
+
 
 /**
  * Excel Text and Properties extractor.
@@ -34,10 +34,10 @@
  * @author Rohit Kulkarni & Ashish Vaidya
  * @author J&eacute;r&ocirc;me Charron
  */
-public class ExcelExtractor {
+class ExcelExtractor extends MSExtractor {
 
   
-  public String extractText(InputStream input) throws IOException {
+  protected String extractText(InputStream input) throws Exception {
     
     String resultText = "";
     HSSFWorkbook wb = new HSSFWorkbook(input);
@@ -88,45 +88,5 @@
     return resultText;
   }
   
-  
-  public Properties extractProperties(InputStream input) throws IOException {
-    
-    PropertiesBroker propertiesBroker = new PropertiesBroker();
-    POIFSReader reader = new POIFSReader();
-    reader.registerListener(new PropertiesReaderListener(propertiesBroker),
-                            "\005SummaryInformation");
-    reader.read(input);
-    return propertiesBroker.getProperties();
-  }
-  
-  
-  class PropertiesBroker {
-    
-    private Properties properties;
-    private int timeoutMillis = 2 * 1000;
-    
-    
-    public synchronized Properties getProperties() {
-      
-      long start = new Date().getTime();
-      long now = start;
-      
-      while ((properties == null) && (now-start < timeoutMillis)) {
-        try {
-          wait(timeoutMillis / 10);
-        } catch (InterruptedException e) {}
-        now = new Date().getTime();
-      }
-      
-      notifyAll();
-      return properties;
-    }
-    
-    public synchronized void setProperties(Properties properties) {
-      this.properties = properties;
-      notifyAll();
-    }
-  }
-
 }
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
 Mon Feb 13 13:28:13 2006
@@ -15,111 +15,36 @@
  */
 package org.apache.nutch.parse.msexcel;
 
-// JDK imports
-import java.io.ByteArrayInputStream;
-import java.util.Properties;
-import java.util.logging.Logger;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.util.LogFormatter;
-
 // Nutch imports
-import org.apache.nutch.metadata.DublinCore;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.OutlinkExtractor;
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ms.MSBaseParser;
 import org.apache.nutch.protocol.Content;
 
+
 /**
  * An Excel document parser.
  *
  * @author Rohit Kulkarni & Ashish Vaidya
  * @author J&eacute;r&ocirc;me Charron
  */
-public class MSExcelParser implements Parser {
-  
-  private Configuration conf;
-  
-  private static final Logger LOG = 
LogFormatter.getLogger(MSExcelParser.class.getName());
-
-  /** Creates a new instance of MSExcelParser */
-  public MSExcelParser() { }
-  
-  public Parse getParse(Content content) {
-    
-    String text = null;
-    String title = null;
-    Properties properties = null;
-    
-    try {
-      byte[] raw = content.getContent();
-      String contentLength = 
content.getMetadata().get(Metadata.CONTENT_LENGTH);
-      if ((contentLength != null) &&
-          (raw.length != Integer.parseInt(contentLength))) {
-        return new ParseStatus(ParseStatus.FAILED,
-                               ParseStatus.FAILED_TRUNCATED,
-                               "Content truncated at " + raw.length +" bytes. 
" +
-                               "Parser can't handle incomplete msexcelfile.")
-                               .getEmptyParse(this.conf);
-      }
-
-      ExcelExtractor extractor = new ExcelExtractor();      
-      // Extract text
-      text = extractor.extractText(new ByteArrayInputStream(raw));
-      // Extract properties
-      properties = extractor.extractProperties(new ByteArrayInputStream(raw));
-      
-      //currently returning empty outlinks array
-      //outlinks = this.fetchOutlinks(resultText);
-      
-    } catch (Exception e) {
-      return new ParseStatus(ParseStatus.FAILED,
-                             "Can't be handled as msexcel document. " + e)
-                             .getEmptyParse(this.conf);
-    } finally {
-      // nothing so far
-    }
+public class MSExcelParser extends MSBaseParser {
     
-    // collect meta data
-    Metadata metadata = new Metadata();
-    title = properties.getProperty(DublinCore.TITLE);
-    properties.remove(DublinCore.TITLE);
-    metadata.setAll(properties);
-
-    if (text == null) { text = ""; }
-    if (title == null) { title = ""; }
-
-    // collect outlink
-    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, this.conf);
-
-    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
-                                        outlinks, content.getMetadata(),
-                                        metadata);
-    parseData.setConf(this.conf);
-    return new ParseImpl(text, parseData);
-  }
+  /**
+   * Associated Mime type for Excel files
+   * (<code>application/vnd.ms-excel</code>).
+   */
+  public static final String MIME_TYPE = "application/vnd.ms-excel";
 
-
-  /* ---------------------------- *
-   * <implemenation:Configurable> *
-   * ---------------------------- */
   
-  public void setConf(Configuration conf) {
-    this.conf = conf;
+  public Parse getParse(Content content) {
+    return getParse(new ExcelExtractor(), content);
   }
 
-  public Configuration getConf() {
-    return this.conf;
+  /**
+   * Main for testing. Pass an excel document as argument
+   */
+  public static void main(String args[]) {
+    main(MIME_TYPE, new MSExcelParser(), args);
   }
-
-  /* ----------------------------- *
-   * </implemenation:Configurable> *
-   * ----------------------------- */
-
+  
 }

Modified: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
 Mon Feb 13 13:28:13 2006
@@ -1,6 +1,6 @@
 <html>
 <body>
-<p>An Excel document parsing plugin.</p>
+<p>A Microsoft &copy; Excel document parsing plugin.</p>
 <p>This package relies on Jakarta <a 
href="http://jakarta.apache.org/poi/index.html";>POI</a>.</p>
 </body>
 </html>

Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml Mon Feb 13 
13:28:13 2006
@@ -8,6 +8,9 @@
              <fileset dir="../lib-jakarta-poi/lib">
                  <include name="*.jar" />
              </fileset>
+             <fileset dir="../../../build/lib-parsems">
+                 <include name="*.jar" />
+             </fileset>
         </path>
 
        <!-- for junit test -->

Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml Mon Feb 13 
13:28:13 2006
@@ -14,6 +14,7 @@
    <requires>
       <import plugin="lib-jakarta-poi"/>
       <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-parsems"/>
    </requires>
 
    <extension id="net.nutch.parse.mspowerpoint"

Modified: 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
 Mon Feb 13 13:28:13 2006
@@ -13,29 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.mspowerpoint;
 
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.util.Properties;
-import java.util.logging.Logger;
-
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.OutlinkExtractor;
+// Nutch imports
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ms.MSBaseParser;
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-
-import org.apache.hadoop.util.LogFormatter;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
 
 
 /**
@@ -45,133 +28,27 @@
  * It is based on org.apache.poi.*.
  * 
  * @author Stephan Strittmatter - http://www.sybit.de
+ * @author J&eacute;r&ocirc;me Charron
  * @see <a href="http://jakarta.apache.org/poi";>Jakarta POI</a>
- * @version 1.0
  */
-public class MSPowerPointParser implements Parser {
-
-  /** associated Mime type for PowerPoint files 
(application/vnd.ms-powerpoint) */
-  public static final String MIME_TYPE = "application/vnd.ms-powerpoint";
-
-  private static final Logger LOG = LogFormatter
-      .getLogger(MSPowerPointParser.class.getName());
-
-  private Configuration conf;
+public class MSPowerPointParser extends MSBaseParser {
 
   /**
-   * 
+   * Associated Mime type for PowerPoint files
+   * (<code>application/vnd.ms-powerpoint</code>).
    */
-  public MSPowerPointParser() {
-  }
+  public static final String MIME_TYPE = "application/vnd.ms-powerpoint";
 
-  /**
-   * Main for testing. Pass a ppt-file as argument
-   * 
-   * @param args
-   */
-  public static void main(String args[]) {
-    if (args.length < 1) {
-      System.err.println("Useage:");
-      System.err.println("\tMSPowerPointParser <file>");
-      System.exit(1);
-    }
-
-    String file = args[0];
-    MSPowerPointParser ppe = new MSPowerPointParser();
-
-    byte[] raw = getRawBytes(new File(file));
-
-    Metadata meta = new Metadata();
-    meta.set(Response.CONTENT_LENGTH, "" + raw.length);
-    Content content = new Content(file, file, raw, MIME_TYPE, meta, 
NutchConfiguration.create());
 
-    System.out.println(ppe.getParse(content).getText());
-  }
-
-  /**
-   * Parses the MS PowerPoint file.
-   * 
-   * @see org.apache.nutch.parse.Parser#getParse(Content)
-   */
   public Parse getParse(final Content content) {
-
-    String plainText = null;
-    String title = null;
-    Outlink[] outlinks = null;
-    Properties properties = null;
-
-    try {
-      final String contentLen = 
content.getMetadata().get(Response.CONTENT_LENGTH);
-      final byte[] raw = content.getContent();
-
-      if (contentLen != null && raw.length != Integer.parseInt(contentLen)) {
-        return new ParseStatus(
-            ParseStatus.FAILED,
-            ParseStatus.FAILED_TRUNCATED,
-            "Content truncated at "
-                + raw.length
-                + " bytes. Please increase <protocol>.content.limit at 
nutch-default.xml. "
-                + "Parser can't handle incomplete PowerPoint files.")
-            .getEmptyParse(getConf());
-      }
-
-      final PPTExtractor extractor = new PPTExtractor(new ByteArrayInputStream(
-          raw));
-
-      plainText = extractor.getText();
-      properties = extractor.getProperties();
-      outlinks = OutlinkExtractor.getOutlinks(plainText, content.getUrl(), 
getConf());
-
-    } catch (Exception e) {
-      LOG.throwing(this.getClass().getName(), "getParse", e);
-      return new ParseStatus(e).getEmptyParse(getConf());
-    }
-
-    Metadata metadata = new Metadata();
-
-    if (properties != null) {
-      title = properties.getProperty(Metadata.TITLE);
-      properties.remove(Metadata.TITLE);
-      metadata.setAll(properties);
-    }
-
-    if (plainText == null) {
-      plainText = "";
-    }
-
-    if (title == null) {
-      title = "";
-    }
-
-    final ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
-    final ParseData parseData = new ParseData(status, title, outlinks, 
metadata);
-    parseData.setConf(this.conf);
-
-    LOG.finest("PowerPoint file parsed sucessful.");
-    return new ParseImpl(plainText, parseData);
+    return getParse(new PPTExtractor(), content);
   }
   
-  private final static byte[] getRawBytes(File f) {
-    try {
-      if (!f.exists())
-        return null;
-      FileInputStream fin = new FileInputStream(f);
-      byte[] buffer = new byte[(int) f.length()];
-      fin.read(buffer);
-      fin.close();
-      return buffer;
-    } catch (Exception err) {
-      err.printStackTrace();
-      return null;
-    }
-
+  /**
+   * Main for testing. Pass a powerpoint document as argument
+   */
+  public static void main(String args[]) {
+    main(MIME_TYPE, new MSPowerPointParser(), args);
   }
   
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
 }

Modified: 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java
 Mon Feb 13 13:28:13 2006
@@ -13,141 +13,44 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.mspowerpoint;
 
+// JDK imports
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.Date;
-import java.util.Properties;
-import java.util.logging.Logger;
 
-import org.apache.hadoop.util.LogFormatter;
-import org.apache.poi.hpsf.SummaryInformation;
+// Nutch imports
+import org.apache.nutch.parse.ms.MSExtractor;
+
+// Jakarta POI imports
 import org.apache.poi.poifs.eventfilesystem.POIFSReader;
 
+
 /**
  * Converts the Powerpoint document content to plain text.
  * 
  * @author Stephan Strittmatter - http://www.sybit.de
- * 
- * @version 1.0
+ * @author J&eacute;r&ocirc;me Charron
  */
+class PPTExtractor extends MSExtractor {
 
-public class PPTExtractor {
-
-  private static final Logger LOG = LogFormatter.getLogger(PPTExtractor.class
-      .getName());
-
-  /** Parsed plain Powerpoint Text */
-  private final transient StringBuffer contentBuf;
-
-  private final PropertiesBroker propertiesBroker;
-
-  private final POIFSReader poireader;
-
-  /**
-   * Constructor that takes a PowerPoint file as <code>InputStream</code> to
-   * parse it.
-   * 
-   * @param in
-   *          <code>InputStream</code> containing the PowerPoint file
-   * @throws PowerPointDocumentException
-   *           thrown if parsing failed
-   */
-  public PPTExtractor(final InputStream in) throws PowerPointDocumentException 
{
-    this.poireader = new POIFSReader();
-    this.propertiesBroker = new PropertiesBroker();
-    this.contentBuf = new StringBuffer();
-
-    this.init(in);
-  }
-
-  /**
-   * Get the PowerPoint content text as plain text
-   * 
-   * @return String the content text
-   */
-  public String getText() {
-    return this.contentBuf.toString();
-  }
-
-  /**
-   * Get the <code>Properties</code> of the PowerPoint document.
-   * 
-   * @return the properties of the document
-   */
-  public Properties getProperties() {
-    return this.propertiesBroker.getProperties();
-  }
-
-  /**
-   * @param input
-   * @throws PowerPointDocumentException
-   */
-  private void init(final InputStream input) throws 
PowerPointDocumentException {
-    // register listener for SummaryInformation
-    this.poireader.registerListener(new PropertiesReaderListener(
-        this.propertiesBroker), SummaryInformation.DEFAULT_STREAM_NAME);
-
-    // register listener for PPT-document content
-    this.poireader.registerListener(new ContentReaderListener(this.contentBuf),
-        PPTConstants.POWERPOINT_DOCUMENT);
-
-    try {
-      input.reset();
-      if (input.available() > 0) {
-        this.poireader.read(input);
-      } else {
-        LOG.warning("Input <=0 :" + input.available());
-      }
-    } catch (IOException e) {
-      throw new PowerPointDocumentException(e);
+  private StringBuffer text = null;
+  private POIFSReader reader = null;
+  
+  
+  protected String extractText(InputStream input) throws Exception {
+    this.reader = new POIFSReader();
+    this.text = new StringBuffer();
+    reader.registerListener(
+            new ContentReaderListener(this.text),
+            PPTConstants.POWERPOINT_DOCUMENT);
+    input.reset();
+    if (input.available() > 0) {
+      this.reader.read(input);
+    } else {
+      LOG.warning("Input <=0 :" + input.available());
     }
+    return (this.text != null) ? text.toString() : null;
   }
 
-  /**
-   * The PropertiesBroker
-   * 
-   * @author Stephan Strittmatter
-   * @version 1.0
-   */
-  static class PropertiesBroker {
-
-    private final static int TIMEOUT = 2 * 1000;
-
-    private Properties properties = null;
-
-    /**
-     * Get the collected properties.
-     * 
-     * @return properties of the PowerPoint file
-     */
-    public synchronized Properties getProperties() {
-
-      final long start = new Date().getTime();
-      long now = start;
-
-      while (this.properties == null && now - start < TIMEOUT) {
-        try {
-          wait(TIMEOUT / 10);
-        } catch (InterruptedException e) {
-        }
-        now = new Date().getTime();
-      }
-
-      notifyAll();
-
-      return this.properties;
-    }
-
-    /**
-     * 
-     * @param properties
-     */
-    public synchronized void setProperties(Properties properties) {
-      this.properties = properties;
-      notifyAll();
-    }
-  }
 }

Modified: 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html
 Mon Feb 13 13:28:13 2006
@@ -21,8 +21,9 @@
        </head>
        <body>
                <p>A Microsoft &copy; PowerPoint document parsing plugin.</p>
-               <p>This package relies on <a 
-                       href="http://www.apache.org/poi/index.html";>POI</a>.</p>
+                <p>This package relies on Jakarta
+                   <a href="http://jakarta.apache.org/poi/index.html";>POI</a>.
+                </p>
                <p> Implementation based on sources found at <a 
                        
href="http://groups.google.com/groups?selm=a4f8800541bc694d5af7dabb35e83b72%40localhost.talkaboutsoftware.com";>Google
 
                        Groups </a>. It can also be found at <a 

Modified: lucene/nutch/trunk/src/plugin/parse-msword/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/build.xml?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/build.xml Mon Feb 13 13:28:13 
2006
@@ -8,6 +8,9 @@
     <fileset dir="../lib-jakarta-poi/lib">
       <include name="*.jar" />
     </fileset>
+    <fileset dir="../../../build/lib-parsems">
+      <include name="*.jar" />
+    </fileset>
   </path>
 
   <!-- for junit test -->

Modified: lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml Mon Feb 13 13:28:13 
2006
@@ -14,6 +14,7 @@
    <requires>
       <import plugin="nutch-extensionpoints"/>
       <import plugin="lib-jakarta-poi"/>
+      <import plugin="lib-parsems"/>
    </requires>
 
    <extension id="org.apache.nutch.parse.msword"

Modified: 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java
 Mon Feb 13 13:28:13 2006
@@ -12,22 +12,12 @@
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  */
-
 package org.apache.nutch.parse.msword;
 
-/**
- * <p>Title: </p>
- * <p>Description: </p>
- * <p>Copyright: Copyright (c) 2003</p>
- * <p>Company: </p>
- * @author not attributable
- * @version 1.0
- */
 
-public class FastSavedException extends Exception
-{
-  public FastSavedException(String msg)
-  {
+public class FastSavedException extends Exception {
+  
+  public FastSavedException(String msg) {
     super(msg);
   }
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
 Mon Feb 13 13:28:13 2006
@@ -13,118 +13,41 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.msword;
 
-import org.apache.nutch.metadata.DublinCore;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
+// Nutch imports
 import org.apache.nutch.protocol.Content;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.OutlinkExtractor;
-import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ms.MSBaseParser;
 
-import java.util.Properties;
-import java.io.ByteArrayInputStream;
 
 /**
- * parser for mime type application/msword.
+ * Parser for mime type application/msword.
  * It is based on org.apache.poi.*. We have to see how well it performs.
  *
  * @author John Xing
- *
- * Note on 20040614 by Xing:
- * Some codes are stacked here for convenience (see inline comments).
- * They may be moved to more appropriate places when new codebase
- * stabilizes, especially after code for indexing is written.
- *
  * @author Andy Hedges
- * code to extract all msword properties.
- *
+ * @author J&eacute;r&ocirc;me Charron
  */
 
-public class MSWordParser implements Parser {
-  private Configuration conf;
+public class MSWordParser extends MSBaseParser {
 
-//  public static final Logger LOG =
-//    LogFormatter.getLogger("org.apache.nutch.parse.msword");
-
-  public MSWordParser () {}
+  /**
+   * Associated Mime type for Word files
+   * (<code>application/msword</code>).
+   */
+  public static final String MIME_TYPE = "application/msword";
 
+  
   public Parse getParse(Content content) {
-
-    String text = null;
-    String title = null;
-    Properties properties = null;
-
-    try {
-
-      byte[] raw = content.getContent();
-
-      String contentLength = 
content.getMetadata().get(Response.CONTENT_LENGTH);
-      if (contentLength != null
-            && raw.length != Integer.parseInt(contentLength)) {
-          return new ParseStatus(ParseStatus.FAILED, 
ParseStatus.FAILED_TRUNCATED,
-                  "Content truncated at " + raw.length
-            +" bytes. Parser can't handle incomplete msword 
file.").getEmptyParse(this.conf);
-      }
-
-      WordExtractor extractor = new WordExtractor();
-
-      // collect text
-      text = extractor.extractText(new ByteArrayInputStream(raw));
-
-      // collect meta info
-      properties = extractor.extractProperties(new ByteArrayInputStream(raw));
-
-      extractor = null;
-
-    } catch (ParseException e) {
-      return new ParseStatus(e).getEmptyParse(this.conf);
-    } catch (FastSavedException e) {
-      return new ParseStatus(e).getEmptyParse(this.conf);
-    } catch (PasswordProtectedException e) {
-      return new ParseStatus(e).getEmptyParse(this.conf);
-    } catch (Exception e) { // run time exception
-      return new ParseStatus(ParseStatus.FAILED,
-              "Can't be handled as msword document. " + 
e).getEmptyParse(this.conf);
-    } finally {
-      // nothing so far
-    }
-
-    // collect meta data
-    Metadata metadata = new Metadata();
-    title = properties.getProperty(DublinCore.TITLE);
-    properties.remove(DublinCore.TITLE);
-    metadata.setAll(properties);
-
-    if (text == null) { text = ""; }
-    if (title == null) { title = ""; }
-
-    // collect outlink
-    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, this.conf);
-
-    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
-                                        outlinks, content.getMetadata(),
-                                        metadata);
-    parseData.setConf(this.conf);
-    return new ParseImpl(text, parseData);
-    // any filter?
-    //return HtmlParseFilters.filter(content, parse, root);
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
+    return getParse(new WordExtractor(), content);
   }
 
-  public Configuration getConf() {
-    return this.conf;
+  /**
+   * Main for testing. Pass an word document as argument
+   */
+  public static void main(String args[]) {
+    main(MIME_TYPE, new MSWordParser(), args);
   }
 
 }

Modified: 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java
 Mon Feb 13 13:28:13 2006
@@ -14,11 +14,10 @@
  */
 package org.apache.nutch.parse.msword;
 
-public class PasswordProtectedException
-  extends Exception
-{
-  public PasswordProtectedException(String msg)
-  {
+
+public class PasswordProtectedException extends Exception {
+
+  public PasswordProtectedException(String msg) {
     super(msg);
   }
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java
 Mon Feb 13 13:28:13 2006
@@ -14,42 +14,47 @@
  */
 package org.apache.nutch.parse.msword;
 
-import org.apache.poi.hpsf.*;
-import org.apache.poi.hwpf.model.*;
-import org.apache.poi.hwpf.sprm.*;
-import org.apache.poi.poifs.eventfilesystem.*;
-import org.apache.poi.poifs.filesystem.*;
+// JDK imports
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+// Jakarta POI imports
+import org.apache.poi.hwpf.model.CHPBinTable;
+import org.apache.poi.hwpf.model.CHPX;
+import org.apache.poi.hwpf.model.ComplexFileTable;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.hwpf.sprm.SprmIterator;
+import org.apache.poi.hwpf.sprm.SprmOperation;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.LittleEndian;
-import org.apache.nutch.metadata.Metadata;
 
-import java.util.*;
-import java.io.*;
+// Nutch imports
+import org.apache.nutch.parse.ms.MSExtractor;
+
 
 /**
  * This class extracts the text from a Word 6.0/95/97/2000/XP word doc
  *
  * @author Ryan Ackley
- *
  * @author Andy Hedges
- * code to extract all msword properties.
+ * @author J&eacute;r&ocirc;me Charron
  *
  */
-public class WordExtractor {
+class WordExtractor extends MSExtractor {
 
-  /**
-   * Constructor
-   */
-  public WordExtractor()
-  {
-  }
 
   /**
    * Gets the text from a Word document.
    *
    * @param in The InputStream representing the Word file.
    */
-  public String extractText(InputStream in) throws Exception
-  {
+  protected String extractText(InputStream in) throws Exception {
+
     ArrayList text = new ArrayList();
     POIFSFileSystem fsys = new POIFSFileSystem(in);
 
@@ -221,128 +226,5 @@
     return false;
   }
 
-  public Properties extractProperties(InputStream in)
-                      throws IOException {
-
-    PropertiesBroker propertiesBroker = new PropertiesBroker();
-    POIFSReader reader = new POIFSReader();
-    reader.registerListener(new PropertiesReaderListener(propertiesBroker),
-                            "\005SummaryInformation");
-    reader.read(in);
-    return propertiesBroker.getProperties();
-  }
-
-  class PropertiesReaderListener
-    implements POIFSReaderListener {
-
-    private PropertiesBroker propertiesBroker;
-    private Properties metaData = new Properties();
-
-    public PropertiesReaderListener(PropertiesBroker propertiesBroker) {
-      this.propertiesBroker = propertiesBroker;
-    }
-
-    public void processPOIFSReaderEvent(POIFSReaderEvent event) {
-
-      SummaryInformation si = null;
-      Properties properties = new Properties();
-
-      try {
-        si = (SummaryInformation)PropertySetFactory.create(event.getStream());
-      } catch (Exception ex) {
-        properties = null;
-      }
-
-      Date tmp = null;
-
-      String title = si.getTitle();
-      String applicationName = si.getApplicationName();
-      String author = si.getAuthor();
-      int charCount = si.getCharCount();
-      String comments = si.getComments();
-      Date createDateTime = si.getCreateDateTime();
-      long editTime = si.getEditTime();
-      String keywords = si.getKeywords();
-      String lastAuthor = si.getLastAuthor();
-      Date lastPrinted = si.getLastPrinted();
-      Date lastSaveDateTime = si.getLastSaveDateTime();
-      int pageCount = si.getPageCount();
-      String revNumber = si.getRevNumber();
-      int security = si.getSecurity();
-      String subject = si.getSubject();
-      String template = si.getTemplate();
-      int wordCount = si.getWordCount();
-
-      /*Dates are being stored in millis since the epoch to aid
-      localization*/
-      if(title != null)
-        properties.setProperty(Metadata.TITLE, title);
-      if(applicationName != null)
-        properties.setProperty(Metadata.APPLICATION_NAME, applicationName);
-      if(author != null)
-        properties.setProperty(Metadata.AUTHOR, author);
-      if(charCount != 0)
-        properties.setProperty(Metadata.CHARACTER_COUNT, charCount + "");
-      if(comments != null)
-        properties.setProperty(Metadata.COMMENTS, comments);
-      if(createDateTime != null)
-        properties.setProperty(Metadata.DATE,
-                               Metadata.DATE_FORMAT.format(createDateTime));
-      if(editTime != 0)
-        properties.setProperty(Metadata.LAST_MODIFIED, editTime + "");
-      if(keywords != null)
-        properties.setProperty(Metadata.KEYWORDS, keywords);
-      if(lastAuthor != null)
-        properties.setProperty(Metadata.LAST_AUTHOR, lastAuthor);
-      if(lastPrinted != null)
-        properties.setProperty(Metadata.LAST_PRINTED, lastPrinted.getTime() + 
"");
-      if(lastSaveDateTime != null)
-        properties.setProperty(Metadata.LAST_SAVED, lastSaveDateTime.getTime() 
+ "");
-      if(pageCount != 0)
-        properties.setProperty(Metadata.PAGE_COUNT, pageCount + "");
-      if(revNumber != null)
-        properties.setProperty(Metadata.REVISION_NUMBER, revNumber);
-      if(security != 0)
-        properties.setProperty(Metadata.RIGHTS, security + "");
-      if(subject != null)
-        properties.setProperty(Metadata.SUBJECT, subject);
-      if(template != null)
-        properties.setProperty(Metadata.TEMPLATE, template);
-      if(wordCount != 0)
-        properties.setProperty(Metadata.WORD_COUNT, wordCount + "");
-      propertiesBroker.setProperties(properties);
-
-      //si.getThumbnail(); // can't think of a sensible way of turning this 
into a string.
-    }
-  }
-
-  class PropertiesBroker {
-
-    private Properties properties;
-    private int timeoutMillis = 2 * 1000;
-
-
-    public synchronized Properties getProperties() {
-
-      long start = new Date().getTime();
-      long now = start;
-
-      while (properties == null && now - start < timeoutMillis) {
-        try {
-          wait(timeoutMillis / 10);
-        } catch (InterruptedException e) {}
-        now = new Date().getTime();
-      }
-
-      notifyAll();
-
-      return properties;
-    }
-
-    public synchronized void setProperties(Properties properties) {
-      this.properties = properties;
-      notifyAll();
-    }
-  }
 }
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html
 Mon Feb 13 13:28:13 2006
@@ -1,5 +1,6 @@
 <html>
 <body>
-<p>A Word document parsing plugin.</p><p>This package relies on <a 
href="http://jakarta.apache.org/poi/index.html";>POI</a>.</p>
+<p>A Microsoft &copy; Word document parsing plugin.</p>
+<p>This package relies on <a 
href="http://jakarta.apache.org/poi/index.html";>POI</a>.</p>
 </body>
 </html>

svn commit: r377494 - in /lucene/nutch/trunk/src/plugin: parse-msexcel/ parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ parse-mspowerpoint/ parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ parse-msword/ parse-msword/src/java/org/...

Reply via email to