[Nutch-cvs] svn commit: r377494 - in /lucene/nutch/trunk/src/plugin: parse-msexcel/ parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ parse-mspowerpoint/ parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ parse-msword/ parse-msword/src/java/org/...

jerome Mon, 13 Feb 2006 13:29:04 -0800

Author: jerome
Date: Mon Feb 13 13:28:13 2006
New Revision: 377494

URL: http://svn.apache.org/viewcvs?rev=377494&view=rev
Log:
Make use of lib-parsems in word, powerpoint and excel parsers


Removed:
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/PropertiesReaderListener.java
    
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PowerPointDocumentException.java
    
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java
Modified:
    lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
    lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
    
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml
    
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
    
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java
    
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html
    lucene/nutch/trunk/src/plugin/parse-msword/build.xml
    lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml
    
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java
    
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
    
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java
    
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java
    
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html

Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/build.xml Mon Feb 13 13:28:13 
2006
@@ -2,19 +2,23 @@
 
 <project name="parse-msexcel" default="jar">
 
-       <import file="../build-plugin.xml" />
+  <import file="../build-plugin.xml" />
 
   <path id="plugin.deps">
     <fileset dir="../lib-jakarta-poi/lib">
       <include name="*.jar" />
     </fileset>
+    <fileset dir="../../../build/lib-parsems">
+      <include name="*.jar" />
+    </fileset>
   </path>
 
-       <!-- for junit test -->
-       <mkdir dir="${build.test}/data" />
-       <copy todir="${build.test}/data">
-               <fileset dir="sample">
-                       <include name="*.xls" />
-               </fileset>
-       </copy>
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+    <fileset dir="sample">
+      <include name="*.xls" />
+    </fileset>
+  </copy>
+
 </project>

Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/plugin.xml Mon Feb 13 13:28:13 
2006
@@ -14,6 +14,7 @@
    <requires>
      <import plugin="nutch-extensionpoints"/>
      <import plugin="lib-jakarta-poi"/>
+     <import plugin="lib-parsems"/>
    </requires>
 
    <extension id="org.apache.nutch.parse.msexcel"

Modified: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ExcelExtractor.java
 Mon Feb 13 13:28:13 2006
@@ -16,17 +16,17 @@
 package org.apache.nutch.parse.msexcel;
 
 // JDK imports
-import java.io.IOException;
 import java.io.InputStream;
-import java.util.Date;
-import java.util.Properties;
 
 // Jakarta POI imports
 import org.apache.poi.hssf.usermodel.HSSFCell;
 import org.apache.poi.hssf.usermodel.HSSFRow;
 import org.apache.poi.hssf.usermodel.HSSFSheet;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
-import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+
+// Nutch imports
+import org.apache.nutch.parse.ms.MSExtractor;
+
 
 /**
  * Excel Text and Properties extractor.
@@ -34,10 +34,10 @@
  * @author Rohit Kulkarni & Ashish Vaidya
  * @author J&eacute;r&ocirc;me Charron
  */
-public class ExcelExtractor {
+class ExcelExtractor extends MSExtractor {
 
   
-  public String extractText(InputStream input) throws IOException {
+  protected String extractText(InputStream input) throws Exception {
     
     String resultText = "";
     HSSFWorkbook wb = new HSSFWorkbook(input);
@@ -88,45 +88,5 @@
     return resultText;
   }
   
-  
-  public Properties extractProperties(InputStream input) throws IOException {
-    
-    PropertiesBroker propertiesBroker = new PropertiesBroker();
-    POIFSReader reader = new POIFSReader();
-    reader.registerListener(new PropertiesReaderListener(propertiesBroker),
-                            "\005SummaryInformation");
-    reader.read(input);
-    return propertiesBroker.getProperties();
-  }
-  
-  
-  class PropertiesBroker {
-    
-    private Properties properties;
-    private int timeoutMillis = 2 * 1000;
-    
-    
-    public synchronized Properties getProperties() {
-      
-      long start = new Date().getTime();
-      long now = start;
-      
-      while ((properties == null) && (now-start < timeoutMillis)) {
-        try {
-          wait(timeoutMillis / 10);
-        } catch (InterruptedException e) {}
-        now = new Date().getTime();
-      }
-      
-      notifyAll();
-      return properties;
-    }
-    
-    public synchronized void setProperties(Properties properties) {
-      this.properties = properties;
-      notifyAll();
-    }
-  }
-
 }
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
 Mon Feb 13 13:28:13 2006
@@ -15,111 +15,36 @@
  */
 package org.apache.nutch.parse.msexcel;
 
-// JDK imports
-import java.io.ByteArrayInputStream;
-import java.util.Properties;
-import java.util.logging.Logger;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.util.LogFormatter;
-
 // Nutch imports
-import org.apache.nutch.metadata.DublinCore;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.OutlinkExtractor;
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ms.MSBaseParser;
 import org.apache.nutch.protocol.Content;
 
+
 /**
  * An Excel document parser.
  *
  * @author Rohit Kulkarni & Ashish Vaidya
  * @author J&eacute;r&ocirc;me Charron
  */
-public class MSExcelParser implements Parser {
-  
-  private Configuration conf;
-  
-  private static final Logger LOG = 
LogFormatter.getLogger(MSExcelParser.class.getName());
-
-  /** Creates a new instance of MSExcelParser */
-  public MSExcelParser() { }
-  
-  public Parse getParse(Content content) {
-    
-    String text = null;
-    String title = null;
-    Properties properties = null;
-    
-    try {
-      byte[] raw = content.getContent();
-      String contentLength = 
content.getMetadata().get(Metadata.CONTENT_LENGTH);
-      if ((contentLength != null) &&
-          (raw.length != Integer.parseInt(contentLength))) {
-        return new ParseStatus(ParseStatus.FAILED,
-                               ParseStatus.FAILED_TRUNCATED,
-                               "Content truncated at " + raw.length +" bytes. 
" +
-                               "Parser can't handle incomplete msexcelfile.")
-                               .getEmptyParse(this.conf);
-      }
-
-      ExcelExtractor extractor = new ExcelExtractor();      
-      // Extract text
-      text = extractor.extractText(new ByteArrayInputStream(raw));
-      // Extract properties
-      properties = extractor.extractProperties(new ByteArrayInputStream(raw));
-      
-      //currently returning empty outlinks array
-      //outlinks = this.fetchOutlinks(resultText);
-      
-    } catch (Exception e) {
-      return new ParseStatus(ParseStatus.FAILED,
-                             "Can't be handled as msexcel document. " + e)
-                             .getEmptyParse(this.conf);
-    } finally {
-      // nothing so far
-    }
+public class MSExcelParser extends MSBaseParser {
     
-    // collect meta data
-    Metadata metadata = new Metadata();
-    title = properties.getProperty(DublinCore.TITLE);
-    properties.remove(DublinCore.TITLE);
-    metadata.setAll(properties);
-
-    if (text == null) { text = ""; }
-    if (title == null) { title = ""; }
-
-    // collect outlink
-    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, this.conf);
-
-    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
-                                        outlinks, content.getMetadata(),
-                                        metadata);
-    parseData.setConf(this.conf);
-    return new ParseImpl(text, parseData);
-  }
+  /**
+   * Associated Mime type for Excel files
+   * (<code>application/vnd.ms-excel</code>).
+   */
+  public static final String MIME_TYPE = "application/vnd.ms-excel";
 
-
-  /* ---------------------------- *
-   * <implemenation:Configurable> *
-   * ---------------------------- */
   
-  public void setConf(Configuration conf) {
-    this.conf = conf;
+  public Parse getParse(Content content) {
+    return getParse(new ExcelExtractor(), content);
   }
 
-  public Configuration getConf() {
-    return this.conf;
+  /**
+   * Main for testing. Pass an excel document as argument
+   */
+  public static void main(String args[]) {
+    main(MIME_TYPE, new MSExcelParser(), args);
   }
-
-  /* ----------------------------- *
-   * </implemenation:Configurable> *
-   * ----------------------------- */
-
+  
 }

Modified: 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/package.html
 Mon Feb 13 13:28:13 2006
@@ -1,6 +1,6 @@
 <html>
 <body>
-<p>An Excel document parsing plugin.</p>
+<p>A Microsoft &copy; Excel document parsing plugin.</p>
 <p>This package relies on Jakarta <a 
href="http://jakarta.apache.org/poi/index.html";>POI</a>.</p>
 </body>
 </html>

Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/build.xml Mon Feb 13 
13:28:13 2006
@@ -8,6 +8,9 @@
              <fileset dir="../lib-jakarta-poi/lib">
                  <include name="*.jar" />
              </fileset>
+             <fileset dir="../../../build/lib-parsems">
+                 <include name="*.jar" />
+             </fileset>
         </path>
 
        <!-- for junit test -->

Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/plugin.xml Mon Feb 13 
13:28:13 2006
@@ -14,6 +14,7 @@
    <requires>
       <import plugin="lib-jakarta-poi"/>
       <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-parsems"/>
    </requires>
 
    <extension id="net.nutch.parse.mspowerpoint"

Modified: 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
 Mon Feb 13 13:28:13 2006
@@ -13,29 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.mspowerpoint;
 
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.util.Properties;
-import java.util.logging.Logger;
-
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.OutlinkExtractor;
+// Nutch imports
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ms.MSBaseParser;
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-
-import org.apache.hadoop.util.LogFormatter;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
 
 
 /**
@@ -45,133 +28,27 @@
  * It is based on org.apache.poi.*.
  * 
  * @author Stephan Strittmatter - http://www.sybit.de
+ * @author J&eacute;r&ocirc;me Charron
  * @see <a href="http://jakarta.apache.org/poi";>Jakarta POI</a>
- * @version 1.0
  */
-public class MSPowerPointParser implements Parser {
-
-  /** associated Mime type for PowerPoint files 
(application/vnd.ms-powerpoint) */
-  public static final String MIME_TYPE = "application/vnd.ms-powerpoint";
-
-  private static final Logger LOG = LogFormatter
-      .getLogger(MSPowerPointParser.class.getName());
-
-  private Configuration conf;
+public class MSPowerPointParser extends MSBaseParser {
 
   /**
-   * 
+   * Associated Mime type for PowerPoint files
+   * (<code>application/vnd.ms-powerpoint</code>).
    */
-  public MSPowerPointParser() {
-  }
+  public static final String MIME_TYPE = "application/vnd.ms-powerpoint";
 
-  /**
-   * Main for testing. Pass a ppt-file as argument
-   * 
-   * @param args
-   */
-  public static void main(String args[]) {
-    if (args.length < 1) {
-      System.err.println("Useage:");
-      System.err.println("\tMSPowerPointParser <file>");
-      System.exit(1);
-    }
-
-    String file = args[0];
-    MSPowerPointParser ppe = new MSPowerPointParser();
-
-    byte[] raw = getRawBytes(new File(file));
-
-    Metadata meta = new Metadata();
-    meta.set(Response.CONTENT_LENGTH, "" + raw.length);
-    Content content = new Content(file, file, raw, MIME_TYPE, meta, 
NutchConfiguration.create());
 
-    System.out.println(ppe.getParse(content).getText());
-  }
-
-  /**
-   * Parses the MS PowerPoint file.
-   * 
-   * @see org.apache.nutch.parse.Parser#getParse(Content)
-   */
   public Parse getParse(final Content content) {
-
-    String plainText = null;
-    String title = null;
-    Outlink[] outlinks = null;
-    Properties properties = null;
-
-    try {
-      final String contentLen = 
content.getMetadata().get(Response.CONTENT_LENGTH);
-      final byte[] raw = content.getContent();
-
-      if (contentLen != null && raw.length != Integer.parseInt(contentLen)) {
-        return new ParseStatus(
-            ParseStatus.FAILED,
-            ParseStatus.FAILED_TRUNCATED,
-            "Content truncated at "
-                + raw.length
-                + " bytes. Please increase <protocol>.content.limit at 
nutch-default.xml. "
-                + "Parser can't handle incomplete PowerPoint files.")
-            .getEmptyParse(getConf());
-      }
-
-      final PPTExtractor extractor = new PPTExtractor(new ByteArrayInputStream(
-          raw));
-
-      plainText = extractor.getText();
-      properties = extractor.getProperties();
-      outlinks = OutlinkExtractor.getOutlinks(plainText, content.getUrl(), 
getConf());
-
-    } catch (Exception e) {
-      LOG.throwing(this.getClass().getName(), "getParse", e);
-      return new ParseStatus(e).getEmptyParse(getConf());
-    }
-
-    Metadata metadata = new Metadata();
-
-    if (properties != null) {
-      title = properties.getProperty(Metadata.TITLE);
-      properties.remove(Metadata.TITLE);
-      metadata.setAll(properties);
-    }
-
-    if (plainText == null) {
-      plainText = "";
-    }
-
-    if (title == null) {
-      title = "";
-    }
-
-    final ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
-    final ParseData parseData = new ParseData(status, title, outlinks, 
metadata);
-    parseData.setConf(this.conf);
-
-    LOG.finest("PowerPoint file parsed sucessful.");
-    return new ParseImpl(plainText, parseData);
+    return getParse(new PPTExtractor(), content);
   }
   
-  private final static byte[] getRawBytes(File f) {
-    try {
-      if (!f.exists())
-        return null;
-      FileInputStream fin = new FileInputStream(f);
-      byte[] buffer = new byte[(int) f.length()];
-      fin.read(buffer);
-      fin.close();
-      return buffer;
-    } catch (Exception err) {
-      err.printStackTrace();
-      return null;
-    }
-
+  /**
+   * Main for testing. Pass a powerpoint document as argument
+   */
+  public static void main(String args[]) {
+    main(MIME_TYPE, new MSPowerPointParser(), args);
   }
   
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
 }

Modified: 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java
 Mon Feb 13 13:28:13 2006
@@ -13,141 +13,44 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.mspowerpoint;
 
+// JDK imports
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.Date;
-import java.util.Properties;
-import java.util.logging.Logger;
 
-import org.apache.hadoop.util.LogFormatter;
-import org.apache.poi.hpsf.SummaryInformation;
+// Nutch imports
+import org.apache.nutch.parse.ms.MSExtractor;
+
+// Jakarta POI imports
 import org.apache.poi.poifs.eventfilesystem.POIFSReader;
 
+
 /**
  * Converts the Powerpoint document content to plain text.
  * 
  * @author Stephan Strittmatter - http://www.sybit.de
- * 
- * @version 1.0
+ * @author J&eacute;r&ocirc;me Charron
  */
+class PPTExtractor extends MSExtractor {
 
-public class PPTExtractor {
-
-  private static final Logger LOG = LogFormatter.getLogger(PPTExtractor.class
-      .getName());
-
-  /** Parsed plain Powerpoint Text */
-  private final transient StringBuffer contentBuf;
-
-  private final PropertiesBroker propertiesBroker;
-
-  private final POIFSReader poireader;
-
-  /**
-   * Constructor that takes a PowerPoint file as <code>InputStream</code> to
-   * parse it.
-   * 
-   * @param in
-   *          <code>InputStream</code> containing the PowerPoint file
-   * @throws PowerPointDocumentException
-   *           thrown if parsing failed
-   */
-  public PPTExtractor(final InputStream in) throws PowerPointDocumentException 
{
-    this.poireader = new POIFSReader();
-    this.propertiesBroker = new PropertiesBroker();
-    this.contentBuf = new StringBuffer();
-
-    this.init(in);
-  }
-
-  /**
-   * Get the PowerPoint content text as plain text
-   * 
-   * @return String the content text
-   */
-  public String getText() {
-    return this.contentBuf.toString();
-  }
-
-  /**
-   * Get the <code>Properties</code> of the PowerPoint document.
-   * 
-   * @return the properties of the document
-   */
-  public Properties getProperties() {
-    return this.propertiesBroker.getProperties();
-  }
-
-  /**
-   * @param input
-   * @throws PowerPointDocumentException
-   */
-  private void init(final InputStream input) throws 
PowerPointDocumentException {
-    // register listener for SummaryInformation
-    this.poireader.registerListener(new PropertiesReaderListener(
-        this.propertiesBroker), SummaryInformation.DEFAULT_STREAM_NAME);
-
-    // register listener for PPT-document content
-    this.poireader.registerListener(new ContentReaderListener(this.contentBuf),
-        PPTConstants.POWERPOINT_DOCUMENT);
-
-    try {
-      input.reset();
-      if (input.available() > 0) {
-        this.poireader.read(input);
-      } else {
-        LOG.warning("Input <=0 :" + input.available());
-      }
-    } catch (IOException e) {
-      throw new PowerPointDocumentException(e);
+  private StringBuffer text = null;
+  private POIFSReader reader = null;
+  
+  
+  protected String extractText(InputStream input) throws Exception {
+    this.reader = new POIFSReader();
+    this.text = new StringBuffer();
+    reader.registerListener(
+            new ContentReaderListener(this.text),
+            PPTConstants.POWERPOINT_DOCUMENT);
+    input.reset();
+    if (input.available() > 0) {
+      this.reader.read(input);
+    } else {
+      LOG.warning("Input <=0 :" + input.available());
     }
+    return (this.text != null) ? text.toString() : null;
   }
 
-  /**
-   * The PropertiesBroker
-   * 
-   * @author Stephan Strittmatter
-   * @version 1.0
-   */
-  static class PropertiesBroker {
-
-    private final static int TIMEOUT = 2 * 1000;
-
-    private Properties properties = null;
-
-    /**
-     * Get the collected properties.
-     * 
-     * @return properties of the PowerPoint file
-     */
-    public synchronized Properties getProperties() {
-
-      final long start = new Date().getTime();
-      long now = start;
-
-      while (this.properties == null && now - start < TIMEOUT) {
-        try {
-          wait(TIMEOUT / 10);
-        } catch (InterruptedException e) {
-        }
-        now = new Date().getTime();
-      }
-
-      notifyAll();
-
-      return this.properties;
-    }
-
-    /**
-     * 
-     * @param properties
-     */
-    public synchronized void setProperties(Properties properties) {
-      this.properties = properties;
-      notifyAll();
-    }
-  }
 }

Modified: 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/package.html
 Mon Feb 13 13:28:13 2006
@@ -21,8 +21,9 @@
        </head>
        <body>
                <p>A Microsoft &copy; PowerPoint document parsing plugin.</p>
-               <p>This package relies on <a 
-                       href="http://www.apache.org/poi/index.html";>POI</a>.</p>
+                <p>This package relies on Jakarta
+                   <a href="http://jakarta.apache.org/poi/index.html";>POI</a>.
+                </p>
                <p> Implementation based on sources found at <a 
                        
href="http://groups.google.com/groups?selm=a4f8800541bc694d5af7dabb35e83b72%40localhost.talkaboutsoftware.com";>Google
 
                        Groups </a>. It can also be found at <a 

Modified: lucene/nutch/trunk/src/plugin/parse-msword/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/build.xml?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/build.xml Mon Feb 13 13:28:13 
2006
@@ -8,6 +8,9 @@
     <fileset dir="../lib-jakarta-poi/lib">
       <include name="*.jar" />
     </fileset>
+    <fileset dir="../../../build/lib-parsems">
+      <include name="*.jar" />
+    </fileset>
   </path>
 
   <!-- for junit test -->

Modified: lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/plugin.xml Mon Feb 13 13:28:13 
2006
@@ -14,6 +14,7 @@
    <requires>
       <import plugin="nutch-extensionpoints"/>
       <import plugin="lib-jakarta-poi"/>
+      <import plugin="lib-parsems"/>
    </requires>
 
    <extension id="org.apache.nutch.parse.msword"

Modified: 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/FastSavedException.java
 Mon Feb 13 13:28:13 2006
@@ -12,22 +12,12 @@
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  */
-
 package org.apache.nutch.parse.msword;
 
-/**
- * <p>Title: </p>
- * <p>Description: </p>
- * <p>Copyright: Copyright (c) 2003</p>
- * <p>Company: </p>
- * @author not attributable
- * @version 1.0
- */
 
-public class FastSavedException extends Exception
-{
-  public FastSavedException(String msg)
-  {
+public class FastSavedException extends Exception {
+  
+  public FastSavedException(String msg) {
     super(msg);
   }
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
 Mon Feb 13 13:28:13 2006
@@ -13,118 +13,41 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.msword;
 
-import org.apache.nutch.metadata.DublinCore;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
+// Nutch imports
 import org.apache.nutch.protocol.Content;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.OutlinkExtractor;
-import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ms.MSBaseParser;
 
-import java.util.Properties;
-import java.io.ByteArrayInputStream;
 
 /**
- * parser for mime type application/msword.
+ * Parser for mime type application/msword.
  * It is based on org.apache.poi.*. We have to see how well it performs.
  *
  * @author John Xing
- *
- * Note on 20040614 by Xing:
- * Some codes are stacked here for convenience (see inline comments).
- * They may be moved to more appropriate places when new codebase
- * stabilizes, especially after code for indexing is written.
- *
  * @author Andy Hedges
- * code to extract all msword properties.
- *
+ * @author J&eacute;r&ocirc;me Charron
  */
 
-public class MSWordParser implements Parser {
-  private Configuration conf;
+public class MSWordParser extends MSBaseParser {
 
-//  public static final Logger LOG =
-//    LogFormatter.getLogger("org.apache.nutch.parse.msword");
-
-  public MSWordParser () {}
+  /**
+   * Associated Mime type for Word files
+   * (<code>application/msword</code>).
+   */
+  public static final String MIME_TYPE = "application/msword";
 
+  
   public Parse getParse(Content content) {
-
-    String text = null;
-    String title = null;
-    Properties properties = null;
-
-    try {
-
-      byte[] raw = content.getContent();
-
-      String contentLength = 
content.getMetadata().get(Response.CONTENT_LENGTH);
-      if (contentLength != null
-            && raw.length != Integer.parseInt(contentLength)) {
-          return new ParseStatus(ParseStatus.FAILED, 
ParseStatus.FAILED_TRUNCATED,
-                  "Content truncated at " + raw.length
-            +" bytes. Parser can't handle incomplete msword 
file.").getEmptyParse(this.conf);
-      }
-
-      WordExtractor extractor = new WordExtractor();
-
-      // collect text
-      text = extractor.extractText(new ByteArrayInputStream(raw));
-
-      // collect meta info
-      properties = extractor.extractProperties(new ByteArrayInputStream(raw));
-
-      extractor = null;
-
-    } catch (ParseException e) {
-      return new ParseStatus(e).getEmptyParse(this.conf);
-    } catch (FastSavedException e) {
-      return new ParseStatus(e).getEmptyParse(this.conf);
-    } catch (PasswordProtectedException e) {
-      return new ParseStatus(e).getEmptyParse(this.conf);
-    } catch (Exception e) { // run time exception
-      return new ParseStatus(ParseStatus.FAILED,
-              "Can't be handled as msword document. " + 
e).getEmptyParse(this.conf);
-    } finally {
-      // nothing so far
-    }
-
-    // collect meta data
-    Metadata metadata = new Metadata();
-    title = properties.getProperty(DublinCore.TITLE);
-    properties.remove(DublinCore.TITLE);
-    metadata.setAll(properties);
-
-    if (text == null) { text = ""; }
-    if (title == null) { title = ""; }
-
-    // collect outlink
-    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, this.conf);
-
-    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
-                                        outlinks, content.getMetadata(),
-                                        metadata);
-    parseData.setConf(this.conf);
-    return new ParseImpl(text, parseData);
-    // any filter?
-    //return HtmlParseFilters.filter(content, parse, root);
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
+    return getParse(new WordExtractor(), content);
   }
 
-  public Configuration getConf() {
-    return this.conf;
+  /**
+   * Main for testing. Pass an word document as argument
+   */
+  public static void main(String args[]) {
+    main(MIME_TYPE, new MSWordParser(), args);
   }
 
 }

Modified: 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/PasswordProtectedException.java
 Mon Feb 13 13:28:13 2006
@@ -14,11 +14,10 @@
  */
 package org.apache.nutch.parse.msword;
 
-public class PasswordProtectedException
-  extends Exception
-{
-  public PasswordProtectedException(String msg)
-  {
+
+public class PasswordProtectedException extends Exception {
+
+  public PasswordProtectedException(String msg) {
     super(msg);
   }
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java
 Mon Feb 13 13:28:13 2006
@@ -14,42 +14,47 @@
  */
 package org.apache.nutch.parse.msword;
 
-import org.apache.poi.hpsf.*;
-import org.apache.poi.hwpf.model.*;
-import org.apache.poi.hwpf.sprm.*;
-import org.apache.poi.poifs.eventfilesystem.*;
-import org.apache.poi.poifs.filesystem.*;
+// JDK imports
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+// Jakarta POI imports
+import org.apache.poi.hwpf.model.CHPBinTable;
+import org.apache.poi.hwpf.model.CHPX;
+import org.apache.poi.hwpf.model.ComplexFileTable;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.hwpf.sprm.SprmIterator;
+import org.apache.poi.hwpf.sprm.SprmOperation;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.LittleEndian;
-import org.apache.nutch.metadata.Metadata;
 
-import java.util.*;
-import java.io.*;
+// Nutch imports
+import org.apache.nutch.parse.ms.MSExtractor;
+
 
 /**
  * This class extracts the text from a Word 6.0/95/97/2000/XP word doc
  *
  * @author Ryan Ackley
- *
  * @author Andy Hedges
- * code to extract all msword properties.
+ * @author J&eacute;r&ocirc;me Charron
  *
  */
-public class WordExtractor {
+class WordExtractor extends MSExtractor {
 
-  /**
-   * Constructor
-   */
-  public WordExtractor()
-  {
-  }
 
   /**
    * Gets the text from a Word document.
    *
    * @param in The InputStream representing the Word file.
    */
-  public String extractText(InputStream in) throws Exception
-  {
+  protected String extractText(InputStream in) throws Exception {
+
     ArrayList text = new ArrayList();
     POIFSFileSystem fsys = new POIFSFileSystem(in);
 
@@ -221,128 +226,5 @@
     return false;
   }
 
-  public Properties extractProperties(InputStream in)
-                      throws IOException {
-
-    PropertiesBroker propertiesBroker = new PropertiesBroker();
-    POIFSReader reader = new POIFSReader();
-    reader.registerListener(new PropertiesReaderListener(propertiesBroker),
-                            "\005SummaryInformation");
-    reader.read(in);
-    return propertiesBroker.getProperties();
-  }
-
-  class PropertiesReaderListener
-    implements POIFSReaderListener {
-
-    private PropertiesBroker propertiesBroker;
-    private Properties metaData = new Properties();
-
-    public PropertiesReaderListener(PropertiesBroker propertiesBroker) {
-      this.propertiesBroker = propertiesBroker;
-    }
-
-    public void processPOIFSReaderEvent(POIFSReaderEvent event) {
-
-      SummaryInformation si = null;
-      Properties properties = new Properties();
-
-      try {
-        si = (SummaryInformation)PropertySetFactory.create(event.getStream());
-      } catch (Exception ex) {
-        properties = null;
-      }
-
-      Date tmp = null;
-
-      String title = si.getTitle();
-      String applicationName = si.getApplicationName();
-      String author = si.getAuthor();
-      int charCount = si.getCharCount();
-      String comments = si.getComments();
-      Date createDateTime = si.getCreateDateTime();
-      long editTime = si.getEditTime();
-      String keywords = si.getKeywords();
-      String lastAuthor = si.getLastAuthor();
-      Date lastPrinted = si.getLastPrinted();
-      Date lastSaveDateTime = si.getLastSaveDateTime();
-      int pageCount = si.getPageCount();
-      String revNumber = si.getRevNumber();
-      int security = si.getSecurity();
-      String subject = si.getSubject();
-      String template = si.getTemplate();
-      int wordCount = si.getWordCount();
-
-      /*Dates are being stored in millis since the epoch to aid
-      localization*/
-      if(title != null)
-        properties.setProperty(Metadata.TITLE, title);
-      if(applicationName != null)
-        properties.setProperty(Metadata.APPLICATION_NAME, applicationName);
-      if(author != null)
-        properties.setProperty(Metadata.AUTHOR, author);
-      if(charCount != 0)
-        properties.setProperty(Metadata.CHARACTER_COUNT, charCount + "");
-      if(comments != null)
-        properties.setProperty(Metadata.COMMENTS, comments);
-      if(createDateTime != null)
-        properties.setProperty(Metadata.DATE,
-                               Metadata.DATE_FORMAT.format(createDateTime));
-      if(editTime != 0)
-        properties.setProperty(Metadata.LAST_MODIFIED, editTime + "");
-      if(keywords != null)
-        properties.setProperty(Metadata.KEYWORDS, keywords);
-      if(lastAuthor != null)
-        properties.setProperty(Metadata.LAST_AUTHOR, lastAuthor);
-      if(lastPrinted != null)
-        properties.setProperty(Metadata.LAST_PRINTED, lastPrinted.getTime() + 
"");
-      if(lastSaveDateTime != null)
-        properties.setProperty(Metadata.LAST_SAVED, lastSaveDateTime.getTime() 
+ "");
-      if(pageCount != 0)
-        properties.setProperty(Metadata.PAGE_COUNT, pageCount + "");
-      if(revNumber != null)
-        properties.setProperty(Metadata.REVISION_NUMBER, revNumber);
-      if(security != 0)
-        properties.setProperty(Metadata.RIGHTS, security + "");
-      if(subject != null)
-        properties.setProperty(Metadata.SUBJECT, subject);
-      if(template != null)
-        properties.setProperty(Metadata.TEMPLATE, template);
-      if(wordCount != 0)
-        properties.setProperty(Metadata.WORD_COUNT, wordCount + "");
-      propertiesBroker.setProperties(properties);
-
-      //si.getThumbnail(); // can't think of a sensible way of turning this 
into a string.
-    }
-  }
-
-  class PropertiesBroker {
-
-    private Properties properties;
-    private int timeoutMillis = 2 * 1000;
-
-
-    public synchronized Properties getProperties() {
-
-      long start = new Date().getTime();
-      long now = start;
-
-      while (properties == null && now - start < timeoutMillis) {
-        try {
-          wait(timeoutMillis / 10);
-        } catch (InterruptedException e) {}
-        now = new Date().getTime();
-      }
-
-      notifyAll();
-
-      return properties;
-    }
-
-    public synchronized void setProperties(Properties properties) {
-      this.properties = properties;
-      notifyAll();
-    }
-  }
 }
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html?rev=377494&r1=377493&r2=377494&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/package.html
 Mon Feb 13 13:28:13 2006
@@ -1,5 +1,6 @@
 <html>
 <body>
-<p>A Word document parsing plugin.</p><p>This package relies on <a 
href="http://jakarta.apache.org/poi/index.html";>POI</a>.</p>
+<p>A Microsoft &copy; Word document parsing plugin.</p>
+<p>This package relies on <a 
href="http://jakarta.apache.org/poi/index.html";>POI</a>.</p>
 </body>
 </html>




-------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc. Do you grep through log files
for problems?  Stop!  Download the new AJAX search engine that makes
searching your log files as easy as surfing the  web.  DOWNLOAD SPLUNK!
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=103432&bid=230486&dat=121642
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] svn commit: r377494 - in /lucene/nutch/trunk/src/plugin: parse-msexcel/ parse-msexcel/src/java/org/apache/nutch/parse/msexcel/ parse-mspowerpoint/ parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ parse-msword/ parse-msword/src/java/org/...

Reply via email to