Author: jukka
Date: Sun Mar  9 04:47:54 2008
New Revision: 635224

URL: http://svn.apache.org/viewvc?rev=635224&view=rev
Log:
TIKA-123: Structured MS Office parsing
    - Consolidated all MS Office parsing to a single class
    - Reliable MIME magic for pseudo type application/x-tika-msoffice
    - Added MIME magic for RTF

Added:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
      - copied, changed from r633304, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
Removed:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
Modified:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
    incubator/tika/trunk/src/main/resources/tika-config.xml
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java

Copied: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 (from r633304, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java)
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java&r1=633304&r2=635224&rev=635224&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 Sun Mar  9 04:47:54 2008
@@ -43,9 +43,7 @@
 import org.apache.poi.hssf.record.SSTRecord;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.tika.metadata.Metadata;
 import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 /**
@@ -65,10 +63,10 @@
  * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api";>
  * POI Event API How To</a>
  */
-public class ExcelParser extends OfficeParser implements Serializable {
+public class ExcelExtractor {
 
     /** Logging instance */
-    private static Log log = LogFactory.getLog(ExcelParser.class);
+    private static final Log log = LogFactory.getLog(ExcelExtractor.class);
 
     /**
      * <code>true</code> if the HSSFListener should be registered
@@ -103,15 +101,6 @@
     }
 
     /**
-     * Return the content type handled by this parser.
-     *
-     * @return The content type handled
-     */
-    protected String getContentType() {
-        return "application/vnd.ms-excel";
-    }
-
-    /**
      * Extracts text from an Excel Workbook writing the extracted content
      * to the specified [EMAIL PROTECTED] Appendable}.
      *
@@ -119,13 +108,10 @@
      * @throws IOException if an error occurs processing the workbook
      * or writing the extracted content
      */
-    protected void parse(
-            POIFSFileSystem filesystem, ContentHandler handler, Metadata 
metadata)
+    protected void parse(POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
             throws IOException, SAXException {
         log.debug("Starting listenForAllRecords=" + listenForAllRecords);
 
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-
         // Set up listener and register the records we want to process
         TikaHSSFListener listener = new TikaHSSFListener(xhtml);
         HSSFRequest hssfRequest = new HSSFRequest();
@@ -151,10 +137,8 @@
         DocumentInputStream documentInputStream = 
filesystem.createDocumentInputStream("Workbook");
         HSSFEventFactory eventFactory = new HSSFEventFactory();
 
-        xhtml.startDocument();
         eventFactory.processEvents(hssfRequest, documentInputStream);
         listener.throwStoredException();
-        xhtml.endDocument();
     }
 
     // ======================================================================
@@ -163,9 +147,6 @@
      * HSSF Listener implementation which processes the HSSF records.
      */
     private static class TikaHSSFListener implements HSSFListener, 
Serializable {
-
-        /** Logging instance */
-        private static Log log = LogFactory.getLog(ExcelParser.class);
 
         private final XHTMLContentHandler handler;
 

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=635224&r1=635223&r2=635224&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 Sun Mar  9 04:47:54 2008
@@ -16,24 +16,40 @@
  */
 package org.apache.tika.parser.microsoft;
 
-import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Date;
+import java.util.Iterator;
 
 import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.MarkUnsupportedException;
+import org.apache.poi.hpsf.NoPropertySetStreamException;
+import org.apache.poi.hpsf.PropertySet;
 import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
+import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * Defines a Microsoft document content extractor.
  */
-public abstract class OfficeParser implements Parser {
+public class OfficeParser implements Parser {
+
+    private static final String SUMMARY_INFORMATION =
+        SummaryInformation.DEFAULT_STREAM_NAME;
+
+    private static final String DOCUMENT_SUMMARY_INFORMATION =
+        DocumentSummaryInformation.DEFAULT_STREAM_NAME;
 
     /**
      * Extracts properties and text from an MS Document input stream
@@ -41,44 +57,103 @@
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, SAXException, TikaException {
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
         POIFSFileSystem filesystem = new POIFSFileSystem(stream);
+        Iterator<?> entries = filesystem.getRoot().getEntries();
+        while (entries.hasNext()) {
+            Entry entry = (Entry) entries.next();
+            String name = entry.getName();
+            if (!(entry instanceof DocumentEntry)) {
+                // Skip directory entries
+            } else if (SUMMARY_INFORMATION.equals(name)
+                    || DOCUMENT_SUMMARY_INFORMATION.equals(name)) {
+                parse((DocumentEntry) entry, metadata);
+            } else if ("WordDocument".equals(name)) {
+                setType(metadata, "application/msword");
+                WordExtractor extractor = new WordExtractor(filesystem);
+                for (String paragraph : extractor.getParagraphText()) {
+                    xhtml.element("p", paragraph);
+                }
+            } else if ("PowerPoint Document".equals(name)) {
+                setType(metadata, "application/vnd.ms-powerpoint");
+                PowerPointExtractor extractor =
+                    new PowerPointExtractor(filesystem);
+                xhtml.element("p", extractor.getText(true, true));
+            } else if ("Workbook".equals(name)) {
+                setType(metadata, "application/vnd.ms-excel");
+                new ExcelExtractor().parse(filesystem, xhtml);
+            }
+        }
 
-        metadata.set(Metadata.CONTENT_TYPE, getContentType());
-        getMetadata(
-                filesystem, SummaryInformation.DEFAULT_STREAM_NAME, metadata);
-        getMetadata(
-                filesystem, DocumentSummaryInformation.DEFAULT_STREAM_NAME,
-                metadata);
+        xhtml.endDocument();
+    }
 
-        parse(filesystem, handler, metadata);
+    public void parse(DocumentEntry entry, Metadata metadata)
+            throws IOException, TikaException {
+        try {
+            PropertySet properties =
+                new PropertySet(new DocumentInputStream(entry));
+            if (properties.isSummaryInformation()) {
+                parse(new SummaryInformation(properties), metadata);
+            }
+            if (properties.isDocumentSummaryInformation()) {
+                parse(new DocumentSummaryInformation(properties), metadata);
+            }
+        } catch (NoPropertySetStreamException e) {
+            throw new TikaException("Not a HPSF document", e);
+        } catch (UnexpectedPropertySetTypeException e) {
+            throw new TikaException("Unexpected HPSF document", e);
+        } catch (MarkUnsupportedException e) {
+            throw new TikaException("Invalid DocumentInputStream", e);
+        }
     }
 
-    /**
-     * The content type of the document being parsed.
-     *
-     * @return MIME content type
-     */
-    protected abstract String getContentType();
+    private void parse(SummaryInformation summary, Metadata metadata) {
+        set(metadata, Metadata.TITLE, summary.getTitle());
+        set(metadata, Metadata.AUTHOR, summary.getAuthor());
+        set(metadata, Metadata.KEYWORDS, summary.getKeywords());
+        set(metadata, Metadata.SUBJECT, summary.getSubject());
+        set(metadata, Metadata.LAST_AUTHOR, summary.getLastAuthor());
+        set(metadata, Metadata.COMMENTS, summary.getComments());
+        set(metadata, Metadata.TEMPLATE, summary.getTemplate());
+        set(metadata, Metadata.APPLICATION_NAME, summary.getApplicationName());
+        set(metadata, Metadata.REVISION_NUMBER, summary.getRevNumber());
+        set(metadata, "creationdate", summary.getCreateDateTime());
+        set(metadata, Metadata.CHARACTER_COUNT, summary.getCharCount());
+        set(metadata, "edittime", summary.getEditTime());
+        set(metadata, Metadata.LAST_SAVED, summary.getLastSaveDateTime());
+        set(metadata, Metadata.PAGE_COUNT, summary.getPageCount());
+        set(metadata, "security", summary.getSecurity());
+        set(metadata, Metadata.WORD_COUNT, summary.getWordCount());
+        set(metadata, Metadata.LAST_PRINTED, summary.getLastPrinted());
+    }
 
-    /**
-     * Extracts the text content from a Microsoft document input stream.
-     */
-    protected abstract void parse(
-            POIFSFileSystem filesystem, ContentHandler handler, Metadata 
metadata)
-            throws IOException, SAXException, TikaException;
+    private void parse(DocumentSummaryInformation summary, Metadata metadata) {
+        set(metadata, "company", summary.getCompany());
+        set(metadata, "manager", summary.getManager());
+    }
 
-    private void getMetadata(
-            POIFSFileSystem filesystem, String name, Metadata metadata)
-            throws IOException, SAXException, TikaException {
-        try {
-            InputStream stream = filesystem.createDocumentInputStream(name);
-            try {
-                new PropertyParser().parse(stream, new DefaultHandler(), 
metadata);
-            } finally {
-                stream.close();
-            }
-        } catch (FileNotFoundException e) {
-            // summary information not available, ignore
+    private void setType(Metadata metadata, String type) {
+        metadata.set(Metadata.CONTENT_TYPE, type);
+    }
+
+    private void set(Metadata metadata, String name, String value) {
+        if (value != null) {
+            metadata.set(name, value);
+        }
+    }
+
+    private void set(Metadata metadata, String name, Date value) {
+        if (value != null) {
+            metadata.set(name, value.toString());
+        }
+    }
+
+    private void set(Metadata metadata, String name, long value) {
+        if (value > 0) {
+            metadata.set(name, Long.toString(value));
         }
     }
 

Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=635224&r1=635223&r2=635224&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Sun Mar  9 
04:47:54 2008
@@ -153,21 +153,21 @@
                        localName='html' />
        </mime-type>
 
+
+        <mime-type type="application/x-tika-msoffice">
+                <magic>
+                        <match value="0xd0cf11e0a1b11ae1" type="string" 
offset="0:8"/>
+                </magic>
+        </mime-type>
+
        <mime-type type="application/vnd.ms-powerpoint">
                <glob pattern="*.ppz" />
                <glob pattern="*.ppt" />
                <glob pattern="*.pps" />
                <glob pattern="*.pot" />
-               <magic priority="50">
-                       <match value="0xcfd0e011" type="little32" offset="0" />
-               </magic>
        </mime-type>
 
        <mime-type type="application/vnd.ms-excel">
-               <magic priority="50">
-                       <match value="Microsoft Excel 5.0 Worksheet" 
type="string"
-                               offset="2080" />
-               </magic>
                <glob pattern="*.xls" />
                <glob pattern="*.xlc" />
                <glob pattern="*.xll" />
@@ -385,16 +385,6 @@
        </mime-type>
 
        <mime-type type="application/msword">
-               <magic priority="50">
-                       <match value="\x31\xbe\x00\x00" type="string" 
offset="0" />
-                       <match value="PO^Q`" type="string" offset="0" />
-                       <match value="\376\067\0\043" type="string" offset="0" 
/>
-                       <match value="\333\245-\0\0\0" type="string" offset="0" 
/>
-                       <match value="Microsoft Word 6.0 Document" type="string"
-                               offset="2080" />
-                       <match value="Microsoft Word document data" 
type="string"
-                               offset="2112" />
-               </magic>
                <glob pattern="*.doc" />
                <alias type="application/vnd.ms-word" />
        </mime-type>
@@ -432,6 +422,9 @@
        </mime-type>
 
        <mime-type type="application/rtf">
+                <magic priority="50">
+                        <match value="{\rtf" type="string" offset="0" />
+                </magic>
            <glob pattern="*.rtf"/>
                <alias type="text/rtf" />
        </mime-type>

Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=635224&r1=635223&r2=635224&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Sun Mar  9 04:47:54 
2008
@@ -27,15 +27,10 @@
                 <mime>application/xml</mime>
         </parser>
 
-        <parser name="parse-msword" 
class="org.apache.tika.parser.microsoft.WordParser">
+        <parser name="parse-office" 
class="org.apache.tika.parser.microsoft.OfficeParser">
+                <mime>application/x-tika-msoffice</mime>
                 <mime>application/msword</mime>
-        </parser>
-
-        <parser name="parse-msexcel" 
class="org.apache.tika.parser.microsoft.ExcelParser">
                 <mime>application/vnd.ms-excel</mime>
-        </parser>
-
-        <parser name="parse-mspowerpoint" 
class="org.apache.tika.parser.microsoft.PowerPointParser">
                 <mime>application/vnd.ms-powerpoint</mime>
         </parser>
 

Modified: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=635224&r1=635223&r2=635224&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 (original)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 Sun Mar  9 04:47:54 2008
@@ -34,7 +34,7 @@
             Metadata metadata = new Metadata();
             StringWriter writer = new StringWriter();
             ContentHandler handler = new WriteOutContentHandler(writer);
-            new ExcelParser().parse(input, handler, metadata);
+            new OfficeParser().parse(input, handler, metadata);
 
             assertEquals(
                     "application/vnd.ms-excel",

Modified: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=635224&r1=635223&r2=635224&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 (original)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 Sun Mar  9 04:47:54 2008
@@ -34,7 +34,7 @@
             Metadata metadata = new Metadata();
             StringWriter writer = new StringWriter();
             ContentHandler handler = new WriteOutContentHandler(writer);
-            new PowerPointParser().parse(input, handler, metadata);
+            new OfficeParser().parse(input, handler, metadata);
 
             assertEquals(
                     "application/vnd.ms-powerpoint",

Modified: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=635224&r1=635223&r2=635224&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 (original)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 Sun Mar  9 04:47:54 2008
@@ -34,7 +34,7 @@
             Metadata metadata = new Metadata();
             StringWriter writer = new StringWriter();
             ContentHandler handler = new WriteOutContentHandler(writer);
-            new WordParser().parse(input, handler, metadata);
+            new OfficeParser().parse(input, handler, metadata);
 
             assertEquals(
                     "application/msword",


Reply via email to