Author: jukka
Date: Sun Mar 9 04:47:54 2008
New Revision: 635224
URL: http://svn.apache.org/viewvc?rev=635224&view=rev
Log:
TIKA-123: Structured MS Office parsing
- Consolidated all MS Office parsing to a single class
- Reliable MIME magic for pseudo type application/x-tika-msoffice
- Added MIME magic for RTF
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
- copied, changed from r633304,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
incubator/tika/trunk/src/main/resources/tika-config.xml
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
(from r633304,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java&r1=633304&r2=635224&rev=635224&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
Sun Mar 9 04:47:54 2008
@@ -43,9 +43,7 @@
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
@@ -65,10 +63,10 @@
* @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
* POI Event API How To</a>
*/
-public class ExcelParser extends OfficeParser implements Serializable {
+public class ExcelExtractor {
/** Logging instance */
- private static Log log = LogFactory.getLog(ExcelParser.class);
+ private static final Log log = LogFactory.getLog(ExcelExtractor.class);
/**
* <code>true</code> if the HSSFListener should be registered
@@ -103,15 +101,6 @@
}
/**
- * Return the content type handled by this parser.
- *
- * @return The content type handled
- */
- protected String getContentType() {
- return "application/vnd.ms-excel";
- }
-
- /**
* Extracts text from an Excel Workbook writing the extracted content
* to the specified [EMAIL PROTECTED] Appendable}.
*
@@ -119,13 +108,10 @@
* @throws IOException if an error occurs processing the workbook
* or writing the extracted content
*/
- protected void parse(
- POIFSFileSystem filesystem, ContentHandler handler, Metadata
metadata)
+ protected void parse(POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException {
log.debug("Starting listenForAllRecords=" + listenForAllRecords);
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-
// Set up listener and register the records we want to process
TikaHSSFListener listener = new TikaHSSFListener(xhtml);
HSSFRequest hssfRequest = new HSSFRequest();
@@ -151,10 +137,8 @@
DocumentInputStream documentInputStream =
filesystem.createDocumentInputStream("Workbook");
HSSFEventFactory eventFactory = new HSSFEventFactory();
- xhtml.startDocument();
eventFactory.processEvents(hssfRequest, documentInputStream);
listener.throwStoredException();
- xhtml.endDocument();
}
// ======================================================================
@@ -163,9 +147,6 @@
* HSSF Listener implementation which processes the HSSF records.
*/
private static class TikaHSSFListener implements HSSFListener,
Serializable {
-
- /** Logging instance */
- private static Log log = LogFactory.getLog(ExcelParser.class);
private final XHTMLContentHandler handler;
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=635224&r1=635223&r2=635224&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Sun Mar 9 04:47:54 2008
@@ -16,24 +16,40 @@
*/
package org.apache.tika.parser.microsoft;
-import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
+import java.util.Date;
+import java.util.Iterator;
import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.MarkUnsupportedException;
+import org.apache.poi.hpsf.NoPropertySetStreamException;
+import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
+import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
/**
* Defines a Microsoft document content extractor.
*/
-public abstract class OfficeParser implements Parser {
+public class OfficeParser implements Parser {
+
+ private static final String SUMMARY_INFORMATION =
+ SummaryInformation.DEFAULT_STREAM_NAME;
+
+ private static final String DOCUMENT_SUMMARY_INFORMATION =
+ DocumentSummaryInformation.DEFAULT_STREAM_NAME;
/**
* Extracts properties and text from an MS Document input stream
@@ -41,44 +57,103 @@
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
POIFSFileSystem filesystem = new POIFSFileSystem(stream);
+ Iterator<?> entries = filesystem.getRoot().getEntries();
+ while (entries.hasNext()) {
+ Entry entry = (Entry) entries.next();
+ String name = entry.getName();
+ if (!(entry instanceof DocumentEntry)) {
+ // Skip directory entries
+ } else if (SUMMARY_INFORMATION.equals(name)
+ || DOCUMENT_SUMMARY_INFORMATION.equals(name)) {
+ parse((DocumentEntry) entry, metadata);
+ } else if ("WordDocument".equals(name)) {
+ setType(metadata, "application/msword");
+ WordExtractor extractor = new WordExtractor(filesystem);
+ for (String paragraph : extractor.getParagraphText()) {
+ xhtml.element("p", paragraph);
+ }
+ } else if ("PowerPoint Document".equals(name)) {
+ setType(metadata, "application/vnd.ms-powerpoint");
+ PowerPointExtractor extractor =
+ new PowerPointExtractor(filesystem);
+ xhtml.element("p", extractor.getText(true, true));
+ } else if ("Workbook".equals(name)) {
+ setType(metadata, "application/vnd.ms-excel");
+ new ExcelExtractor().parse(filesystem, xhtml);
+ }
+ }
- metadata.set(Metadata.CONTENT_TYPE, getContentType());
- getMetadata(
- filesystem, SummaryInformation.DEFAULT_STREAM_NAME, metadata);
- getMetadata(
- filesystem, DocumentSummaryInformation.DEFAULT_STREAM_NAME,
- metadata);
+ xhtml.endDocument();
+ }
- parse(filesystem, handler, metadata);
+ public void parse(DocumentEntry entry, Metadata metadata)
+ throws IOException, TikaException {
+ try {
+ PropertySet properties =
+ new PropertySet(new DocumentInputStream(entry));
+ if (properties.isSummaryInformation()) {
+ parse(new SummaryInformation(properties), metadata);
+ }
+ if (properties.isDocumentSummaryInformation()) {
+ parse(new DocumentSummaryInformation(properties), metadata);
+ }
+ } catch (NoPropertySetStreamException e) {
+ throw new TikaException("Not a HPSF document", e);
+ } catch (UnexpectedPropertySetTypeException e) {
+ throw new TikaException("Unexpected HPSF document", e);
+ } catch (MarkUnsupportedException e) {
+ throw new TikaException("Invalid DocumentInputStream", e);
+ }
}
- /**
- * The content type of the document being parsed.
- *
- * @return MIME content type
- */
- protected abstract String getContentType();
+ private void parse(SummaryInformation summary, Metadata metadata) {
+ set(metadata, Metadata.TITLE, summary.getTitle());
+ set(metadata, Metadata.AUTHOR, summary.getAuthor());
+ set(metadata, Metadata.KEYWORDS, summary.getKeywords());
+ set(metadata, Metadata.SUBJECT, summary.getSubject());
+ set(metadata, Metadata.LAST_AUTHOR, summary.getLastAuthor());
+ set(metadata, Metadata.COMMENTS, summary.getComments());
+ set(metadata, Metadata.TEMPLATE, summary.getTemplate());
+ set(metadata, Metadata.APPLICATION_NAME, summary.getApplicationName());
+ set(metadata, Metadata.REVISION_NUMBER, summary.getRevNumber());
+ set(metadata, "creationdate", summary.getCreateDateTime());
+ set(metadata, Metadata.CHARACTER_COUNT, summary.getCharCount());
+ set(metadata, "edittime", summary.getEditTime());
+ set(metadata, Metadata.LAST_SAVED, summary.getLastSaveDateTime());
+ set(metadata, Metadata.PAGE_COUNT, summary.getPageCount());
+ set(metadata, "security", summary.getSecurity());
+ set(metadata, Metadata.WORD_COUNT, summary.getWordCount());
+ set(metadata, Metadata.LAST_PRINTED, summary.getLastPrinted());
+ }
- /**
- * Extracts the text content from a Microsoft document input stream.
- */
- protected abstract void parse(
- POIFSFileSystem filesystem, ContentHandler handler, Metadata
metadata)
- throws IOException, SAXException, TikaException;
+ private void parse(DocumentSummaryInformation summary, Metadata metadata) {
+ set(metadata, "company", summary.getCompany());
+ set(metadata, "manager", summary.getManager());
+ }
- private void getMetadata(
- POIFSFileSystem filesystem, String name, Metadata metadata)
- throws IOException, SAXException, TikaException {
- try {
- InputStream stream = filesystem.createDocumentInputStream(name);
- try {
- new PropertyParser().parse(stream, new DefaultHandler(),
metadata);
- } finally {
- stream.close();
- }
- } catch (FileNotFoundException e) {
- // summary information not available, ignore
+ private void setType(Metadata metadata, String type) {
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ }
+
+ private void set(Metadata metadata, String name, String value) {
+ if (value != null) {
+ metadata.set(name, value);
+ }
+ }
+
+ private void set(Metadata metadata, String name, Date value) {
+ if (value != null) {
+ metadata.set(name, value.toString());
+ }
+ }
+
+ private void set(Metadata metadata, String name, long value) {
+ if (value > 0) {
+ metadata.set(name, Long.toString(value));
}
}
Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=635224&r1=635223&r2=635224&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Sun Mar 9
04:47:54 2008
@@ -153,21 +153,21 @@
localName='html' />
</mime-type>
+
+ <mime-type type="application/x-tika-msoffice">
+ <magic>
+ <match value="0xd0cf11e0a1b11ae1" type="string"
offset="0:8"/>
+ </magic>
+ </mime-type>
+
<mime-type type="application/vnd.ms-powerpoint">
<glob pattern="*.ppz" />
<glob pattern="*.ppt" />
<glob pattern="*.pps" />
<glob pattern="*.pot" />
- <magic priority="50">
- <match value="0xcfd0e011" type="little32" offset="0" />
- </magic>
</mime-type>
<mime-type type="application/vnd.ms-excel">
- <magic priority="50">
- <match value="Microsoft Excel 5.0 Worksheet"
type="string"
- offset="2080" />
- </magic>
<glob pattern="*.xls" />
<glob pattern="*.xlc" />
<glob pattern="*.xll" />
@@ -385,16 +385,6 @@
</mime-type>
<mime-type type="application/msword">
- <magic priority="50">
- <match value="\x31\xbe\x00\x00" type="string"
offset="0" />
- <match value="PO^Q`" type="string" offset="0" />
- <match value="\376\067\0\043" type="string" offset="0"
/>
- <match value="\333\245-\0\0\0" type="string" offset="0"
/>
- <match value="Microsoft Word 6.0 Document" type="string"
- offset="2080" />
- <match value="Microsoft Word document data"
type="string"
- offset="2112" />
- </magic>
<glob pattern="*.doc" />
<alias type="application/vnd.ms-word" />
</mime-type>
@@ -432,6 +422,9 @@
</mime-type>
<mime-type type="application/rtf">
+ <magic priority="50">
+ <match value="{\rtf" type="string" offset="0" />
+ </magic>
<glob pattern="*.rtf"/>
<alias type="text/rtf" />
</mime-type>
Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=635224&r1=635223&r2=635224&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Sun Mar 9 04:47:54
2008
@@ -27,15 +27,10 @@
<mime>application/xml</mime>
</parser>
- <parser name="parse-msword"
class="org.apache.tika.parser.microsoft.WordParser">
+ <parser name="parse-office"
class="org.apache.tika.parser.microsoft.OfficeParser">
+ <mime>application/x-tika-msoffice</mime>
<mime>application/msword</mime>
- </parser>
-
- <parser name="parse-msexcel"
class="org.apache.tika.parser.microsoft.ExcelParser">
<mime>application/vnd.ms-excel</mime>
- </parser>
-
- <parser name="parse-mspowerpoint"
class="org.apache.tika.parser.microsoft.PowerPointParser">
<mime>application/vnd.ms-powerpoint</mime>
</parser>
Modified:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=635224&r1=635223&r2=635224&view=diff
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
(original)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
Sun Mar 9 04:47:54 2008
@@ -34,7 +34,7 @@
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
ContentHandler handler = new WriteOutContentHandler(writer);
- new ExcelParser().parse(input, handler, metadata);
+ new OfficeParser().parse(input, handler, metadata);
assertEquals(
"application/vnd.ms-excel",
Modified:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=635224&r1=635223&r2=635224&view=diff
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
(original)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
Sun Mar 9 04:47:54 2008
@@ -34,7 +34,7 @@
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
ContentHandler handler = new WriteOutContentHandler(writer);
- new PowerPointParser().parse(input, handler, metadata);
+ new OfficeParser().parse(input, handler, metadata);
assertEquals(
"application/vnd.ms-powerpoint",
Modified:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=635224&r1=635223&r2=635224&view=diff
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
(original)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Sun Mar 9 04:47:54 2008
@@ -34,7 +34,7 @@
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
ContentHandler handler = new WriteOutContentHandler(writer);
- new WordParser().parse(input, handler, metadata);
+ new OfficeParser().parse(input, handler, metadata);
assertEquals(
"application/msword",