Author: jukka
Date: Mon Feb 18 15:13:55 2008
New Revision: 628916
URL: http://svn.apache.org/viewvc?rev=628916&view=rev
Log:
TIKA-123: Structured MS Office parsing
- Replaced custom Word parsing code with WordExtractor from POI HWPF
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FilteredStringWriter.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextBox.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6CHPBinTable.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java?rev=628916&r1=628915&r2=628916&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
Mon Feb 18 15:13:55 2008
@@ -16,23 +16,16 @@
*/
package org.apache.tika.parser.microsoft;
-import org.apache.log4j.xml.SAXErrorHandler;
-import org.apache.poi.hwpf.HWPFDocument;
-import org.apache.poi.hwpf.usermodel.CharacterRun;
-import org.apache.poi.hwpf.usermodel.Range;
-import org.apache.poi.poifs.filesystem.DocumentEntry;
-import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import java.io.IOException;
+
+import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.util.LittleEndian;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.AppendableAdaptor;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import java.io.IOException;
-
/**
* Word parser
*/
@@ -42,68 +35,14 @@
return "application/msword";
}
- /**
- * Gets the text from a Word document.
- *
- * @param fsys the <code>POIFSFileSystem</code> to read the word document
from.
- * @param appendable the <code>Appendable</code> to add the text content
to.
- */
public void parse(
- POIFSFileSystem fsys, ContentHandler handler, Metadata metadata)
+ POIFSFileSystem poifs, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
- xhtml.startElement("p");
- Appendable appendable = new AppendableAdaptor(xhtml);
-
- // load our POIFS document streams.
- DocumentEntry headerProps =
- (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
- DocumentInputStream din =
fsys.createDocumentInputStream("WordDocument");
- byte[] header = new byte[headerProps.getSize()];
-
- din.read(header);
- din.close();
-
- int info = LittleEndian.getShort(header, 0xa);
- if ((info & 0x4) != 0) {
- throw new TikaException("Fast-saved files are unsupported");
- }
- if ((info & 0x100) != 0) {
- throw new TikaException("This document is password protected");
+ for (String paragraph : new WordExtractor(poifs).getParagraphText()) {
+ xhtml.element("p", paragraph);
}
-
- // determine the version of Word this document came from.
- int nFib = LittleEndian.getShort(header, 0x2);
- switch (nFib) {
- case 101:
- case 102:
- case 103:
- case 104:
- // this is a Word 6.0 doc send it to the extractor for that
version.
- Word6Extractor oldExtractor = new Word6Extractor(appendable);
- oldExtractor.extractText(header);
-
- // Set POI values to null
- headerProps = null;
- header = null;
- din = null;
- fsys = null;
- return;
- }
-
- WordTextBuffer finalTextBuf = new WordTextBuffer(appendable);
-
- HWPFDocument doc = new HWPFDocument(fsys);
- Range range = doc.getRange();
- for (int i = 0; i < range.numCharacterRuns(); i++) {
- CharacterRun cr = range.getCharacterRun(i);
- if (!cr.isMarkedDeleted()) {
- finalTextBuf.append(cr.text());
- }
- }
-
- xhtml.endElement("p");
xhtml.endDocument();
}
}