Author: jukka
Date: Sun Jan 20 06:25:20 2008
New Revision: 613563
URL: http://svn.apache.org/viewvc?rev=613563&view=rev
Log:
TIKA-109: WordParser fails on some Word files
- Applied WordParser patch from Mats Norén
- Removed the now unused WordTextPiece class
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextPiece.java
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=613563&r1=613562&r2=613563&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Jan 20 06:25:20 2008
@@ -2,7 +2,7 @@
Unreleased changes (0.2-incubating)
-
+1. TIKA-109 - WordParser fails on some Word files (Mats Norén)
Release 0.1-incubating - 12/27/2007
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java?rev=613563&r1=613562&r2=613563&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
Sun Jan 20 06:25:20 2008
@@ -16,23 +16,17 @@
*/
package org.apache.tika.parser.microsoft;
-import java.io.IOException;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.poi.hwpf.model.CHPBinTable;
-import org.apache.poi.hwpf.model.CHPX;
-import org.apache.poi.hwpf.model.ComplexFileTable;
-import org.apache.poi.hwpf.model.TextPiece;
-import org.apache.poi.hwpf.model.TextPieceTable;
-import org.apache.poi.hwpf.sprm.SprmIterator;
-import org.apache.poi.hwpf.sprm.SprmOperation;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
import org.apache.tika.exception.TikaException;
+import java.io.IOException;
+
/**
* Word parser
*/
@@ -45,7 +39,8 @@
/**
* Gets the text from a Word document.
*
- * @param in The InputStream representing the Word file.
+ * @param fsys the <code>POIFSFileSystem</code> to read the word document
from.
+ * @param appendable the <code>Appendable</code> to add the text content
to.
*/
public void extractText(POIFSFileSystem fsys, Appendable appendable)
throws IOException, TikaException {
@@ -76,120 +71,31 @@
// this is a Word 6.0 doc send it to the extractor for that
version.
Word6Extractor oldExtractor = new Word6Extractor(appendable);
oldExtractor.extractText(header);
- }
-
- //get the location of the piece table
- int complexOffset = LittleEndian.getInt(header, 0x1a2);
- // determine which table stream we must use.
- //Get the information we need from the header
- String tableName = null;
- boolean useTable1 = (info & 0x200) != 0;
- if (useTable1) {
- tableName = "1Table";
- } else {
- tableName = "0Table";
+ // Set POI values to null
+ headerProps = null;
+ header = null;
+ din = null;
+ fsys = null;
+ return;
}
- DocumentEntry table =
(DocumentEntry)fsys.getRoot().getEntry(tableName);
- byte[] tableStream = new byte[table.getSize()];
-
- din = fsys.createDocumentInputStream(tableName);
-
- din.read(tableStream);
- din.close();
-
- int chpOffset = LittleEndian.getInt(header, 0xfa);
- int chpSize = LittleEndian.getInt(header, 0xfe);
- int fcMin = LittleEndian.getInt(header, 0x18);
- CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset,
chpSize, fcMin);
-
- // load our text pieces and our character runs
- ComplexFileTable cft = new ComplexFileTable(header, tableStream,
complexOffset, fcMin);
- TextPieceTable tpt = cft.getTextPieceTable();
- List textPieces = tpt.getTextPieces();
-
- // make the POIFS objects available for garbage collection
- din = null;
- fsys = null;
- table = null;
- headerProps = null;
-
- List textRuns = cbt.getTextRuns();
- Iterator runIt = textRuns.iterator();
- Iterator textIt = textPieces.iterator();
-
- TextPiece currentPiece = (TextPiece)textIt.next();
- int currentTextStart = currentPiece.getStart();
- int currentTextEnd = currentPiece.getEnd();
-
WordTextBuffer finalTextBuf = new WordTextBuffer(appendable);
- // iterate through all text runs extract the text only if they haven't
been
- // deleted
- while (runIt.hasNext()) {
- CHPX chpx = (CHPX)runIt.next();
- boolean deleted = isDeleted(chpx.getGrpprl());
- if (deleted) {
- continue;
- }
-
- int runStart = chpx.getStart();
- int runEnd = chpx.getEnd();
-
- while (runStart >= currentTextEnd) {
- currentPiece = (TextPiece) textIt.next ();
- currentTextStart = currentPiece.getStart ();
- currentTextEnd = currentPiece.getEnd ();
- }
-
- if (runEnd < currentTextEnd) {
- String str = currentPiece.substring(runStart -
currentTextStart, runEnd - currentTextStart);
- finalTextBuf.append(str);
- } else if (runEnd > currentTextEnd) {
- while (runEnd > currentTextEnd) {
- String str = currentPiece.substring(runStart -
currentTextStart,
- currentTextEnd - currentTextStart);
- finalTextBuf.append(str);
- if (textIt.hasNext()) {
- currentPiece = (TextPiece) textIt.next ();
- currentTextStart = currentPiece.getStart ();
- runStart = currentTextStart;
- currentTextEnd = currentPiece.getEnd ();
- } else {
- return;
- }
- }
- String str = currentPiece.substring(0, runEnd -
currentTextStart);
- finalTextBuf.append(str);
- } else {
- String str = currentPiece.substring(runStart -
currentTextStart, runEnd - currentTextStart);
- if (textIt.hasNext()) {
- currentPiece = (TextPiece) textIt.next();
- currentTextStart = currentPiece.getStart();
- currentTextEnd = currentPiece.getEnd();
- }
- finalTextBuf.append(str);
+ HWPFDocument doc = new HWPFDocument(fsys);
+ Range range = doc.getRange();
+ for (int i = 0; i < range.numCharacterRuns(); i++) {
+ CharacterRun cr = range.getCharacterRun(i);
+ if (!cr.isMarkedDeleted()) {
+ finalTextBuf.append(cr.text());
}
}
- }
- /**
- * Used to determine if a run of text has been deleted.
- *
- * @param grpprl The list of sprms for a particular run of text.
- * @return true if this run of text has been deleted.
- */
- private boolean isDeleted(byte[] grpprl) {
- SprmIterator iterator = new SprmIterator(grpprl,0);
- while (iterator.hasNext()) {
- SprmOperation op = iterator.next();
- // 0 is the operation that signals a FDelRMark operation
- if (op.getOperation() == 0 && op.getOperand() != 0) {
- return true;
- }
- }
- return false;
+ // Set POI values to null
+ headerProps = null;
+ header = null;
+ din = null;
+ doc = null;
+ fsys = null;
}
-
}