Author: mikemccand
Date: Sun Sep 2 13:02:01 2012
New Revision: 1379962
URL: http://svn.apache.org/viewvc?rev=1379962&view=rev
Log:
TIKA-982: handle Wordpad/RTF docs embedded in Word doc
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_embedded_rtf.doc
(with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1379962&r1=1379961&r2=1379962&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Sep 2 13:02:01 2012
@@ -3,7 +3,8 @@ Release 1.3 - Current Development
* MS Word: When a Word (.doc) document contains embedded files, Tika
now places a <div class="embedded" id="_XXX"/> into the XHTML so
you can see where in the main text the embedded document
- occurred. (TIKA-956)
+ occurred (TIKA-956). Embedded Wordpad/RTF documents are now
+ recognized (TIKA-982).
* PDF: Text from pop-up annotations is now extracted (TIKA-981)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1379962&r1=1379961&r2=1379962&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
Sun Sep 2 13:02:01 2012
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.microsoft;
+import java.io.FileNotFoundException;
import java.io.IOException;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
@@ -150,7 +151,12 @@ abstract class AbstractPOIFSExtractor {
} else if (type == POIFSDocumentType.COMP_OBJ) {
try {
// Grab the contents and process
- DocumentEntry contentsEntry =
(DocumentEntry)dir.getEntry("CONTENTS");
+ DocumentEntry contentsEntry;
+ try {
+ contentsEntry = (DocumentEntry)dir.getEntry("CONTENTS");
+ } catch (FileNotFoundException ioe) {
+ contentsEntry = (DocumentEntry)dir.getEntry("Contents");
+ }
DocumentInputStream inp = new
DocumentInputStream(contentsEntry);
byte[] contents = new byte[contentsEntry.getSize()];
inp.readFully(contents);
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1379962&r1=1379961&r2=1379962&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
Sun Sep 2 13:02:01 2012
@@ -250,8 +250,10 @@ public class POIFSContainerDetector impl
// this occurs on older Works Word Processor files (versions
3.0 and 4.0)
return WPS;
} else if (names.contains("CONTENTS") &&
names.contains("SPELLING")) {
- // Newer Works files
- return WPS;
+ // Newer Works files
+ return WPS;
+ } else if (names.contains("Contents") &&
names.contains("\u0003ObjInfo")) {
+ return COMP_OBJ;
} else if (names.contains("CONTENTS") &&
names.contains("\u0001CompObj")) {
// CompObj is a general kind of OLE2 embedding, but this may be
an old Works file
// If we have the Directory, check
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1379962&r1=1379961&r2=1379962&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Sun Sep 2 13:02:01 2012
@@ -187,6 +187,13 @@ public class WordParserTest extends Tika
assertTrue(j < k);
}
+ // TIKA-982
+ public void testEmbeddedRTF() throws Exception {
+ String result =
getXML("/test-documents/testWORD_embedded_rtf.doc").xml;
+ assertTrue(result.indexOf("<div class=\"embedded\"
id=\"_1404039792\"/>") != -1);
+ assertTrue(result.indexOf("_1404039792.rtf") != -1);
+ }
+
public void testWord6Parser() throws Exception {
InputStream input = WordParserTest.class.getResourceAsStream(
"/test-documents/testWORD6.doc");
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_embedded_rtf.doc
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_embedded_rtf.doc?rev=1379962&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_embedded_rtf.doc
------------------------------------------------------------------------------
svn:mime-type = application/msword