Author: mikemccand
Date: Sun Sep  2 13:02:01 2012
New Revision: 1379962

URL: http://svn.apache.org/viewvc?rev=1379962&view=rev
Log:
TIKA-982: handle Wordpad/RTF docs embedded in Word doc

Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_embedded_rtf.doc
   (with props)
Modified:
    tika/trunk/CHANGES.txt
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1379962&r1=1379961&r2=1379962&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Sep  2 13:02:01 2012
@@ -3,7 +3,8 @@ Release 1.3 - Current Development
   * MS Word: When a Word (.doc) document contains embedded files, Tika
     now places a <div class="embedded" id="_XXX"/> into the XHTML so
     you can see where in the main text the embedded document
-    occurred. (TIKA-956)
+    occurred (TIKA-956).  Embedded Wordpad/RTF documents are now
+    recognized (TIKA-982).
 
   * PDF: Text from pop-up annotations is now extracted (TIKA-981)
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1379962&r1=1379961&r2=1379962&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
 Sun Sep  2 13:02:01 2012
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import java.io.FileNotFoundException;
 import java.io.IOException;
 
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
@@ -150,7 +151,12 @@ abstract class AbstractPOIFSExtractor {
             } else if (type == POIFSDocumentType.COMP_OBJ) {
                 try {
                    // Grab the contents and process
-                   DocumentEntry contentsEntry = 
(DocumentEntry)dir.getEntry("CONTENTS");
+                   DocumentEntry contentsEntry;
+                   try {
+                     contentsEntry = (DocumentEntry)dir.getEntry("CONTENTS");
+                   } catch (FileNotFoundException ioe) {
+                     contentsEntry = (DocumentEntry)dir.getEntry("Contents");
+                   }
                    DocumentInputStream inp = new 
DocumentInputStream(contentsEntry);
                    byte[] contents = new byte[contentsEntry.getSize()];
                    inp.readFully(contents);

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1379962&r1=1379961&r2=1379962&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
 Sun Sep  2 13:02:01 2012
@@ -250,8 +250,10 @@ public class POIFSContainerDetector impl
                 // this occurs on older Works Word Processor files (versions 
3.0 and 4.0)
                 return WPS;
             } else if (names.contains("CONTENTS") && 
names.contains("SPELLING")) {
-               // Newer Works files
-               return WPS;
+                // Newer Works files
+                return WPS;
+            } else if (names.contains("Contents") && 
names.contains("\u0003ObjInfo")) {
+                return COMP_OBJ;
             } else if (names.contains("CONTENTS") && 
names.contains("\u0001CompObj")) {
                // CompObj is a general kind of OLE2 embedding, but this may be 
an old Works file
                // If we have the Directory, check

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1379962&r1=1379961&r2=1379962&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 Sun Sep  2 13:02:01 2012
@@ -187,6 +187,13 @@ public class WordParserTest extends Tika
         assertTrue(j < k);
     }
 
+    // TIKA-982
+    public void testEmbeddedRTF() throws Exception {
+        String result = 
getXML("/test-documents/testWORD_embedded_rtf.doc").xml;
+        assertTrue(result.indexOf("<div class=\"embedded\" 
id=\"_1404039792\"/>") != -1);
+        assertTrue(result.indexOf("_1404039792.rtf") != -1);
+    }
+
     public void testWord6Parser() throws Exception {
         InputStream input = WordParserTest.class.getResourceAsStream(
                 "/test-documents/testWORD6.doc");

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_embedded_rtf.doc
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_embedded_rtf.doc?rev=1379962&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_embedded_rtf.doc
------------------------------------------------------------------------------
    svn:mime-type = application/msword


Reply via email to