Author: mikemccand
Date: Tue Oct 16 11:17:36 2012
New Revision: 1398734

URL: http://svn.apache.org/viewvc?rev=1398734&view=rev
Log:
TIKA-1005: also extract text from text boxes in .docx documents

Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_text_box.docx
   (with props)
Modified:
    tika/trunk/CHANGES.txt
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1398734&r1=1398733&r2=1398734&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Oct 16 11:17:36 2012
@@ -24,7 +24,8 @@ Release 1.3 - Current Development
     embedded document as the new Metadata.EMBEDDED_RELATIONSHIP_ID
     key, and TikaCLI prepends the rId (if present) onto the filename
     it extracts (TIKA-989).  Fixed NullPointerException when style is
-    null (TIKA-1006).
+    null (TIKA-1006).  Text inside text boxes is now extracted
+    (TIKA-1005).
 
   * RTF: Page, word, character count and creation date metadata are
     now extracted for RTF documents (TIKA-999).

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1398734&r1=1398733&r2=1398734&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
 Tue Oct 16 11:17:36 2012
@@ -51,6 +51,7 @@ import org.apache.xmlbeans.XmlObject;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
@@ -268,6 +269,11 @@ public class XWPFWordExtractorDecorator 
           xhtml.characters(footnameText + "\n");
        }
 
+       // Also extract any paragraphs embedded in text boxes:
+       for (XmlObject embeddedParagraph : 
paragraph.getCTP().selectPath("declare namespace 
w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare 
namespace 
wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' 
.//*/wps:txbx/w:txbxContent/w:p")) {
+           extractParagraph(new 
XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), 
paragraph.getBody()), xhtml);
+       }
+
        // Finish this paragraph
        xhtml.endElement(tag);
 

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1398734&r1=1398733&r2=1398734&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 Tue Oct 16 11:17:36 2012
@@ -903,4 +903,13 @@ public class OOXMLParserTest extends Tik
       String xml = getXML("testWORD_null_style.docx").xml;        
       assertContains("Test av styrt dokument", xml);
     }
+
+    // TIKA-1005:
+    public void testTextInsideTextBox() throws Exception {
+        String xml = getXML("testWORD_text_box.docx").xml;
+        assertContains("This text is directly in the body of the document.", 
xml);
+        assertContains("This text is inside of a text box in the body of the 
document.", xml);
+        assertContains("This text is inside of a text box in the header of the 
document.", xml);
+        assertContains("This text is inside of a text box in the footer of the 
document.", xml);
+    }
 }

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_text_box.docx
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_text_box.docx?rev=1398734&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_text_box.docx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to