Author: mikemccand
Date: Tue Oct 16 11:17:36 2012
New Revision: 1398734
URL: http://svn.apache.org/viewvc?rev=1398734&view=rev
Log:
TIKA-1005: also extract text from text boxes in .docx documents
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_text_box.docx
(with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1398734&r1=1398733&r2=1398734&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Oct 16 11:17:36 2012
@@ -24,7 +24,8 @@ Release 1.3 - Current Development
embedded document as the new Metadata.EMBEDDED_RELATIONSHIP_ID
key, and TikaCLI prepends the rId (if present) onto the filename
it extracts (TIKA-989). Fixed NullPointerException when style is
- null (TIKA-1006).
+ null (TIKA-1006). Text inside text boxes is now extracted
+ (TIKA-1005).
* RTF: Page, word, character count and creation date metadata are
now extracted for RTF documents (TIKA-999).
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1398734&r1=1398733&r2=1398734&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
Tue Oct 16 11:17:36 2012
@@ -51,6 +51,7 @@ import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -268,6 +269,11 @@ public class XWPFWordExtractorDecorator
xhtml.characters(footnameText + "\n");
}
+ // Also extract any paragraphs embedded in text boxes:
+ for (XmlObject embeddedParagraph :
paragraph.getCTP().selectPath("declare namespace
w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare
namespace
wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape'
.//*/wps:txbx/w:txbxContent/w:p")) {
+ extractParagraph(new
XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()),
paragraph.getBody()), xhtml);
+ }
+
// Finish this paragraph
xhtml.endElement(tag);
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1398734&r1=1398733&r2=1398734&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Tue Oct 16 11:17:36 2012
@@ -903,4 +903,13 @@ public class OOXMLParserTest extends Tik
String xml = getXML("testWORD_null_style.docx").xml;
assertContains("Test av styrt dokument", xml);
}
+
+ // TIKA-1005:
+ public void testTextInsideTextBox() throws Exception {
+ String xml = getXML("testWORD_text_box.docx").xml;
+ assertContains("This text is directly in the body of the document.",
xml);
+ assertContains("This text is inside of a text box in the body of the
document.", xml);
+ assertContains("This text is inside of a text box in the header of the
document.", xml);
+ assertContains("This text is inside of a text box in the footer of the
document.", xml);
+ }
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_text_box.docx
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_text_box.docx?rev=1398734&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_text_box.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream