Author: rgauss
Date: Fri Jun 14 00:09:51 2013
New Revision: 1492909
URL: http://svn.apache.org/r1492909
Log:
TIKA-1130: .docx text extract leaves out some portions of text
- Added test file
- Added disabled unit test
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_missing_text.docx
(with props)
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1492909&r1=1492908&r2=1492909&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Fri Jun 14 00:09:51 2013
@@ -36,7 +36,6 @@ import org.apache.tika.metadata.TikaMeta
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.WordParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
@@ -911,4 +910,28 @@ public class OOXMLParserTest extends Tik
assertContains("<div class=\"embedded\" id=\"slide1_rId7\"/>" , xml);
assertContains("<div class=\"embedded\" id=\"slide2_rId7\"/>" , xml);
}
+
+ /**
+ * Test for missing text described in
+ * <a href="https://issues.apache.org/jira/browse/TIKA-1130">TIKA-1130</a>.
+ *
+ * @throws Exception
+ */
+ public void disabledTestMissingText() throws Exception { // TODO: Enable
test once POI has been updated.
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+
+ InputStream input = getTestDocument("testWORD_missing_text.docx");
+ try {
+ parser.parse(input, handler, metadata, context);
+ assertEquals(
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertTrue(handler.toString().contains("BigCompany"));
+ assertTrue(handler.toString().contains("Seasoned"));
+ } finally {
+ input.close();
+ }
+ }
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_missing_text.docx
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_missing_text.docx?rev=1492909&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_missing_text.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream