Author: rgauss
Date: Fri Jun 14 00:09:51 2013
New Revision: 1492909

URL: http://svn.apache.org/r1492909
Log:
TIKA-1130: .docx text extract leaves out some portions of text
   - Added test file
   - Added disabled unit test

Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_missing_text.docx
   (with props)
Modified:
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1492909&r1=1492908&r2=1492909&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 Fri Jun 14 00:09:51 2013
@@ -36,7 +36,6 @@ import org.apache.tika.metadata.TikaMeta
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.microsoft.OfficeParser;
 import org.apache.tika.parser.microsoft.WordParserTest;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
@@ -911,4 +910,28 @@ public class OOXMLParserTest extends Tik
         assertContains("<div class=\"embedded\" id=\"slide1_rId7\"/>" , xml);
         assertContains("<div class=\"embedded\" id=\"slide2_rId7\"/>" , xml);
     }
+    
+    /**
+     * Test for missing text described in 
+     * <a href="https://issues.apache.org/jira/browse/TIKA-1130";>TIKA-1130</a>.
+     * 
+     * @throws Exception
+     */
+    public void disabledTestMissingText() throws Exception { // TODO: Enable 
test once POI has been updated. 
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        ParseContext context = new ParseContext();
+
+        InputStream input = getTestDocument("testWORD_missing_text.docx");
+        try {
+            parser.parse(input, handler, metadata, context);
+            assertEquals(
+                    
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertTrue(handler.toString().contains("BigCompany"));
+            assertTrue(handler.toString().contains("Seasoned"));
+        } finally {
+            input.close();
+        }
+    }
   }

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_missing_text.docx
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_missing_text.docx?rev=1492909&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_missing_text.docx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to