Repository: tika Updated Branches: refs/heads/master 99b592437 -> 5425d02a1
TIKA-2191 -- step1 -- add other docx tests and comment/ignore where appropriate Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/89430130 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/89430130 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/89430130 Branch: refs/heads/master Commit: 894301307da5167c95585688f9448d3050f53aaa Parents: 99b5924 Author: tballison <talli...@mitre.org> Authored: Mon Dec 5 10:10:37 2016 -0500 Committer: tballison <talli...@mitre.org> Committed: Tue Dec 6 09:00:37 2016 -0500 ---------------------------------------------------------------------- .../parser/microsoft/ooxml/OOXMLParserTest.java | 16 + .../microsoft/ooxml/SXWPFExtractorTest.java | 694 +++++++++++++++++++ .../ooxml/xwpf/SXWPFExtractorTest.java | 187 ----- .../parser/microsoft/tika-config-sax-docx.xml | 27 + 4 files changed, 737 insertions(+), 187 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/89430130/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index bfbd8ce..0059d09 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft.ooxml; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import javax.xml.transform.OutputKeys; @@ -37,6 +38,7 @@ import java.util.Locale; import java.util.Map; import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -1330,6 +1332,20 @@ public class OOXMLParserTest extends TikaTest { System.out.println("elapsed: "+(new Date().getTime()-started) + " with " + ex + " exceptions"); } + @Test + public void testInitializationViaConfig() throws Exception { + //NOTE: this test relies on a bug in the DOM extractor that + //is passing over the title information. + //once we fix that, this test will no longer be meaningful! + InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml"); + assertNotNull(is); + TikaConfig tikaConfig = new TikaConfig(is); + AutoDetectParser p = new AutoDetectParser(tikaConfig); + XMLResult xml = getXML("testWORD_2006ml.docx", p, new Metadata()); + assertContains("engaging title", xml.xml); + + } + } http://git-wip-us.apache.org/repos/asf/tika/blob/89430130/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java new file mode 100644 index 0000000..fb7a977 --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java @@ -0,0 +1,694 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.microsoft.ooxml; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayOutputStream; +import java.io.InputStream; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +import org.apache.tika.TikaTest; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.parser.microsoft.OfficeParserConfig; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; +import org.xml.sax.ContentHandler; + + +public class SXWPFExtractorTest extends TikaTest { + + private ParseContext parseContext; + + @Before + public void setUp() { + parseContext = new ParseContext(); + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setUseSAXDocxExtractor(true); + parseContext.set(OfficeParserConfig.class, officeParserConfig); + + } + + @Test + public void basicTest() throws Exception { + + List<Metadata> metadataList = getRecursiveMetadata("testWORD_2006ml.docx", parseContext); + + assertEquals(8, metadataList.size()); + Metadata m = metadataList.get(0); + + assertEquals("2016-11-29T00:58:00Z", m.get(TikaCoreProperties.CREATED)); + assertEquals("2016-11-29T17:54:00Z", m.get(TikaCoreProperties.MODIFIED)); + assertEquals("My Document Title", m.get(TikaCoreProperties.TITLE)); + assertEquals("This is the Author", m.get(TikaCoreProperties.CREATOR)); + assertEquals("3", m.get(OfficeOpenXMLCore.REVISION)); + assertEquals("Allison, Timothy B.", m.get(TikaCoreProperties.MODIFIER)); + //assertEquals("0", m.get(OfficeOpenXMLExtended.DOC_SECURITY)); + assertEquals("260", m.get(Office.WORD_COUNT)); + assertEquals("3", m.get(Office.PARAGRAPH_COUNT)); + assertEquals("1742", m.get(Office.CHARACTER_COUNT_WITH_SPACES)); + assertEquals("12", m.get(Office.LINE_COUNT)); + assertEquals("16.0000", m.get(OfficeOpenXMLExtended.APP_VERSION)); + + + String content = m.get(RecursiveParserWrapper.TIKA_CONTENT); + + assertContainsCount("engaging title page", content, 1); + //need \n to differentiate from metadata values + assertContainsCount("This is the Author\n", content, 1); + assertContainsCount("This is an engaging title page", content, 1); + + assertContains("My Document Title", content); + assertContains("My Document Subtitle", content); + + assertContains("<p>\tHeading1\t3</p>", content); + + + //TODO: integrate numbering + assertContains("Really basic 2.", content); + + assertContainsCount("This is a text box", content, 1); + + assertContains("<p>This is a hyperlink: <a href=\"http://tika.apache.org\">tika</a></p>", content); + + assertContains("<p>This is a link to a local file: <a href=\"file:///C:/data/test.png\">test.png</a></p>", content); + + assertContains("<p>This is 10 spaces</p>", content); + + //caption + assertContains("<p>Table 1: Table1 Caption</p>", content); + + //embedded table + //TODO: figure out how to handle embedded tables in html + assertContains("<p>Embedded table r1c1</p>", content); + + //shape + assertContainsCount("<p>This is text within a shape", content, 1); + + //sdt rich text + assertContains("<p>Rich text content control", content); + + //sdt simple text + assertContains("<p>Simple text content control", content); + + //sdt repeating + assertContains("Repeating content", content); + + //sdt dropdown + //TODO: get options for dropdown + assertContains("Drop down1", content); + + //sdt date + assertContains("<p>11/16/2016</p>", content); + + //test that <tab/> works + assertContains("tab\ttab", content); + + assertContainsCount("serious word art", content, 1); + assertContainsCount("Wordartr1c1", content, 1); + + //glossary document contents + assertContains("Click or tap to enter a date", content); + + //basic formatting + assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over", + content); + + //TODO: add chart parsing +// assertContains("This is the chart", content); + + assertContains("This is a comment", content); + + assertContains("This is an endnote", content); + + assertContains("this is the footnote", content); + + assertContains("First page header", content); + + assertContains("Even page header", content); + + assertContains("Odd page header", content); + + assertContains("First page footer", content); + + assertContains("Even page footer", content); + + assertContains("Odd page footer", content); + + //test default does not include deleted + assertNotContained("frog", content); + + assertContains("Mattmann", content); + + //TODO: extract chart text +// assertContains("This is the chart title", content); + + } + + /** + * Test the plain text output of the Word converter + * + * @throws Exception + */ + @Test + public void testWord() throws Exception { + + XMLResult xmlResult = getXML("testWORD.docx", parseContext); + assertEquals( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + xmlResult.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Sample Word Document", xmlResult.metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Keith Bennett", xmlResult.metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Keith Bennett", xmlResult.metadata.get(Metadata.AUTHOR)); + assertTrue(xmlResult.xml.contains("Sample Word Document")); + + } + + /** + * Test the plain text output of the Word converter + * + * @throws Exception + */ + @Test + public void testWordFootnote() throws Exception { + XMLResult xmlResult = getXML("footnotes.docx", parseContext); + assertEquals( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + xmlResult.metadata.get(Metadata.CONTENT_TYPE)); + assertTrue(xmlResult.xml.contains("snoska")); + + } + + /** + * Test that the word converter is able to generate the + * correct HTML for the document + */ + @Test + public void testWordHTML() throws Exception { + XMLResult result = getXML("testWORD.docx", parseContext); + String xml = result.xml; + Metadata metadata = result.metadata; + assertEquals( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR)); + assertTrue(xml.contains("Sample Word Document")); + + // Check that custom headings came through +//TODO: assertTrue(xml.contains("<h1 class=\"title\">")); + + // Regular headings +//TODO: assertTrue(xml.contains("<h1>Heading Level 1</h1>")); +//TODO: assertTrue(xml.contains("<h2>Heading Level 2</h2>")); + // Headings with anchor tags in them +//TODO: assertTrue(xml.contains("<h3><a name=\"OnLevel3\" />Heading Level 3</h3>")); + // Bold and italic + assertTrue(xml.contains("<b>BOLD</b>")); + assertTrue(xml.contains("<i>ITALIC</i>")); + // Table + assertTrue(xml.contains("<table>")); + assertTrue(xml.contains("<td>")); + // Links + assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>")); + // Anchor links +//TODO: assertContains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>", xml); + // Paragraphs with other styles +//TODO: assertTrue(xml.contains("<p class=\"signature\">This one")); + + result = getXML("testWORD_3imgs.docx", parseContext); + xml = result.xml; + + // Images 2-4 (there is no 1!) +//TODO: assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image2.png\" alt=\"A description...\" />")); +//TODO: assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image3.jpeg\" alt=\"A description...\" />")); +//TODO: assertTrue("Image not found in:\n" + xml, xml.contains("<img src=\"embedded:image4.png\" alt=\"A description...\" />")); + + // Text too + assertTrue(xml.contains("<p>The end!</p>")); + + // TIKA-692: test document containing multiple + // character runs within a bold tag: + xml = getXML("testWORD_bold_character_runs.docx", parseContext).xml; + + // Make sure bold text arrived as single + // contiguous string even though Word parser + // handled this as 3 character runs +//TODO: assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>")); + + // TIKA-692: test document containing multiple + // character runs within a bold tag: + xml = getXML("testWORD_bold_character_runs2.docx", parseContext).xml; + + // Make sure bold text arrived as single + // contiguous string even though Word parser + // handled this as 3 character runs +//TODO: assertTrue("Bold text wasn't contiguous: " + xml, xml.contains("F<b>oob</b>a<b>r</b>")); + } + + /** + * Test that we can extract image from docx header + */ + @Test + @Ignore("TODO") + public void testWordPicturesInHeader() throws Exception { + assertEquals(2, getRecursiveMetadata("headerPic.docx").size()); + XMLResult xmlResult = getXML("headerPic.docx", parseContext); + assertEquals( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + xmlResult.metadata.get(Metadata.CONTENT_TYPE)); + // Check that custom headings came through + assertTrue(xmlResult.xml.contains("<img")); + + } + + /** + * Test docx without headers + * TIKA-633 + */ + @Test + public void testNullHeaders() throws Exception { + XMLResult xmlResult = getXML("NullHeader.docx", parseContext); + assertEquals("Should have found some text", false, + xmlResult.xml.isEmpty()); + + } + + @Test + public void testVarious() throws Exception { + XMLResult xmlResult = getXML("testWORD_various.docx", parseContext); + String content = xmlResult.xml; + Metadata metadata = xmlResult.metadata; + //content = content.replaceAll("\\s+"," "); + assertContains("Footnote appears here", content); + assertContains("This is a footnote.", content); + assertContains("This is the header text.", content); + assertContains("This is the footer text.", content); + assertContains("Here is a text box", content); + assertContains("Bold", content); + assertContains("italic", content); + assertContains("underline", content); + assertContains("superscript", content); + assertContains("subscript", content); + assertContains("Here is a citation:", content); + assertContains("Figure 1 This is a caption for Figure 1", content); + assertContains("(Kramer)", content); +//TODO: assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " ")); +//TODO: assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " ")); + assertContains("This is a hyperlink", content); + assertContains("Here is a list:", content); + for (int row = 1; row <= 3; row++) { + //assertContains("·\tBullet " + row, content); + //assertContains("\u00b7\tBullet " + row, content); + assertContains("Bullet " + row, content); + } + assertContains("Here is a numbered list:", content); + for (int row = 1; row <= 3; row++) { + //assertContains(row + ")\tNumber bullet " + row, content); + //assertContains(row + ") Number bullet " + row, content); + // TODO: OOXMLExtractor fails to number the bullets: + assertContains("Number bullet " + row, content); + } + + for (int row = 1; row <= 2; row++) { + for (int col = 1; col <= 3; col++) { + assertContains("Row " + row + " Col " + col, content); + } + } + + assertContains("Keyword1 Keyword2", content); + assertEquals("Keyword1 Keyword2", + metadata.get(Metadata.KEYWORDS)); + + assertContains("Subject is here", content); + // TODO: Remove subject in Tika 2.0 + assertEquals("Subject is here", + metadata.get(Metadata.SUBJECT)); + assertEquals("Subject is here", + metadata.get(OfficeOpenXMLCore.SUBJECT)); + + assertContains("Suddenly some Japanese text:", content); + // Special version of (GHQ) + assertContains("\uff08\uff27\uff28\uff31\uff09", content); + // 6 other characters + assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content); + + assertContains("And then some Gothic text:", content); + assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content); + } + + @Test + public void testWordCustomProperties() throws Exception { + Metadata metadata = new Metadata(); + + try (InputStream input = OOXMLParserTest.class.getResourceAsStream( + "/test-documents/testWORD_custom_props.docx")) { + ContentHandler handler = new BodyContentHandler(-1); + ParseContext context = new ParseContext(); + context.set(Locale.class, Locale.US); + new OOXMLParser().parse(input, handler, metadata, context); + } + + assertEquals( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("EJ04325S", metadata.get(TikaCoreProperties.CREATOR)); + assertEquals("Etienne Jouvin", metadata.get(TikaCoreProperties.MODIFIER)); + assertEquals("Etienne Jouvin", metadata.get(Metadata.LAST_AUTHOR)); + assertEquals("2011-07-29T16:52:00Z", metadata.get(TikaCoreProperties.CREATED)); + assertEquals("2011-07-29T16:52:00Z", metadata.get(Metadata.CREATION_DATE)); + assertEquals("2012-01-03T22:14:00Z", metadata.get(TikaCoreProperties.MODIFIED)); + assertEquals("2012-01-03T22:14:00Z", metadata.get(Metadata.DATE)); + assertEquals("Microsoft Office Word", metadata.get(Metadata.APPLICATION_NAME)); + assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION)); + assertEquals("1", metadata.get(Office.PAGE_COUNT)); + assertEquals("2", metadata.get(Office.WORD_COUNT)); + assertEquals("My Title", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("My Keyword", metadata.get(TikaCoreProperties.KEYWORDS)); + assertEquals("Normal.dotm", metadata.get(Metadata.TEMPLATE)); + assertEquals("Normal.dotm", metadata.get(OfficeOpenXMLExtended.TEMPLATE)); + // TODO: Remove subject in Tika 2.0 + assertEquals("My subject", metadata.get(Metadata.SUBJECT)); + assertEquals("My subject", metadata.get(OfficeOpenXMLCore.SUBJECT)); + assertEquals("EDF-DIT", metadata.get(TikaCoreProperties.PUBLISHER)); + assertEquals("true", metadata.get("custom:myCustomBoolean")); + assertEquals("3", metadata.get("custom:myCustomNumber")); + assertEquals("MyStringValue", metadata.get("custom:MyCustomString")); + assertEquals("2010-12-30T23:00:00Z", metadata.get("custom:MyCustomDate")); + assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate")); + } + + // TIKA-989: + @Test + @Ignore("TODO") + public void testEmbeddedPDF() throws Exception { + String xml = getXML("testWORD_embedded_pdf.docx", parseContext).xml; + int i = xml.indexOf("Here is the pdf file:"); + int j = xml.indexOf("<div class=\"embedded\" id=\"rId5\"/>"); + int k = xml.indexOf("Bye Bye"); + int l = xml.indexOf("<div class=\"embedded\" id=\"rId6\"/>"); + int m = xml.indexOf("Bye for real."); + assertTrue(i != -1); + assertTrue(j != -1); + assertTrue(k != -1); + assertTrue(l != -1); + assertTrue(m != -1); + assertTrue(i < j); + assertTrue(j < k); + assertTrue(k < l); + assertTrue(l < m); + } + + // TIKA-1006 + @Test + public void testWordNullStyle() throws Exception { + String xml = getXML("testWORD_null_style.docx").xml; + assertContains("Test av styrt dokument", xml); + } + + /** + * TIKA-1044 - Handle word documents where parts of the + * text have no formatting or styles applied to them + */ + @Test + public void testNoFormat() throws Exception { + assertContains("This is a piece of text that causes an exception", + getXML("testWORD_no_format.docx", parseContext).xml); + } + + @Test + public void testSkipDeleted() throws Exception { + ParseContext pc = new ParseContext(); + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setIncludeDeletedContent(true); + officeParserConfig.setUseSAXDocxExtractor(true); + officeParserConfig.setIncludeMoveFromContent(true); + pc.set(OfficeParserConfig.class, officeParserConfig); + + XMLResult r = getXML("testWORD_2006ml.docx", pc); + assertContains("frog", r.xml); + assertContainsCount("Second paragraph", r.xml, 2); + + } + + // TIKA-1005: + @Test + public void testTextInsideTextBox() throws Exception { + String xml = getXML("testWORD_text_box.docx", parseContext).xml; + assertContains("This text is directly in the body of the document.", xml); + assertContains("This text is inside of a text box in the body of the document.", xml); + assertContains("This text is inside of a text box in the header of the document.", xml); + assertContains("This text is inside of a text box in the footer of the document.", xml); + } + + /** + * Test for missing text described in + * <a href="https://issues.apache.org/jira/browse/TIKA-1130">TIKA-1130</a>. + * and TIKA-1317 + */ + @Test + public void testMissingText() throws Exception { + + XMLResult xmlResult = getXML("testWORD_missing_text.docx", parseContext); + assertEquals( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + xmlResult.metadata.get(Metadata.CONTENT_TYPE)); + assertContains("BigCompany", xmlResult.xml); + assertContains("Seasoned", xmlResult.xml); + assertContains("Rich_text_in_cell", xmlResult.xml); + + } + + //TIKA-792; with room for future missing bean tests + @Test + public void testWordMissingOOXMLBeans() throws Exception { + //If a bean is missing, POI prints stack trace to stderr + String[] fileNames = new String[]{ + "testWORD_missing_ooxml_bean1.docx",//TIKA-792 + }; + PrintStream origErr = System.err; + for (String fileName : fileNames) { + + //grab stderr + ByteArrayOutputStream errContent = new ByteArrayOutputStream(); + System.setErr(new PrintStream(errContent, true, UTF_8.name())); + getXML(fileName, parseContext); + + //return stderr + System.setErr(origErr); + + String err = errContent.toString(UTF_8.name()); + assertTrue(err.length() == 0); + } + } + + @Test + public void testDOCXThumbnail() throws Exception { + String xml = getXML("testDOCX_Thumbnail.docx", parseContext).xml; + int a = xml.indexOf("This file contains a thumbnail"); + int b = xml.indexOf("<div class=\"embedded\" id=\"/docProps/thumbnail.emf\" />"); + + assertTrue(a != -1); + assertTrue(b != -1); + assertTrue(a < b); + } + + @Test + public void testEncrypted() throws Exception { + Map<String, String> tests = new HashMap<String, String>(); + tests.put("testWORD_protected_passtika.docx", + "This is an encrypted Word 2007 File"); + + Parser parser = new AutoDetectParser(); + Metadata m = new Metadata(); + PasswordProvider passwordProvider = new PasswordProvider() { + @Override + public String getPassword(Metadata metadata) { + return "tika"; + } + }; + + OfficeParserConfig opc = new OfficeParserConfig(); + opc.setUseSAXDocxExtractor(true); + ParseContext passwordContext = new ParseContext(); + passwordContext.set(org.apache.tika.parser.PasswordProvider.class, passwordProvider); + passwordContext.set(OfficeParserConfig.class, opc); + for (Map.Entry<String, String> e : tests.entrySet()) { + assertContains(e.getValue(), getXML(e.getKey(), passwordContext).xml); + } + + //now try with no password + for (Map.Entry<String, String> e : tests.entrySet()) { + boolean exc = false; + try { + getXML(e.getKey(), parseContext); + } catch (EncryptedDocumentException ex) { + exc = true; + } + assertTrue(exc); + } + + } + + @Test + public void testDOCXParagraphNumbering() throws Exception { + String xml = getXML("testWORD_numbered_list.docx", parseContext).xml; + //SAX parser is getting this. DOM parser is not + assertContains("add a list here", xml); +/*TODO: + assertContains("1) This", xml); + assertContains("a) Is", xml); + assertContains("i) A multi", xml); + assertContains("ii) Level", xml); + assertContains("1. Within cell 1", xml); + assertContains("b. Cell b", xml); + assertContains("iii) List", xml); + assertContains("2) foo", xml); + assertContains("ii) baz", xml); + assertContains("ii) foo", xml); + assertContains("II. bar", xml); + assertContains("6. six", xml); + assertContains("7. seven", xml); + assertContains("a. seven a", xml); + assertContains("e. seven e", xml); + assertContains("2. A ii 2", xml); + assertContains("3. page break list 3", xml); + assertContains("Some-1-CrazyFormat Greek numbering with crazy format - alpha", xml); + assertContains("1.1.1. 1.1.1", xml); + assertContains("1.1. 1.2->1.1 //set the value", xml); +*/ + } + + @Test + @Ignore("TODO") + public void testDOCXOverrideParagraphNumbering() throws Exception { + String xml = getXML("testWORD_override_list_numbering.docx").xml; + + //Test 1 + assertContains("<p>1.1.1.1...1 1.1.1.1...1</p>", xml); + assertContains("1st.2.3someText 1st.2.3someText", xml); + assertContains("1st.2.2someOtherText.1 1st.2.2someOtherText.1", xml); + assertContains("5th 5th", xml); + + + //Test 2 + assertContains("1.a.I 1.a.I", xml); + //test no reset because level 2 is not sufficient to reset + assertContains("<p>1.b.III 1.b.III</p>", xml); + //test restarted because of level 0's increment to 2 + assertContains("2.a.I 2.a.I", xml); + //test handling of skipped level + assertContains("<p>2.b 2.b</p>", xml); + + //Test 3 + assertContains("(1)) (1))", xml); + //tests start level 1 at 17 and + assertContains("2.17 2.17", xml); + //tests that isLegal turns everything into decimal + assertContains("2.18.2.1 2.18.2.1", xml); + assertContains("<p>2 2</p>", xml); + + //Test4 + assertContains("<p>1 1</p>", xml); + assertContains("<p>A A</p>", xml); + assertContains("<p>B B</p>", xml); + //this tests overrides + assertContains("<p>C C</p>", xml); + assertContains("<p>4 4</p>", xml); + + //Test5 + assertContains(">00 00", xml); + assertContains(">01 01", xml); + assertContains(">01. 01.", xml); + assertContains(">01..1 01..1", xml); + assertContains(">02 02", xml); + } + + @Test + public void testMultiAuthorsManagers() throws Exception { + XMLResult r = getXML("testWORD_multi_authors.docx", parseContext); + String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR); + assertEquals(3, authors.length); + assertEquals("author2", authors[1]); + + String[] managers = r.metadata.getValues(OfficeOpenXMLExtended.MANAGER); + assertEquals(2, managers.length); + assertEquals("manager1", managers[0]); + assertEquals("manager2", managers[1]); + } + + @Test + public void testOrigSourcePath() throws Exception { + Metadata embed1_zip_metadata = getRecursiveMetadata("test_recursive_embedded.docx", parseContext).get(11); + assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip", + Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME))); + assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip", + Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME))); + } + + @Test + @Ignore("TODO") + public void testBoldHyperlink() throws Exception { + //TIKA-1255 + String xml = getXML("testWORD_boldHyperlink.docx", parseContext).xml; + xml = xml.replaceAll("\\s+", " "); + assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml); + assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold", xml); + } + + @Test + public void testLongForIntExceptionInSummaryDetails() throws Exception { + //TIKA-2055 + assertContains("bold", getXML("testWORD_totalTimeOutOfRange.docx", parseContext).xml); + } + + @Test + @Ignore("TODO") + public void testMacrosInDocm() throws Exception { + Metadata minExpected = new Metadata(); + minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()"); + minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()"); + minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic"); + minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); + + assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", parseContext)); + } + + +} http://git-wip-us.apache.org/repos/asf/tika/blob/89430130/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java deleted file mode 100644 index 06f0eed..0000000 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/SXWPFExtractorTest.java +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.parser.microsoft.ooxml.xwpf; - -import static org.junit.Assert.assertEquals; - -import java.util.List; - -import org.apache.tika.TikaTest; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Office; -import org.apache.tika.metadata.OfficeOpenXMLCore; -import org.apache.tika.metadata.OfficeOpenXMLExtended; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.RecursiveParserWrapper; -import org.apache.tika.parser.microsoft.OfficeParserConfig; -import org.junit.Test; - - -public class SXWPFExtractorTest extends TikaTest { - - @Test - public void basicTest() throws Exception { - ParseContext pc = new ParseContext(); - OfficeParserConfig officeParserConfig = new OfficeParserConfig(); - officeParserConfig.setUseSAXDocxExtractor(true); - - pc.set(OfficeParserConfig.class, officeParserConfig); - List<Metadata> metadataList = getRecursiveMetadata("testWORD_2006ml.docx", pc); - - assertEquals(8, metadataList.size()); - Metadata m = metadataList.get(0); - - assertEquals("2016-11-29T00:58:00Z", m.get(TikaCoreProperties.CREATED)); - assertEquals("2016-11-29T17:54:00Z", m.get(TikaCoreProperties.MODIFIED)); - assertEquals("My Document Title", m.get(TikaCoreProperties.TITLE)); - assertEquals("This is the Author", m.get(TikaCoreProperties.CREATOR)); - assertEquals("3", m.get(OfficeOpenXMLCore.REVISION)); - assertEquals("Allison, Timothy B.", m.get(TikaCoreProperties.MODIFIER)); - //assertEquals("0", m.get(OfficeOpenXMLExtended.DOC_SECURITY)); - assertEquals("260", m.get(Office.WORD_COUNT)); - assertEquals("3", m.get(Office.PARAGRAPH_COUNT)); - assertEquals("1742", m.get(Office.CHARACTER_COUNT_WITH_SPACES)); - assertEquals("12", m.get(Office.LINE_COUNT)); - assertEquals("16.0000", m.get(OfficeOpenXMLExtended.APP_VERSION)); - - - String content = m.get(RecursiveParserWrapper.TIKA_CONTENT); - - - assertContainsCountTimes("engaging title page", content, 1); - assertContainsCountTimes("This is the Author", content, 1); - assertContainsCountTimes("This is an engaging title page", content, 1); - - assertContains("My Document Title", content); - assertContains("My Document Subtitle", content); - - assertContains("<p>\tHeading1\t3</p>", content); - - - //TODO: integrate numbering - assertContains("Really basic 2.", content); - - assertContainsCountTimes("This is a text box", content, 1); - - assertContains("<p>This is a hyperlink: <a href=\"http://tika.apache.org\">tika</a></p>", content); - - assertContains("<p>This is a link to a local file: <a href=\"file:///C:/data/test.png\">test.png</a></p>", content); - - assertContains("<p>This is 10 spaces</p>", content); - - //caption - assertContains("<p>Table 1: Table1 Caption</p>", content); - - //embedded table - //TODO: figure out how to handle embedded tables in html - assertContains("<p>Embedded table r1c1</p>", content); - - //shape - assertContainsCountTimes("<p>This is text within a shape", content, 1); - - //sdt rich text - assertContains("<p>Rich text content control", content); - - //sdt simple text - assertContains("<p>Simple text content control", content); - - //sdt repeating - assertContains("Repeating content", content); - - //sdt dropdown - //TODO: get options for dropdown - assertContains("Drop down1", content); - - //sdt date - assertContains("<p>11/16/2016</p>", content); - - //test that <tab/> works - assertContains("tab\ttab", content); - - assertContainsCountTimes("serious word art", content, 1); - assertContainsCountTimes("Wordartr1c1", content, 1); - - //glossary document contents - assertContains("Click or tap to enter a date", content); - - //basic formatting - assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over", - content); - - //TODO: add chart parsing -// assertContains("This is the chart", content); - - assertContains("This is a comment", content); - - assertContains("This is an endnote", content); - - assertContains("this is the footnote", content); - - assertContains("First page header", content); - - assertContains("Even page header", content); - - assertContains("Odd page header", content); - - assertContains("First page footer", content); - - assertContains("Even page footer", content); - - assertContains("Odd page footer", content); - - //test default does not include deleted - assertNotContained("frog", content); - - assertContains("Mattmann", content); - - //TODO: extract chart text -// assertContains("This is the chart title", content); - - } - - @Test - public void testSkipDeleted() throws Exception { - ParseContext pc = new ParseContext(); - OfficeParserConfig officeParserConfig = new OfficeParserConfig(); - officeParserConfig.setIncludeDeletedContent(true); - officeParserConfig.setUseSAXDocxExtractor(true); - officeParserConfig.setIncludeMoveFromContent(true); - pc.set(OfficeParserConfig.class, officeParserConfig); - - XMLResult r = getXML("testWORD_2006ml.docx", pc); - assertContains("frog", r.xml); - assertContainsCount("Second paragraph", r.xml, 2); - - } - - private void assertContainsCountTimes(String needle, String haystack, int expectedCount) { - int i = haystack.indexOf("engaging title page"); - int cnt = 0; - while (i > -1) { - cnt++; - i = haystack.indexOf("engaging title page", i+1); - } - assertEquals("found needle >"+ needle+"<"+cnt+" times instead of expected: "+expectedCount, - expectedCount, cnt); - - } - - - -} http://git-wip-us.apache.org/repos/asf/tika/blob/89430130/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml new file mode 100644 index 0000000..cad9c5a --- /dev/null +++ b/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml @@ -0,0 +1,27 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"/> + <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"> + <params> + <param name="useSAXDocxExtractor" type="bool">true</param> + </params> + </parser> + </parsers> +</properties>