This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push: new 4d94d20ff TIKA-4124 -- add unit tests (#1324) 4d94d20ff is described below commit 4d94d20ffe599cc4a7746733d94151ea347bf250 Author: Tim Allison <talli...@apache.org> AuthorDate: Mon Sep 11 16:04:39 2023 -0400 TIKA-4124 -- add unit tests (#1324) * TIKA-4124 -- add test documents and turn on unit tests for altchunk in docx --- .../ooxml/OOXMLContainerExtractionTest.java | 11 ----------- .../resources/test-documents/testAltChunkHTML.docx | Bin 0 -> 2631 bytes .../resources/test-documents/testAltChunkMHT.docx | Bin 0 -> 3070 bytes .../parser/microsoft/ooxml/OOXMLParserTest.java | 20 ++++++++++++++++++++ 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java index 83641751f..dfe86f204 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java @@ -23,7 +23,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.List; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.apache.tika.Tika; @@ -322,14 +321,4 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction assertTrue(found, "didn't find chart in " + suffix); } } - - @Test - @Disabled("until we can add test file to repo") - public void testAltFileChunk() throws Exception { - //not included test file from: https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_MHT_container.docx - //Tika is not correctly identifying rfc822, but rather, treating it as html. :( - List<Metadata> metadataList = getRecursiveMetadata("testAltChunkMHT.docx"); - assertEquals(2, metadataList.size()); - assertContains("Example of a table", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkHTML.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkHTML.docx new file mode 100644 index 000000000..0a37c1dff Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkHTML.docx differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkMHT.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkMHT.docx new file mode 100644 index 000000000..f58134ecf Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkMHT.docx differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index 65f14f169..36038a8ca 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -122,4 +122,24 @@ public class OOXMLParserTest extends TikaTest { //TIKA_2446 getRecursiveMetadata("testZIP_corrupted_oom.zip"); } + + @Test + public void testAltFileMHTChunk() throws Exception { + //test file with permission from: + // https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_MHT_container.docx + List<Metadata> metadataList = getRecursiveMetadata("testAltChunkMHT.docx"); + assertEquals(3, metadataList.size()); + assertContains("Example of a table", + metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT)); + } + + @Test + public void testAltFileHTMLChunk() throws Exception { + //test file with permission from: + // https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_HTML_container.docx + List<Metadata> metadataList = getRecursiveMetadata("testAltChunkHTML.docx"); + assertEquals(2, metadataList.size()); + assertContains("Example of a table", + metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); + } }