This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_2x by this push:
new 5789b57 microsoft package unit tests work
5789b57 is described below
commit 5789b5761caa39c06ee4d444e4e7d1c49326b333
Author: tallison <[email protected]>
AuthorDate: Tue Aug 18 07:34:09 2020 -0400
microsoft package unit tests work
---
tika-parser-integration-tests/pom.xml | 13 ++
.../parser/tests}/microsoft/EMFParserTest.java | 40 +---
.../parser/tests/microsoft/ExcelParserTest.java | 16 ++
.../microsoft/POIContainerExtractionTest.java | 170 +---------------
.../tests/microsoft/PowerPointParserTest.java | 22 +++
.../parser/tests/microsoft/XML2003ParserTest.java | 66 +++++++
.../tests/microsoft/ooxml/OOXMLParserTest.java | 49 +++++
.../tests/microsoft/ooxml/TruncatedOOXMLTest.java | 33 ++++
.../parser/tests/microsoft/rtf/RTFParserTest.java | 111 +++++++++++
tika-parser-modules/pom.xml | 5 +-
.../tika-parser-microsoft-module/pom.xml | 39 +++-
.../services/org.apache.tika.detect.Detector | 16 ++
.../services/org.apache.tika.parser.Parser | 27 +++
.../tika/parser/microsoft/EMFParserTest.java | 22 +--
.../tika/parser/microsoft/ExcelParserTest.java | 6 +-
.../microsoft/POIContainerExtractionTest.java | 219 ---------------------
.../parser/microsoft/PowerPointParserTest.java | 9 -
.../tika/parser/microsoft/WordParserTest.java | 2 +-
.../microsoft/onenote/OneNoteParserTest.java | 2 -
.../parser/microsoft/ooxml/OOXMLParserTest.java | 28 +--
.../parser/microsoft/ooxml/SXSLFExtractorTest.java | 8 -
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 2 +-
.../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 19 --
.../tika/parser/microsoft/rtf/RTFParserTest.java | 85 +-------
.../parser/microsoft/xml/XML2003ParserTest.java | 49 -----
.../ooxml}/tika-config-custom-date-override.xml | 0
.../microsoft/ooxml/tika-config-dom-macros.xml | 0
.../microsoft/ooxml/tika-config-sax-macros.xml | 0
.../rtf/ignoreListMarkup-tika-config.xml | 0
.../tika/parser/microsoft}/rtf/tika-config.xml | 0
.../tika-config-custom-date-override.xml | 0
.../microsoft/tika-config-exclude-phonetic.xml | 0
.../tika-config-extract-all-alternatives-msg.xml | 0
.../tika/parser/microsoft/tika-config-macros.xml | 0
.../tika/parser/microsoft/tika-config-sax-docx.xml | 0
.../src/test/resources/test-documents/Doc1_ole.doc | Bin
.../resources/test-documents/EmbeddedDocument.docx | Bin
.../resources/test-documents/EmbeddedOutlook.docx | Bin
.../test/resources/test-documents/EmbeddedPDF.docx | Bin
.../test/resources/test-documents/NullHeader.docx | Bin
.../test/resources/test-documents/footnotes.docx | Bin
.../test/resources/test-documents/headerPic.docx | Bin
.../src/test/resources/test-documents/jxl.xls | Bin
.../src/test/resources/test-documents/pictures.ppt | Bin
.../src/test/resources/test-documents/protect.xlsx | Bin
.../resources/test-documents/protectedFile.xlsx | Bin
.../resources/test-documents/protectedSheets.xlsx | Bin
.../resources/test-documents/test-columnar.xls | Bin
.../resources/test-documents/test-columnar.xlsb | Bin
.../resources/test-documents/test-columnar.xlsx | Bin
.../test/resources/test-documents/test-outlook.msg | Bin
.../resources/test-documents/test-outlook2003.msg | Bin
.../src/test/resources/test-documents/test.doc | Bin
.../test/resources/test-documents/testACCESS.mdb | Bin
.../resources/test-documents/testAccess2.accdb | Bin
.../resources/test-documents/testAccess2_2000.mdb | Bin
.../test-documents/testAccess2_2002-2003.mdb | Bin
.../test-documents/testAccess2_encrypted.accdb | Bin
.../resources/test-documents/testAccess_V1997.mdb | Bin
.../test-documents/testBinControlWord.rtf | 0
.../test/resources/test-documents/testComment.doc | Bin
.../test/resources/test-documents/testComment.docx | Bin
.../test/resources/test-documents/testComment.ppt | Bin
.../test/resources/test-documents/testComment.pptx | Bin
.../test/resources/test-documents/testComment.rtf | 0
.../test/resources/test-documents/testComment.xls | Bin
.../test/resources/test-documents/testComment.xlsx | Bin
.../test-documents/testControlCharacters.doc | Bin
.../test-documents/testDOCX_Thumbnail.docx | Bin
.../test/resources/test-documents/testDOTM.dotm | Bin
.../resources/test-documents/testDocumentLink.doc | Bin
.../src/test/resources/test-documents/testEMF.emf | Bin
.../test/resources/test-documents/testEMLX.emlx | 0
.../testEML_embedded_xhtml_and_img.eml | 0
.../test-documents/testFontAfterBufferedText.rtf | 0
.../test/resources/test-documents/testOneNote.one | Bin
.../test/resources/test-documents/testOneNote1.one | Bin
.../test/resources/test-documents/testOneNote2.one | Bin
.../test-documents/testOneNote2007OrEarlier1.one | Bin
.../test-documents/testOneNote2007OrEarlier2.one | Bin
.../resources/test-documents/testOneNote2016.one | Bin
.../test/resources/test-documents/testOneNote3.one | Bin
.../test/resources/test-documents/testOneNote4.one | Bin
.../test-documents/testOneNoteEmbeddedWordDoc.one | Bin
.../resources/test-documents/testPROJECT2003.mpp | Bin
.../resources/test-documents/testPROJECT2007.mpp | Bin
.../resources/test-documents/testPUBLISHER.pub | Bin
.../test/resources/test-documents/testWINMAIL.dat | Bin
.../src/test/resources/test-documents/testWMF.wmf | Bin
.../resources/test-documents/testWMF_charset.wmf | Bin
.../test-documents/testWORKSSpreadsheet7.0.xlr | Bin
.../test/resources/test-documents/testWordArt.pptx | Bin
.../test-documents/testXLSX_Thumbnail.xlsx | Bin
.../resources/test-documents/testXPS_various.xps | Bin
.../test-documents/testZIP_corrupted_oom.zip | Bin
.../resources/test-documents/test_TIKA-1251.doc | Bin
.../test-documents/test_embedded_zip.pptx | Bin
.../test-documents/test_list_override.rtf | 0
.../test-documents/test_recursive_embedded.doc | Bin
.../test-documents/test_recursive_embedded.docx | Bin
.../test_recursive_embedded_npe.docx | Bin
tika-parsers/pom.xml | 10 -
.../services/org.apache.tika.parser.Parser | 12 --
103 files changed, 413 insertions(+), 667 deletions(-)
diff --git a/tika-parser-integration-tests/pom.xml
b/tika-parser-integration-tests/pom.xml
index 8341715..fde1038 100644
--- a/tika-parser-integration-tests/pom.xml
+++ b/tika-parser-integration-tests/pom.xml
@@ -46,6 +46,12 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-microsoft-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
<artifactId>tika-parser-pkg-module</artifactId>
<version>${project.version}</version>
<scope>test</scope>
@@ -80,6 +86,13 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-microsoft-module</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
<artifactId>tika-parser-pkg-module</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/EMFParserTest.java
similarity index 51%
copy from
tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java
copy to
tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/EMFParserTest.java
index e6d2db3..7ebde82 100644
---
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java
+++
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/EMFParserTest.java
@@ -1,30 +1,14 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertEquals;
-
-import java.util.List;
+package org.apache.tika.parser.tests.microsoft;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.junit.Test;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
public class EMFParserTest extends TikaTest {
@Test
@@ -38,17 +22,6 @@ public class EMFParserTest extends TikaTest {
Metadata pdfMetadata = metadataList.get(2);
assertEquals("application/pdf",
pdfMetadata.get(Metadata.CONTENT_TYPE));
assertContains("is a toolkit for detecting",
pdfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-
- }
-
- @Test
- public void testTextExtractionMac() throws Exception {
- List<Metadata> metadataList =
getRecursiveMetadata("testEXCEL_embeddedPDF_mac.xls");
- Metadata emfMetadata = metadataList.get(2);
- assertEquals("image/emf", emfMetadata.get(Metadata.CONTENT_TYPE));
- assertContains("is a toolkit for detecting",
emfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- //test that a space was inserted before url
- assertContains("Tika http://incubator.apache.org/tika/",
emfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
}
@Test
@@ -60,7 +33,4 @@ public class EMFParserTest extends TikaTest {
assertEquals("application/pdf",
pdfMetadata.get(Metadata.CONTENT_TYPE));
assertContains("is a toolkit for detecting",
pdfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
}
-
-
}
-
diff --git
a/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/ExcelParserTest.java
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/ExcelParserTest.java
new file mode 100644
index 0000000..53bcd39
--- /dev/null
+++
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/ExcelParserTest.java
@@ -0,0 +1,16 @@
+package org.apache.tika.parser.tests.microsoft;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.junit.Test;
+
+import java.util.List;
+
+public class ExcelParserTest extends TikaTest {
+ @Test
+ public void testEmbeddedPDF() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testExcel_embeddedPDF.xls");
+ assertContains("Hello World!",
metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
+ }
+}
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/POIContainerExtractionTest.java
similarity index 66%
copy from
tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
copy to
tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/POIContainerExtractionTest.java
index 148efec..c01b78a 100644
---
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
+++
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/POIContainerExtractionTest.java
@@ -1,110 +1,15 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.util.List;
+package org.apache.tika.parser.tests.microsoft;
+import org.apache.tika.TikaTest;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.utils.ParserUtils;
+import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest;
import org.junit.Test;
-/**
- * Tests that the various POI powered parsers are
- * able to extract their embedded contents.
- */
-public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTest {
-
- /**
- * For office files which don't have anything embedded in them
- */
- @Test
- public void testWithoutEmbedded() throws Exception {
- ContainerExtractor extractor = new ParserContainerExtractor();
-
- String[] files = new String[]{
- "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
- "testVISIO.vsd", "test-outlook.msg"
- };
- for (String file : files) {
- // Process it without recursing
- TrackingHandler handler = process(file, extractor, false);
-
- // Won't have fired
- assertEquals(0, handler.filenames.size());
- assertEquals(0, handler.mediaTypes.size());
-
- // Ditto with recursing
- handler = process(file, extractor, true);
- assertEquals(0, handler.filenames.size());
- assertEquals(0, handler.mediaTypes.size());
- }
- }
-
- /**
- * Office files with embedded images, but no other
- * office files in them
- */
- @Test
- public void testEmbeddedImages() throws Exception {
- ContainerExtractor extractor = new ParserContainerExtractor();
- TrackingHandler handler;
-
- // Excel with 1 image
- handler = process("testEXCEL_1img.xls", extractor, false);
- assertEquals(1, handler.filenames.size());
- assertEquals(1, handler.mediaTypes.size());
-
- assertEquals(null, handler.filenames.get(0));
- assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-
-
- // PowerPoint with 2 images + sound
- // TODO
-
-
- // Word with 1 image
- handler = process("testWORD_1img.doc", extractor, false);
- assertEquals(1, handler.filenames.size());
- assertEquals(1, handler.mediaTypes.size());
-
- assertEquals("image1.png", handler.filenames.get(0));
- assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-
-
- // Word with 3 images
- handler = process("testWORD_3imgs.doc", extractor, false);
- assertEquals(3, handler.filenames.size());
- assertEquals(3, handler.mediaTypes.size());
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
- assertEquals("image1.png", handler.filenames.get(0));
- assertEquals("image2.jpg", handler.filenames.get(1));
- assertEquals("image3.png", handler.filenames.get(2));
- assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
- assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
- assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
- }
+public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTest {
/**
* Office files which have other office files
@@ -122,7 +27,7 @@ public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTe
@Test
public void testEmbeddedOfficeFiles() throws Exception {
ContainerExtractor extractor = new ParserContainerExtractor();
- TrackingHandler handler;
+ TikaTest.TrackingHandler handler;
// Excel with a word doc and a powerpoint doc, both of which have
images in them
@@ -326,65 +231,4 @@ public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTe
assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
}
-
- @Test
- public void testEmbeddedOfficeFilesXML() throws Exception {
- ContainerExtractor extractor = new ParserContainerExtractor();
- TrackingHandler handler;
-
- handler = process("EmbeddedDocument.docx", extractor, false);
-
assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
- assertEquals(2, handler.filenames.size());
- }
-
- @Test
- public void testPowerpointImages() throws Exception {
- ContainerExtractor extractor = new ParserContainerExtractor();
- TrackingHandler handler;
-
- handler = process("pictures.ppt", extractor, false);
- assertTrue(handler.mediaTypes.contains(new MediaType("image",
"jpeg")));
- assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
- }
-
- @Test
- public void testEmbeddedStorageId() throws Exception {
-
- List<Metadata> list = getRecursiveMetadata("testWORD_embeded.doc");
- //.docx
- assertEquals("{F4754C9B-64F5-4B40-8AF4-679732AC0607}",
-
list.get(10).get(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID));
- //_1345471035.ppt
- assertEquals("{64818D10-4F9B-11CF-86EA-00AA00B929E8}",
-
list.get(14).get(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID));
- //_1345470949.xls
- assertEquals("{00020820-0000-0000-C000-000000000046}",
-
list.get(16).get(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID));
-
- }
-
- @Test
- public void testEmbeddedGraphChart() throws Exception {
- //doc converts a chart to a actual xls file
- //so we only need to look in ppt and xls
- for (String suffix : new String[]{"ppt", "xls"}) {
- List<Metadata> list =
getRecursiveMetadata("testMSChart-govdocs-428996."+suffix);
- boolean found = false;
- for (Metadata m : list) {
- if
(m.get(Metadata.CONTENT_TYPE).equals(POIFSContainerDetector.MS_GRAPH_CHART.toString()))
{
- found = true;
- }
- assertNull(m.get(ParserUtils.EMBEDDED_EXCEPTION));
- }
- assertTrue("didn't find chart in "+suffix, found);
- }
- }
-
- @Test
- public void testEmbeddedEquation() throws Exception {
- //file derives from govdocs1 863534.doc
- List<Metadata> metadataList =
getRecursiveMetadata("testMSEquation-govdocs-863534.doc");
- assertEquals(3, metadataList.size());
- assertEquals("application/vnd.ms-equation",
metadataList.get(2).get(Metadata.CONTENT_TYPE));
- }
}
diff --git
a/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/PowerPointParserTest.java
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/PowerPointParserTest.java
new file mode 100644
index 0000000..71da57e
--- /dev/null
+++
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/PowerPointParserTest.java
@@ -0,0 +1,22 @@
+package org.apache.tika.parser.tests.microsoft;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.junit.Test;
+
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+public class PowerPointParserTest extends TikaTest {
+ @Test
+ public void testEmbeddedPDF() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testPPT_EmbeddedPDF.ppt");
+ assertContains("Apache Tika project",
metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("3.pdf",
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertContains("Hello World",
metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("4.pdf",
metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ }
+}
diff --git
a/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/XML2003ParserTest.java
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/XML2003ParserTest.java
new file mode 100644
index 0000000..8ebe759
--- /dev/null
+++
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/XML2003ParserTest.java
@@ -0,0 +1,66 @@
+package org.apache.tika.parser.tests.microsoft;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+public class XML2003ParserTest extends TikaTest {
+ @Test
+ public void testBasicWord() throws Exception {
+ List<Metadata> list = getRecursiveMetadata("testWORD2003.xml");
+ assertEquals(6, list.size());
+ Metadata m = list.get(0);//container doc
+ String xml = m.get(RecursiveParserWrapper.TIKA_CONTENT);
+ xml = xml.replaceAll("\\s+", " ");
+ //make sure that metadata gets dumped to xml
+ assertContains("<meta name=\"meta:character-count-with-spaces\"
content=\"256\"", xml);
+ //do not allow nested <p> elements
+ assertContains("<p /> <img href=\"02000003.jpg\" /><p /> <p><img
href=\"02000004.jpg\" /></p>", xml);
+ assertContains("<table><tbody>", xml);
+ assertContains("</tbody></table>", xml);
+ assertContains("<td><p>R1 c1</p> </td>", xml);
+ assertContains("<a href=\"https://tika.apache.org/\">tika</a>", xml);
+ assertContains("footnote", xml);
+ assertContains("Mycomment", xml);
+ assertContains("Figure 1: My Figure", xml);
+ assertContains("myEndNote", xml);
+ assertContains("We have always been at war with OceaniaEurasia", xml);
+ assertContains("Text box", xml);
+ assertNotContained("Text boxText box", xml);
+ assertContains("MyHeader", xml);
+ assertContains("MyFooter", xml);
+ assertContains("<img href=\"02000003.jpg\" />", xml);
+ assertEquals("219", m.get(Office.CHARACTER_COUNT));
+ assertEquals("256", m.get(Office.CHARACTER_COUNT_WITH_SPACES));
+
+ assertEquals("38", m.get(Office.WORD_COUNT));
+ assertEquals("1", m.get(Office.PARAGRAPH_COUNT));
+ assertEquals("Allison, Timothy B.", m.get(TikaCoreProperties.CREATOR));
+ assertEquals("2016-04-27T17:49:00Z",
m.get(TikaCoreProperties.CREATED));
+ assertEquals("application/vnd.ms-wordml",
m.get(Metadata.CONTENT_TYPE));
+
+ //make sure embedded docs were properly processed
+ assertContains("moscow-birds",
+
Arrays.asList(list.get(5).getValues(TikaCoreProperties.SUBJECT)));
+
+ assertEquals("testJPEG_EXIF.jpg",
list.get(5).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+
+ //check that text is extracted with breaks between elements
+ String txt =
getText(getResourceAsStream("/test-documents/testWORD2003.xml"),AUTO_DETECT_PARSER);
+ txt = txt.replaceAll("\\s+", " ");
+ assertNotContained("beforeR1", txt);
+ assertContains("R1 c1 R1 c2", txt);
+ assertNotContained("footnoteFigure", txt);
+ assertContains("footnote Figure", txt);
+ assertContains("test space", txt);
+
+ }
+}
diff --git
a/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/ooxml/OOXMLParserTest.java
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/ooxml/OOXMLParserTest.java
new file mode 100644
index 0000000..0edd3c3
--- /dev/null
+++
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/ooxml/OOXMLParserTest.java
@@ -0,0 +1,49 @@
+package org.apache.tika.parser.tests.microsoft.ooxml;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.junit.Test;
+
+import java.util.List;
+
+public class OOXMLParserTest extends TikaTest {
+
+ @Test
+ public void testEmbeddedPDFInPPTX() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testPPT_EmbeddedPDF.pptx");
+ Metadata pdfMetadata1 = metadataList.get(4);
+ assertContains("Apache Tika",
pdfMetadata1.get(RecursiveParserWrapper.TIKA_CONTENT));
+ Metadata pdfMetadata2 = metadataList.get(5);
+ assertContains("Hello World",
pdfMetadata2.get(RecursiveParserWrapper.TIKA_CONTENT));
+ }
+
+ @Test
+ public void testEmbeddedPDFInXLSX() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testExcel_embeddedPDF.xlsx");
+ Metadata pdfMetadata = metadataList.get(1);
+ assertContains("Hello World",
pdfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ }
+
+ @Test
+ public void testEmbeddedPDFInStreamingPPTX() throws Exception {
+ ParseContext parseContext = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setUseSAXPptxExtractor(true);
+ parseContext.set(OfficeParserConfig.class, officeParserConfig);
+
+ List<Metadata> metadataList =
getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", parseContext);
+ Metadata pdfMetadata1 = metadataList.get(4);
+ assertContains("Apache Tika",
pdfMetadata1.get(RecursiveParserWrapper.TIKA_CONTENT));
+ Metadata pdfMetadata2 = metadataList.get(5);
+ assertContains("Hello World",
pdfMetadata2.get(RecursiveParserWrapper.TIKA_CONTENT));
+ }
+
+ @Test(expected = org.apache.tika.exception.TikaException.class)
+ public void testCorruptedZip() throws Exception {
+ //TIKA_2446
+ getRecursiveMetadata("testZIP_corrupted_oom.zip");
+ }
+}
diff --git
a/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/ooxml/TruncatedOOXMLTest.java
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/ooxml/TruncatedOOXMLTest.java
new file mode 100644
index 0000000..f4a26ad
--- /dev/null
+++
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/ooxml/TruncatedOOXMLTest.java
@@ -0,0 +1,33 @@
+package org.apache.tika.parser.tests.microsoft.ooxml;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+public class TruncatedOOXMLTest extends TikaTest {
+
+ @Test
+ public void testWordTrunc13138() throws Exception {
+ //this truncates the content_types.xml
+ //this tests that there's a backoff to the pkg parser
+ List<Metadata> metadataList = getRecursiveMetadata(truncate(
+ "testWORD_various.docx", 13138), true);
+ assertEquals(19, metadataList.size());
+ Metadata m = metadataList.get(0);
+ assertEquals("application/x-tika-ooxml", m.get(Metadata.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testWordTrunc774() throws Exception {
+ //this is really truncated
+ List<Metadata> metadataList = getRecursiveMetadata(truncate(
+ "testWORD_various.docx", 774), true);
+ assertEquals(4, metadataList.size());
+ Metadata m = metadataList.get(0);
+ assertEquals("application/x-tika-ooxml", m.get(Metadata.CONTENT_TYPE));
+ }
+}
diff --git
a/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/rtf/RTFParserTest.java
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/rtf/RTFParserTest.java
new file mode 100644
index 0000000..a8f6c96
--- /dev/null
+++
b/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/tests/microsoft/rtf/RTFParserTest.java
@@ -0,0 +1,111 @@
+package org.apache.tika.parser.tests.microsoft.rtf;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+public class RTFParserTest extends TikaTest {
+ // TIKA-1010
+ @Test
+ public void testEmbeddedMonster() throws Exception {
+
+ Map<Integer, Pair> expected = new HashMap<>();
+ expected.put(3, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
+ expected.put(4, new Pair("file_0.doc", "application/msword"));
+ expected.put(7, new Pair("file_1.xlsx",
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
+ expected.put(10, new Pair("text.html", "text/html;
charset=windows-1252"));
+ expected.put(11, new Pair("html-within-zip.zip", "application/zip"));
+ expected.put(12, new
Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
+ expected.put(15, new
Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html;
charset=UTF-8"));
+ expected.put(18, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg",
"image/jpeg"));
+ expected.put(21, new Pair("file_2.xls", "application/vnd.ms-excel"));
+ expected.put(24, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg",
"application/vnd.ms-outlook"));
+ expected.put(27, new Pair("file_3.pdf", "application/pdf"));
+ expected.put(30, new Pair("file_4.ppt",
"application/vnd.ms-powerpoint"));
+ expected.put(34, new Pair("file_5.pptx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation"));
+ expected.put(33, new Pair("thumbnail.jpeg", "image/jpeg"));
+ expected.put(37, new Pair("file_6.doc", "application/msword"));
+ expected.put(40, new Pair("file_7.doc", "application/msword"));
+ expected.put(43, new Pair("file_8.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+ expected.put(46, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg",
"image/jpeg"));
+
+
+ List<Metadata> metadataList =
getRecursiveMetadata("testRTFEmbeddedFiles.rtf");
+ assertEquals(49, metadataList.size());
+ for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
+ Metadata metadata = metadataList.get(e.getKey());
+ Pair p = e.getValue();
+ assertNotNull(metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ //necessary to getName() because MSOffice extractor includes
+ //directory: _1457338524/HW.txt
+ Assert.assertEquals("filename equals ",
+ p.fileName, FilenameUtils.getName(
+
metadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)));
+
+ assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
+ }
+
assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_普林斯顿.jpg",
+
metadataList.get(46).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+ }
+
+ //TIKA-1010 test regular (not "embedded") images/picts
+ @Test
+ public void testRegularImages() throws Exception {
+ ParseContext ctx = new ParseContext();
+ RecursiveParserWrapper parser = new
RecursiveParserWrapper(AUTO_DETECT_PARSER);
+ RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(
+ new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE,
-1),-1);
+ Metadata rootMetadata = new Metadata();
+ rootMetadata.add(TikaCoreProperties.RESOURCE_NAME_KEY,
"testRTFRegularImages.rtf");
+ try (TikaInputStream tis =
TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf")))
{
+ parser.parse(tis, handler, rootMetadata, ctx);
+ }
+ List<Metadata> metadatas = handler.getMetadataList();
+
+ Metadata meta_jpg_exif =
metadatas.get(1);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
+ Metadata meta_jpg =
metadatas.get(3);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
+
+ assertTrue(meta_jpg_exif != null);
+ assertTrue(meta_jpg != null);
+
assertTrue(Arrays.asList(meta_jpg_exif.getValues(TikaCoreProperties.SUBJECT)).contains("serbor"));
+
assertTrue(meta_jpg.get(TikaCoreProperties.COMMENTS).contains("Licensed to the
Apache"));
+ //make sure old metadata doesn't linger between objects
+
assertFalse(Arrays.asList(meta_jpg.getValues(TikaCoreProperties.SUBJECT)).contains("serbor"));
+ assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
+ assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
+
+ assertEquals(51, meta_jpg.names().length);
+ assertEquals(112, meta_jpg_exif.names().length);
+ }
+
+ private static class Pair {
+ final String fileName;
+ final String mimeType;
+ Pair(String fileName, String mimeType) {
+ this.fileName = fileName;
+ this.mimeType = mimeType;
+ }
+ }
+
+}
diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml
index c07171b..42880cb 100644
--- a/tika-parser-modules/pom.xml
+++ b/tika-parser-modules/pom.xml
@@ -39,6 +39,10 @@
<jempbox.version>1.8.16</jempbox.version>
<mime4j.version>0.8.3</mime4j.version>
<pdfbox.version>2.0.20</pdfbox.version>
+ <commons.logging.version>1.2</commons.logging.version>
+ <!-- used by POI, PDFBox and Jackcess ...try to sync -->
+ <bouncycastle.version>1.65</bouncycastle.version>
+
</properties>
<dependencies>
@@ -71,7 +75,6 @@
<module>tika-parser-font-module</module>
<module>tika-parser-microsoft-module</module>
<module>tika-parser-pkg-module</module>
- <module>tika-parser-rtf-module</module>
<module>tika-parser-mail-commons</module>
<module>tika-parser-xml-module</module>
</modules>
diff --git a/tika-parser-modules/tika-parser-microsoft-module/pom.xml
b/tika-parser-modules/tika-parser-microsoft-module/pom.xml
index 36a17b5..3e4c1f4 100644
--- a/tika-parser-modules/tika-parser-microsoft-module/pom.xml
+++ b/tika-parser-modules/tika-parser-microsoft-module/pom.xml
@@ -46,7 +46,6 @@
<artifactId>commons-codec</artifactId>
<version>${codec.version}</version>
</dependency>
- <!-- we're only using Pair from this -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
@@ -87,6 +86,12 @@
</exclusion>
</exclusions>
</dependency>
+ <!-- needed by jackcess -->
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <version>${commons.logging.version}</version>
+ </dependency>
<dependency>
<groupId>com.healthmarketscience.jackcess</groupId>
<artifactId>jackcess</artifactId>
@@ -119,7 +124,37 @@
</exclusion>
</exclusions>
</dependency>
-
+ <!-- for java 10
+ See TIKA-2778 for why we need to do this now.
+ May the gods of API design fix this in the future.
+ only required for jackcess-encrypt
+ -->
+ <dependency>
+ <groupId>org.glassfish.jaxb</groupId>
+ <artifactId>jaxb-runtime</artifactId>
+ <version>${jaxb.version}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>jakarta.activation</groupId>
+ <artifactId>jakarta.activation-api</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>com.sun.activation</groupId>
+ <artifactId>jakarta.activation</artifactId>
+ <version>1.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.bouncycastle</groupId>
+ <artifactId>bcmail-jdk15on</artifactId>
+ <version>${bouncycastle.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.bouncycastle</groupId>
+ <artifactId>bcprov-jdk15on</artifactId>
+ <version>${bouncycastle.version}</version>
+ </dependency>
<!-- https://mvnrepository.com/artifact/log4j/log4j -->
<dependency>
<groupId>log4j</groupId>
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
b/tika-parser-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
new file mode 100644
index 0000000..f674e79
--- /dev/null
+++
b/tika-parser-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.tika.parser.microsoft.POIFSContainerDetector
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
b/tika-parser-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
new file mode 100644
index 0000000..0a9fa02
--- /dev/null
+++
b/tika-parser-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+org.apache.tika.parser.microsoft.rtf.RTFParser
+org.apache.tika.parser.microsoft.EMFParser
+org.apache.tika.parser.microsoft.WMFParser
+org.apache.tika.parser.microsoft.JackcessParser
+org.apache.tika.parser.microsoft.MSOwnerFileParser
+org.apache.tika.parser.microsoft.OfficeParser
+org.apache.tika.parser.microsoft.OldExcelParser
+org.apache.tika.parser.microsoft.TNEFParser
+org.apache.tika.parser.microsoft.onenote.OneNoteParser
+org.apache.tika.parser.microsoft.ooxml.OOXMLParser
+org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006.Word2006MLParser
+org.apache.tika.parser.microsoft.xml.WordMLParser
+org.apache.tika.parser.microsoft.xml.SpreadsheetMLParser
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java
index e6d2db3..e3508d7 100644
---
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java
+++
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/EMFParserTest.java
@@ -27,19 +27,7 @@ import org.junit.Test;
public class EMFParserTest extends TikaTest {
- @Test
- public void testTextExtractionWindows() throws Exception {
- List<Metadata> metadataList =
getRecursiveMetadata("testEXCEL_embeddedPDF_windows.xls");
- Metadata emfMetadata = metadataList.get(1);
- assertEquals("image/emf", emfMetadata.get(Metadata.CONTENT_TYPE));
- assertContains("<p>testPDF.pdf</p>",
emfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- //this is just the usual embedded pdf
- Metadata pdfMetadata = metadataList.get(2);
- assertEquals("application/pdf",
pdfMetadata.get(Metadata.CONTENT_TYPE));
- assertContains("is a toolkit for detecting",
pdfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
-
- }
@Test
public void testTextExtractionMac() throws Exception {
@@ -51,15 +39,7 @@ public class EMFParserTest extends TikaTest {
assertContains("Tika http://incubator.apache.org/tika/",
emfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
}
- @Test
- public void testPDFExtraction() throws Exception {
- List<Metadata> metadataList =
getRecursiveMetadata("testEXCEL_embeddedPDF_mac.xls");
- //this pdf has to be extracted from within the EMF
- //it does not exist as a standalone pdf file inside the _mac.xls file.
- Metadata pdfMetadata = metadataList.get(1);
- assertEquals("application/pdf",
pdfMetadata.get(Metadata.CONTENT_TYPE));
- assertContains("is a toolkit for detecting",
pdfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- }
+
}
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 7fb8fb8..4e1528a 100644
---
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -474,11 +474,7 @@ public class ExcelParserTest extends TikaTest {
// assertContains("<a
href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
}
- @Test
- public void testEmbeddedPDF() throws Exception {
- List<Metadata> metadataList =
getRecursiveMetadata("testExcel_embeddedPDF.xls");
- assertContains("Hello World!",
metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
- }
+
@Test
public void testBigIntegersWGeneralFormat() throws Exception {
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
index 148efec..9b9e3d2 100644
---
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
+++
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
@@ -106,227 +106,8 @@ public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTe
assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
}
- /**
- * Office files which have other office files
- * embedded into them. The embedded office files
- * will sometimes have images in them.
- * <p/>
- * eg xls
- * -> word
- * -> image
- * -> image
- * -> powerpoint
- * -> excel
- * -> image
- */
- @Test
- public void testEmbeddedOfficeFiles() throws Exception {
- ContainerExtractor extractor = new ParserContainerExtractor();
- TrackingHandler handler;
-
-
- // Excel with a word doc and a powerpoint doc, both of which have
images in them
- // Without recursion, should see both documents + the images
- handler = process("testEXCEL_embeded.xls", extractor, false);
- assertEquals(5, handler.filenames.size());
- assertEquals(5, handler.mediaTypes.size());
-
- // We don't know their filenames
- assertEquals(null, handler.filenames.get(0));
- assertEquals(null, handler.filenames.get(1));
- assertEquals(null, handler.filenames.get(2));
- assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
- assertEquals("MBD00032A24.doc", handler.filenames.get(4));
- // But we do know their types
- assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded
office doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded
office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
- assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office
doc
- assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office
doc
-
-
- // With recursion, should get the images embedded in the office files
too
- handler = process("testEXCEL_embeded.xls", extractor, true);
- assertEquals(17, handler.filenames.size());
- assertEquals(17, handler.mediaTypes.size());
-
- assertEquals(null, handler.filenames.get(0));
- assertEquals(null, handler.filenames.get(1));
- assertEquals(null, handler.filenames.get(2));
- assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
- assertEquals("1", handler.filenames.get(4));
- assertEquals(null, handler.filenames.get(5));
- assertEquals("2", handler.filenames.get(6));
- assertEquals("image1.png", handler.filenames.get(7));
- assertEquals("image2.jpg", handler.filenames.get(8));
- assertEquals("image3.png", handler.filenames.get(9));
- assertEquals("image1.png", handler.filenames.get(16));
-
- assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded
office doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded
office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
- assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded
presentation
- assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS
- assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
- assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office
doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
- assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
- assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
- assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office
doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image
-
- // Word with .docx, powerpoint and excel
- handler = process("testWORD_embeded.doc", extractor, false);
- assertEquals(9, handler.filenames.size());
- assertEquals(9, handler.mediaTypes.size());
-
- // Filenames are a bit iffy...
- // Should really be 3*embedded pictures then 3*icons then embedded docs
- assertEquals("image1.emf", handler.filenames.get(0));
- assertEquals("image4.png", handler.filenames.get(1));
- assertEquals("image5.jpg", handler.filenames.get(2));
- assertEquals("image6.png", handler.filenames.get(3));
- assertEquals("image2.emf", handler.filenames.get(4));
- assertEquals("image3.emf", handler.filenames.get(5));
- assertEquals(null, handler.filenames.get(6));
- assertEquals("_1345471035.ppt", handler.filenames.get(7));
- assertEquals("_1345470949.xls", handler.filenames.get(8));
-
- // But we do know their types
- assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded
office doc?
- assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image -
logo
- assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image -
safe
- assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image -
try
- assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded
office doc?
- assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded
office doc?
- assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office
doc
- assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office
doc
- assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office
doc
-
-
- // With recursion, should get their images too
- handler = process("testWORD_embeded.doc", extractor, true);
- assertEquals(16, handler.filenames.size());
- assertEquals(16, handler.mediaTypes.size());
-
- // We don't know their filenames, except for doc images + docx
- assertEquals("image1.emf", handler.filenames.get(0));
- assertEquals("image4.png", handler.filenames.get(1));
- assertEquals("image5.jpg", handler.filenames.get(2));
- assertEquals("image6.png", handler.filenames.get(3));
- assertEquals("image2.emf", handler.filenames.get(4));
- assertEquals("image3.emf", handler.filenames.get(5));
- assertEquals(null, handler.filenames.get(6));
- assertEquals("image2.png", handler.filenames.get(7));
- assertEquals("image3.jpeg", handler.filenames.get(8));
- assertEquals("image4.png", handler.filenames.get(9));
- for (int i = 11; i < 14; i++) {
- assertNull(handler.filenames.get(i));
- }
- // But we do know their types
- assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded
office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image -
logo
- assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image -
safe
- assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image -
try
- assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded
office doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded
office doc
- assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office
doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // PNG inside
.docx
- assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // JPG inside
.docx
- assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // PNG inside
.docx
- assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office
doc
- assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office
doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); // PNG inside
.xls
- // PowerPoint with excel and word
- handler = process("testPPT_embeded.ppt", extractor, false);
- assertEquals(7, handler.filenames.size());
- assertEquals(7, handler.mediaTypes.size());
-
- // We don't get all that helpful filenames
- assertEquals("1", handler.filenames.get(0));
- assertEquals("2", handler.filenames.get(1));
- assertEquals(null, handler.filenames.get(2));
- assertEquals(null, handler.filenames.get(3));
- assertEquals(null, handler.filenames.get(4));
- assertEquals(null, handler.filenames.get(5));
- assertEquals(null, handler.filenames.get(6));
- // But we do know their types
- assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office
doc
- assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office
doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of embedded
office doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); // Icon of embedded
office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embedded image
- assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
- assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // Embedded image
-
- // Run again on PowerPoint but with recursion
- handler = process("testPPT_embeded.ppt", extractor, true);
- assertEquals(11, handler.filenames.size());
- assertEquals(11, handler.mediaTypes.size());
-
- assertEquals("1", handler.filenames.get(0));
- assertEquals(null, handler.filenames.get(1));
- assertEquals("2", handler.filenames.get(2));
- assertEquals("image1.png", handler.filenames.get(3));
- assertEquals("image2.jpg", handler.filenames.get(4));
- assertEquals("image3.png", handler.filenames.get(5));
- assertEquals(null, handler.filenames.get(6));
- assertEquals(null, handler.filenames.get(7));
- assertEquals(null, handler.filenames.get(8));
- assertEquals(null, handler.filenames.get(9));
- assertEquals(null, handler.filenames.get(10));
-
- assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office
doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // PNG inside
.xls
- assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office
doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // PNG inside
.docx
- assertEquals(TYPE_JPG, handler.mediaTypes.get(4)); // JPG inside
.docx
- assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // PNG inside
.docx
- assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded
office doc
- assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded
office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image
- assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
- assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image
-
-
- // Word, with a non-office file (PDF)
- handler = process("testWORD_embedded_pdf.doc", extractor, true);
- assertEquals(2, handler.filenames.size());
- assertEquals(2, handler.mediaTypes.size());
-
- assertEquals("image1.emf", handler.filenames.get(0));
- assertEquals("_1402837031.pdf", handler.filenames.get(1));
-
- assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded
pdf
- assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); // The embedded PDF
itself
-
-
- // Outlook with a text file and a word document
- handler = process("testMSG_att_doc.msg", extractor, true);
- assertEquals(2, handler.filenames.size());
- assertEquals(2, handler.mediaTypes.size());
-
- assertEquals("test-unicode.doc", handler.filenames.get(0));
- assertEquals(TYPE_DOC, handler.mediaTypes.get(0));
-
- assertEquals("pj1.txt", handler.filenames.get(1));
- assertEquals(TYPE_TXT, handler.mediaTypes.get(1));
-
-
- // Outlook with a pdf and another outlook message
- handler = process("testMSG_att_msg.msg", extractor, true);
- assertEquals(2, handler.filenames.size());
- assertEquals(2, handler.mediaTypes.size());
-
- assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
- assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
-
- assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
- assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
- }
-
@Test
public void testEmbeddedOfficeFilesXML() throws Exception {
ContainerExtractor extractor = new ParserContainerExtractor();
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 45a1a39..4742aae 100644
---
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -292,15 +292,6 @@ public class PowerPointParserTest extends TikaTest {
}
@Test
- public void testEmbeddedPDF() throws Exception {
- List<Metadata> metadataList =
getRecursiveMetadata("testPPT_EmbeddedPDF.ppt");
- assertContains("Apache Tika project",
metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
- assertEquals("3.pdf",
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertContains("Hello World",
metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
- assertEquals("4.pdf",
metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- }
-
- @Test
public void testMacros() throws Exception {
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub
Embolden()");
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index edc9c94..1c0f3b5 100644
---
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -544,7 +544,7 @@ public class WordParserTest extends TikaTest {
@Test
public void testOrigSourcePath() throws Exception {
- Metadata embed1_zip_metadata =
getRecursiveMetadata("test_recursive_embedded.doc").get(11);
+ Metadata embed1_zip_metadata =
getRecursiveMetadata("test_recursive_embedded.doc").get(2);
assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder
(2)\\embed1.zip",
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
index d5d1639..33e9a45 100644
---
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
+++
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
@@ -34,8 +34,6 @@ public class OneNoteParserTest extends TikaTest {
*/
@Test
public void testOneNote2013Doc1() throws Exception {
-// List<Metadata> metadataList =
getRecursiveMetadata("testOneNote1.one");
- // debug(metadataList);
Metadata metadata = new Metadata();
String txt = getText("testOneNote1.one", metadata);
assertNoJunk(txt);
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index a38e4d7..fdefbc4 100644
---
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -31,7 +31,6 @@ import java.io.File;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.StringWriter;
-import java.nio.file.Path;
import java.text.DecimalFormatSymbols;
import java.util.Arrays;
import java.util.HashMap;
@@ -1263,25 +1262,10 @@ public class OOXMLParserTest extends TikaTest {
assertContains("<a
href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
}
- @Test
- public void testEmbeddedPDFInPPTX() throws Exception {
- List<Metadata> metadataList =
getRecursiveMetadata("testPPT_EmbeddedPDF.pptx");
- Metadata pdfMetadata1 = metadataList.get(4);
- assertContains("Apache Tika",
pdfMetadata1.get(RecursiveParserWrapper.TIKA_CONTENT));
- Metadata pdfMetadata2 = metadataList.get(5);
- assertContains("Hello World",
pdfMetadata2.get(RecursiveParserWrapper.TIKA_CONTENT));
- }
-
- @Test
- public void testEmbeddedPDFInXLSX() throws Exception {
- List<Metadata> metadataList =
getRecursiveMetadata("testExcel_embeddedPDF.xlsx");
- Metadata pdfMetadata = metadataList.get(1);
- assertContains("Hello World",
pdfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- }
@Test
public void testOrigSourcePath() throws Exception {
- Metadata embed1_zip_metadata =
getRecursiveMetadata("test_recursive_embedded.docx").get(11);
+ Metadata embed1_zip_metadata =
getRecursiveMetadata("test_recursive_embedded.docx").get(2);
assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder
(2)\\embed1.zip",
@@ -1741,11 +1725,6 @@ public class OOXMLParserTest extends TikaTest {
xlsx.get(Metadata.CONTENT_TYPE));
}
- @Test(expected = org.apache.tika.exception.TikaException.class)
- public void testCorruptedZip() throws Exception {
- //TIKA_2446
- getRecursiveMetadata("testZIP_corrupted_oom.zip");
- }
@Test
public void testSigned() throws Exception {
@@ -1788,11 +1767,6 @@ public class OOXMLParserTest extends TikaTest {
getRecursiveMetadata("testWORD_docSecurity.docx")
.get(0).get(OfficeOpenXMLExtended.DOC_SECURITY_STRING));
}
-
- @Test
- public void oneOff() throws Exception {
-
debug(getRecursiveMetadata("CVLKRA-KYC_Download_File_Structure_V3.1.xlsx"));
- }
}
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
index f8c0ff2..d87b9ae 100644
---
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
+++
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
@@ -530,14 +530,6 @@ public class SXSLFExtractorTest extends TikaTest {
}
- @Test
- public void testEmbeddedPDFInPPTX() throws Exception {
- List<Metadata> metadataList =
getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", parseContext);
- Metadata pdfMetadata1 = metadataList.get(4);
- assertContains("Apache Tika",
pdfMetadata1.get(RecursiveParserWrapper.TIKA_CONTENT));
- Metadata pdfMetadata2 = metadataList.get(5);
- assertContains("Hello World",
pdfMetadata2.get(RecursiveParserWrapper.TIKA_CONTENT));
- }
@Test
public void testMacrosInPptm() throws Exception {
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 4eca4b1..0c02056 100644
---
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -674,7 +674,7 @@ public class SXWPFExtractorTest extends TikaTest {
@Test
public void testOrigSourcePath() throws Exception {
- Metadata embed1_zip_metadata =
getRecursiveMetadata("test_recursive_embedded.docx", parseContext).get(11);
+ Metadata embed1_zip_metadata =
getRecursiveMetadata("test_recursive_embedded.docx", parseContext).get(2);
assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder
(2)\\embed1.zip",
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
index 1247cc1..68a3528 100644
---
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
+++
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
@@ -59,26 +59,7 @@ public class TruncatedOOXMLTest extends TikaTest {
assertContains("Suddenly some Japanese", content);
}
- @Test
- public void testWordTrunc13138() throws Exception {
- //this truncates the content_types.xml
- //this tests that there's a backoff to the pkg parser
- List<Metadata> metadataList = getRecursiveMetadata(truncate(
- "testWORD_various.docx", 13138), true);
- assertEquals(19, metadataList.size());
- Metadata m = metadataList.get(0);
- assertEquals("application/x-tika-ooxml", m.get(Metadata.CONTENT_TYPE));
- }
- @Test
- public void testWordTrunc774() throws Exception {
- //this is really truncated
- List<Metadata> metadataList = getRecursiveMetadata(truncate(
- "testWORD_various.docx", 774), true);
- assertEquals(4, metadataList.size());
- Metadata m = metadataList.get(0);
- assertEquals("application/x-tika-ooxml", m.get(Metadata.CONTENT_TYPE));
- }
@Test
public void testTruncation() throws Exception {
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index 250fd82..011895c 100644
---
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -335,7 +335,7 @@ public class RTFParserTest extends TikaTest {
@Test
public void testTurningOffList() throws Exception {
InputStream is = getClass().getResourceAsStream(
-
"/org/apache/tika/parser/rtf/ignoreListMarkup-tika-config.xml");
+
"/org/apache/tika/parser/microsoft/rtf/ignoreListMarkup-tika-config.xml");
assertNotNull(is);
TikaConfig tikaConfig = new TikaConfig(is);
Parser p = new AutoDetectParser(tikaConfig);
@@ -396,79 +396,7 @@ public class RTFParserTest extends TikaTest {
getText("testRTFCorruptListOverride.rtf"));
}
- // TIKA-1010
- @Test
- public void testEmbeddedMonster() throws Exception {
-
- Map<Integer, Pair> expected = new HashMap<>();
- expected.put(3, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
- expected.put(4, new Pair("file_0.doc", "application/msword"));
- expected.put(7, new Pair("file_1.xlsx",
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
- expected.put(10, new Pair("text.html", "text/html;
charset=windows-1252"));
- expected.put(11, new Pair("html-within-zip.zip", "application/zip"));
- expected.put(12, new
Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
- expected.put(15, new
Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html;
charset=UTF-8"));
- expected.put(18, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg",
"image/jpeg"));
- expected.put(21, new Pair("file_2.xls", "application/vnd.ms-excel"));
- expected.put(24, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg",
"application/vnd.ms-outlook"));
- expected.put(27, new Pair("file_3.pdf", "application/pdf"));
- expected.put(30, new Pair("file_4.ppt",
"application/vnd.ms-powerpoint"));
- expected.put(34, new Pair("file_5.pptx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation"));
- expected.put(33, new Pair("thumbnail.jpeg", "image/jpeg"));
- expected.put(37, new Pair("file_6.doc", "application/msword"));
- expected.put(40, new Pair("file_7.doc", "application/msword"));
- expected.put(43, new Pair("file_8.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
- expected.put(46, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg",
"image/jpeg"));
-
-
- List<Metadata> metadataList =
getRecursiveMetadata("testRTFEmbeddedFiles.rtf");
- assertEquals(49, metadataList.size());
- for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
- Metadata metadata = metadataList.get(e.getKey());
- Pair p = e.getValue();
- assertNotNull(metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
- //necessary to getName() because MSOffice extractor includes
- //directory: _1457338524/HW.txt
- Assert.assertEquals("filename equals ",
- p.fileName, FilenameUtils.getName(
-
metadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)));
-
- assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
- }
-
assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_普林斯顿.jpg",
-
metadataList.get(46).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
- }
-
- //TIKA-1010 test regular (not "embedded") images/picts
- @Test
- public void testRegularImages() throws Exception {
- ParseContext ctx = new ParseContext();
- RecursiveParserWrapper parser = new
RecursiveParserWrapper(AUTO_DETECT_PARSER);
- RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(
- new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE,
-1),-1);
- Metadata rootMetadata = new Metadata();
- rootMetadata.add(TikaCoreProperties.RESOURCE_NAME_KEY,
"testRTFRegularImages.rtf");
- try (TikaInputStream tis =
TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf")))
{
- parser.parse(tis, handler, rootMetadata, ctx);
- }
- List<Metadata> metadatas = handler.getMetadataList();
-
- Metadata meta_jpg_exif =
metadatas.get(1);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
- Metadata meta_jpg =
metadatas.get(3);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
-
- assertTrue(meta_jpg_exif != null);
- assertTrue(meta_jpg != null);
-
assertTrue(Arrays.asList(meta_jpg_exif.getValues(TikaCoreProperties.SUBJECT)).contains("serbor"));
-
assertTrue(meta_jpg.get(TikaCoreProperties.COMMENTS).contains("Licensed to the
Apache"));
- //make sure old metadata doesn't linger between objects
-
assertFalse(Arrays.asList(meta_jpg.getValues(TikaCoreProperties.SUBJECT)).contains("serbor"));
- assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
- assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
- assertEquals(51, meta_jpg.names().length);
- assertEquals(112, meta_jpg_exif.names().length);
- }
@Test
public void testMultipleNewlines() throws Exception {
@@ -517,7 +445,7 @@ public class RTFParserTest extends TikaTest {
//test that memory allocation of the bin element is limited
//via the config file. Unfortunately, this test file's bin embedding
contains 10 bytes
//so we had to set the config to 0.
- InputStream is =
getClass().getResourceAsStream("/org/apache/tika/parser/rtf/tika-config.xml");
+ InputStream is =
getClass().getResourceAsStream("/org/apache/tika/parser/microsoft/rtf/tika-config.xml");
assertNotNull(is);
TikaConfig tikaConfig = new TikaConfig(is);
Parser p = new AutoDetectParser(tikaConfig);
@@ -567,13 +495,4 @@ public class RTFParserTest extends TikaTest {
getXML("testRTFTIKA_2899.rtf").xml);
}
-
- private static class Pair {
- final String fileName;
- final String mimeType;
- Pair(String fileName, String mimeType) {
- this.fileName = fileName;
- this.mimeType = mimeType;
- }
- }
}
diff --git
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
index ff716b0..0c1509b 100644
---
a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
+++
b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
@@ -43,56 +43,7 @@ public class XML2003ParserTest extends MultiThreadedTikaTest
{
XMLReaderUtils.setPoolSize(XMLReaderUtils.DEFAULT_POOL_SIZE);
}
- @Test
- public void testBasicWord() throws Exception {
- List<Metadata> list = getRecursiveMetadata("testWORD2003.xml");
- assertEquals(6, list.size());
- Metadata m = list.get(0);//container doc
- String xml = m.get(RecursiveParserWrapper.TIKA_CONTENT);
- xml = xml.replaceAll("\\s+", " ");
- //make sure that metadata gets dumped to xml
- assertContains("<meta name=\"meta:character-count-with-spaces\"
content=\"256\"", xml);
- //do not allow nested <p> elements
- assertContains("<p /> <img href=\"02000003.jpg\" /><p /> <p><img
href=\"02000004.jpg\" /></p>", xml);
- assertContains("<table><tbody>", xml);
- assertContains("</tbody></table>", xml);
- assertContains("<td><p>R1 c1</p> </td>", xml);
- assertContains("<a href=\"https://tika.apache.org/\">tika</a>", xml);
- assertContains("footnote", xml);
- assertContains("Mycomment", xml);
- assertContains("Figure 1: My Figure", xml);
- assertContains("myEndNote", xml);
- assertContains("We have always been at war with OceaniaEurasia", xml);
- assertContains("Text box", xml);
- assertNotContained("Text boxText box", xml);
- assertContains("MyHeader", xml);
- assertContains("MyFooter", xml);
- assertContains("<img href=\"02000003.jpg\" />", xml);
- assertEquals("219", m.get(Office.CHARACTER_COUNT));
- assertEquals("256", m.get(Office.CHARACTER_COUNT_WITH_SPACES));
-
- assertEquals("38", m.get(Office.WORD_COUNT));
- assertEquals("1", m.get(Office.PARAGRAPH_COUNT));
- assertEquals("Allison, Timothy B.", m.get(TikaCoreProperties.CREATOR));
- assertEquals("2016-04-27T17:49:00Z",
m.get(TikaCoreProperties.CREATED));
- assertEquals("application/vnd.ms-wordml",
m.get(Metadata.CONTENT_TYPE));
-
- //make sure embedded docs were properly processed
- assertContains("moscow-birds",
-
Arrays.asList(list.get(5).getValues(TikaCoreProperties.SUBJECT)));
- assertEquals("testJPEG_EXIF.jpg",
list.get(5).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
-
- //check that text is extracted with breaks between elements
- String txt =
getText(getResourceAsStream("/test-documents/testWORD2003.xml"),AUTO_DETECT_PARSER);
- txt = txt.replaceAll("\\s+", " ");
- assertNotContained("beforeR1", txt);
- assertContains("R1 c1 R1 c2", txt);
- assertNotContained("footnoteFigure", txt);
- assertContains("footnote Figure", txt);
- assertContains("test space", txt);
-
- }
@Test
public void testBasicExcel() throws Exception {
diff --git
a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml
similarity index 100%
rename from
tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml
diff --git
a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-dom-macros.xml
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-dom-macros.xml
similarity index 100%
rename from
tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-dom-macros.xml
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-dom-macros.xml
diff --git
a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-sax-macros.xml
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-sax-macros.xml
similarity index 100%
rename from
tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-sax-macros.xml
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-sax-macros.xml
diff --git
a/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/ignoreListMarkup-tika-config.xml
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/rtf/ignoreListMarkup-tika-config.xml
similarity index 100%
rename from
tika-parsers/src/test/resources/org/apache/tika/parser/rtf/ignoreListMarkup-tika-config.xml
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/rtf/ignoreListMarkup-tika-config.xml
diff --git
a/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/tika-config.xml
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/rtf/tika-config.xml
similarity index 100%
rename from
tika-parsers/src/test/resources/org/apache/tika/parser/rtf/tika-config.xml
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/rtf/tika-config.xml
diff --git
a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml
similarity index 100%
rename from
tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/ooxml/tika-config-custom-date-override.xml
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-custom-date-override.xml
diff --git
a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-exclude-phonetic.xml
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-exclude-phonetic.xml
similarity index 100%
rename from
tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-exclude-phonetic.xml
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-exclude-phonetic.xml
diff --git
a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-extract-all-alternatives-msg.xml
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-extract-all-alternatives-msg.xml
similarity index 100%
rename from
tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-extract-all-alternatives-msg.xml
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-extract-all-alternatives-msg.xml
diff --git
a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-macros.xml
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-macros.xml
similarity index 100%
rename from
tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-macros.xml
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-macros.xml
diff --git
a/tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml
similarity index 100%
rename from
tika-parsers/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml
diff --git a/tika-parsers/src/test/resources/test-documents/Doc1_ole.doc
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/Doc1_ole.doc
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/Doc1_ole.doc
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/Doc1_ole.doc
diff --git
a/tika-parsers/src/test/resources/test-documents/EmbeddedDocument.docx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/EmbeddedDocument.docx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/EmbeddedDocument.docx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/EmbeddedDocument.docx
diff --git
a/tika-parsers/src/test/resources/test-documents/EmbeddedOutlook.docx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/EmbeddedOutlook.docx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/EmbeddedOutlook.docx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/EmbeddedOutlook.docx
diff --git a/tika-parsers/src/test/resources/test-documents/EmbeddedPDF.docx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/EmbeddedPDF.docx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/EmbeddedPDF.docx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/EmbeddedPDF.docx
diff --git a/tika-parsers/src/test/resources/test-documents/NullHeader.docx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/NullHeader.docx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/NullHeader.docx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/NullHeader.docx
diff --git a/tika-parsers/src/test/resources/test-documents/footnotes.docx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/footnotes.docx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/footnotes.docx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/footnotes.docx
diff --git a/tika-parsers/src/test/resources/test-documents/headerPic.docx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/headerPic.docx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/headerPic.docx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/headerPic.docx
diff --git a/tika-parsers/src/test/resources/test-documents/jxl.xls
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/jxl.xls
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/jxl.xls
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/jxl.xls
diff --git a/tika-parsers/src/test/resources/test-documents/pictures.ppt
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/pictures.ppt
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/pictures.ppt
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/pictures.ppt
diff --git a/tika-parsers/src/test/resources/test-documents/protect.xlsx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/protect.xlsx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/protect.xlsx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/protect.xlsx
diff --git a/tika-parsers/src/test/resources/test-documents/protectedFile.xlsx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/protectedFile.xlsx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/protectedFile.xlsx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/protectedFile.xlsx
diff --git
a/tika-parsers/src/test/resources/test-documents/protectedSheets.xlsx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/protectedSheets.xlsx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/protectedSheets.xlsx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/protectedSheets.xlsx
diff --git a/tika-parsers/src/test/resources/test-documents/test-columnar.xls
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-columnar.xls
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/test-columnar.xls
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-columnar.xls
diff --git a/tika-parsers/src/test/resources/test-documents/test-columnar.xlsb
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-columnar.xlsb
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/test-columnar.xlsb
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-columnar.xlsb
diff --git a/tika-parsers/src/test/resources/test-documents/test-columnar.xlsx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-columnar.xlsx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/test-columnar.xlsx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-columnar.xlsx
diff --git a/tika-parsers/src/test/resources/test-documents/test-outlook.msg
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-outlook.msg
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/test-outlook.msg
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-outlook.msg
diff --git
a/tika-parsers/src/test/resources/test-documents/test-outlook2003.msg
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-outlook2003.msg
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/test-outlook2003.msg
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-outlook2003.msg
diff --git a/tika-parsers/src/test/resources/test-documents/test.doc
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test.doc
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/test.doc
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test.doc
diff --git a/tika-parsers/src/test/resources/test-documents/testACCESS.mdb
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testACCESS.mdb
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testACCESS.mdb
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testACCESS.mdb
diff --git a/tika-parsers/src/test/resources/test-documents/testAccess2.accdb
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAccess2.accdb
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testAccess2.accdb
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAccess2.accdb
diff --git
a/tika-parsers/src/test/resources/test-documents/testAccess2_2000.mdb
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAccess2_2000.mdb
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testAccess2_2000.mdb
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAccess2_2000.mdb
diff --git
a/tika-parsers/src/test/resources/test-documents/testAccess2_2002-2003.mdb
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAccess2_2002-2003.mdb
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/testAccess2_2002-2003.mdb
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAccess2_2002-2003.mdb
diff --git
a/tika-parsers/src/test/resources/test-documents/testAccess2_encrypted.accdb
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAccess2_encrypted.accdb
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/testAccess2_encrypted.accdb
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAccess2_encrypted.accdb
diff --git
a/tika-parsers/src/test/resources/test-documents/testAccess_V1997.mdb
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAccess_V1997.mdb
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testAccess_V1997.mdb
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAccess_V1997.mdb
diff --git
a/tika-parsers/src/test/resources/test-documents/testBinControlWord.rtf
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testBinControlWord.rtf
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/testBinControlWord.rtf
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testBinControlWord.rtf
diff --git a/tika-parsers/src/test/resources/test-documents/testComment.doc
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testComment.doc
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testComment.doc
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testComment.doc
diff --git a/tika-parsers/src/test/resources/test-documents/testComment.docx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testComment.docx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testComment.docx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testComment.docx
diff --git a/tika-parsers/src/test/resources/test-documents/testComment.ppt
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testComment.ppt
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testComment.ppt
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testComment.ppt
diff --git a/tika-parsers/src/test/resources/test-documents/testComment.pptx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testComment.pptx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testComment.pptx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testComment.pptx
diff --git a/tika-parsers/src/test/resources/test-documents/testComment.rtf
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testComment.rtf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testComment.rtf
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testComment.rtf
diff --git a/tika-parsers/src/test/resources/test-documents/testComment.xls
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testComment.xls
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testComment.xls
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testComment.xls
diff --git a/tika-parsers/src/test/resources/test-documents/testComment.xlsx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testComment.xlsx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testComment.xlsx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testComment.xlsx
diff --git
a/tika-parsers/src/test/resources/test-documents/testControlCharacters.doc
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testControlCharacters.doc
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/testControlCharacters.doc
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testControlCharacters.doc
diff --git
a/tika-parsers/src/test/resources/test-documents/testDOCX_Thumbnail.docx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDOCX_Thumbnail.docx
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/testDOCX_Thumbnail.docx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDOCX_Thumbnail.docx
diff --git a/tika-parsers/src/test/resources/test-documents/testDOTM.dotm
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDOTM.dotm
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testDOTM.dotm
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDOTM.dotm
diff --git
a/tika-parsers/src/test/resources/test-documents/testDocumentLink.doc
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDocumentLink.doc
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testDocumentLink.doc
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDocumentLink.doc
diff --git a/tika-parsers/src/test/resources/test-documents/testEMF.emf
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEMF.emf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testEMF.emf
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEMF.emf
diff --git a/tika-parsers/src/test/resources/test-documents/testEMLX.emlx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEMLX.emlx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testEMLX.emlx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEMLX.emlx
diff --git
a/tika-parsers/src/test/resources/test-documents/testEML_embedded_xhtml_and_img.eml
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEML_embedded_xhtml_and_img.eml
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/testEML_embedded_xhtml_and_img.eml
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEML_embedded_xhtml_and_img.eml
diff --git
a/tika-parsers/src/test/resources/test-documents/testFontAfterBufferedText.rtf
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testFontAfterBufferedText.rtf
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/testFontAfterBufferedText.rtf
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testFontAfterBufferedText.rtf
diff --git a/tika-parsers/src/test/resources/test-documents/testOneNote.one
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote.one
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testOneNote.one
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote.one
diff --git a/tika-parsers/src/test/resources/test-documents/testOneNote1.one
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote1.one
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testOneNote1.one
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote1.one
diff --git a/tika-parsers/src/test/resources/test-documents/testOneNote2.one
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote2.one
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testOneNote2.one
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote2.one
diff --git
a/tika-parsers/src/test/resources/test-documents/testOneNote2007OrEarlier1.one
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote2007OrEarlier1.one
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/testOneNote2007OrEarlier1.one
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote2007OrEarlier1.one
diff --git
a/tika-parsers/src/test/resources/test-documents/testOneNote2007OrEarlier2.one
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote2007OrEarlier2.one
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/testOneNote2007OrEarlier2.one
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote2007OrEarlier2.one
diff --git a/tika-parsers/src/test/resources/test-documents/testOneNote2016.one
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote2016.one
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testOneNote2016.one
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote2016.one
diff --git a/tika-parsers/src/test/resources/test-documents/testOneNote3.one
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote3.one
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testOneNote3.one
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote3.one
diff --git a/tika-parsers/src/test/resources/test-documents/testOneNote4.one
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote4.one
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testOneNote4.one
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNote4.one
diff --git
a/tika-parsers/src/test/resources/test-documents/testOneNoteEmbeddedWordDoc.one
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNoteEmbeddedWordDoc.one
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/testOneNoteEmbeddedWordDoc.one
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testOneNoteEmbeddedWordDoc.one
diff --git a/tika-parsers/src/test/resources/test-documents/testPROJECT2003.mpp
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testPROJECT2003.mpp
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPROJECT2003.mpp
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testPROJECT2003.mpp
diff --git a/tika-parsers/src/test/resources/test-documents/testPROJECT2007.mpp
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testPROJECT2007.mpp
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPROJECT2007.mpp
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testPROJECT2007.mpp
diff --git a/tika-parsers/src/test/resources/test-documents/testPUBLISHER.pub
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testPUBLISHER.pub
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testPUBLISHER.pub
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testPUBLISHER.pub
diff --git a/tika-parsers/src/test/resources/test-documents/testWINMAIL.dat
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWINMAIL.dat
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testWINMAIL.dat
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWINMAIL.dat
diff --git a/tika-parsers/src/test/resources/test-documents/testWMF.wmf
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWMF.wmf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testWMF.wmf
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWMF.wmf
diff --git a/tika-parsers/src/test/resources/test-documents/testWMF_charset.wmf
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWMF_charset.wmf
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testWMF_charset.wmf
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWMF_charset.wmf
diff --git
a/tika-parsers/src/test/resources/test-documents/testWORKSSpreadsheet7.0.xlr
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORKSSpreadsheet7.0.xlr
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/testWORKSSpreadsheet7.0.xlr
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORKSSpreadsheet7.0.xlr
diff --git a/tika-parsers/src/test/resources/test-documents/testWordArt.pptx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWordArt.pptx
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testWordArt.pptx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWordArt.pptx
diff --git
a/tika-parsers/src/test/resources/test-documents/testXLSX_Thumbnail.xlsx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXLSX_Thumbnail.xlsx
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/testXLSX_Thumbnail.xlsx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXLSX_Thumbnail.xlsx
diff --git a/tika-parsers/src/test/resources/test-documents/testXPS_various.xps
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPS_various.xps
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/testXPS_various.xps
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPS_various.xps
diff --git
a/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testZIP_corrupted_oom.zip
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testZIP_corrupted_oom.zip
similarity index 100%
rename from
tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testZIP_corrupted_oom.zip
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testZIP_corrupted_oom.zip
diff --git a/tika-parsers/src/test/resources/test-documents/test_TIKA-1251.doc
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_TIKA-1251.doc
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/test_TIKA-1251.doc
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_TIKA-1251.doc
diff --git
a/tika-parsers/src/test/resources/test-documents/test_embedded_zip.pptx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_embedded_zip.pptx
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/test_embedded_zip.pptx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_embedded_zip.pptx
diff --git
a/tika-parsers/src/test/resources/test-documents/test_list_override.rtf
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_list_override.rtf
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/test_list_override.rtf
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_list_override.rtf
diff --git
a/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.doc
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_recursive_embedded.doc
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/test_recursive_embedded.doc
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_recursive_embedded.doc
diff --git
a/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.docx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_recursive_embedded.docx
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/test_recursive_embedded.docx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_recursive_embedded.docx
diff --git
a/tika-parsers/src/test/resources/test-documents/test_recursive_embedded_npe.docx
b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_recursive_embedded_npe.docx
similarity index 100%
rename from
tika-parsers/src/test/resources/test-documents/test_recursive_embedded_npe.docx
rename to
tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_recursive_embedded_npe.docx
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index e8d300c..dd6e25b 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -146,16 +146,6 @@
<version>1.5</version>
</dependency>
<dependency>
- <groupId>org.apache.james</groupId>
- <artifactId>apache-mime4j-core</artifactId>
- <version>${mime4j.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.james</groupId>
- <artifactId>apache-mime4j-dom</artifactId>
- <version>${mime4j.version}</version>
- </dependency>
- <dependency>
<groupId>com.googlecode.plist</groupId>
<artifactId>dd-plist</artifactId>
<version>1.23</version>
diff --git
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 64a8f8f..ebeb5d2 100644
---
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -42,18 +42,6 @@ org.apache.tika.parser.image.JpegParser
org.apache.tika.parser.mail.RFC822Parser
org.apache.tika.parser.mbox.MboxParser
org.apache.tika.parser.mbox.OutlookPSTParser
-org.apache.tika.parser.microsoft.EMFParser
-org.apache.tika.parser.microsoft.WMFParser
-org.apache.tika.parser.microsoft.JackcessParser
-org.apache.tika.parser.microsoft.MSOwnerFileParser
-org.apache.tika.parser.microsoft.OfficeParser
-org.apache.tika.parser.microsoft.OldExcelParser
-org.apache.tika.parser.microsoft.TNEFParser
-org.apache.tika.parser.microsoft.onenote.OneNoteParser
-org.apache.tika.parser.microsoft.ooxml.OOXMLParser
-org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006.Word2006MLParser
-org.apache.tika.parser.microsoft.xml.WordMLParser
-org.apache.tika.parser.microsoft.xml.SpreadsheetMLParser
org.apache.tika.parser.mp3.Mp3Parser
org.apache.tika.parser.mp4.MP4Parser
org.apache.tika.parser.hdf.HDFParser