TIKA-1855 -- first pass. Need to turn back on the forbidden-apis testCheck. More clean up remains.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/aa5f60d7 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/aa5f60d7 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/aa5f60d7 Branch: refs/heads/2.x Commit: aa5f60d7a0ac0a6a9d739344c76b10940132503f Parents: 41915dc Author: tballison <[email protected]> Authored: Mon Mar 21 21:18:00 2016 -0400 Committer: tballison <[email protected]> Committed: Mon Mar 21 21:18:05 2016 -0400 ---------------------------------------------------------------------- pom.xml | 7 +- tika-app/pom.xml | 15 + .../batch/builders/AppParserFactoryBuilder.java | 2 +- .../main/java/org/apache/tika/cli/TikaCLI.java | 2 +- .../main/java/org/apache/tika/gui/TikaGUI.java | 2 +- .../tika/config/TikaDetectorConfigTest.java | 143 +++ .../tika/config/TikaParserConfigTest.java | 155 +++ .../tika/config/TikaTranslatorConfigTest.java | 73 ++ .../tika/detect/TestContainerAwareDetector.java | 410 +++++++ .../tika/embedder/ExternalEmbedderTest.java | 285 +++++ .../java/org/apache/tika/mime/MimeTypeTest.java | 108 ++ .../org/apache/tika/mime/MimeTypesTest.java | 122 ++ .../org/apache/tika/mime/TestMimeTypes.java | 1044 +++++++++++++++++ .../tika/parser/AutoDetectParserTest.java | 459 ++++++++ .../apache/tika/parser/DigestingParserTest.java | 139 +++ .../apache/tika/parser/ParsingReaderTest.java | 104 ++ .../tika/parser/RecursiveParserWrapperTest.java | 312 ++++++ .../org/apache/tika/parser/TestParsers.java | 133 +++ .../parser/fork/ForkParserIntegrationTest.java | 268 +++++ .../apache/tika/parser/mock/MockParserTest.java | 251 +++++ .../org/apache/tika/parser/pkg/PackageTest.java | 335 ++++++ .../sax/PhoneExtractingContentHandlerTest.java | 58 + .../tika/utils/ServiceLoaderUtilsTest.java | 57 + tika-core/pom.xml | 19 + .../tika/parser/digesting/CommonsDigester.java | 295 +++++ .../src/test/java/org/apache/tika/TikaTest.java | 74 +- .../tika/detect/MimeDetectionWithNNTest.java | 8 +- .../org/apache/tika/mime/MimeDetectionTest.java | 7 +- .../mime/ProbabilisticMimeDetectionTest.java | 7 +- .../ProbabilisticMimeDetectionTestWithTika.java | 7 +- .../java/org/apache/tika/osgi/BundleIT.java | 11 - .../GLDAS_CLM10SUBP_3H.A19790202.0000.001.grb | Bin 1362900 -> 0 bytes .../org/apache/tika/mime/brwNIMS_2014.dif | 56 - .../apache/tika/mime/circles-with-prefix.svg | 8 - .../resources/org/apache/tika/mime/circles.svg | 8 - .../org/apache/tika/mime/datamatrix.png | Bin 204 -> 0 bytes .../tika/mime/gdas1.forecmwf.2014062612.grib2 | Bin 2489194 -> 0 bytes .../resources/org/apache/tika/mime/htmlfragment | 18 - .../apache/tika/mime/plotutils-bin-cgm-v3.cgm | Bin 1744 -> 0 bytes .../org/apache/tika/mime/stylesheet.xsl | 9 - .../apache/tika/mime/test-difficult-rdf1.xml | 39 - .../apache/tika/mime/test-difficult-rdf2.xml | 44 - .../org/apache/tika/mime/test-iso-8859-1.xml | 2 - .../org/apache/tika/mime/test-long-comment.xml | 21 - .../tika/mime/test-malformed-header.html.bin | Bin 305 -> 0 bytes .../org/apache/tika/mime/test-tika-327.html | 50 - .../org/apache/tika/mime/test-utf16be.xml | Bin 126 -> 0 bytes .../org/apache/tika/mime/test-utf16le.xml | Bin 126 -> 0 bytes .../org/apache/tika/mime/test-utf8-bom.xml | 2 - .../org/apache/tika/mime/test-utf8.xml | 2 - .../resources/org/apache/tika/mime/test.html | 10 - .../resources/org/apache/tika/mime/test.xls | Bin 13824 -> 0 bytes .../org/apache/tika/mime/testlargerbuffer.html | 827 -------------- tika-parent/pom.xml | 3 +- tika-parser-modules/pom.xml | 26 - .../tika/parser/ner/NamedEntityParserTest.java | 16 +- .../parser/ner/regex/RegexNERecogniserTest.java | 15 +- .../apache/tika/parser/ner/regex/ner-regex.txt | 17 + .../tika/parser/ner/tika-config-for-ner.xml | 27 + .../tika/parser/jdbc/SQLite3ParserTest.java | 50 +- .../tika/parser/chm/TestChmExtraction.java | 25 +- .../tika/parser/microsoft/ExcelParserTest.java | 387 +++---- .../apache/tika/parser/odf/ODFParserTest.java | 460 ++++---- .../apache/tika/parser/rtf/RTFParserTest.java | 163 +-- .../apache/tika/parser/pdf/PDFParserTest.java | 133 +-- .../tika/parser/isatab/ISArchiveParser.java | 3 +- .../apache/tika/parser/netcdf/NetCDFParser.java | 17 +- .../apache/tika/parser/dif/DIFParserTest.java | 31 +- .../tika/parser/envi/EnviHeaderParserTest.java | 36 +- .../apache/tika/parser/gdal/TestGDALParser.java | 34 +- .../tika/parser/geo/topic/GeoParserTest.java | 23 +- .../GeographicInformationParserTest.java | 50 +- .../apache/tika/parser/grib/GribParserTest.java | 30 +- .../apache/tika/parser/hdf/HDFParserTest.java | 44 +- .../tika/parser/isatab/ISArchiveParserTest.java | 80 +- .../apache/tika/parser/mat/MatParserTest.java | 60 +- .../tika/parser/netcdf/NetCDFParserTest.java | 48 +- .../tika/parser/strings/StringsParserTest.java | 23 +- .../tika/parser/txt/CharsetDetectorTest.java | 7 +- .../apache/tika/parser/txt/TXTParserTest.java | 51 +- .../apache/tika/parser/xml/DcXMLParserTest.java | 28 +- .../EmptyAndDuplicateElementsXMLParserTest.java | 60 +- .../tika/parser/xml/FictionBookParserTest.java | 19 +- tika-parsers/pom.xml | 333 ------ .../main/appended-resources/META-INF/LICENSE | 94 -- .../apache/tika/parser/internal/Activator.java | 54 - .../tika/parser/utils/CommonsDigester.java | 299 ----- .../test/java/org/apache/tika/TestParsers.java | 109 -- .../tika/config/TikaDetectorConfigTest.java | 143 --- .../tika/config/TikaParserConfigTest.java | 157 --- .../tika/config/TikaTranslatorConfigTest.java | 72 -- .../tika/detect/TestContainerAwareDetector.java | 410 ------- .../tika/embedder/ExternalEmbedderTest.java | 292 ----- .../java/org/apache/tika/mime/MimeTypeTest.java | 105 -- .../org/apache/tika/mime/MimeTypesTest.java | 122 -- .../org/apache/tika/mime/TestMimeTypes.java | 1047 ------------------ .../tika/parser/AutoDetectParserTest.java | 459 -------- .../apache/tika/parser/DigestingParserTest.java | 136 --- .../apache/tika/parser/ParsingReaderTest.java | 104 -- .../tika/parser/RecursiveParserWrapperTest.java | 312 ------ .../parser/fork/ForkParserIntegrationTest.java | 268 ----- .../apache/tika/parser/mock/MockParserTest.java | 251 ----- .../org/apache/tika/parser/pkg/PackageTest.java | 335 ------ .../sax/PhoneExtractingContentHandlerTest.java | 58 - .../tika/utils/ServiceLoaderUtilsTest.java | 57 - tika-server/pom.xml | 8 +- .../org/apache/tika/server/TikaServerCli.java | 2 +- .../org/apache/tika/server/CXFTestBase.java | 14 +- .../tika/server/DetectorResourceTest.java | 6 +- .../tika/server/LanguageResourceTest.java | 4 +- .../tika/server/MetadataResourceTest.java | 26 +- .../server/RecursiveMetadataResourceTest.java | 36 +- .../apache/tika/server/StackTraceOffTest.java | 8 +- .../org/apache/tika/server/StackTraceTest.java | 8 +- .../org/apache/tika/server/TikaParsersTest.java | 12 +- .../apache/tika/server/TikaResourceTest.java | 23 +- .../tika/server/UnpackerResourceTest.java | 20 +- tika-server/src/test/resources/2exe.docx | Bin 715333 -> 0 bytes tika-server/src/test/resources/2pic.doc | Bin 4339712 -> 0 bytes tika-server/src/test/resources/2pic.docx | Bin 883427 -> 0 bytes .../src/test/resources/CDEC_WEATHER_2010_03_02 | 98 -- tika-server/src/test/resources/Doc1_ole.doc | Bin 89600 -> 0 bytes tika-server/src/test/resources/english.txt | 1 - tika-server/src/test/resources/foo.csv | 4 - tika-server/src/test/resources/french.txt | 1 - .../test/resources/mime/custom-mimetypes.xml | 24 - .../src/test/resources/mock/null_pointer.xml | 25 - .../org/apache/tika/mime/custom-mimetypes.xml | 24 + tika-server/src/test/resources/password.xls | Bin 22528 -> 0 bytes tika-server/src/test/resources/pic.xls | Bin 593920 -> 0 bytes tika-server/src/test/resources/pic.xlsx | Bin 580188 -> 0 bytes tika-server/src/test/resources/test.doc | Bin 9216 -> 0 bytes .../testRTF_npeFromWMFInTikaServer.rtf | 235 ---- .../test/resources/test_recursive_embedded.docx | Bin 27082 -> 0 bytes tika-test-resources/pom.xml | 7 - .../apache/tika/parser/ner/regex/ner-regex.txt | 17 - .../org/apache/tika/parser/ner/tika-config.xml | 27 - .../src/test/resources/test-documents/2exe.docx | Bin 0 -> 715333 bytes .../src/test/resources/test-documents/2pic.doc | Bin 0 -> 4339712 bytes .../src/test/resources/test-documents/2pic.docx | Bin 0 -> 883427 bytes .../test-documents/CDEC_WEATHER_2010_03_02 | 98 ++ .../resources/test-documents/brwNIMS_2014.dif | 56 + .../test-documents/circles-with-prefix.svg | 8 + .../test/resources/test-documents/circles.svg | 8 + .../resources/test-documents/datamatrix.png | Bin 0 -> 204 bytes .../test/resources/test-documents/english.txt | 1 + .../src/test/resources/test-documents/foo.csv | 4 + .../test/resources/test-documents/french.txt | 1 + .../test/resources/test-documents/htmlfragment | 18 + .../test-documents/mock/null_pointer.xml | 4 +- .../test/resources/test-documents/password.xls | Bin 0 -> 22528 bytes .../src/test/resources/test-documents/pic.xls | Bin 0 -> 593920 bytes .../src/test/resources/test-documents/pic.xlsx | Bin 0 -> 580188 bytes .../test-documents/plotutils-bin-cgm-v3.cgm | Bin 0 -> 1744 bytes .../resources/test-documents/stylesheet.xsl | 9 + .../test-documents/test-difficult-rdf1.xml | 39 + .../test-documents/test-difficult-rdf2.xml | 44 + .../test-documents/test-iso-8859-1.xml | 2 + .../test-documents/test-long-comment.xml | 21 + .../resources/test-documents/test-tika-327.html | 50 + .../resources/test-documents/test-utf16be.xml | Bin 0 -> 126 bytes .../resources/test-documents/test-utf16le.xml | Bin 0 -> 126 bytes .../resources/test-documents/test-utf8-bom.xml | 2 + .../test/resources/test-documents/test-utf8.xml | 2 + .../src/test/resources/test-documents/test.html | 10 + .../src/test/resources/test-documents/test.xls | Bin 0 -> 13824 bytes .../testRTF_npeFromWMFInTikaServer.rtf | 235 ++++ .../test-documents/testlargerbuffer.html | 827 ++++++++++++++ 168 files changed, 7231 insertions(+), 8029 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index c790244..ea4f114 100644 --- a/pom.xml +++ b/pom.xml @@ -46,9 +46,10 @@ <modules> <module>tika-parent</module> - <module>tika-core</module> <module>tika-test-resources</module> - <module>tika-parsers</module> + <module>tika-core</module> + <module>tika-parser-modules</module> + <module>tika-parser-bundles</module> <module>tika-xmp</module> <module>tika-serialization</module> <module>tika-batch</module> @@ -59,8 +60,6 @@ <module>tika-langdetect</module> <module>tika-example</module> <module>tika-java7</module> - <module>tika-parser-modules</module> - <module>tika-parser-bundles</module> </modules> <profiles> http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/pom.xml ---------------------------------------------------------------------- diff --git a/tika-app/pom.xml b/tika-app/pom.xml index e362391..9177afb 100644 --- a/tika-app/pom.xml +++ b/tika-app/pom.xml @@ -101,6 +101,21 @@ <groupId>commons-io</groupId> <version>${commons.io.version}</version> </dependency> + <!-- test dependencies --> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-test-resources</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> </dependencies> <build> http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java ---------------------------------------------------------------------- diff --git a/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java b/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java index 998f649..98f4343 100644 --- a/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java +++ b/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java @@ -23,7 +23,7 @@ import java.util.Map; import org.apache.tika.batch.DigestingAutoDetectParserFactory; import org.apache.tika.batch.ParserFactory; import org.apache.tika.parser.DigestingParser; -import org.apache.tika.parser.utils.CommonsDigester; +import org.apache.tika.parser.digesting.CommonsDigester; import org.apache.tika.util.ClassLoaderUtil; import org.apache.tika.util.XMLDOMUtil; import org.w3c.dom.Node; http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java ---------------------------------------------------------------------- diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 314599e..a2b91c9 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -101,7 +101,7 @@ import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.parser.html.BoilerpipeContentHandler; -import org.apache.tika.parser.utils.CommonsDigester; +import org.apache.tika.parser.digesting.CommonsDigester; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerFactory; http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java ---------------------------------------------------------------------- diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java index 5ecc763..1bc9405 100644 --- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java +++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java @@ -76,7 +76,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.parser.html.BoilerpipeContentHandler; -import org.apache.tika.parser.utils.CommonsDigester; +import org.apache.tika.parser.digesting.CommonsDigester; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java b/tika-app/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java new file mode 100644 index 0000000..132475a --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import org.apache.tika.detect.CompositeDetector; +import org.apache.tika.detect.DefaultDetector; +import org.apache.tika.detect.Detector; +import org.apache.tika.detect.EmptyDetector; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.mbox.OutlookPSTParser; +import org.apache.tika.parser.microsoft.POIFSContainerDetector; +import org.apache.tika.parser.pkg.ZipContainerDetector; +import org.junit.Test; + +/** + * Junit test class for {@link TikaConfig}, which cover things + * that {@link TikaConfigTest} can't do due to a need for the + * full set of detectors + */ +public class TikaDetectorConfigTest extends AbstractTikaConfigTest { + @Test + public void testDetectorExcludeFromDefault() throws Exception { + TikaConfig config = getConfig("TIKA-1702-detector-blacklist.xml"); + assertNotNull(config.getParser()); + assertNotNull(config.getDetector()); + CompositeDetector detector = (CompositeDetector)config.getDetector(); + + // Should be wrapping two detectors + assertEquals(2, detector.getDetectors().size()); + + + // First should be DefaultDetector, second Empty, that order + assertEquals(DefaultDetector.class, detector.getDetectors().get(0).getClass()); + assertEquals(EmptyDetector.class, detector.getDetectors().get(1).getClass()); + + + // Get the DefaultDetector from the config + DefaultDetector confDetector = (DefaultDetector)detector.getDetectors().get(0); + + // Get a fresh "default" DefaultParser + DefaultDetector normDetector = new DefaultDetector(config.getMimeRepository()); + + + // The default one will offer the Zip and POIFS detectors + assertDetectors(normDetector, true, true); + + + // The one from the config won't, as we excluded those + assertDetectors(confDetector, false, false); + } + + /** + * TIKA-1708 - If the Zip detector is disabled, either explicitly, + * or via giving a list of detectors that it isn't part of, ensure + * that detection of PST files still works + */ + @Test + public void testPSTDetectionWithoutZipDetector() throws Exception { + // Check the one with an exclude + TikaConfig configWX = getConfig("TIKA-1708-detector-default.xml"); + assertNotNull(configWX.getParser()); + assertNotNull(configWX.getDetector()); + CompositeDetector detectorWX = (CompositeDetector)configWX.getDetector(); + + // Check it has the POIFS one, but not the zip one + assertDetectors(detectorWX, true, false); + + + // Check the one with an explicit list + TikaConfig configCL = getConfig("TIKA-1708-detector-composite.xml"); + assertNotNull(configCL.getParser()); + assertNotNull(configCL.getDetector()); + CompositeDetector detectorCL = (CompositeDetector)configCL.getDetector(); + assertEquals(2, detectorCL.getDetectors().size()); + + // Check it also has the POIFS one, but not the zip one + assertDetectors(detectorCL, true, false); + + + // Check that both detectors have a mimetypes with entries + assertTrue("Not enough mime types: " + configWX.getMediaTypeRegistry().getTypes().size(), + configWX.getMediaTypeRegistry().getTypes().size() > 100); + assertTrue("Not enough mime types: " + configCL.getMediaTypeRegistry().getTypes().size(), + configCL.getMediaTypeRegistry().getTypes().size() > 100); + + + // Now check they detect PST files correctly + TikaInputStream stream = TikaInputStream.cast( + getTestDocumentAsStream("testPST.pst")); + assertEquals( + OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE, + detectorWX.detect(stream, new Metadata()) + ); + assertEquals( + OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE, + detectorCL.detect(stream, new Metadata()) + ); + } + + private void assertDetectors(CompositeDetector detector, boolean shouldHavePOIFS, + boolean shouldHaveZip) { + boolean hasZip = false; + boolean hasPOIFS = false; + for (Detector d : detector.getDetectors()) { + if (d instanceof ZipContainerDetector) { + if (shouldHaveZip) { + hasZip = true; + } else { + fail("Shouldn't have the ZipContainerDetector from config"); + } + } + if (d instanceof POIFSContainerDetector) { + if (shouldHavePOIFS) { + hasPOIFS = true; + } else { + fail("Shouldn't have the POIFSContainerDetector from config"); + } + } + } + if (shouldHavePOIFS) assertTrue("Should have the POIFSContainerDetector", hasPOIFS); + if (shouldHaveZip) assertTrue("Should have the ZipContainerDetector", hasZip); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/config/TikaParserConfigTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/config/TikaParserConfigTest.java b/tika-app/src/test/java/org/apache/tika/config/TikaParserConfigTest.java new file mode 100644 index 0000000..817beb4 --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/config/TikaParserConfigTest.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.util.List; + +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.CompositeParser; +import org.apache.tika.parser.DefaultParser; +import org.apache.tika.parser.EmptyParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; +import org.apache.tika.parser.executable.ExecutableParser; +import org.apache.tika.parser.xml.XMLParser; +import org.junit.Test; + +/** + * Junit test class for {@link TikaConfig}, which cover things + * that {@link TikaConfigTest} can't do due to a need for the + * full set of parsers + */ +public class TikaParserConfigTest extends AbstractTikaConfigTest { + @Test + public void testMimeExcludeInclude() throws Exception { + TikaConfig config = getConfig("TIKA-1558-blacklist.xml"); + assertNotNull(config.getParser()); + assertNotNull(config.getDetector()); + Parser parser = config.getParser(); + + MediaType PDF = MediaType.application("pdf"); + MediaType JPEG = MediaType.image("jpeg"); + + + // Has two parsers + assertEquals(CompositeParser.class, parser.getClass()); + CompositeParser cParser = (CompositeParser)parser; + assertEquals(2, cParser.getAllComponentParsers().size()); + + // Both are decorated + assertTrue(cParser.getAllComponentParsers().get(0) instanceof ParserDecorator); + assertTrue(cParser.getAllComponentParsers().get(1) instanceof ParserDecorator); + ParserDecorator p0 = (ParserDecorator)cParser.getAllComponentParsers().get(0); + ParserDecorator p1 = (ParserDecorator)cParser.getAllComponentParsers().get(1); + + + // DefaultParser will be wrapped with excludes + assertEquals(DefaultParser.class, p0.getWrappedParser().getClass()); + + assertNotContained(PDF, p0.getSupportedTypes(context)); + assertContains(PDF, p0.getWrappedParser().getSupportedTypes(context)); + assertNotContained(JPEG, p0.getSupportedTypes(context)); + assertContains(JPEG, p0.getWrappedParser().getSupportedTypes(context)); + + + // Will have an empty parser for PDF + assertEquals(EmptyParser.class, p1.getWrappedParser().getClass()); + assertEquals(1, p1.getSupportedTypes(context).size()); + assertContains(PDF, p1.getSupportedTypes(context)); + assertNotContained(PDF, p1.getWrappedParser().getSupportedTypes(context)); + } + + @Test + public void testParserExcludeFromDefault() throws Exception { + TikaConfig config = getConfig("TIKA-1558-blacklist.xml"); + assertNotNull(config.getParser()); + assertNotNull(config.getDetector()); + CompositeParser parser = (CompositeParser)config.getParser(); + + MediaType PE_EXE = MediaType.application("x-msdownload"); + MediaType ELF = MediaType.application("x-elf"); + + + // Get the DefaultParser from the config + ParserDecorator confWrappedParser = (ParserDecorator)parser.getParsers().get(MediaType.APPLICATION_XML); + assertNotNull(confWrappedParser); + DefaultParser confParser = (DefaultParser)confWrappedParser.getWrappedParser(); + + // Get a fresh "default" DefaultParser + DefaultParser normParser = new DefaultParser(config.getMediaTypeRegistry()); + + + // The default one will offer the Executable Parser + assertContains(PE_EXE, normParser.getSupportedTypes(context)); + assertContains(ELF, normParser.getSupportedTypes(context)); + + boolean hasExec = false; + for (Parser p : normParser.getParsers().values()) { + if (p instanceof ExecutableParser) { + hasExec = true; + break; + } + } + assertTrue(hasExec); + + + // The one from the config won't + assertNotContained(PE_EXE, confParser.getSupportedTypes(context)); + assertNotContained(ELF, confParser.getSupportedTypes(context)); + + for (Parser p : confParser.getParsers().values()) { + if (p instanceof ExecutableParser) + fail("Shouldn't have the Executable Parser from config"); + } + } + /** + * TIKA-1558 It should be possible to exclude Parsers from being picked up by + * DefaultParser. + */ + @Test + public void defaultParserBlacklist() throws Exception { + TikaConfig config = new TikaConfig(); + assertNotNull(config.getParser()); + assertNotNull(config.getDetector()); + CompositeParser cp = (CompositeParser) config.getParser(); + List<Parser> parsers = cp.getAllComponentParsers(); + + boolean hasXML = false; + for (Parser p : parsers) { + if (p instanceof XMLParser) { + hasXML = true; + break; + } + } + assertTrue("Default config should include an XMLParser.", hasXML); + + // This custom TikaConfig should exclude XMLParser and all of its subclasses. + config = getConfig("TIKA-1558-blacklistsub.xml"); + cp = (CompositeParser) config.getParser(); + parsers = cp.getAllComponentParsers(); + + for (Parser p : parsers) { + if (p instanceof XMLParser) + fail("Custom config should not include an XMLParser (" + p.getClass() + ")."); + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java b/tika-app/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java new file mode 100644 index 0000000..764bbe4 --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.language.translate.DefaultTranslator; +import org.apache.tika.language.translate.EmptyTranslator; +import org.junit.Test; + +/** + * Junit test class for {@link TikaConfig}, which cover things + * that {@link TikaConfigTest} can't do due to a need for the + * full set of translators + */ +public class TikaTranslatorConfigTest extends AbstractTikaConfigTest { + @Test + public void testDefaultBehaviour() throws Exception { + TikaConfig config = TikaConfig.getDefaultConfig(); + assertNotNull(config.getTranslator()); + assertEquals(DefaultTranslator.class, config.getTranslator().getClass()); + } + + @Test + public void testRequestsDefault() throws Exception { + TikaConfig config = getConfig("TIKA-1702-translator-default.xml"); + assertNotNull(config.getParser()); + assertNotNull(config.getDetector()); + assertNotNull(config.getTranslator()); + + assertEquals(DefaultTranslator.class, config.getTranslator().getClass()); + } + + @Test + public void testRequestsEmpty() throws Exception { + TikaConfig config = getConfig("TIKA-1702-translator-empty.xml"); + assertNotNull(config.getParser()); + assertNotNull(config.getDetector()); + assertNotNull(config.getTranslator()); + + assertEquals(EmptyTranslator.class, config.getTranslator().getClass()); + } + + /** + * Currently, Translators don't support Composites, so + * if multiple translators are given, only the first wins + */ + @Test + public void testRequestsMultiple() throws Exception { + TikaConfig config = getConfig("TIKA-1702-translator-empty-default.xml"); + assertNotNull(config.getParser()); + assertNotNull(config.getDetector()); + assertNotNull(config.getTranslator()); + + assertEquals(EmptyTranslator.class, config.getTranslator().getClass()); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-app/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java new file mode 100644 index 0000000..5787408 --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java @@ -0,0 +1,410 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MimeTypes; +import org.junit.Test; + +/** + * Junit test class for {@link ContainerAwareDetector} + */ +public class TestContainerAwareDetector { + private final TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); + private final MimeTypes mimeTypes = tikaConfig.getMimeRepository(); + private final Detector detector = new DefaultDetector(mimeTypes); + + private void assertTypeByData(String file, String type) throws Exception { + assertTypeByNameAndData(file, null, type); + } + private void assertTypeByNameAndData(String file, String type) throws Exception { + assertTypeByNameAndData(file, file, type); + } + private void assertType(String file, String byData, String byNameAndData) throws Exception { + assertTypeByData(file, byData); + assertTypeByNameAndData(file, byNameAndData); + } + private void assertTypeByNameAndData(String dataFile, String name, String type) throws Exception { + assertTypeByNameAndData(dataFile, name, type, null); + } + private void assertTypeByNameAndData(String dataFile, String name, String typeFromDetector, String typeFromMagic) throws Exception { + try (TikaInputStream stream = TikaInputStream.get( + TestContainerAwareDetector.class.getResource("/test-documents/" + dataFile))) { + Metadata m = new Metadata(); + if (name != null) + m.add(Metadata.RESOURCE_NAME_KEY, name); + + // Mime Magic version is likely to be less precise + if (typeFromMagic != null) { + assertEquals( + MediaType.parse(typeFromMagic), + mimeTypes.detect(stream, m)); + } + + // All being well, the detector should get it perfect + assertEquals( + MediaType.parse(typeFromDetector), + detector.detect(stream, m)); + } + } + + @Test + public void testDetectOLE2() throws Exception { + // Microsoft office types known by POI + assertTypeByData("testEXCEL.xls", "application/vnd.ms-excel"); + assertTypeByData("testWORD.doc", "application/msword"); + assertTypeByData("testPPT.ppt", "application/vnd.ms-powerpoint"); + + assertTypeByData("test-outlook.msg", "application/vnd.ms-outlook"); + assertTypeByData("test-outlook2003.msg", "application/vnd.ms-outlook"); + assertTypeByData("testVISIO.vsd", "application/vnd.visio"); + assertTypeByData("testPUBLISHER.pub", "application/x-mspublisher"); + assertTypeByData("testWORKS.wps", "application/vnd.ms-works"); + assertTypeByData("testWORKS2000.wps", "application/vnd.ms-works"); + + // older Works Word Processor files can't be recognized + // they were created with Works Word Processor 7.0 (hence the text inside) + // and exported to the older formats with the "Save As" feature + assertTypeByData("testWORKSWordProcessor3.0.wps","application/vnd.ms-works"); + assertTypeByData("testWORKSWordProcessor4.0.wps","application/vnd.ms-works"); + assertTypeByData("testWORKSSpreadsheet7.0.xlr", "application/x-tika-msworks-spreadsheet"); + assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project"); + assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project"); + + // Excel95 can be detected by not parsed + assertTypeByData("testEXCEL_95.xls", "application/vnd.ms-excel"); + + // Try some ones that POI doesn't handle, that are still OLE2 based + assertTypeByData("testCOREL.shw", "application/x-corelpresentations"); + assertTypeByData("testQUATTRO.qpw", "application/x-quattro-pro"); + assertTypeByData("testQUATTRO.wb3", "application/x-quattro-pro"); + + assertTypeByData("testHWP_5.0.hwp", "application/x-hwp-v5"); + + + // With the filename and data + assertTypeByNameAndData("testEXCEL.xls", "application/vnd.ms-excel"); + assertTypeByNameAndData("testWORD.doc", "application/msword"); + assertTypeByNameAndData("testPPT.ppt", "application/vnd.ms-powerpoint"); + + // With the wrong filename supplied, data will trump filename + assertTypeByNameAndData("testEXCEL.xls", "notWord.doc", "application/vnd.ms-excel"); + assertTypeByNameAndData("testWORD.doc", "notExcel.xls", "application/msword"); + assertTypeByNameAndData("testPPT.ppt", "notWord.doc", "application/vnd.ms-powerpoint"); + + // With a filename of a totally different type, data will trump filename + assertTypeByNameAndData("testEXCEL.xls", "notPDF.pdf", "application/vnd.ms-excel"); + assertTypeByNameAndData("testEXCEL.xls", "notPNG.png", "application/vnd.ms-excel"); + } + + /** + * There is no way to distinguish "proper" StarOffice files from templates. + * All templates have the same extension but their actual type depends on + * the magic. Our current MimeTypes class doesn't allow us to use the same + * glob pattern in more than one mimetype. + * + * @throws Exception + */ + @Test + public void testDetectStarOfficeFiles() throws Exception { + assertType("testStarOffice-5.2-calc.sdc", + "application/vnd.stardivision.calc", + "application/vnd.stardivision.calc"); + assertType("testVORCalcTemplate.vor", + "application/vnd.stardivision.calc", + "application/vnd.stardivision.calc"); + assertType("testStarOffice-5.2-draw.sda", + "application/vnd.stardivision.draw", + "application/vnd.stardivision.draw"); + assertType("testVORDrawTemplate.vor", + "application/vnd.stardivision.draw", + "application/vnd.stardivision.draw"); + assertType("testStarOffice-5.2-impress.sdd", + "application/vnd.stardivision.impress", + "application/vnd.stardivision.impress"); + assertType("testVORImpressTemplate.vor", + "application/vnd.stardivision.impress", + "application/vnd.stardivision.impress"); + assertType("testStarOffice-5.2-writer.sdw", + "application/vnd.stardivision.writer", + "application/vnd.stardivision.writer"); + assertType("testVORWriterTemplate.vor", + "application/vnd.stardivision.writer", + "application/vnd.stardivision.writer"); + + } + + @Test + public void testOpenContainer() throws Exception { + try (TikaInputStream stream = TikaInputStream.get( + TestContainerAwareDetector.class.getResource("/test-documents/testPPT.ppt"))) { + assertNull(stream.getOpenContainer()); + assertEquals( + MediaType.parse("application/vnd.ms-powerpoint"), + detector.detect(stream, new Metadata())); + assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem); + } + } + + /** + * EPub uses a similar mimetype entry to OpenDocument for storing + * the mimetype within the parent zip file + */ + @Test + public void testDetectEPub() throws Exception { + assertTypeByData("testEPUB.epub", "application/epub+zip"); + assertTypeByData("testiBooks.ibooks", "application/x-ibooks+zip"); + } + + @Test + public void testDetectLotusNotesEml() throws Exception { + // Lotus .eml files aren't guaranteed to have any of the magic + // matches as the first line, but should have X-Notes-Item and Message-ID + assertTypeByData("testLotusEml.eml", "message/rfc822"); + } + + @Test + public void testDetectODF() throws Exception { + assertTypeByData("testODFwithOOo3.odt", "application/vnd.oasis.opendocument.text"); + assertTypeByData("testOpenOffice2.odf", "application/vnd.oasis.opendocument.formula"); + } + + @Test + public void testDetectOOXML() throws Exception { + assertTypeByData("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + assertTypeByData("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); + assertTypeByData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); + + // Check some of the less common OOXML types + assertTypeByData("testPPT.pptm", "application/vnd.ms-powerpoint.presentation.macroenabled.12"); + assertTypeByData("testPPT.ppsx", "application/vnd.openxmlformats-officedocument.presentationml.slideshow"); + assertTypeByData("testPPT.ppsm", "application/vnd.ms-powerpoint.slideshow.macroEnabled.12"); + assertTypeByData("testDOTM.dotm", "application/vnd.ms-word.template.macroEnabled.12"); + assertTypeByData("testEXCEL.strict.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + assertTypeByData("testPPT.xps", "application/vnd.ms-xpsdocument"); + + assertTypeByData("testVISIO.vsdm", "application/vnd.ms-visio.drawing.macroenabled.12"); + assertTypeByData("testVISIO.vsdx", "application/vnd.ms-visio.drawing"); + assertTypeByData("testVISIO.vssm", "application/vnd.ms-visio.stencil.macroenabled.12"); + assertTypeByData("testVISIO.vssx", "application/vnd.ms-visio.stencil"); + assertTypeByData("testVISIO.vstm", "application/vnd.ms-visio.template.macroenabled.12"); + assertTypeByData("testVISIO.vstx", "application/vnd.ms-visio.template"); + + // .xlsb is an OOXML file containing the binary parts, and not + // an OLE2 file as you might initially expect! + assertTypeByData("testEXCEL.xlsb", "application/vnd.ms-excel.sheet.binary.macroEnabled.12"); + + // With the filename and data + assertTypeByNameAndData("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + assertTypeByNameAndData("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); + assertTypeByNameAndData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); + + // With the wrong filename supplied, data will trump filename + assertTypeByNameAndData("testEXCEL.xlsx", "notWord.docx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + assertTypeByNameAndData("testWORD.docx", "notExcel.xlsx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); + assertTypeByNameAndData("testPPT.pptx", "notWord.docx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); + + // With an incorrect filename of a different container type, data trumps filename + assertTypeByNameAndData("testEXCEL.xlsx", "notOldExcel.xls", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + } + + /** + * Password Protected OLE2 files are fairly straightforward to detect, as they + * have the same structure as regular OLE2 files. (Core streams may be encrypted + * however) + */ + @Test + public void testDetectProtectedOLE2() throws Exception { + assertTypeByData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel"); + assertTypeByData("testWORD_protected_passtika.doc", "application/msword"); + assertTypeByData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint"); + assertTypeByNameAndData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel"); + assertTypeByNameAndData("testWORD_protected_passtika.doc", "application/msword"); + assertTypeByNameAndData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint"); + } + + /** + * Password Protected OOXML files are much more tricky beasts to work with. + * They have a very different structure to regular OOXML files, and instead + * of being ZIP based they are actually an OLE2 file which contains the + * OOXML structure within an encrypted stream. + * This makes detecting them much harder... + */ + @Test + public void testDetectProtectedOOXML() throws Exception { + // Encrypted Microsoft Office OOXML files have OLE magic but + // special streams, so we can tell they're Protected OOXML + assertTypeByData("testEXCEL_protected_passtika.xlsx", + "application/x-tika-ooxml-protected"); + assertTypeByData("testWORD_protected_passtika.docx", + "application/x-tika-ooxml-protected"); + assertTypeByData("testPPT_protected_passtika.pptx", + "application/x-tika-ooxml-protected"); + + // At the moment, we can't use the name to specialise + // See discussions on TIKA-790 for details + assertTypeByNameAndData("testEXCEL_protected_passtika.xlsx", + "application/x-tika-ooxml-protected"); + assertTypeByNameAndData("testWORD_protected_passtika.docx", + "application/x-tika-ooxml-protected"); + assertTypeByNameAndData("testPPT_protected_passtika.pptx", + "application/x-tika-ooxml-protected"); + } + + /** + * Check that temporary files created by Tika are removed after + * closing TikaInputStream. + */ + @Test + public void testRemovalTempfiles() throws Exception { + assertRemovalTempfiles("testWORD.docx"); + assertRemovalTempfiles("test-documents.zip"); + } + + private int countTemporaryFiles() { + return new File(System.getProperty("java.io.tmpdir")).listFiles( + new FilenameFilter() { + public boolean accept(File dir, String name) { + return name.startsWith("apache-tika-"); + } + }).length; + } + + private void assertRemovalTempfiles(String fileName) throws Exception { + int numberOfTempFiles = countTemporaryFiles(); + + try (TikaInputStream stream = TikaInputStream.get( + TestContainerAwareDetector.class.getResource("/test-documents/" + fileName))) { + detector.detect(stream, new Metadata()); + } + + assertEquals(numberOfTempFiles, countTemporaryFiles()); + } + + @Test + public void testDetectIWork() throws Exception { + assertTypeByData("testKeynote.key", "application/vnd.apple.keynote"); + assertTypeByData("testNumbers.numbers", "application/vnd.apple.numbers"); + assertTypeByData("testPages.pages", "application/vnd.apple.pages"); + } + + @Test + public void testDetectKMZ() throws Exception { + assertTypeByData("testKMZ.kmz", "application/vnd.google-earth.kmz"); + } + + @Test + public void testDetectIPA() throws Exception { + assertTypeByNameAndData("testIPA.ipa", "application/x-itunes-ipa"); + assertTypeByData("testIPA.ipa", "application/x-itunes-ipa"); + } + + @Test + public void testASiC() throws Exception { + assertTypeByData("testASiCE.asice", "application/vnd.etsi.asic-e+zip"); + assertTypeByData("testASiCS.asics", "application/vnd.etsi.asic-s+zip"); + assertTypeByNameAndData("testASiCE.asice", "application/vnd.etsi.asic-e+zip"); + assertTypeByNameAndData("testASiCS.asics", "application/vnd.etsi.asic-s+zip"); + } + + @Test + public void testDetectZip() throws Exception { + assertTypeByData("test-documents.zip", "application/zip"); + assertTypeByData("test-zip-of-zip.zip", "application/zip"); + + // JAR based formats + assertTypeByData("testJAR.jar", "application/java-archive"); + assertTypeByData("testWAR.war", "application/x-tika-java-web-archive"); + assertTypeByData("testEAR.ear", "application/x-tika-java-enterprise-archive"); + assertTypeByData("testAPK.apk", "application/vnd.android.package-archive"); + + // JAR with HTML files in it + assertTypeByNameAndData("testJAR_with_HTML.jar", "testJAR_with_HTML.jar", + "application/java-archive", "application/java-archive"); + } + + private TikaInputStream getTruncatedFile(String name, int n) + throws IOException { + try (InputStream input = TestContainerAwareDetector.class.getResourceAsStream( + "/test-documents/" + name)) { + byte[] bytes = new byte[n]; + int m = 0; + while (m < bytes.length) { + int i = input.read(bytes, m, bytes.length - m); + if (i != -1) { + m += i; + } else { + throw new IOException("Unexpected end of stream"); + } + } + return TikaInputStream.get(bytes); + } + } + + @Test + public void testTruncatedFiles() throws Exception { + // First up a truncated OOXML (zip) file + + // With only the data supplied, the best we can do is the container + Metadata m = new Metadata(); + try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) { + assertEquals( + MediaType.application("x-tika-ooxml"), + detector.detect(xlsx, m)); + } + + // With truncated data + filename, we can use the filename to specialise + m = new Metadata(); + m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx"); + try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) { + assertEquals( + MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"), + detector.detect(xlsx, m)); + } + + // Now a truncated OLE2 file + m = new Metadata(); + try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) { + assertEquals( + MediaType.application("x-tika-msoffice"), + detector.detect(xls, m)); + } + + // Finally a truncated OLE2 file, with a filename available + m = new Metadata(); + m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xls"); + try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) { + assertEquals( + MediaType.application("vnd.ms-excel"), + detector.detect(xls, m)); + } + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java b/tika-app/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java new file mode 100644 index 0000000..45f68cc --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java @@ -0,0 +1,285 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.embedder; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStreamWriter; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +import org.apache.tika.TikaTest; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.txt.TXTParser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Unit test for {@link ExternalEmbedder}s. + */ +public class ExternalEmbedderTest extends TikaTest { + + static Path TMP_TEST_TXT; + protected static final DateFormat EXPECTED_METADATA_DATE_FORMATTER = + new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ROOT); + protected static final String DEFAULT_CHARSET = UTF_8.name(); + private static final String COMMAND_METADATA_ARGUMENT_DESCRIPTION = "dc:description"; + private static final String TEST_TXT_PATH = "test-documents/testTXT.txt"; + + private TemporaryResources tmp = new TemporaryResources(); + + @BeforeClass + public static void copyTestFile() throws Exception { + TMP_TEST_TXT = Files.createTempFile("tika-test", ""); + Files.copy(TikaTest.class.getClassLoader().getResourceAsStream(TEST_TXT_PATH), + TMP_TEST_TXT, StandardCopyOption.REPLACE_EXISTING); + } + + @AfterClass + public static void rmTestFile() throws Exception { + Files.delete(TMP_TEST_TXT); + } + + /** + * Gets the expected returned metadata value for the given field + * + * @param fieldName + * @return a prefix added to the field name + */ + protected String getExpectedMetadataValueString(String fieldName, Date timestamp) { + return this.getClass().getSimpleName() + " embedded " + fieldName + + " on " + EXPECTED_METADATA_DATE_FORMATTER.format(timestamp); + } + + /** + * Gets the tika <code>Metadata</code> object containing data to be + * embedded. + * + * @return the populated tika metadata object + */ + protected Metadata getMetadataToEmbed(Date timestamp) { + Metadata metadata = new Metadata(); + metadata.add(TikaCoreProperties.DESCRIPTION, + getExpectedMetadataValueString(TikaCoreProperties.DESCRIPTION.toString(), timestamp)); + return metadata; + } + + /** + * Gets the <code>Embedder</code> to test. + * + * @return the embedder under test + */ + protected Embedder getEmbedder() { + ExternalEmbedder embedder = new ExternalEmbedder(); + Map<Property, String[]> metadataCommandArguments = new HashMap<Property, String[]>(1); + metadataCommandArguments.put(TikaCoreProperties.DESCRIPTION, + new String[] { COMMAND_METADATA_ARGUMENT_DESCRIPTION }); + embedder.setMetadataCommandArguments(metadataCommandArguments); + return embedder; + } + + /** + * Gets the source input stream through standard Java resource loaders + * before metadata has been embedded. + * + * @return a fresh input stream + */ + protected InputStream getSourceStandardInputStream() { + return this.getClass().getResourceAsStream(TEST_TXT_PATH); + } + + /** + * Gets the source input stream via {@link TikaInputStream} + * before metadata has been embedded. + * + * @return a fresh input stream + * @throws FileNotFoundException + */ + protected InputStream getSourceTikaInputStream() throws IOException { + return TikaInputStream.get(TMP_TEST_TXT); + } + + /** + * Gets the parser to use to verify the result of the embed operation. + * + * @return the parser to read embedded metadata + */ + protected Parser getParser() { + return new TXTParser(); + } + + /** + * Whether or not the final result of reading the now embedded metadata is + * expected in the output of the external tool + * + * @return whether or not results are expected in command line output + */ + protected boolean getIsMetadataExpectedInOutput() { + return true; + } + + /** + * Tests embedding metadata then reading metadata to verify the results. + * + * @param isResultExpectedInOutput whether or not results are expected in command line output + */ + protected void embedInTempFile(InputStream sourceInputStream, boolean isResultExpectedInOutput) { + Embedder embedder = getEmbedder(); + + // TODO Move this check to ExternalEmbedder + String os = System.getProperty("os.name", ""); + if (os.contains("Windows")) { + // Skip test on Windows + return; + } + + Date timestamp = new Date(); + Metadata metadataToEmbed = getMetadataToEmbed(timestamp); + + try { + File tempOutputFile = tmp.createTemporaryFile(); + FileOutputStream tempFileOutputStream = new FileOutputStream(tempOutputFile); + + // Embed the metadata into a copy of the original output stream + embedder.embed(metadataToEmbed, sourceInputStream, tempFileOutputStream, null); + + ParseContext context = new ParseContext(); + Parser parser = getParser(); + context.set(Parser.class, parser); + + // Setup the extracting content handler + ByteArrayOutputStream result = new ByteArrayOutputStream(); + OutputStreamWriter outputWriter = new OutputStreamWriter(result,DEFAULT_CHARSET); + ContentHandler handler = new BodyContentHandler(outputWriter); + + // Create a new metadata object to read the new metadata into + Metadata embeddedMetadata = new Metadata(); + + // Setup a re-read of the now embeded temp file + FileInputStream embeddedFileInputStream = new FileInputStream(tempOutputFile); + + parser.parse(embeddedFileInputStream, handler, embeddedMetadata, + context); + + tmp.dispose(); + + String outputString = null; + if (isResultExpectedInOutput) { + outputString = result.toString(DEFAULT_CHARSET); + } else { + assertTrue("no metadata found", embeddedMetadata.size() > 0); + } + + // Check each metadata property for the expected value + for (String metadataName : metadataToEmbed.names()) { + if (metadataToEmbed.get(metadataName) != null) { + String expectedValue = metadataToEmbed.get(metadataName); + boolean foundExpectedValue = false; + if (isResultExpectedInOutput) { + // just check that the entire output contains the expected string + foundExpectedValue = outputString.contains(expectedValue); + } else { + if (embeddedMetadata.isMultiValued(metadataName)) { + for (String embeddedValue : embeddedMetadata.getValues(metadataName)) { + if (embeddedValue != null) { + if (embeddedValue.contains(expectedValue)) { + foundExpectedValue = true; + break; + } + } + } + } else { + String embeddedValue = embeddedMetadata.get(metadataName); + assertNotNull("expected metadata for " + + metadataName + " not found", + embeddedValue); + foundExpectedValue = embeddedValue.contains(expectedValue); + } + } + assertTrue( + "result did not contain expected appended metadata " + + metadataName + "=" + + expectedValue, + foundExpectedValue); + } + } + } catch (IOException e) { + fail(e.getMessage()); + } catch (TikaException e) { + fail(e.getMessage()); + } catch (SAXException e) { + fail(e.getMessage()); + } + } + + protected void checkSourceFileExists() { + String message = "the original input file was deleted"; + assertNotNull(message, TMP_TEST_TXT); + assertTrue(message, Files.isRegularFile(TMP_TEST_TXT)); + } + + /** + * Tests embedding using an input stream obtained via {@link ExternalEmbedderTest#getSourceStandardInputStream()} + * + * @throws IOException + */ + @Test + public void testEmbedStandardInputStream() throws IOException { + embedInTempFile(getSourceStandardInputStream(), getIsMetadataExpectedInOutput()); + checkSourceFileExists(); + } + + /** + * Tests embedding using an input stream obtained via {@link ExternalEmbedderTest#getSourceTikaInputStream()} + * + * @throws IOException + */ + @Test + public void testEmbedTikaInputStream() throws IOException { + embedInTempFile(getSourceTikaInputStream(), getIsMetadataExpectedInOutput()); + checkSourceFileExists(); + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/mime/MimeTypeTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/mime/MimeTypeTest.java b/tika-app/src/test/java/org/apache/tika/mime/MimeTypeTest.java new file mode 100644 index 0000000..447042b --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/mime/MimeTypeTest.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.mime; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import org.apache.tika.mime.MimeType; +import org.apache.tika.mime.MimeTypeException; +import org.apache.tika.mime.MimeTypes; +import org.junit.Before; +import org.junit.Test; + +public class MimeTypeTest { + + private MimeTypes types; + private MimeType text; + + @Before + public void setUp() throws MimeTypeException { + types = new MimeTypes(); + text = types.forName("text/plain"); + } + + /** Test MimeType constructor */ + @Test + public void testConstrctor() { + // Missing name + try { + new MimeType(null); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expected result + } + } + + @Test + public void testIsValidName() { + assertTrue(MimeType.isValid("application/octet-stream")); + assertTrue(MimeType.isValid("text/plain")); + assertTrue(MimeType.isValid("foo/bar")); + assertTrue(MimeType.isValid("a/b")); + + assertFalse(MimeType.isValid("application")); + assertFalse(MimeType.isValid("application/")); + assertFalse(MimeType.isValid("/")); + assertFalse(MimeType.isValid("/octet-stream")); + assertFalse(MimeType.isValid("application//octet-stream")); + assertFalse(MimeType.isValid("application/octet=stream")); + assertFalse(MimeType.isValid("application/\u00f6ctet-stream")); + assertFalse(MimeType.isValid("text/plain;")); + assertFalse(MimeType.isValid("text/plain; charset=UTF-8")); + try { + MimeType.isValid(null); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expected result + } + } + + /** Test MimeType setDescription() */ + @Test + public void testSetEmptyValues() { + try { + text.setDescription(null); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expected result + } + + try { + text.setAcronym(null); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expected result + } + + try { + text.addLink(null); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expected result + } + + try { + text.setUniformTypeIdentifier(null); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expected result + } + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/mime/MimeTypesTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/mime/MimeTypesTest.java b/tika-app/src/test/java/org/apache/tika/mime/MimeTypesTest.java new file mode 100644 index 0000000..be8a575 --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/mime/MimeTypesTest.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.mime; + +import static org.apache.tika.mime.MediaType.OCTET_STREAM; +import static org.apache.tika.mime.MediaType.TEXT_PLAIN; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import org.junit.Before; +import org.junit.Test; + +public class MimeTypesTest { + + private MimeTypes types; + + private MediaTypeRegistry registry; + + private MimeType binary; + + private MimeType text; + + private MimeType html; + + @Before + public void setUp() throws MimeTypeException { + types = new MimeTypes(); + registry = types.getMediaTypeRegistry(); + binary = types.forName("application/octet-stream"); + text = types.forName("text/plain"); + types.addAlias(text, MediaType.parse("text/x-plain")); + html = types.forName("text/html"); + types.setSuperType(html, TEXT_PLAIN); + } + + @Test + public void testForName() throws MimeTypeException { + assertEquals(text, types.forName("text/plain")); + assertEquals(text, types.forName("TEXT/PLAIN")); + + try { + types.forName("invalid"); + fail("MimeTypeException not thrown on invalid type name"); + } catch (MimeTypeException e) { + // expected + } + } + + @Test + public void testRegisteredMimes() throws MimeTypeException { + String dummy = "text/xxxxx"; + assertEquals(text, types.getRegisteredMimeType("text/plain")); + assertNull(types.getRegisteredMimeType(dummy)); + assertNotNull(types.forName(dummy)); + assertEquals(dummy, types.forName("text/xxxxx").getType().toString()); + assertEquals(dummy, types.getRegisteredMimeType("text/xxxxx").getType().toString()); + + try { + types.forName("invalid"); + fail("MimeTypeException not thrown on invalid type name"); + } catch (MimeTypeException e) { + // expected + } + } + + @Test + public void testSuperType() throws MimeTypeException { + assertNull(registry.getSupertype(OCTET_STREAM)); + assertEquals(OCTET_STREAM, registry.getSupertype(TEXT_PLAIN)); + assertEquals(TEXT_PLAIN, registry.getSupertype(html.getType())); + } + + @Test + public void testIsDescendantOf() { + assertFalse(registry.isSpecializationOf(OCTET_STREAM, OCTET_STREAM)); + assertFalse(registry.isSpecializationOf(TEXT_PLAIN, TEXT_PLAIN)); + assertFalse(registry.isSpecializationOf(html.getType(), html.getType())); + + assertTrue(registry.isSpecializationOf(html.getType(), OCTET_STREAM)); + assertFalse(registry.isSpecializationOf(OCTET_STREAM, html.getType())); + + assertTrue(registry.isSpecializationOf(html.getType(), TEXT_PLAIN)); + assertFalse(registry.isSpecializationOf(TEXT_PLAIN, html.getType())); + + assertTrue(registry.isSpecializationOf(TEXT_PLAIN, OCTET_STREAM)); + assertFalse(registry.isSpecializationOf(OCTET_STREAM, TEXT_PLAIN)); + } + + @Test + public void testCompareTo() { + assertTrue(binary.compareTo(binary) == 0); + assertTrue(binary.compareTo(text) != 0); + assertTrue(binary.compareTo(html) != 0); + + assertTrue(text.compareTo(binary) != 0); + assertTrue(text.compareTo(text) == 0); + assertTrue(text.compareTo(html) != 0); + + assertTrue(html.compareTo(binary) != 0); + assertTrue(html.compareTo(text) != 0); + assertTrue(html.compareTo(html) == 0); + } + +}
