http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/main/appended-resources/META-INF/LICENSE ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/appended-resources/META-INF/LICENSE b/tika-parsers/src/main/appended-resources/META-INF/LICENSE deleted file mode 100644 index bd54624..0000000 --- a/tika-parsers/src/main/appended-resources/META-INF/LICENSE +++ /dev/null @@ -1,94 +0,0 @@ -APACHE TIKA SUBCOMPONENTS - -Apache Tika includes a number of subcomponents with separate copyright notices -and license terms. Your use of these subcomponents is subject to the terms and -conditions of the following licenses. - -Charset detection code from ICU4J (http://site.icu-project.org/) - - Copyright (c) 1995-2009 International Business Machines Corporation - and others - - All rights reserved. - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, and/or sell copies of the Software, and to permit persons - to whom the Software is furnished to do so, provided that the above - copyright notice(s) and this permission notice appear in all copies - of the Software and that both the above copyright notice(s) and this - permission notice appear in supporting documentation. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. - IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE - BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, - OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, - ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - SOFTWARE. - - Except as contained in this notice, the name of a copyright holder shall - not be used in advertising or otherwise to promote the sale, use or other - dealings in this Software without prior written authorization of the - copyright holder. - - -JUnRAR (https://github.com/edmund-wagner/junrar/) - - JUnRAR is based on the UnRAR tool, and covered by the same license - It was formerly available from http://java-unrar.svn.sourceforge.net/ - - ****** ***** ****** UnRAR - free utility for RAR archives - ** ** ** ** ** ** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ****** ******* ****** License for use and distribution of - ** ** ** ** ** ** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ** ** ** ** ** ** FREE portable version - ~~~~~~~~~~~~~~~~~~~~~ - - The source code of UnRAR utility is freeware. This means: - - 1. All copyrights to RAR and the utility UnRAR are exclusively - owned by the author - Alexander Roshal. - - 2. The UnRAR sources may be used in any software to handle RAR - archives without limitations free of charge, but cannot be used - to re-create the RAR compression algorithm, which is proprietary. - Distribution of modified UnRAR sources in separate form or as a - part of other software is permitted, provided that it is clearly - stated in the documentation and source comments that the code may - not be used to develop a RAR (WinRAR) compatible archiver. - - 3. The UnRAR utility may be freely distributed. It is allowed - to distribute UnRAR inside of other software packages. - - 4. THE RAR ARCHIVER AND THE UnRAR UTILITY ARE DISTRIBUTED "AS IS". - NO WARRANTY OF ANY KIND IS EXPRESSED OR IMPLIED. YOU USE AT - YOUR OWN RISK. THE AUTHOR WILL NOT BE LIABLE FOR DATA LOSS, - DAMAGES, LOSS OF PROFITS OR ANY OTHER KIND OF LOSS WHILE USING - OR MISUSING THIS SOFTWARE. - - 5. Installing and using the UnRAR utility signifies acceptance of - these terms and conditions of the license. - - 6. If you don't agree with terms of the license you must remove - UnRAR files from your storage devices and cease to use the - utility. - - Thank you for your interest in RAR and UnRAR. Alexander L. Roshal - -Sqlite (included in the "provided" org.xerial's sqlite-jdbc) - Sqlite is in the Public Domain. For details - see: https://www.sqlite.org/copyright.html - -Two photos in test-documents (testWebp_Alpha_Lossy.webp and testWebp_Alpha_Lossless.webp) - are in the public domain. These files were retrieved from: - https://github.com/drewnoakes/metadata-extractor-images/tree/master/webp - These photos are also available here: - https://developers.google.com/speed/webp/gallery2#webp_links - Credits for the photo: - "Free Stock Photo in High Resolution - Yellow Rose 3 - Flowers" - Image Author: Jon Sullivan
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/main/java/org/apache/tika/parser/internal/Activator.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/internal/Activator.java b/tika-parsers/src/main/java/org/apache/tika/parser/internal/Activator.java deleted file mode 100644 index a884d3a..0000000 --- a/tika-parsers/src/main/java/org/apache/tika/parser/internal/Activator.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.internal; - -import java.util.Properties; - -import org.apache.tika.detect.DefaultDetector; -import org.apache.tika.detect.Detector; -import org.apache.tika.parser.DefaultParser; -import org.apache.tika.parser.Parser; -import org.osgi.framework.BundleActivator; -import org.osgi.framework.BundleContext; -import org.osgi.framework.ServiceRegistration; - -public class Activator implements BundleActivator { - - private ServiceRegistration detectorService; - - private ServiceRegistration parserService; - - @Override - public void start(BundleContext context) throws Exception { - detectorService = context.registerService( - Detector.class.getName(), - new DefaultDetector(Activator.class.getClassLoader()), - new Properties()); - Parser parser = new DefaultParser(Activator.class.getClassLoader()); - parserService = context.registerService( - Parser.class.getName(), - parser, - new Properties()); - } - - @Override - public void stop(BundleContext context) throws Exception { - parserService.unregister(); - detectorService.unregister(); - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java b/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java deleted file mode 100644 index a064156..0000000 --- a/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java +++ /dev/null @@ -1,299 +0,0 @@ -package org.apache.tika.parser.utils; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Locale; - -import org.apache.commons.codec.digest.DigestUtils; -import org.apache.commons.io.IOUtils; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.parser.DigestingParser; -import org.apache.tika.parser.ParseContext; - -/** - * Implementation of {@link org.apache.tika.parser.DigestingParser.Digester} - * that relies on commons.codec.digest.DigestUtils to calculate digest hashes. - * <p> - * This digester tries to use the regular mark/reset protocol on the InputStream. - * However, this wraps an internal BoundedInputStream, and if the InputStream - * is not fully read, then this will reset the stream and - * spool the InputStream to disk (via TikaInputStream) and then digest the file. - * <p> - * If a TikaInputStream is passed in and it has an underlying file that is longer - * than the {@link #markLimit}, then this digester digests the file directly. - * - */ -public class CommonsDigester implements DigestingParser.Digester { - - public enum DigestAlgorithm { - //those currently available in commons.digest - MD2, - MD5, - SHA1, - SHA256, - SHA384, - SHA512; - - String getMetadataKey() { - return TikaCoreProperties.TIKA_META_PREFIX+ - "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER+this.toString(); - } - } - - private final List<DigestAlgorithm> algorithms = new ArrayList<DigestAlgorithm>(); - private final int markLimit; - - public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) { - Collections.addAll(this.algorithms, algorithms); - if (markLimit < 0) { - throw new IllegalArgumentException("markLimit must be >= 0"); - } - this.markLimit = markLimit; - } - - @Override - public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException { - InputStream tis = TikaInputStream.get(is); - long sz = -1; - if (((TikaInputStream)tis).hasFile()) { - sz = ((TikaInputStream)tis).getLength(); - } - //if the file is definitely a file, - //and its size is greater than its mark limit, - //just digest the underlying file. - if (sz > markLimit) { - digestFile(((TikaInputStream)tis).getFile(), m); - return; - } - - //try the usual mark/reset stuff. - //however, if you actually hit the bound, - //then stop and spool to file via TikaInputStream - SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, tis); - boolean finishedStream = false; - for (DigestAlgorithm algorithm : algorithms) { - bis.mark(markLimit + 1); - finishedStream = digestEach(algorithm, bis, m); - bis.reset(); - if (!finishedStream) { - break; - } - } - if (!finishedStream) { - digestFile(((TikaInputStream)tis).getFile(), m); - } - } - - private void digestFile(File f, Metadata m) throws IOException { - for (DigestAlgorithm algorithm : algorithms) { - InputStream is = new FileInputStream(f); - try { - digestEach(algorithm, is, m); - } finally { - IOUtils.closeQuietly(is); - } - } - } - - /** - * - * @param algorithm algo to use - * @param is input stream to read from - * @param metadata metadata for reporting the digest - * @return whether or not this finished the input stream - * @throws IOException - */ - private boolean digestEach(DigestAlgorithm algorithm, - InputStream is, Metadata metadata) throws IOException { - String digest = null; - try { - switch (algorithm) { - case MD2: - digest = DigestUtils.md2Hex(is); - break; - case MD5: - digest = DigestUtils.md5Hex(is); - break; - case SHA1: - digest = DigestUtils.sha1Hex(is); - break; - case SHA256: - digest = DigestUtils.sha256Hex(is); - break; - case SHA384: - digest = DigestUtils.sha384Hex(is); - break; - case SHA512: - digest = DigestUtils.sha512Hex(is); - break; - default: - throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algorithm.toString()); - } - } catch (IOException e) { - e.printStackTrace(); - //swallow, or should we throw this? - } - if (is instanceof SimpleBoundedInputStream) { - if (((SimpleBoundedInputStream)is).hasHitBound()) { - return false; - } - } - metadata.set(algorithm.getMetadataKey(), digest); - return true; - } - - /** - * - * @param s comma-delimited (no space) list of algorithms to use: md5,sha256 - * @return - */ - public static DigestAlgorithm[] parse(String s) { - assert(s != null); - - List<DigestAlgorithm> ret = new ArrayList<DigestAlgorithm>(); - for (String algoString : s.split(",")) { - String uc = algoString.toUpperCase(Locale.ROOT); - if (uc.equals(DigestAlgorithm.MD2.toString())) { - ret.add(DigestAlgorithm.MD2); - } else if (uc.equals(DigestAlgorithm.MD5.toString())) { - ret.add(DigestAlgorithm.MD5); - } else if (uc.equals(DigestAlgorithm.SHA1.toString())) { - ret.add(DigestAlgorithm.SHA1); - } else if (uc.equals(DigestAlgorithm.SHA256.toString())) { - ret.add(DigestAlgorithm.SHA256); - } else if (uc.equals(DigestAlgorithm.SHA384.toString())) { - ret.add(DigestAlgorithm.SHA384); - } else if (uc.equals(DigestAlgorithm.SHA512.toString())) { - ret.add(DigestAlgorithm.SHA512); - } else { - StringBuilder sb = new StringBuilder(); - int i = 0; - for (DigestAlgorithm algo : DigestAlgorithm.values()) { - if (i++ > 0) { - sb.append(", "); - } - sb.append(algo.toString()); - } - throw new IllegalArgumentException("Couldn't match " + s + " with any of: " + sb.toString()); - } - } - return ret.toArray(new DigestAlgorithm[ret.size()]); - } - - /** - * Very slight modification of Commons' BoundedInputStream - * so that we can figure out if this hit the bound or not. - */ - private class SimpleBoundedInputStream extends InputStream { - private final static int EOF = -1; - private final long max; - private final InputStream in; - private long pos; - boolean hitBound = false; - - private SimpleBoundedInputStream(long max, InputStream in) { - this.max = max; - this.in = in; - } - - @Override - public int read() throws IOException { - if (max >= 0 && pos >= max) { - hitBound = true; - return EOF; - } - final int result = in.read(); - pos++; - return result; - } - - /** - * Invokes the delegate's <code>read(byte[])</code> method. - * @param b the buffer to read the bytes into - * @return the number of bytes read or -1 if the end of stream or - * the limit has been reached. - * @throws IOException if an I/O error occurs - */ - @Override - public int read(final byte[] b) throws IOException { - return this.read(b, 0, b.length); - } - - /** - * Invokes the delegate's <code>read(byte[], int, int)</code> method. - * @param b the buffer to read the bytes into - * @param off The start offset - * @param len The number of bytes to read - * @return the number of bytes read or -1 if the end of stream or - * the limit has been reached. - * @throws IOException if an I/O error occurs - */ - @Override - public int read(final byte[] b, final int off, final int len) throws IOException { - if (max>=0 && pos>=max) { - return EOF; - } - final long maxRead = max>=0 ? Math.min(len, max-pos) : len; - final int bytesRead = in.read(b, off, (int)maxRead); - - if (bytesRead==EOF) { - return EOF; - } - - pos+=bytesRead; - return bytesRead; - } - - /** - * Invokes the delegate's <code>skip(long)</code> method. - * @param n the number of bytes to skip - * @return the actual number of bytes skipped - * @throws IOException if an I/O error occurs - */ - @Override - public long skip(final long n) throws IOException { - final long toSkip = max>=0 ? Math.min(n, max-pos) : n; - final long skippedBytes = in.skip(toSkip); - pos+=skippedBytes; - return skippedBytes; - } - - @Override - public void reset() throws IOException { - in.reset(); - } - - @Override - public void mark(int readLimit) { - in.mark(readLimit); - } - - public boolean hasHitBound() { - return hitBound; - } - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/TestParsers.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/TestParsers.java b/tika-parsers/src/test/java/org/apache/tika/TestParsers.java deleted file mode 100644 index ddd671d..0000000 --- a/tika-parsers/src/test/java/org/apache/tika/TestParsers.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStream; - -import org.apache.tika.config.TikaConfig; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.junit.Before; -import org.junit.Test; -import org.xml.sax.helpers.DefaultHandler; - -/** - * Junit test class for Tika {@link Parser}s. - */ -public class TestParsers extends TikaTest { - - private TikaConfig tc; - - private Tika tika; - - @Before - public void setUp() throws Exception { - tc = TikaConfig.getDefaultConfig(); - tika = new Tika(tc); - } - - @Test - public void testWORDxtraction() throws Exception { - File file = getResourceAsFile("/test-documents/testWORD.doc"); - Parser parser = tika.getParser(); - Metadata metadata = new Metadata(); - try (InputStream stream = new FileInputStream(file)) { - parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); - } - assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE)); - } - - @Test - public void testEXCELExtraction() throws Exception { - final String expected = "Numbers and their Squares"; - File file = getResourceAsFile("/test-documents/testEXCEL.xls"); - String s1 = tika.parseToString(file); - assertTrue("Text does not contain '" + expected + "'", s1 - .contains(expected)); - Parser parser = tika.getParser(); - Metadata metadata = new Metadata(); - try (InputStream stream = new FileInputStream(file)) { - parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); - } - assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE)); - } - - @Test - public void testOptionalHyphen() throws Exception { - String[] extensions = - new String[] { "ppt", "pptx", "doc", "docx", "rtf", "pdf"}; - for (String extension : extensions) { - File file = getResourceAsFile("/test-documents/testOptionalHyphen." + extension); - String content = tika.parseToString(file); - assertTrue("optional hyphen was not handled for '" + extension + "' file type: " + content, - content.contains("optionalhyphen") || - content.contains("optional\u00adhyphen") || // soft hyphen - content.contains("optional\u200bhyphen") || // zero width space - content.contains("optional\u2027")); // hyphenation point - - } - } - - private void verifyComment(String extension, String fileName) throws Exception { - File file = getResourceAsFile("/test-documents/" + fileName + "." + extension); - String content = tika.parseToString(file); - assertTrue(extension + ": content=" + content + " did not extract text", - content.contains("Here is some text")); - assertTrue(extension + ": content=" + content + " did not extract comment", - content.contains("Here is a comment")); - } - - @Test - public void testComment() throws Exception { - final String[] extensions = new String[] {"ppt", "pptx", "doc", - "docx", "xls", "xlsx", "pdf", "rtf"}; - for(String extension : extensions) { - verifyComment(extension, "testComment"); - } - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java deleted file mode 100644 index 2125888..0000000 --- a/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.config; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - -import org.apache.tika.detect.CompositeDetector; -import org.apache.tika.detect.DefaultDetector; -import org.apache.tika.detect.Detector; -import org.apache.tika.detect.EmptyDetector; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.mbox.OutlookPSTParser; -import org.apache.tika.parser.microsoft.POIFSContainerDetector; -import org.apache.tika.parser.pkg.ZipContainerDetector; -import org.junit.Test; - -/** - * Junit test class for {@link TikaConfig}, which cover things - * that {@link TikaConfigTest} can't do due to a need for the - * full set of detectors - */ -public class TikaDetectorConfigTest extends AbstractTikaConfigTest { - @Test - public void testDetectorExcludeFromDefault() throws Exception { - TikaConfig config = getConfig("TIKA-1702-detector-blacklist.xml"); - assertNotNull(config.getParser()); - assertNotNull(config.getDetector()); - CompositeDetector detector = (CompositeDetector)config.getDetector(); - - // Should be wrapping two detectors - assertEquals(2, detector.getDetectors().size()); - - - // First should be DefaultDetector, second Empty, that order - assertEquals(DefaultDetector.class, detector.getDetectors().get(0).getClass()); - assertEquals(EmptyDetector.class, detector.getDetectors().get(1).getClass()); - - - // Get the DefaultDetector from the config - DefaultDetector confDetector = (DefaultDetector)detector.getDetectors().get(0); - - // Get a fresh "default" DefaultParser - DefaultDetector normDetector = new DefaultDetector(config.getMimeRepository()); - - - // The default one will offer the Zip and POIFS detectors - assertDetectors(normDetector, true, true); - - - // The one from the config won't, as we excluded those - assertDetectors(confDetector, false, false); - } - - /** - * TIKA-1708 - If the Zip detector is disabled, either explicitly, - * or via giving a list of detectors that it isn't part of, ensure - * that detection of PST files still works - */ - @Test - public void testPSTDetectionWithoutZipDetector() throws Exception { - // Check the one with an exclude - TikaConfig configWX = getConfig("TIKA-1708-detector-default.xml"); - assertNotNull(configWX.getParser()); - assertNotNull(configWX.getDetector()); - CompositeDetector detectorWX = (CompositeDetector)configWX.getDetector(); - - // Check it has the POIFS one, but not the zip one - assertDetectors(detectorWX, true, false); - - - // Check the one with an explicit list - TikaConfig configCL = getConfig("TIKA-1708-detector-composite.xml"); - assertNotNull(configCL.getParser()); - assertNotNull(configCL.getDetector()); - CompositeDetector detectorCL = (CompositeDetector)configCL.getDetector(); - assertEquals(2, detectorCL.getDetectors().size()); - - // Check it also has the POIFS one, but not the zip one - assertDetectors(detectorCL, true, false); - - - // Check that both detectors have a mimetypes with entries - assertTrue("Not enough mime types: " + configWX.getMediaTypeRegistry().getTypes().size(), - configWX.getMediaTypeRegistry().getTypes().size() > 100); - assertTrue("Not enough mime types: " + configCL.getMediaTypeRegistry().getTypes().size(), - configCL.getMediaTypeRegistry().getTypes().size() > 100); - - - // Now check they detect PST files correctly - TikaInputStream stream = TikaInputStream.get( - getResourceAsFile("/test-documents/testPST.pst")); - assertEquals( - OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE, - detectorWX.detect(stream, new Metadata()) - ); - assertEquals( - OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE, - detectorCL.detect(stream, new Metadata()) - ); - } - - private void assertDetectors(CompositeDetector detector, boolean shouldHavePOIFS, - boolean shouldHaveZip) { - boolean hasZip = false; - boolean hasPOIFS = false; - for (Detector d : detector.getDetectors()) { - if (d instanceof ZipContainerDetector) { - if (shouldHaveZip) { - hasZip = true; - } else { - fail("Shouldn't have the ZipContainerDetector from config"); - } - } - if (d instanceof POIFSContainerDetector) { - if (shouldHavePOIFS) { - hasPOIFS = true; - } else { - fail("Shouldn't have the POIFSContainerDetector from config"); - } - } - } - if (shouldHavePOIFS) assertTrue("Should have the POIFSContainerDetector", hasPOIFS); - if (shouldHaveZip) assertTrue("Should have the ZipContainerDetector", hasZip); - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java deleted file mode 100644 index 2acd358..0000000 --- a/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.config; - -import static org.apache.tika.TikaTest.assertContains; -import static org.apache.tika.TikaTest.assertNotContained; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - -import java.util.List; - -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.CompositeParser; -import org.apache.tika.parser.DefaultParser; -import org.apache.tika.parser.EmptyParser; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.ParserDecorator; -import org.apache.tika.parser.executable.ExecutableParser; -import org.apache.tika.parser.xml.XMLParser; -import org.junit.Test; - -/** - * Junit test class for {@link TikaConfig}, which cover things - * that {@link TikaConfigTest} can't do due to a need for the - * full set of parsers - */ -public class TikaParserConfigTest extends AbstractTikaConfigTest { - @Test - public void testMimeExcludeInclude() throws Exception { - TikaConfig config = getConfig("TIKA-1558-blacklist.xml"); - assertNotNull(config.getParser()); - assertNotNull(config.getDetector()); - Parser parser = config.getParser(); - - MediaType PDF = MediaType.application("pdf"); - MediaType JPEG = MediaType.image("jpeg"); - - - // Has two parsers - assertEquals(CompositeParser.class, parser.getClass()); - CompositeParser cParser = (CompositeParser)parser; - assertEquals(2, cParser.getAllComponentParsers().size()); - - // Both are decorated - assertTrue(cParser.getAllComponentParsers().get(0) instanceof ParserDecorator); - assertTrue(cParser.getAllComponentParsers().get(1) instanceof ParserDecorator); - ParserDecorator p0 = (ParserDecorator)cParser.getAllComponentParsers().get(0); - ParserDecorator p1 = (ParserDecorator)cParser.getAllComponentParsers().get(1); - - - // DefaultParser will be wrapped with excludes - assertEquals(DefaultParser.class, p0.getWrappedParser().getClass()); - - assertNotContained(PDF, p0.getSupportedTypes(context)); - assertContains(PDF, p0.getWrappedParser().getSupportedTypes(context)); - assertNotContained(JPEG, p0.getSupportedTypes(context)); - assertContains(JPEG, p0.getWrappedParser().getSupportedTypes(context)); - - - // Will have an empty parser for PDF - assertEquals(EmptyParser.class, p1.getWrappedParser().getClass()); - assertEquals(1, p1.getSupportedTypes(context).size()); - assertContains(PDF, p1.getSupportedTypes(context)); - assertNotContained(PDF, p1.getWrappedParser().getSupportedTypes(context)); - } - - @Test - public void testParserExcludeFromDefault() throws Exception { - TikaConfig config = getConfig("TIKA-1558-blacklist.xml"); - assertNotNull(config.getParser()); - assertNotNull(config.getDetector()); - CompositeParser parser = (CompositeParser)config.getParser(); - - MediaType PE_EXE = MediaType.application("x-msdownload"); - MediaType ELF = MediaType.application("x-elf"); - - - // Get the DefaultParser from the config - ParserDecorator confWrappedParser = (ParserDecorator)parser.getParsers().get(MediaType.APPLICATION_XML); - assertNotNull(confWrappedParser); - DefaultParser confParser = (DefaultParser)confWrappedParser.getWrappedParser(); - - // Get a fresh "default" DefaultParser - DefaultParser normParser = new DefaultParser(config.getMediaTypeRegistry()); - - - // The default one will offer the Executable Parser - assertContains(PE_EXE, normParser.getSupportedTypes(context)); - assertContains(ELF, normParser.getSupportedTypes(context)); - - boolean hasExec = false; - for (Parser p : normParser.getParsers().values()) { - if (p instanceof ExecutableParser) { - hasExec = true; - break; - } - } - assertTrue(hasExec); - - - // The one from the config won't - assertNotContained(PE_EXE, confParser.getSupportedTypes(context)); - assertNotContained(ELF, confParser.getSupportedTypes(context)); - - for (Parser p : confParser.getParsers().values()) { - if (p instanceof ExecutableParser) - fail("Shouldn't have the Executable Parser from config"); - } - } - /** - * TIKA-1558 It should be possible to exclude Parsers from being picked up by - * DefaultParser. - */ - @Test - public void defaultParserBlacklist() throws Exception { - TikaConfig config = new TikaConfig(); - assertNotNull(config.getParser()); - assertNotNull(config.getDetector()); - CompositeParser cp = (CompositeParser) config.getParser(); - List<Parser> parsers = cp.getAllComponentParsers(); - - boolean hasXML = false; - for (Parser p : parsers) { - if (p instanceof XMLParser) { - hasXML = true; - break; - } - } - assertTrue("Default config should include an XMLParser.", hasXML); - - // This custom TikaConfig should exclude XMLParser and all of its subclasses. - config = getConfig("TIKA-1558-blacklistsub.xml"); - cp = (CompositeParser) config.getParser(); - parsers = cp.getAllComponentParsers(); - - for (Parser p : parsers) { - if (p instanceof XMLParser) - fail("Custom config should not include an XMLParser (" + p.getClass() + ")."); - } - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java deleted file mode 100644 index 71af206..0000000 --- a/tika-parsers/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.config; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; - -import org.apache.tika.language.translate.DefaultTranslator; -import org.apache.tika.language.translate.EmptyTranslator; -import org.junit.Test; - -/** - * Junit test class for {@link TikaConfig}, which cover things - * that {@link TikaConfigTest} can't do due to a need for the - * full set of translators - */ -public class TikaTranslatorConfigTest extends AbstractTikaConfigTest { - @Test - public void testDefaultBehaviour() throws Exception { - TikaConfig config = TikaConfig.getDefaultConfig(); - assertNotNull(config.getTranslator()); - assertEquals(DefaultTranslator.class, config.getTranslator().getClass()); - } - - @Test - public void testRequestsDefault() throws Exception { - TikaConfig config = getConfig("TIKA-1702-translator-default.xml"); - assertNotNull(config.getParser()); - assertNotNull(config.getDetector()); - assertNotNull(config.getTranslator()); - - assertEquals(DefaultTranslator.class, config.getTranslator().getClass()); - } - - @Test - public void testRequestsEmpty() throws Exception { - TikaConfig config = getConfig("TIKA-1702-translator-empty.xml"); - assertNotNull(config.getParser()); - assertNotNull(config.getDetector()); - assertNotNull(config.getTranslator()); - - assertEquals(EmptyTranslator.class, config.getTranslator().getClass()); - } - - /** - * Currently, Translators don't support Composites, so - * if multiple translators are given, only the first wins - */ - @Test - public void testRequestsMultiple() throws Exception { - TikaConfig config = getConfig("TIKA-1702-translator-empty-default.xml"); - assertNotNull(config.getParser()); - assertNotNull(config.getDetector()); - assertNotNull(config.getTranslator()); - - assertEquals(EmptyTranslator.class, config.getTranslator().getClass()); - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java deleted file mode 100644 index 5787408..0000000 --- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.detect; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertTrue; - -import java.io.File; -import java.io.FilenameFilter; -import java.io.IOException; -import java.io.InputStream; - -import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.mime.MimeTypes; -import org.junit.Test; - -/** - * Junit test class for {@link ContainerAwareDetector} - */ -public class TestContainerAwareDetector { - private final TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); - private final MimeTypes mimeTypes = tikaConfig.getMimeRepository(); - private final Detector detector = new DefaultDetector(mimeTypes); - - private void assertTypeByData(String file, String type) throws Exception { - assertTypeByNameAndData(file, null, type); - } - private void assertTypeByNameAndData(String file, String type) throws Exception { - assertTypeByNameAndData(file, file, type); - } - private void assertType(String file, String byData, String byNameAndData) throws Exception { - assertTypeByData(file, byData); - assertTypeByNameAndData(file, byNameAndData); - } - private void assertTypeByNameAndData(String dataFile, String name, String type) throws Exception { - assertTypeByNameAndData(dataFile, name, type, null); - } - private void assertTypeByNameAndData(String dataFile, String name, String typeFromDetector, String typeFromMagic) throws Exception { - try (TikaInputStream stream = TikaInputStream.get( - TestContainerAwareDetector.class.getResource("/test-documents/" + dataFile))) { - Metadata m = new Metadata(); - if (name != null) - m.add(Metadata.RESOURCE_NAME_KEY, name); - - // Mime Magic version is likely to be less precise - if (typeFromMagic != null) { - assertEquals( - MediaType.parse(typeFromMagic), - mimeTypes.detect(stream, m)); - } - - // All being well, the detector should get it perfect - assertEquals( - MediaType.parse(typeFromDetector), - detector.detect(stream, m)); - } - } - - @Test - public void testDetectOLE2() throws Exception { - // Microsoft office types known by POI - assertTypeByData("testEXCEL.xls", "application/vnd.ms-excel"); - assertTypeByData("testWORD.doc", "application/msword"); - assertTypeByData("testPPT.ppt", "application/vnd.ms-powerpoint"); - - assertTypeByData("test-outlook.msg", "application/vnd.ms-outlook"); - assertTypeByData("test-outlook2003.msg", "application/vnd.ms-outlook"); - assertTypeByData("testVISIO.vsd", "application/vnd.visio"); - assertTypeByData("testPUBLISHER.pub", "application/x-mspublisher"); - assertTypeByData("testWORKS.wps", "application/vnd.ms-works"); - assertTypeByData("testWORKS2000.wps", "application/vnd.ms-works"); - - // older Works Word Processor files can't be recognized - // they were created with Works Word Processor 7.0 (hence the text inside) - // and exported to the older formats with the "Save As" feature - assertTypeByData("testWORKSWordProcessor3.0.wps","application/vnd.ms-works"); - assertTypeByData("testWORKSWordProcessor4.0.wps","application/vnd.ms-works"); - assertTypeByData("testWORKSSpreadsheet7.0.xlr", "application/x-tika-msworks-spreadsheet"); - assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project"); - assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project"); - - // Excel95 can be detected by not parsed - assertTypeByData("testEXCEL_95.xls", "application/vnd.ms-excel"); - - // Try some ones that POI doesn't handle, that are still OLE2 based - assertTypeByData("testCOREL.shw", "application/x-corelpresentations"); - assertTypeByData("testQUATTRO.qpw", "application/x-quattro-pro"); - assertTypeByData("testQUATTRO.wb3", "application/x-quattro-pro"); - - assertTypeByData("testHWP_5.0.hwp", "application/x-hwp-v5"); - - - // With the filename and data - assertTypeByNameAndData("testEXCEL.xls", "application/vnd.ms-excel"); - assertTypeByNameAndData("testWORD.doc", "application/msword"); - assertTypeByNameAndData("testPPT.ppt", "application/vnd.ms-powerpoint"); - - // With the wrong filename supplied, data will trump filename - assertTypeByNameAndData("testEXCEL.xls", "notWord.doc", "application/vnd.ms-excel"); - assertTypeByNameAndData("testWORD.doc", "notExcel.xls", "application/msword"); - assertTypeByNameAndData("testPPT.ppt", "notWord.doc", "application/vnd.ms-powerpoint"); - - // With a filename of a totally different type, data will trump filename - assertTypeByNameAndData("testEXCEL.xls", "notPDF.pdf", "application/vnd.ms-excel"); - assertTypeByNameAndData("testEXCEL.xls", "notPNG.png", "application/vnd.ms-excel"); - } - - /** - * There is no way to distinguish "proper" StarOffice files from templates. - * All templates have the same extension but their actual type depends on - * the magic. Our current MimeTypes class doesn't allow us to use the same - * glob pattern in more than one mimetype. - * - * @throws Exception - */ - @Test - public void testDetectStarOfficeFiles() throws Exception { - assertType("testStarOffice-5.2-calc.sdc", - "application/vnd.stardivision.calc", - "application/vnd.stardivision.calc"); - assertType("testVORCalcTemplate.vor", - "application/vnd.stardivision.calc", - "application/vnd.stardivision.calc"); - assertType("testStarOffice-5.2-draw.sda", - "application/vnd.stardivision.draw", - "application/vnd.stardivision.draw"); - assertType("testVORDrawTemplate.vor", - "application/vnd.stardivision.draw", - "application/vnd.stardivision.draw"); - assertType("testStarOffice-5.2-impress.sdd", - "application/vnd.stardivision.impress", - "application/vnd.stardivision.impress"); - assertType("testVORImpressTemplate.vor", - "application/vnd.stardivision.impress", - "application/vnd.stardivision.impress"); - assertType("testStarOffice-5.2-writer.sdw", - "application/vnd.stardivision.writer", - "application/vnd.stardivision.writer"); - assertType("testVORWriterTemplate.vor", - "application/vnd.stardivision.writer", - "application/vnd.stardivision.writer"); - - } - - @Test - public void testOpenContainer() throws Exception { - try (TikaInputStream stream = TikaInputStream.get( - TestContainerAwareDetector.class.getResource("/test-documents/testPPT.ppt"))) { - assertNull(stream.getOpenContainer()); - assertEquals( - MediaType.parse("application/vnd.ms-powerpoint"), - detector.detect(stream, new Metadata())); - assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem); - } - } - - /** - * EPub uses a similar mimetype entry to OpenDocument for storing - * the mimetype within the parent zip file - */ - @Test - public void testDetectEPub() throws Exception { - assertTypeByData("testEPUB.epub", "application/epub+zip"); - assertTypeByData("testiBooks.ibooks", "application/x-ibooks+zip"); - } - - @Test - public void testDetectLotusNotesEml() throws Exception { - // Lotus .eml files aren't guaranteed to have any of the magic - // matches as the first line, but should have X-Notes-Item and Message-ID - assertTypeByData("testLotusEml.eml", "message/rfc822"); - } - - @Test - public void testDetectODF() throws Exception { - assertTypeByData("testODFwithOOo3.odt", "application/vnd.oasis.opendocument.text"); - assertTypeByData("testOpenOffice2.odf", "application/vnd.oasis.opendocument.formula"); - } - - @Test - public void testDetectOOXML() throws Exception { - assertTypeByData("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); - assertTypeByData("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); - assertTypeByData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); - - // Check some of the less common OOXML types - assertTypeByData("testPPT.pptm", "application/vnd.ms-powerpoint.presentation.macroenabled.12"); - assertTypeByData("testPPT.ppsx", "application/vnd.openxmlformats-officedocument.presentationml.slideshow"); - assertTypeByData("testPPT.ppsm", "application/vnd.ms-powerpoint.slideshow.macroEnabled.12"); - assertTypeByData("testDOTM.dotm", "application/vnd.ms-word.template.macroEnabled.12"); - assertTypeByData("testEXCEL.strict.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); - assertTypeByData("testPPT.xps", "application/vnd.ms-xpsdocument"); - - assertTypeByData("testVISIO.vsdm", "application/vnd.ms-visio.drawing.macroenabled.12"); - assertTypeByData("testVISIO.vsdx", "application/vnd.ms-visio.drawing"); - assertTypeByData("testVISIO.vssm", "application/vnd.ms-visio.stencil.macroenabled.12"); - assertTypeByData("testVISIO.vssx", "application/vnd.ms-visio.stencil"); - assertTypeByData("testVISIO.vstm", "application/vnd.ms-visio.template.macroenabled.12"); - assertTypeByData("testVISIO.vstx", "application/vnd.ms-visio.template"); - - // .xlsb is an OOXML file containing the binary parts, and not - // an OLE2 file as you might initially expect! - assertTypeByData("testEXCEL.xlsb", "application/vnd.ms-excel.sheet.binary.macroEnabled.12"); - - // With the filename and data - assertTypeByNameAndData("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); - assertTypeByNameAndData("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); - assertTypeByNameAndData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); - - // With the wrong filename supplied, data will trump filename - assertTypeByNameAndData("testEXCEL.xlsx", "notWord.docx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); - assertTypeByNameAndData("testWORD.docx", "notExcel.xlsx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); - assertTypeByNameAndData("testPPT.pptx", "notWord.docx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); - - // With an incorrect filename of a different container type, data trumps filename - assertTypeByNameAndData("testEXCEL.xlsx", "notOldExcel.xls", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); - } - - /** - * Password Protected OLE2 files are fairly straightforward to detect, as they - * have the same structure as regular OLE2 files. (Core streams may be encrypted - * however) - */ - @Test - public void testDetectProtectedOLE2() throws Exception { - assertTypeByData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel"); - assertTypeByData("testWORD_protected_passtika.doc", "application/msword"); - assertTypeByData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint"); - assertTypeByNameAndData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel"); - assertTypeByNameAndData("testWORD_protected_passtika.doc", "application/msword"); - assertTypeByNameAndData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint"); - } - - /** - * Password Protected OOXML files are much more tricky beasts to work with. - * They have a very different structure to regular OOXML files, and instead - * of being ZIP based they are actually an OLE2 file which contains the - * OOXML structure within an encrypted stream. - * This makes detecting them much harder... - */ - @Test - public void testDetectProtectedOOXML() throws Exception { - // Encrypted Microsoft Office OOXML files have OLE magic but - // special streams, so we can tell they're Protected OOXML - assertTypeByData("testEXCEL_protected_passtika.xlsx", - "application/x-tika-ooxml-protected"); - assertTypeByData("testWORD_protected_passtika.docx", - "application/x-tika-ooxml-protected"); - assertTypeByData("testPPT_protected_passtika.pptx", - "application/x-tika-ooxml-protected"); - - // At the moment, we can't use the name to specialise - // See discussions on TIKA-790 for details - assertTypeByNameAndData("testEXCEL_protected_passtika.xlsx", - "application/x-tika-ooxml-protected"); - assertTypeByNameAndData("testWORD_protected_passtika.docx", - "application/x-tika-ooxml-protected"); - assertTypeByNameAndData("testPPT_protected_passtika.pptx", - "application/x-tika-ooxml-protected"); - } - - /** - * Check that temporary files created by Tika are removed after - * closing TikaInputStream. - */ - @Test - public void testRemovalTempfiles() throws Exception { - assertRemovalTempfiles("testWORD.docx"); - assertRemovalTempfiles("test-documents.zip"); - } - - private int countTemporaryFiles() { - return new File(System.getProperty("java.io.tmpdir")).listFiles( - new FilenameFilter() { - public boolean accept(File dir, String name) { - return name.startsWith("apache-tika-"); - } - }).length; - } - - private void assertRemovalTempfiles(String fileName) throws Exception { - int numberOfTempFiles = countTemporaryFiles(); - - try (TikaInputStream stream = TikaInputStream.get( - TestContainerAwareDetector.class.getResource("/test-documents/" + fileName))) { - detector.detect(stream, new Metadata()); - } - - assertEquals(numberOfTempFiles, countTemporaryFiles()); - } - - @Test - public void testDetectIWork() throws Exception { - assertTypeByData("testKeynote.key", "application/vnd.apple.keynote"); - assertTypeByData("testNumbers.numbers", "application/vnd.apple.numbers"); - assertTypeByData("testPages.pages", "application/vnd.apple.pages"); - } - - @Test - public void testDetectKMZ() throws Exception { - assertTypeByData("testKMZ.kmz", "application/vnd.google-earth.kmz"); - } - - @Test - public void testDetectIPA() throws Exception { - assertTypeByNameAndData("testIPA.ipa", "application/x-itunes-ipa"); - assertTypeByData("testIPA.ipa", "application/x-itunes-ipa"); - } - - @Test - public void testASiC() throws Exception { - assertTypeByData("testASiCE.asice", "application/vnd.etsi.asic-e+zip"); - assertTypeByData("testASiCS.asics", "application/vnd.etsi.asic-s+zip"); - assertTypeByNameAndData("testASiCE.asice", "application/vnd.etsi.asic-e+zip"); - assertTypeByNameAndData("testASiCS.asics", "application/vnd.etsi.asic-s+zip"); - } - - @Test - public void testDetectZip() throws Exception { - assertTypeByData("test-documents.zip", "application/zip"); - assertTypeByData("test-zip-of-zip.zip", "application/zip"); - - // JAR based formats - assertTypeByData("testJAR.jar", "application/java-archive"); - assertTypeByData("testWAR.war", "application/x-tika-java-web-archive"); - assertTypeByData("testEAR.ear", "application/x-tika-java-enterprise-archive"); - assertTypeByData("testAPK.apk", "application/vnd.android.package-archive"); - - // JAR with HTML files in it - assertTypeByNameAndData("testJAR_with_HTML.jar", "testJAR_with_HTML.jar", - "application/java-archive", "application/java-archive"); - } - - private TikaInputStream getTruncatedFile(String name, int n) - throws IOException { - try (InputStream input = TestContainerAwareDetector.class.getResourceAsStream( - "/test-documents/" + name)) { - byte[] bytes = new byte[n]; - int m = 0; - while (m < bytes.length) { - int i = input.read(bytes, m, bytes.length - m); - if (i != -1) { - m += i; - } else { - throw new IOException("Unexpected end of stream"); - } - } - return TikaInputStream.get(bytes); - } - } - - @Test - public void testTruncatedFiles() throws Exception { - // First up a truncated OOXML (zip) file - - // With only the data supplied, the best we can do is the container - Metadata m = new Metadata(); - try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) { - assertEquals( - MediaType.application("x-tika-ooxml"), - detector.detect(xlsx, m)); - } - - // With truncated data + filename, we can use the filename to specialise - m = new Metadata(); - m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx"); - try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) { - assertEquals( - MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"), - detector.detect(xlsx, m)); - } - - // Now a truncated OLE2 file - m = new Metadata(); - try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) { - assertEquals( - MediaType.application("x-tika-msoffice"), - detector.detect(xls, m)); - } - - // Finally a truncated OLE2 file, with a filename available - m = new Metadata(); - m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xls"); - try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) { - assertEquals( - MediaType.application("vnd.ms-excel"), - detector.detect(xls, m)); - } - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java b/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java deleted file mode 100644 index e988aff..0000000 --- a/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java +++ /dev/null @@ -1,292 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.embedder; - -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStreamWriter; -import java.net.URISyntaxException; -import java.net.URL; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.HashMap; -import java.util.Locale; -import java.util.Map; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TemporaryResources; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Property; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.txt.TXTParser; -import org.apache.tika.sax.BodyContentHandler; -import org.junit.Test; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - -/** - * Unit test for {@link ExternalEmbedder}s. - */ -public class ExternalEmbedderTest { - - protected static final DateFormat EXPECTED_METADATA_DATE_FORMATTER = - new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ROOT); - protected static final String DEFAULT_CHARSET = UTF_8.name(); - private static final String COMMAND_METADATA_ARGUMENT_DESCRIPTION = "dc:description"; - private static final String TEST_TXT_PATH = "/test-documents/testTXT.txt"; - - private TemporaryResources tmp = new TemporaryResources(); - - /** - * Gets the expected returned metadata value for the given field - * - * @param fieldName - * @return a prefix added to the field name - */ - protected String getExpectedMetadataValueString(String fieldName, Date timestamp) { - return this.getClass().getSimpleName() + " embedded " + fieldName + - " on " + EXPECTED_METADATA_DATE_FORMATTER.format(timestamp); - } - - /** - * Gets the tika <code>Metadata</code> object containing data to be - * embedded. - * - * @return the populated tika metadata object - */ - protected Metadata getMetadataToEmbed(Date timestamp) { - Metadata metadata = new Metadata(); - metadata.add(TikaCoreProperties.DESCRIPTION, - getExpectedMetadataValueString(TikaCoreProperties.DESCRIPTION.toString(), timestamp)); - return metadata; - } - - /** - * Gets the <code>Embedder</code> to test. - * - * @return the embedder under test - */ - protected Embedder getEmbedder() { - ExternalEmbedder embedder = new ExternalEmbedder(); - Map<Property, String[]> metadataCommandArguments = new HashMap<Property, String[]>(1); - metadataCommandArguments.put(TikaCoreProperties.DESCRIPTION, - new String[] { COMMAND_METADATA_ARGUMENT_DESCRIPTION }); - embedder.setMetadataCommandArguments(metadataCommandArguments); - return embedder; - } - - /** - * Gets the source input stream through standard Java resource loaders - * before metadata has been embedded. - * - * @return a fresh input stream - */ - protected InputStream getSourceStandardInputStream() { - return this.getClass().getResourceAsStream(TEST_TXT_PATH); - } - - /** - * Gets the source input stream via {@link TikaInputStream} - * before metadata has been embedded. - * - * @return a fresh input stream - * @throws FileNotFoundException - */ - protected InputStream getSourceTikaInputStream() throws FileNotFoundException { - return TikaInputStream.get(getSourceInputFile()); - } - - /** - * Gets the source input file through standard Java resource loaders - * before metadata has been embedded. - * - * @return a fresh input stream - * @throws FileNotFoundException - */ - protected File getSourceInputFile() throws FileNotFoundException { - URL origUrl = this.getClass().getResource(TEST_TXT_PATH); - if (origUrl == null) { - throw new FileNotFoundException("could not load " + TEST_TXT_PATH); - } - try { - return new File(origUrl.toURI()); - } catch (URISyntaxException e) { - throw new FileNotFoundException(e.getMessage()); - } - } - - /** - * Gets the parser to use to verify the result of the embed operation. - * - * @return the parser to read embedded metadata - */ - protected Parser getParser() { - return new TXTParser(); - } - - /** - * Whether or not the final result of reading the now embedded metadata is - * expected in the output of the external tool - * - * @return whether or not results are expected in command line output - */ - protected boolean getIsMetadataExpectedInOutput() { - return true; - } - - /** - * Tests embedding metadata then reading metadata to verify the results. - * - * @param isResultExpectedInOutput whether or not results are expected in command line output - */ - protected void embedInTempFile(InputStream sourceInputStream, boolean isResultExpectedInOutput) { - Embedder embedder = getEmbedder(); - - // TODO Move this check to ExternalEmbedder - String os = System.getProperty("os.name", ""); - if (os.contains("Windows")) { - // Skip test on Windows - return; - } - - Date timestamp = new Date(); - Metadata metadataToEmbed = getMetadataToEmbed(timestamp); - - try { - File tempOutputFile = tmp.createTemporaryFile(); - FileOutputStream tempFileOutputStream = new FileOutputStream(tempOutputFile); - - // Embed the metadata into a copy of the original output stream - embedder.embed(metadataToEmbed, sourceInputStream, tempFileOutputStream, null); - - ParseContext context = new ParseContext(); - Parser parser = getParser(); - context.set(Parser.class, parser); - - // Setup the extracting content handler - ByteArrayOutputStream result = new ByteArrayOutputStream(); - OutputStreamWriter outputWriter = new OutputStreamWriter(result,DEFAULT_CHARSET); - ContentHandler handler = new BodyContentHandler(outputWriter); - - // Create a new metadata object to read the new metadata into - Metadata embeddedMetadata = new Metadata(); - - // Setup a re-read of the now embeded temp file - FileInputStream embeddedFileInputStream = new FileInputStream(tempOutputFile); - - parser.parse(embeddedFileInputStream, handler, embeddedMetadata, - context); - - tmp.dispose(); - - String outputString = null; - if (isResultExpectedInOutput) { - outputString = result.toString(DEFAULT_CHARSET); - } else { - assertTrue("no metadata found", embeddedMetadata.size() > 0); - } - - // Check each metadata property for the expected value - for (String metadataName : metadataToEmbed.names()) { - if (metadataToEmbed.get(metadataName) != null) { - String expectedValue = metadataToEmbed.get(metadataName); - boolean foundExpectedValue = false; - if (isResultExpectedInOutput) { - // just check that the entire output contains the expected string - foundExpectedValue = outputString.contains(expectedValue); - } else { - if (embeddedMetadata.isMultiValued(metadataName)) { - for (String embeddedValue : embeddedMetadata.getValues(metadataName)) { - if (embeddedValue != null) { - if (embeddedValue.contains(expectedValue)) { - foundExpectedValue = true; - break; - } - } - } - } else { - String embeddedValue = embeddedMetadata.get(metadataName); - assertNotNull("expected metadata for " - + metadataName + " not found", - embeddedValue); - foundExpectedValue = embeddedValue.contains(expectedValue); - } - } - assertTrue( - "result did not contain expected appended metadata " - + metadataName + "=" - + expectedValue, - foundExpectedValue); - } - } - } catch (IOException e) { - fail(e.getMessage()); - } catch (TikaException e) { - fail(e.getMessage()); - } catch (SAXException e) { - fail(e.getMessage()); - } - } - - protected void checkSourceFileExists() { - String message = "the original input file was deleted"; - try { - File origInputFile = getSourceInputFile(); - assertNotNull(message, origInputFile); - assertTrue(message, origInputFile.exists()); - } catch (FileNotFoundException e) { - fail(message + ": " + e.getMessage()); - } - } - - /** - * Tests embedding using an input stream obtained via {@link ExternalEmbedderTest#getSourceStandardInputStream()} - * - * @throws IOException - */ - @Test - public void testEmbedStandardInputStream() throws IOException { - embedInTempFile(getSourceStandardInputStream(), getIsMetadataExpectedInOutput()); - checkSourceFileExists(); - } - - /** - * Tests embedding using an input stream obtained via {@link ExternalEmbedderTest#getSourceTikaInputStream()} - * - * @throws IOException - */ - @Test - public void testEmbedTikaInputStream() throws IOException { - embedInTempFile(getSourceTikaInputStream(), getIsMetadataExpectedInOutput()); - checkSourceFileExists(); - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java b/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java deleted file mode 100644 index 7987630..0000000 --- a/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.mime; - -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - -import org.junit.Before; -import org.junit.Test; - -public class MimeTypeTest { - - private MimeTypes types; - private MimeType text; - - @Before - public void setUp() throws MimeTypeException { - types = new MimeTypes(); - text = types.forName("text/plain"); - } - - /** Test MimeType constructor */ - @Test - public void testConstrctor() { - // Missing name - try { - new MimeType(null); - fail("Expected IllegalArgumentException"); - } catch (IllegalArgumentException e) { - // expected result - } - } - - @Test - public void testIsValidName() { - assertTrue(MimeType.isValid("application/octet-stream")); - assertTrue(MimeType.isValid("text/plain")); - assertTrue(MimeType.isValid("foo/bar")); - assertTrue(MimeType.isValid("a/b")); - - assertFalse(MimeType.isValid("application")); - assertFalse(MimeType.isValid("application/")); - assertFalse(MimeType.isValid("/")); - assertFalse(MimeType.isValid("/octet-stream")); - assertFalse(MimeType.isValid("application//octet-stream")); - assertFalse(MimeType.isValid("application/octet=stream")); - assertFalse(MimeType.isValid("application/\u00f6ctet-stream")); - assertFalse(MimeType.isValid("text/plain;")); - assertFalse(MimeType.isValid("text/plain; charset=UTF-8")); - try { - MimeType.isValid(null); - fail("Expected IllegalArgumentException"); - } catch (IllegalArgumentException e) { - // expected result - } - } - - /** Test MimeType setDescription() */ - @Test - public void testSetEmptyValues() { - try { - text.setDescription(null); - fail("Expected IllegalArgumentException"); - } catch (IllegalArgumentException e) { - // expected result - } - - try { - text.setAcronym(null); - fail("Expected IllegalArgumentException"); - } catch (IllegalArgumentException e) { - // expected result - } - - try { - text.addLink(null); - fail("Expected IllegalArgumentException"); - } catch (IllegalArgumentException e) { - // expected result - } - - try { - text.setUniformTypeIdentifier(null); - fail("Expected IllegalArgumentException"); - } catch (IllegalArgumentException e) { - // expected result - } - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java b/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java deleted file mode 100644 index be8a575..0000000 --- a/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.mime; - -import static org.apache.tika.mime.MediaType.OCTET_STREAM; -import static org.apache.tika.mime.MediaType.TEXT_PLAIN; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - -import org.junit.Before; -import org.junit.Test; - -public class MimeTypesTest { - - private MimeTypes types; - - private MediaTypeRegistry registry; - - private MimeType binary; - - private MimeType text; - - private MimeType html; - - @Before - public void setUp() throws MimeTypeException { - types = new MimeTypes(); - registry = types.getMediaTypeRegistry(); - binary = types.forName("application/octet-stream"); - text = types.forName("text/plain"); - types.addAlias(text, MediaType.parse("text/x-plain")); - html = types.forName("text/html"); - types.setSuperType(html, TEXT_PLAIN); - } - - @Test - public void testForName() throws MimeTypeException { - assertEquals(text, types.forName("text/plain")); - assertEquals(text, types.forName("TEXT/PLAIN")); - - try { - types.forName("invalid"); - fail("MimeTypeException not thrown on invalid type name"); - } catch (MimeTypeException e) { - // expected - } - } - - @Test - public void testRegisteredMimes() throws MimeTypeException { - String dummy = "text/xxxxx"; - assertEquals(text, types.getRegisteredMimeType("text/plain")); - assertNull(types.getRegisteredMimeType(dummy)); - assertNotNull(types.forName(dummy)); - assertEquals(dummy, types.forName("text/xxxxx").getType().toString()); - assertEquals(dummy, types.getRegisteredMimeType("text/xxxxx").getType().toString()); - - try { - types.forName("invalid"); - fail("MimeTypeException not thrown on invalid type name"); - } catch (MimeTypeException e) { - // expected - } - } - - @Test - public void testSuperType() throws MimeTypeException { - assertNull(registry.getSupertype(OCTET_STREAM)); - assertEquals(OCTET_STREAM, registry.getSupertype(TEXT_PLAIN)); - assertEquals(TEXT_PLAIN, registry.getSupertype(html.getType())); - } - - @Test - public void testIsDescendantOf() { - assertFalse(registry.isSpecializationOf(OCTET_STREAM, OCTET_STREAM)); - assertFalse(registry.isSpecializationOf(TEXT_PLAIN, TEXT_PLAIN)); - assertFalse(registry.isSpecializationOf(html.getType(), html.getType())); - - assertTrue(registry.isSpecializationOf(html.getType(), OCTET_STREAM)); - assertFalse(registry.isSpecializationOf(OCTET_STREAM, html.getType())); - - assertTrue(registry.isSpecializationOf(html.getType(), TEXT_PLAIN)); - assertFalse(registry.isSpecializationOf(TEXT_PLAIN, html.getType())); - - assertTrue(registry.isSpecializationOf(TEXT_PLAIN, OCTET_STREAM)); - assertFalse(registry.isSpecializationOf(OCTET_STREAM, TEXT_PLAIN)); - } - - @Test - public void testCompareTo() { - assertTrue(binary.compareTo(binary) == 0); - assertTrue(binary.compareTo(text) != 0); - assertTrue(binary.compareTo(html) != 0); - - assertTrue(text.compareTo(binary) != 0); - assertTrue(text.compareTo(text) == 0); - assertTrue(text.compareTo(html) != 0); - - assertTrue(html.compareTo(binary) != 0); - assertTrue(html.compareTo(text) != 0); - assertTrue(html.compareTo(html) == 0); - } - -}
