http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-app/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java new file mode 100644 index 0000000..0ed428e --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java @@ -0,0 +1,312 @@ +package org.apache.tika.parser; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaMetadataKeys; +import org.apache.tika.parser.digesting.CommonsDigester; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.ContentHandlerFactory; +import org.junit.Test; +import org.xml.sax.helpers.DefaultHandler; + +public class RecursiveParserWrapperTest { + + @Test + public void testBasicXML() throws Exception { + List<Metadata> list = getMetadata(new Metadata(), + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + Metadata container = list.get(0); + String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + //not much differentiates html from xml in this test file + assertTrue(content.indexOf("<p class=\"header\" />") > -1); + } + + @Test + public void testBasicHTML() throws Exception { + List<Metadata> list = getMetadata(new Metadata(), + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1)); + Metadata container = list.get(0); + String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + //not much differentiates html from xml in this test file + assertTrue(content.indexOf("<p class=\"header\"></p>") > -1); + } + + @Test + public void testBasicText() throws Exception { + List<Metadata> list = getMetadata(new Metadata(), + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); + Metadata container = list.get(0); + String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + assertTrue(content.indexOf("<p ") < 0); + assertTrue(content.indexOf("embed_0") > -1); + } + + @Test + public void testIgnoreContent() throws Exception { + List<Metadata> list = getMetadata(new Metadata(), + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); + Metadata container = list.get(0); + String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + assertNull(content); + } + + + @Test + public void testCharLimit() throws Exception { + ParseContext context = new ParseContext(); + Metadata metadata = new Metadata(); + + Parser wrapped = new AutoDetectParser(); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60)); + InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream( + "/test-documents/test_recursive_embedded.docx"); + wrapper.parse(stream, new DefaultHandler(), metadata, context); + List<Metadata> list = wrapper.getMetadata(); + + assertEquals(5, list.size()); + + int wlr = 0; + for (Metadata m : list) { + String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED); + if (limitReached != null && limitReached.equals("true")) { + wlr++; + } + } + assertEquals(1, wlr); + + } + + @Test + public void testMaxEmbedded() throws Exception { + int maxEmbedded = 4; + int totalNoLimit = 12;//including outer container file + ParseContext context = new ParseContext(); + Metadata metadata = new Metadata(); + String limitReached = null; + + Parser wrapped = new AutoDetectParser(); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); + + InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream( + "/test-documents/test_recursive_embedded.docx"); + wrapper.parse(stream, new DefaultHandler(), metadata, context); + List<Metadata> list = wrapper.getMetadata(); + //test default + assertEquals(totalNoLimit, list.size()); + + limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); + assertNull(limitReached); + + + wrapper.reset(); + stream.close(); + + //test setting value + metadata = new Metadata(); + stream = RecursiveParserWrapperTest.class.getResourceAsStream( + "/test-documents/test_recursive_embedded.docx"); + wrapper.setMaxEmbeddedResources(maxEmbedded); + wrapper.parse(stream, new DefaultHandler(), metadata, context); + list = wrapper.getMetadata(); + + //add 1 for outer container file + assertEquals(maxEmbedded + 1, list.size()); + + limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); + assertEquals("true", limitReached); + + wrapper.reset(); + stream.close(); + + //test setting value < 0 + metadata = new Metadata(); + stream = RecursiveParserWrapperTest.class.getResourceAsStream( + "/test-documents/test_recursive_embedded.docx"); + + wrapper.setMaxEmbeddedResources(-2); + wrapper.parse(stream, new DefaultHandler(), metadata, context); + assertEquals(totalNoLimit, list.size()); + limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED); + assertNull(limitReached); + } + + @Test + public void testEmbeddedResourcePath() throws Exception { + + Set<String> targets = new HashSet<String>(); + targets.add("/embed1.zip"); + targets.add("/embed1.zip/embed2.zip"); + targets.add("/embed1.zip/embed2.zip/embed3.zip"); + targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip"); + targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip/embed4.txt"); + targets.add("/embed1.zip/embed2.zip/embed3.zip/embed3.txt"); + targets.add("/embed1.zip/embed2.zip/embed2a.txt"); + targets.add("/embed1.zip/embed2.zip/embed2b.txt"); + targets.add("/embed1.zip/embed1b.txt"); + targets.add("/embed1.zip/embed1a.txt"); + targets.add("/image1.emf"); + + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx"); + List<Metadata> list = getMetadata(metadata, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + Metadata container = list.get(0); + String content = container.get(RecursiveParserWrapper.TIKA_CONTENT); + assertTrue(content.indexOf("<p class=\"header\" />") > -1); + + Set<String> seen = new HashSet<String>(); + for (Metadata m : list) { + String path = m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); + if (path != null) { + seen.add(path); + } + } + assertEquals(targets, seen); + } + + @Test + public void testEmbeddedNPE() throws Exception { + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx"); + List<Metadata> list = getMetadata(metadata, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); + //default behavior (user doesn't specify whether or not to catch embedded exceptions + //is to catch the exception + assertEquals(13, list.size()); + Metadata mockNPEMetadata = list.get(10); + assertContains("java.lang.NullPointerException", mockNPEMetadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION)); + + metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx"); + list = getMetadata(metadata, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), + false, null); + + //Composite parser swallows caught TikaExceptions, IOExceptions and SAXExceptions + //and just doesn't bother to report that there was an exception. + assertEquals(12, list.size()); + } + + @Test + public void testPrimaryExcWEmbedded() throws Exception { + //if embedded content is handled and then + //the parser hits an exception in the container document, + //that the first element of the returned list is the container document + //and the second is the embedded content + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml"); + + ParseContext context = new ParseContext(); + Parser wrapped = new AutoDetectParser(); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true); + String path = "/test-documents/mock/embedded_then_npe.xml"; + + InputStream stream = null; + boolean npe = false; + try { + stream = RecursiveParserWrapperTest.class.getResourceAsStream( + path); + wrapper.parse(stream, new DefaultHandler(), metadata, context); + } catch (TikaException e) { + if (e.getCause().getClass().equals(NullPointerException.class)) { + npe = true; + } + } finally { + IOUtils.closeQuietly(stream); + } + assertTrue("npe", npe); + + List<Metadata> metadataList = wrapper.getMetadata(); + assertEquals(2, metadataList.size()); + Metadata outerMetadata = metadataList.get(0); + Metadata embeddedMetadata = metadataList.get(1); + assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY)); + assertEquals("Nikolai Lobachevsky", outerMetadata.get("author")); + + assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT)); + assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY)); + assertEquals("embeddedAuthor", embeddedMetadata.get("author")); + } + + @Test + public void testDigesters() throws Exception { + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx"); + List<Metadata> list = getMetadata(metadata, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), + true, new CommonsDigester(100000, CommonsDigester.DigestAlgorithm.MD5)); + int i = 0; + Metadata m0 = list.get(0); + Metadata m6 = list.get(6); + String md5Key = "X-TIKA:digest:MD5"; + assertEquals("59f626e09a8c16ab6dbc2800c685f772", list.get(0).get(md5Key)); + assertEquals("ccdf3882e7e4c2454e28884db9b0a54d", list.get(6).get(md5Key)); + assertEquals("a869bf6432ebd14e19fc79416274e0c9", list.get(7).get(md5Key)); + } + + private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory, + boolean catchEmbeddedExceptions, + DigestingParser.Digester digester) throws Exception { + ParseContext context = new ParseContext(); + Parser wrapped = new AutoDetectParser(); + if (digester != null) { + wrapped = new DigestingParser(wrapped, digester); + } + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, + contentHandlerFactory, catchEmbeddedExceptions); + String path = metadata.get(Metadata.RESOURCE_NAME_KEY); + if (path == null) { + path = "/test-documents/test_recursive_embedded.docx"; + } else { + path = "/test-documents/" + path; + } + InputStream stream = null; + try { + stream = TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI()); + wrapper.parse(stream, new DefaultHandler(), metadata, context); + } finally { + IOUtils.closeQuietly(stream); + } + return wrapper.getMetadata(); + + } + + private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory) + throws Exception { + return getMetadata(metadata, contentHandlerFactory, true, null); + } +}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/TestParsers.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/parser/TestParsers.java b/tika-app/src/test/java/org/apache/tika/parser/TestParsers.java new file mode 100644 index 0000000..cde3e78 --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/parser/TestParsers.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.tika.Tika; +import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.Before; +import org.junit.Test; +import org.xml.sax.helpers.DefaultHandler; + +/** + * Junit test class for Tika {@link Parser}s. + */ +public class TestParsers extends TikaTest { + + private TikaConfig tc; + + private Tika tika; + + @Before + public void setUp() throws Exception { + tc = TikaConfig.getDefaultConfig(); + tika = new Tika(tc); + } + + @Test + public void testWORDExtraction() throws Exception { + + Path tmpFile = getTestDocumentAsTempFile("testWORD.doc"); + Parser parser = tika.getParser(); + Metadata metadata = new Metadata(); + try (InputStream stream = Files.newInputStream(tmpFile)) { + parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + } finally { + Files.delete(tmpFile); + } + assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE)); + } + + @Test + public void testEXCELExtraction() throws Exception { + final String expected = "Numbers and their Squares"; + Path tmpFile = getTestDocumentAsTempFile("testEXCEL.xls"); + try { + String s1 = tika.parseToString(tmpFile); + assertTrue("Text does not contain '" + expected + "'", s1 + .contains(expected)); + Parser parser = tika.getParser(); + Metadata metadata = new Metadata(); + try (InputStream stream = Files.newInputStream(tmpFile)) { + parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + } + assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE)); + } finally { + Files.delete(tmpFile); + } + } + + @Test + public void testOptionalHyphen() throws Exception { + String[] extensions = + new String[] { "ppt", "pptx", "doc", "docx", "rtf", "pdf"}; + for (String extension : extensions) { + Path tmpFile = getTestDocumentAsTempFile("testOptionalHyphen." + extension); + String content = null; + try { + content = tika.parseToString(tmpFile); + } finally { + Files.delete(tmpFile); + } + assertTrue("optional hyphen was not handled for '" + extension + "' file type: " + content, + content.contains("optionalhyphen") || + content.contains("optional\u00adhyphen") || // soft hyphen + content.contains("optional\u200bhyphen") || // zero width space + content.contains("optional\u2027")); // hyphenation point + + } + } + + @Test + public void testComment() throws Exception { + final String[] extensions = new String[] {"ppt", "pptx", "doc", + "docx", "xls", "xlsx", "pdf", "rtf"}; + for(String extension : extensions) { + verifyComment(extension, "testComment"); + } + } + + private void verifyComment(String extension, String fileName) throws Exception { + TemporaryResources tmp = new TemporaryResources(); + + String content = null; + Path tmpFile = null; + try { + tmpFile = getTestDocumentAsTempFile(fileName + "." + extension); + content = tika.parseToString(tmpFile); + } finally { + if (tmpFile != null) { + Files.delete(tmpFile); + } + } + assertTrue(extension + ": content=" + content + " did not extract text", + content.contains("Here is some text")); + assertTrue(extension + ": content=" + content + " did not extract comment", + content.contains("Here is a comment")); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java b/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java new file mode 100644 index 0000000..54c1427 --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.fork; + +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.fail; + +import java.io.IOException; +import java.io.InputStream; +import java.io.NotSerializableException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import org.apache.tika.Tika; +import org.apache.tika.detect.Detector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.fork.ForkParser; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Test that the ForkParser correctly behaves when + * wired in to the regular Parsers and their test data + */ +public class ForkParserIntegrationTest { + + private Tika tika = new Tika(); // TODO Use TikaConfig instead, when it works + + /** + * Simple text parsing + */ + @Test + public void testForkedTextParsing() throws Exception { + ForkParser parser = new ForkParser( + ForkParserIntegrationTest.class.getClassLoader(), + tika.getParser()); + + try { + ContentHandler output = new BodyContentHandler(); + InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream( + "/test-documents/testTXT.txt"); + ParseContext context = new ParseContext(); + parser.parse(stream, output, new Metadata(), context); + + String content = output.toString(); + assertContains("Test d'indexation", content); + assertContains("http://www.apache.org", content); + } finally { + parser.close(); + } + } + + /** + * This error has a message and an equals() implementation as to be able + * to match it against the serialized version of itself. + */ + static class AnError extends Error { + private static final long serialVersionUID = -6197267350768803348L; + private String message; + AnError(String message) { + super(message); + this.message = message; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + AnError anError = (AnError) o; + + if (!message.equals(anError.message)) return false; + + return true; + } + + @Override + public int hashCode() { + return message.hashCode(); + } + } + + /** + * This error isn't serializable on the server, so can't be sent back + * to the Fork Client once it has occured + */ + static class WontBeSerializedError extends RuntimeException { + private static final long serialVersionUID = 1L; + + WontBeSerializedError(String message) { + super(message); + } + + private void writeObject(java.io.ObjectOutputStream out) { + RuntimeException e = new RuntimeException("Bang!"); + boolean found = false; + for (StackTraceElement ste : e.getStackTrace()) { + if (ste.getClassName().equals(ForkParser.class.getName())) { + found = true; + break; + } + } + if (!found) { + throw e; + } + } + } + + static class BrokenParser implements Parser { + private static final long serialVersionUID = 995871497930817839L; + public Error err = new AnError("Simulated fail"); + public RuntimeException re = null; + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return new HashSet<MediaType>(Arrays.asList(MediaType.TEXT_PLAIN)); + } + + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + if (re != null) throw re; + throw err; + } + } + + /** + * TIKA-831 Parsers throwing errors should be caught and + * properly reported + */ + @Test + public void testParsingErrorInForkedParserShouldBeReported() throws Exception { + BrokenParser brokenParser = new BrokenParser(); + Parser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser); + InputStream stream = getClass().getResourceAsStream("/test-documents/testTXT.txt"); + + // With a serializable error, we'll get that back + try { + ContentHandler output = new BodyContentHandler(); + ParseContext context = new ParseContext(); + parser.parse(stream, output, new Metadata(), context); + fail("Expected TikaException caused by Error"); + } catch (TikaException e) { + assertEquals(brokenParser.err, e.getCause()); + } + + // With a non serializable one, we'll get something else + // TODO Fix this test + brokenParser = new BrokenParser(); + brokenParser.re= new WontBeSerializedError("Can't Serialize"); + parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser); +// try { +// ContentHandler output = new BodyContentHandler(); +// ParseContext context = new ParseContext(); +// parser.parse(stream, output, new Metadata(), context); +// fail("Expected TikaException caused by Error"); +// } catch (TikaException e) { +// assertEquals(TikaException.class, e.getCause().getClass()); +// assertEquals("Bang!", e.getCause().getMessage()); +// } + } + + /** + * If we supply a non serializable object on the ParseContext, + * check we get a helpful exception back + */ + @Test + public void testParserHandlingOfNonSerializable() throws Exception { + ForkParser parser = new ForkParser( + ForkParserIntegrationTest.class.getClassLoader(), + tika.getParser()); + + ParseContext context = new ParseContext(); + context.set(Detector.class, new Detector() { + public MediaType detect(InputStream input, Metadata metadata) { + return MediaType.OCTET_STREAM; + } + }); + + try { + ContentHandler output = new BodyContentHandler(); + InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream( + "/test-documents/testTXT.txt"); + parser.parse(stream, output, new Metadata(), context); + fail("Should have blown up with a non serializable ParseContext"); + } catch(TikaException e) { + // Check the right details + assertNotNull(e.getCause()); + assertEquals(NotSerializableException.class, e.getCause().getClass()); + assertEquals("Unable to serialize ParseContext to pass to the Forked Parser", e.getMessage()); + } finally { + parser.close(); + } + } + + /** + * TIKA-832 + */ + @Test + public void testAttachingADebuggerOnTheForkedParserShouldWork() + throws Exception { + ParseContext context = new ParseContext(); + context.set(Parser.class, tika.getParser()); + + ForkParser parser = new ForkParser( + ForkParserIntegrationTest.class.getClassLoader(), + tika.getParser()); + parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug", + "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n")); + try { + ContentHandler body = new BodyContentHandler(); + InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream( + "/test-documents/testTXT.txt"); + parser.parse(stream, body, new Metadata(), context); + String content = body.toString(); + assertContains("Test d'indexation", content); + assertContains("http://www.apache.org", content); + } finally { + parser.close(); + } + } + + /** + * TIKA-808 - Ensure that parsing of our test PDFs work under + * the Fork Parser, to ensure that complex parsing behaves + */ + @Test + public void testForkedPDFParsing() throws Exception { + ForkParser parser = new ForkParser( + ForkParserIntegrationTest.class.getClassLoader(), + tika.getParser()); + try { + ContentHandler output = new BodyContentHandler(); + InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream( + "/test-documents/testPDF.pdf"); + ParseContext context = new ParseContext(); + parser.parse(stream, output, new Metadata(), context); + + String content = output.toString(); + assertContains("Apache Tika", content); + assertContains("Tika - Content Analysis Toolkit", content); + assertContains("incubator", content); + assertContains("Apache Software Foundation", content); + } finally { + parser.close(); + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/mock/MockParserTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/parser/mock/MockParserTest.java b/tika-app/src/test/java/org/apache/tika/parser/mock/MockParserTest.java new file mode 100644 index 0000000..52af12b --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/parser/mock/MockParserTest.java @@ -0,0 +1,251 @@ +package org.apache.tika.parser.mock; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintStream; +import java.util.Date; + +import org.apache.tika.TikaTest; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.IOUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.junit.Test; + +/** + * Somewhat bizarrely, we can't put the test of this test resource in tika-test-resources + * or else it will be called by every module that uses it. Um, Yossarian!!! + */ +public class MockParserTest extends TikaTest { + private final static String M = "/test-documents/mock/"; + private final static Parser PARSER = new AutoDetectParser(); + + @Override + public XMLResult getXML(String path, Metadata m) throws Exception { + //note that this is specific to MockParserTest with addition of M to the path! + InputStream is = getResourceAsStream(M+path); + try { + return super.getXML(is, PARSER, m); + } finally { + IOUtils.closeQuietly(is); + } + } + + @Test + public void testExample() throws Exception { + Metadata m = new Metadata(); + PrintStream out = System.out; + PrintStream err = System.err; + ByteArrayOutputStream outBos = new ByteArrayOutputStream(); + ByteArrayOutputStream errBos = new ByteArrayOutputStream(); + PrintStream tmpOut = new PrintStream(outBos, true, UTF_8.toString()); + PrintStream tmpErr = new PrintStream(errBos, true, UTF_8.toString()); + System.setOut(tmpOut); + System.setErr(tmpErr); + try { + assertThrowable("example.xml", m, IOException.class, "not another IOException"); + assertMockParser(m); + } finally { + System.setOut(out); + System.setErr(err); + } + String outString = new String(outBos.toByteArray(), UTF_8); + assertContains("writing to System.out", outString); + + String errString = new String(errBos.toByteArray(), UTF_8); + assertContains("writing to System.err", errString); + + } + + @Test + public void testNothingBad() throws Exception { + Metadata m = new Metadata(); + String content = getXML("nothing_bad.xml", m).xml; + assertEquals("Geoffrey Chaucer", m.get("author")); + assertContains("<p>And bathed every veyne in swich licour,</p>", content); + assertMockParser(m); + } + + @Test + public void testNullPointer() throws Exception { + Metadata m = new Metadata(); + assertThrowable("null_pointer.xml", m, NullPointerException.class, "null pointer message"); + assertMockParser(m); + } + + @Test + public void testNullPointerNoMsg() throws Exception { + Metadata m = new Metadata(); + assertThrowable("null_pointer_no_msg.xml", m, NullPointerException.class, null); + assertMockParser(m); + } + + + @Test + public void testSleep() throws Exception { + long start = new Date().getTime(); + Metadata m = new Metadata(); + String content = getXML("sleep.xml", m).xml; + assertMockParser(m); + long elapsed = new Date().getTime()-start; + //should sleep for at least 3000 + boolean enoughTimeHasElapsed = elapsed > 2000; + assertTrue("not enough time has not elapsed: "+elapsed, enoughTimeHasElapsed); + assertMockParser(m); + } + + @Test + public void testHeavyHang() throws Exception { + long start = new Date().getTime(); + Metadata m = new Metadata(); + + String content = getXML("heavy_hang.xml", m).xml; + assertMockParser(m); + long elapsed = new Date().getTime()-start; + //should sleep for at least 3000 + boolean enoughTimeHasElapsed = elapsed > 2000; + assertTrue("not enough time has elapsed: "+elapsed, enoughTimeHasElapsed); + assertMockParser(m); + } + + @Test + public void testFakeOOM() throws Exception { + Metadata m = new Metadata(); + assertThrowable("fake_oom.xml", m, OutOfMemoryError.class, "not another oom"); + assertMockParser(m); + } + + @Test + public void testRealOOM() throws Exception { + //Note: we're not actually testing the diff between fake and real oom + //i.e. by creating child process and setting different -Xmx or + //memory profiling. + Metadata m = new Metadata(); + assertThrowable("real_oom.xml", m, OutOfMemoryError.class, "Java heap space"); + assertMockParser(m); + } + + @Test + public void testInterruptibleSleep() { + //Without static initialization of the parser, it can take ~1 second after t.start() + //before the parser actually calls parse. This is + //just the time it takes to instantiate and call AutoDetectParser, do the detection, etc. + //This is not thread creation overhead. + ParserRunnable r = new ParserRunnable("sleep_interruptible.xml"); + Thread t = new Thread(r); + t.start(); + long start = new Date().getTime(); + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + //swallow + } + + t.interrupt(); + + try { + t.join(10000); + } catch (InterruptedException e) { + //swallow + } + long elapsed = new Date().getTime()-start; + boolean shortEnough = elapsed < 2000;//the xml file specifies 3000 + assertTrue("elapsed (" + elapsed + " millis) was not short enough", shortEnough); + } + + @Test + public void testNonInterruptibleSleep() { + ParserRunnable r = new ParserRunnable("sleep_not_interruptible.xml"); + Thread t = new Thread(r); + t.start(); + long start = new Date().getTime(); + try { + //make sure that the thread has actually started + Thread.sleep(1000); + } catch (InterruptedException e) { + //swallow + } + t.interrupt(); + try { + t.join(20000); + } catch (InterruptedException e) { + //swallow + } + long elapsed = new Date().getTime()-start; + boolean longEnough = elapsed > 3000;//the xml file specifies 3000, this sleeps 1000 + assertTrue("elapsed ("+elapsed+" millis) was not long enough", longEnough); + } + + private class ParserRunnable implements Runnable { + private final String path; + ParserRunnable(String path) { + this.path = path; + } + @Override + public void run() { + Metadata m = new Metadata(); + try { + getXML(path, m); + } catch (Exception e) { + throw new RuntimeException(e); + } finally { + assertMockParser(m); + } + } + } + + private void assertThrowable(String path, Metadata m, Class<? extends Throwable> expected, String message) { + + try { + getXML(path, m); + } catch (Throwable t) { + //if this is a throwable wrapped in a TikaException, use the cause + if (t instanceof TikaException && t.getCause() != null) { + t = t.getCause(); + } + if (! (t.getClass().isAssignableFrom(expected))){ + fail(t.getClass() +" is not assignable from "+expected); + } + if (message != null) { + assertEquals(message, t.getMessage()); + } + } + } + + private void assertMockParser(Metadata m) { + String[] parsers = m.getValues("X-Parsed-By"); + //make sure that it was actually parsed by mock. + boolean parsedByMock = false; + for (String parser : parsers) { + if (parser.equals("org.apache.tika.parser.mock.MockParser")) { + parsedByMock = true; + break; + } + } + assertTrue("mock parser should have been called", parsedByMock); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java b/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java new file mode 100644 index 0000000..c47a348 --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java @@ -0,0 +1,335 @@ +package org.apache.tika.parser.pkg; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.InputStream; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Before; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +public class PackageTest extends TikaTest { + + private static final MediaType TYPE_7ZIP = MediaType.application("x-7z-compressed"); + + private ParseContext recursingContext; + private Parser autoDetectParser; + + @Before + public void setUp() throws Exception { + + autoDetectParser = new AutoDetectParser(); + recursingContext = new ParseContext(); + recursingContext.set(Parser.class, autoDetectParser); + } + + @Test + public void testZlibParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/testTXT.zlib")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/zlib", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("Test d'indexation de Txt", content); + assertContains("http://www.apache.org", content); + } + + + @Test + public void testArParsing() throws Exception { + Parser parser = new AutoDetectParser(); + + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/testARofText.ar")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-archive", + metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("http://www.apache.org", content); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/testARofSND.ar")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-archive", + metadata.get(Metadata.CONTENT_TYPE)); + content = handler.toString(); + assertContains("testAU.au", content); + } + + @Test + public void testBzip2Parsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/test-documents.tbz2")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("Sample Excel Worksheet", content); + assertContains("test-documents/testHTML.html", content); + assertContains("Test Indexation Html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("This is a sample Open Office document", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("Apache Tika", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("Sample Powerpoint Slide", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("indexation Word", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("This is a sample Microsoft Word Document", content); + assertContains("test-documents/testXML.xml", content); + assertContains("Rida Benjelloun", content); + } + + @Test + public void testCompressParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/test-documents.tar.Z"); + try { + parser.parse(stream, handler, metadata, recursingContext); + } finally { + stream.close(); + } + + assertEquals("application/x-compress", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("Sample Excel Worksheet", content); + assertContains("test-documents/testHTML.html", content); + assertContains("Test Indexation Html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("This is a sample Open Office document", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("Apache Tika", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("Sample Powerpoint Slide", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("indexation Word", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("This is a sample Microsoft Word Document", content); + assertContains("test-documents/testXML.xml", content); + assertContains("Rida Benjelloun", content); + } + + @Test + public void testGzipParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/test-documents.tgz")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("Sample Excel Worksheet", content); + assertContains("test-documents/testHTML.html", content); + assertContains("Test Indexation Html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("This is a sample Open Office document", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("Apache Tika", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("Sample Powerpoint Slide", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("indexation Word", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("This is a sample Microsoft Word Document", content); + assertContains("test-documents/testXML.xml", content); + assertContains("Rida Benjelloun", content); + } + + @Test + public void testRarParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/test-documents.rar")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("Sample Excel Worksheet", content); + assertContains("test-documents/testHTML.html", content); + assertContains("Test Indexation Html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("This is a sample Open Office document", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("Apache Tika", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("Sample Powerpoint Slide", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("indexation Word", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("This is a sample Microsoft Word Document", content); + assertContains("test-documents/testXML.xml", content); + assertContains("Rida Benjelloun", content); + } + + @Test + public void test7ZParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + // Ensure 7zip is a parsable format + assertTrue("No 7zip parser found", + parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP)); + + // Parse + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/test-documents.7z")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("Sample Excel Worksheet", content); + assertContains("test-documents/testHTML.html", content); + assertContains("Test Indexation Html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("This is a sample Open Office document", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("Apache Tika", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("Sample Powerpoint Slide", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("indexation Word", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("This is a sample Microsoft Word Document", content); + assertContains("test-documents/testXML.xml", content); + assertContains("Rida Benjelloun", content); + } + @Test + public void testTarParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/test-documents.tar")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("test-documents/testEXCEL.xls", content); + assertContains("Sample Excel Worksheet", content); + assertContains("test-documents/testHTML.html", content); + assertContains("Test Indexation Html", content); + assertContains("test-documents/testOpenOffice2.odt", content); + assertContains("This is a sample Open Office document", content); + assertContains("test-documents/testPDF.pdf", content); + assertContains("Apache Tika", content); + assertContains("test-documents/testPPT.ppt", content); + assertContains("Sample Powerpoint Slide", content); + assertContains("test-documents/testRTF.rtf", content); + assertContains("indexation Word", content); + assertContains("test-documents/testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("test-documents/testWORD.doc", content); + assertContains("This is a sample Microsoft Word Document", content); + assertContains("test-documents/testXML.xml", content); + assertContains("Rida Benjelloun", content); + } + + @Test + public void testZipParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/test-documents.zip")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("testEXCEL.xls", content); + assertContains("Sample Excel Worksheet", content); + assertContains("testHTML.html", content); + assertContains("Test Indexation Html", content); + assertContains("testOpenOffice2.odt", content); + assertContains("This is a sample Open Office document", content); + assertContains("testPDF.pdf", content); + assertContains("Apache Tika", content); + assertContains("testPPT.ppt", content); + assertContains("Sample Powerpoint Slide", content); + assertContains("testRTF.rtf", content); + assertContains("indexation Word", content); + assertContains("testTXT.txt", content); + assertContains("Test d'indexation de Txt", content); + assertContains("testWORD.doc", content); + assertContains("This is a sample Microsoft Word Document", content); + assertContains("testXML.xml", content); + assertContains("Rida Benjelloun", content); + } + + @Test + public void testSvgzParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + try (InputStream stream = PackageTest.class.getResourceAsStream( + "/test-documents/testSVG.svgz")) { + parser.parse(stream, handler, metadata, recursingContext); + } + + assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + assertContains("Test SVG image", content); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java b/tika-app/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java new file mode 100644 index 0000000..eff076b --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.sax; + +import static org.apache.tika.TikaTest.assertContains; + +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.junit.Test; + +/** + * Test class for the {@link PhoneExtractingContentHandler} + * class. This demonstrates how to parse a document and retrieve any phone numbers + * found within. + * + * The phone numbers are added to a multivalued Metadata object under the key, "phonenumbers". + * You can get an array of phone numbers by calling metadata.getValues("phonenumber"). + */ +public class PhoneExtractingContentHandlerTest { + @Test + public void testExtractPhoneNumbers() throws Exception { + Parser parser = new AutoDetectParser(); + Metadata metadata = new Metadata(); + // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them + // to the underlying Handler. + PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata); + try (InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("/test-documents/testPhoneNumberExtractor.odt")) { + parser.parse(stream, handler, metadata, new ParseContext()); + } + String[] phoneNumbers = metadata.getValues("phonenumbers"); + assertContains("9498888888", phoneNumbers[0]); + assertContains("9497777777", phoneNumbers[1]); + assertContains("9496666666", phoneNumbers[2]); + assertContains("9495555555", phoneNumbers[3]); + assertContains("4193404645", phoneNumbers[4]); + assertContains("9044687081", phoneNumbers[5]); + assertContains("2604094811", phoneNumbers[6]); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java b/tika-app/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java new file mode 100644 index 0000000..62660c8 --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.utils; + +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertTrue; + +import org.apache.tika.TikaTest; +import org.apache.tika.parser.DefaultParser; +import org.apache.tika.parser.Parser; +import org.junit.Test; + +public class ServiceLoaderUtilsTest extends TikaTest { + @Test + public void testOrdering() throws Exception { + //make sure that non Tika parsers come last + //which means that they'll overwrite Tika parsers and + //be preferred. + DefaultParser defaultParser = new DefaultParser(); + int vorbisIndex = -1; + int fictIndex = -1; + int dcxmlIndex = -1; + int i = 0; + for (Parser p : defaultParser.getAllComponentParsers()) { + if ("class org.gagravarr.tika.VorbisParser".equals(p.getClass().toString())) { + vorbisIndex = i; + } + if ("class org.apache.tika.parser.xml.FictionBookParser".equals(p.getClass().toString())) { + fictIndex = i; + } + if ("class org.apache.tika.parser.xml.DcXMLParser".equals(p.getClass().toString())) { + dcxmlIndex = i; + } + i++; + } + + assertNotEquals(vorbisIndex, fictIndex); + assertNotEquals(fictIndex, dcxmlIndex); + assertTrue(vorbisIndex > fictIndex); + assertTrue(fictIndex > dcxmlIndex); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/pom.xml ---------------------------------------------------------------------- diff --git a/tika-core/pom.xml b/tika-core/pom.xml index e63f101..2c61616 100644 --- a/tika-core/pom.xml +++ b/tika-core/pom.xml @@ -33,8 +33,17 @@ <packaging>bundle</packaging> <name>Apache Tika core</name> <url>http://tika.apache.org/</url> + <properties> + <!-- NOTE: sync codec version with POI --> + <codec.version>1.10</codec.version> + </properties> <dependencies> + <dependency> + <groupId>commons-codec</groupId> + <artifactId>commons-codec</artifactId> + <version>${codec.version}</version> + </dependency> <!-- Optional OSGi dependencies, used only when running within OSGi --> <dependency> <groupId>org.osgi</groupId> @@ -60,6 +69,13 @@ <artifactId>junit</artifactId> <scope>test</scope> </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-test-resources</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> <dependency> <groupId>org.ops4j.pax.exam</groupId> <artifactId>pax-exam-junit4</artifactId> @@ -108,6 +124,9 @@ <Bundle-DocURL>${project.url}</Bundle-DocURL> <Bundle-Activator>org.apache.tika.config.TikaActivator</Bundle-Activator> <Bundle-ActivationPolicy>lazy</Bundle-ActivationPolicy> + <Embed-Dependency> + commons-codec + </Embed-Dependency> </instructions> </configuration> </plugin> http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java new file mode 100644 index 0000000..e7b2405 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java @@ -0,0 +1,295 @@ +package org.apache.tika.parser.digesting; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Locale; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.DigestingParser; +import org.apache.tika.parser.ParseContext; + +/** + * Implementation of {@link org.apache.tika.parser.DigestingParser.Digester} + * that relies on commons.codec.digest.DigestUtils to calculate digest hashes. + * <p> + * This digester tries to use the regular mark/reset protocol on the InputStream. + * However, this wraps an internal BoundedInputStream, and if the InputStream + * is not fully read, then this will reset the stream and + * spool the InputStream to disk (via TikaInputStream) and then digest the file. + * <p> + * If a TikaInputStream is passed in and it has an underlying file that is longer + * than the {@link #markLimit}, then this digester digests the file directly. + * + */ +public class CommonsDigester implements DigestingParser.Digester { + + public enum DigestAlgorithm { + //those currently available in commons.digest + MD2, + MD5, + SHA1, + SHA256, + SHA384, + SHA512; + + String getMetadataKey() { + return TikaCoreProperties.TIKA_META_PREFIX+ + "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER+this.toString(); + } + } + + private final List<DigestAlgorithm> algorithms = new ArrayList<DigestAlgorithm>(); + private final int markLimit; + + public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) { + Collections.addAll(this.algorithms, algorithms); + if (markLimit < 0) { + throw new IllegalArgumentException("markLimit must be >= 0"); + } + this.markLimit = markLimit; + } + + @Override + public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException { + InputStream tis = TikaInputStream.get(is); + long sz = -1; + if (((TikaInputStream)tis).hasFile()) { + sz = ((TikaInputStream)tis).getLength(); + } + //if the file is definitely a file, + //and its size is greater than its mark limit, + //just digest the underlying file. + if (sz > markLimit) { + digestFile(((TikaInputStream)tis).getFile(), m); + return; + } + + //try the usual mark/reset stuff. + //however, if you actually hit the bound, + //then stop and spool to file via TikaInputStream + SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, tis); + boolean finishedStream = false; + for (DigestAlgorithm algorithm : algorithms) { + bis.mark(markLimit + 1); + finishedStream = digestEach(algorithm, bis, m); + bis.reset(); + if (!finishedStream) { + break; + } + } + if (!finishedStream) { + digestFile(((TikaInputStream)tis).getFile(), m); + } + } + + private void digestFile(File f, Metadata m) throws IOException { + for (DigestAlgorithm algorithm : algorithms) { + try (InputStream is = new FileInputStream(f)) { + digestEach(algorithm, is, m); + } + } + } + + /** + * + * @param algorithm algo to use + * @param is input stream to read from + * @param metadata metadata for reporting the digest + * @return whether or not this finished the input stream + * @throws IOException + */ + private boolean digestEach(DigestAlgorithm algorithm, + InputStream is, Metadata metadata) throws IOException { + String digest = null; + try { + switch (algorithm) { + case MD2: + digest = DigestUtils.md2Hex(is); + break; + case MD5: + digest = DigestUtils.md5Hex(is); + break; + case SHA1: + digest = DigestUtils.sha1Hex(is); + break; + case SHA256: + digest = DigestUtils.sha256Hex(is); + break; + case SHA384: + digest = DigestUtils.sha384Hex(is); + break; + case SHA512: + digest = DigestUtils.sha512Hex(is); + break; + default: + throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algorithm.toString()); + } + } catch (IOException e) { + e.printStackTrace(); + //swallow, or should we throw this? + } + if (is instanceof SimpleBoundedInputStream) { + if (((SimpleBoundedInputStream)is).hasHitBound()) { + return false; + } + } + metadata.set(algorithm.getMetadataKey(), digest); + return true; + } + + /** + * + * @param s comma-delimited (no space) list of algorithms to use: md5,sha256 + * @return + */ + public static DigestAlgorithm[] parse(String s) { + assert(s != null); + + List<DigestAlgorithm> ret = new ArrayList<DigestAlgorithm>(); + for (String algoString : s.split(",")) { + String uc = algoString.toUpperCase(Locale.ROOT); + if (uc.equals(DigestAlgorithm.MD2.toString())) { + ret.add(DigestAlgorithm.MD2); + } else if (uc.equals(DigestAlgorithm.MD5.toString())) { + ret.add(DigestAlgorithm.MD5); + } else if (uc.equals(DigestAlgorithm.SHA1.toString())) { + ret.add(DigestAlgorithm.SHA1); + } else if (uc.equals(DigestAlgorithm.SHA256.toString())) { + ret.add(DigestAlgorithm.SHA256); + } else if (uc.equals(DigestAlgorithm.SHA384.toString())) { + ret.add(DigestAlgorithm.SHA384); + } else if (uc.equals(DigestAlgorithm.SHA512.toString())) { + ret.add(DigestAlgorithm.SHA512); + } else { + StringBuilder sb = new StringBuilder(); + int i = 0; + for (DigestAlgorithm algo : DigestAlgorithm.values()) { + if (i++ > 0) { + sb.append(", "); + } + sb.append(algo.toString()); + } + throw new IllegalArgumentException("Couldn't match " + s + " with any of: " + sb.toString()); + } + } + return ret.toArray(new DigestAlgorithm[ret.size()]); + } + + /** + * Very slight modification of Commons' BoundedInputStream + * so that we can figure out if this hit the bound or not. + */ + private class SimpleBoundedInputStream extends InputStream { + private final static int EOF = -1; + private final long max; + private final InputStream in; + private long pos; + boolean hitBound = false; + + private SimpleBoundedInputStream(long max, InputStream in) { + this.max = max; + this.in = in; + } + + @Override + public int read() throws IOException { + if (max >= 0 && pos >= max) { + hitBound = true; + return EOF; + } + final int result = in.read(); + pos++; + return result; + } + + /** + * Invokes the delegate's <code>read(byte[])</code> method. + * @param b the buffer to read the bytes into + * @return the number of bytes read or -1 if the end of stream or + * the limit has been reached. + * @throws IOException if an I/O error occurs + */ + @Override + public int read(final byte[] b) throws IOException { + return this.read(b, 0, b.length); + } + + /** + * Invokes the delegate's <code>read(byte[], int, int)</code> method. + * @param b the buffer to read the bytes into + * @param off The start offset + * @param len The number of bytes to read + * @return the number of bytes read or -1 if the end of stream or + * the limit has been reached. + * @throws IOException if an I/O error occurs + */ + @Override + public int read(final byte[] b, final int off, final int len) throws IOException { + if (max>=0 && pos>=max) { + return EOF; + } + final long maxRead = max>=0 ? Math.min(len, max-pos) : len; + final int bytesRead = in.read(b, off, (int)maxRead); + + if (bytesRead==EOF) { + return EOF; + } + + pos+=bytesRead; + return bytesRead; + } + + /** + * Invokes the delegate's <code>skip(long)</code> method. + * @param n the number of bytes to skip + * @return the actual number of bytes skipped + * @throws IOException if an I/O error occurs + */ + @Override + public long skip(final long n) throws IOException { + final long toSkip = max>=0 ? Math.min(n, max-pos) : n; + final long skippedBytes = in.skip(toSkip); + pos+=skippedBytes; + return skippedBytes; + } + + @Override + public void reset() throws IOException { + in.reset(); + } + + @Override + public void mark(int readLimit) { + in.mark(readLimit); + } + + public boolean hasHitBound() { + return hitBound; + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/TikaTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index 2c6f21f..1edf91c 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -26,6 +26,9 @@ import java.io.IOException; import java.io.InputStream; import java.net.URISyntaxException; import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; @@ -74,6 +77,25 @@ public abstract class TikaTest { } } + + /** + * Copies test file from "test-documents" to a temp file. + * Consumers are responsible for deleting the temp file after use. + * + * @param name + * @return + * @throws IOException + */ + public Path getTestDocumentAsTempFile(String name) throws IOException{ + Path tmp = Files.createTempFile("tika-test", ""); + Files.copy(getResourceAsStream("/test-documents/"+name), tmp, StandardCopyOption.REPLACE_EXISTING); + return tmp; + } + + public InputStream getTestDocumentAsStream(String name) { + return TikaInputStream.get(getResourceAsStream("/test-documents/"+name)); + } + public InputStream getResourceAsStream(String name) { InputStream stream = this.getClass().getResourceAsStream(name); if (stream == null) { @@ -106,36 +128,50 @@ public abstract class TikaTest { } } + protected XMLResult getXML(String filePath, Parser parser, Metadata metadata, ParseContext context) throws Exception { + return getXML(getTestDocumentAsStream(filePath), parser, metadata, context); + } + protected XMLResult getXML(String filePath, Parser parser, Metadata metadata) throws Exception { - return getXML(getResourceAsStream("/test-documents/" + filePath), parser, metadata); + return getXML(getTestDocumentAsStream(filePath), parser, metadata); } protected XMLResult getXML(String filePath, Metadata metadata) throws Exception { - return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), metadata); + Parser parser = new AutoDetectParser(); + ParseContext context = new ParseContext(); + context.set(Parser.class, parser); + + return getXML(getTestDocumentAsStream(filePath), parser, metadata, context); + } + + protected XMLResult getXML(String filePath, Parser parser) throws Exception { + //send in empty parse context so that only outer parser is used + return getXML(getTestDocumentAsStream(filePath), parser, new Metadata(), new ParseContext()); } protected XMLResult getXML(String filePath) throws Exception { - return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata()); + return getXML(filePath, new Metadata()); } protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception { - ParseContext context = new ParseContext(); - context.set(Parser.class, parser); - - try { - ContentHandler handler = new ToXMLContentHandler(); - parser.parse(input, handler, metadata, context); - return new XMLResult(handler.toString(), metadata); - } finally { - input.close(); - } - } + return getXML(input, parser, metadata, new ParseContext()); + } - /** - * Basic text extraction. - * <p> - * Tries to close input stream after processing. - */ + protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, ParseContext context) throws Exception { + try { + ContentHandler handler = new ToXMLContentHandler(); + parser.parse(input, handler, metadata, context); + return new XMLResult(handler.toString(), metadata); + } finally { + input.close(); + } + } + + /** + * Basic text extraction. + * <p> + * Tries to close input stream after processing. + */ public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{ ContentHandler handler = new BodyContentHandler(1000000); try { http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java index c815607..d2f3b40 100644 --- a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java +++ b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java @@ -22,13 +22,13 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; -import org.apache.tika.mime.MimeDetectionTest; import org.junit.Before; import org.junit.Test; -public class MimeDetectionWithNNTest { +public class MimeDetectionWithNNTest extends TikaTest { private Detector detector; @@ -88,13 +88,13 @@ public class MimeDetectionWithNNTest { private void testUrl(String expected, String url, String file) throws IOException { - InputStream in = MimeDetectionTest.class.getResourceAsStream(file); + InputStream in = getTestDocumentAsStream(file); testStream(expected, url, in); } private void testFile(String expected, String filename) throws IOException { - InputStream in = MimeDetectionTest.class.getResourceAsStream(filename); + InputStream in = getTestDocumentAsStream(filename); testStream(expected, filename, in); } http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java index 1f986da..31df3ec 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java @@ -27,12 +27,13 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; +import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.junit.Before; import org.junit.Test; -public class MimeDetectionTest { +public class MimeDetectionTest extends TikaTest { private MimeTypes mimeTypes; @@ -136,12 +137,12 @@ public class MimeDetectionTest { } private void testUrl(String expected, String url, String file) throws IOException{ - InputStream in = getClass().getResourceAsStream(file); + InputStream in = getTestDocumentAsStream(file); testStream(expected, url, in); } private void testFile(String expected, String filename) throws IOException { - InputStream in = getClass().getResourceAsStream(filename); + InputStream in = getTestDocumentAsStream(filename); testStream(expected, filename, in); } http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java index 35c75b7..415961f 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java +++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java @@ -27,11 +27,12 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; +import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.junit.Before; import org.junit.Test; -public class ProbabilisticMimeDetectionTest { +public class ProbabilisticMimeDetectionTest extends TikaTest { private ProbabilisticMimeDetectionSelector proDetector; @@ -130,12 +131,12 @@ public class ProbabilisticMimeDetectionTest { private void testUrl(String expected, String url, String file) throws IOException { - InputStream in = getClass().getResourceAsStream(file); + InputStream in = getTestDocumentAsStream(file); testStream(expected, url, in); } private void testFile(String expected, String filename) throws IOException { - InputStream in = getClass().getResourceAsStream(filename); + InputStream in = getTestDocumentAsStream(filename); testStream(expected, filename, in); } http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java index 5605300..a6dc7f3 100644 --- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java +++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java @@ -29,6 +29,7 @@ import java.io.InputStream; import java.net.URL; import org.apache.tika.Tika; +import org.apache.tika.TikaTest; import org.apache.tika.config.ServiceLoader; import org.apache.tika.detect.DefaultProbDetector; import org.apache.tika.metadata.Metadata; @@ -36,7 +37,7 @@ import org.apache.tika.mime.ProbabilisticMimeDetectionSelector.Builder; import org.junit.Before; import org.junit.Test; -public class ProbabilisticMimeDetectionTestWithTika { +public class ProbabilisticMimeDetectionTestWithTika extends TikaTest { private ProbabilisticMimeDetectionSelector proSelector; private MediaTypeRegistry registry; @@ -151,12 +152,12 @@ public class ProbabilisticMimeDetectionTestWithTika { private void testUrl(String expected, String url, String file) throws IOException { - InputStream in = getClass().getResourceAsStream(file); + InputStream in = getTestDocumentAsStream(file); testStream(expected, url, in); } private void testFile(String expected, String filename) throws IOException { - InputStream in = getClass().getResourceAsStream(filename); + InputStream in = getTestDocumentAsStream(filename); testStream(expected, filename, in); } http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java b/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java index f3397d9..696d5e6 100644 --- a/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java +++ b/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java @@ -18,27 +18,17 @@ package org.apache.tika.osgi; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; -import static org.junit.Assert.assertNotNull; import static org.ops4j.pax.exam.CoreOptions.bundle; import static org.ops4j.pax.exam.CoreOptions.junitBundles; import static org.ops4j.pax.exam.CoreOptions.options; -import static org.ops4j.pax.exam.CoreOptions.mavenBundle; import javax.inject.Inject; - import java.io.File; import java.io.IOException; -import java.io.InputStream; -import java.io.StringWriter; -import java.io.Writer; import java.net.URISyntaxException; import java.util.Set; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.osgi.TikaService; import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.BodyContentHandler; import org.junit.Test; import org.junit.runner.RunWith; import org.ops4j.pax.exam.Configuration; @@ -48,7 +38,6 @@ import org.ops4j.pax.exam.spi.reactors.ExamReactorStrategy; import org.ops4j.pax.exam.spi.reactors.PerMethod; import org.osgi.framework.Bundle; import org.osgi.framework.BundleContext; -import org.xml.sax.ContentHandler; @RunWith(PaxExam.class) @ExamReactorStrategy(PerMethod.class) http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/GLDAS_CLM10SUBP_3H.A19790202.0000.001.grb ---------------------------------------------------------------------- diff --git a/tika-core/src/test/resources/org/apache/tika/mime/GLDAS_CLM10SUBP_3H.A19790202.0000.001.grb b/tika-core/src/test/resources/org/apache/tika/mime/GLDAS_CLM10SUBP_3H.A19790202.0000.001.grb deleted file mode 100644 index 0bffdca..0000000 Binary files a/tika-core/src/test/resources/org/apache/tika/mime/GLDAS_CLM10SUBP_3H.A19790202.0000.001.grb and /dev/null differ
