Repository: tika Updated Branches: refs/heads/2.x 73d720a83 -> 249105aa3
TIKA-1851: move all test resources back to src/test from src/main in tika-test-resources. Sorry! Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/249105aa Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/249105aa Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/249105aa Branch: refs/heads/2.x Commit: 249105aa397f962fae8b0ac1980ae7b20ea82b25 Parents: 73d720a Author: tballison <[email protected]> Authored: Fri Feb 5 22:50:56 2016 -0500 Committer: tballison <[email protected]> Committed: Fri Feb 5 22:50:56 2016 -0500 ---------------------------------------------------------------------- tika-batch/pom.xml | 4 +- tika-parser-modules/pom.xml | 3 +- tika-parsers/pom.xml | 4 +- .../tika/config/TikaDetectorConfigTest.java | 1 - .../apache/tika/parser/mock/MockParserTest.java | 251 +++++++++++++ tika-server/pom.xml | 4 +- tika-test-resources/pom.xml | 2 +- .../src/main/java/org/apache/tika/TikaTest.java | 214 ----------- .../tika/config/AbstractTikaConfigTest.java | 50 --- .../org/apache/tika/parser/mock/MockParser.java | 365 ------------------- .../services/org.apache.tika.parser.Parser | 1 - .../src/test/java/org/apache/tika/TikaTest.java | 214 +++++++++++ .../tika/config/AbstractTikaConfigTest.java | 50 +++ .../org/apache/tika/parser/mock/MockParser.java | 365 +++++++++++++++++++ .../apache/tika/parser/mock/MockParserTest.java | 247 ------------- .../services/org.apache.tika.parser.Parser | 1 + 16 files changed, 893 insertions(+), 883 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-batch/pom.xml ---------------------------------------------------------------------- diff --git a/tika-batch/pom.xml b/tika-batch/pom.xml index ffd29b1..bd78cbf 100644 --- a/tika-batch/pom.xml +++ b/tika-batch/pom.xml @@ -81,11 +81,13 @@ <scope>test</scope> </dependency> <dependency> - <groupId>${project.groupId}</groupId> + <groupId>org.apache.tika</groupId> <artifactId>tika-test-resources</artifactId> <version>${project.version}</version> + <type>test-jar</type> <scope>test</scope> </dependency> + </dependencies> <build> http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-parser-modules/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml index 724f0f9..8e71c1b 100644 --- a/tika-parser-modules/pom.xml +++ b/tika-parser-modules/pom.xml @@ -61,9 +61,10 @@ <dependencies> <!-- Test dependencies --> <dependency> - <groupId>${project.groupId}</groupId> + <groupId>org.apache.tika</groupId> <artifactId>tika-test-resources</artifactId> <version>${project.version}</version> + <type>test-jar</type> <scope>test</scope> </dependency> <dependency> http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-parsers/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index 76a78ac..396902a 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -154,11 +154,13 @@ <scope>test</scope> </dependency> <dependency> - <groupId>${project.groupId}</groupId> + <groupId>org.apache.tika</groupId> <artifactId>tika-test-resources</artifactId> <version>${project.version}</version> + <type>test-jar</type> <scope>test</scope> </dependency> + </dependencies> <build> http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java index 949107c..2125888 100644 --- a/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java @@ -30,7 +30,6 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.mbox.OutlookPSTParser; import org.apache.tika.parser.microsoft.POIFSContainerDetector; import org.apache.tika.parser.pkg.ZipContainerDetector; -import org.junit.Ignore; import org.junit.Test; /** http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java new file mode 100644 index 0000000..d222e68 --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java @@ -0,0 +1,251 @@ +package org.apache.tika.parser.mock; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintStream; +import java.util.Date; + +import org.apache.tika.TikaTest; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.IOUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.junit.Test; + +/** + * Somewhat bizarrely, we can't put the test of this test resource in tika-test-resources + * or else it will be called by every module that uses it. Um, Yossarian!!! + */ +public class MockParserTest extends TikaTest { + private final static String M = "/test-documents/mock/"; + private final static Parser PARSER = new AutoDetectParser(); + + @Override + public XMLResult getXML(String path, Metadata m) throws Exception { + //note that this is specific to MockParserTest with addition of M to the path! + InputStream is = getResourceAsStream(M+path); + try { + return super.getXML(is, PARSER, m); + } finally { + IOUtils.closeQuietly(is); + } + } + + @Test + public void testExample() throws Exception { + Metadata m = new Metadata(); + PrintStream out = System.out; + PrintStream err = System.err; + ByteArrayOutputStream outBos = new ByteArrayOutputStream(); + ByteArrayOutputStream errBos = new ByteArrayOutputStream(); + PrintStream tmpOut = new PrintStream(outBos, true, UTF_8.toString()); + PrintStream tmpErr = new PrintStream(errBos, true, UTF_8.toString()); + System.setOut(tmpOut); + System.setErr(tmpErr); + try { + assertThrowable("example.xml", m, IOException.class, "not another IOException"); + assertMockParser(m); + } finally { + System.setOut(out); + System.setErr(err); + } + String outString = new String(outBos.toByteArray(), UTF_8); + assertContains("writing to System.out", outString); + + String errString = new String(errBos.toByteArray(), UTF_8); + assertContains("writing to System.err", errString); + + } + + @Test + public void testNothingBad() throws Exception { + Metadata m = new Metadata(); + String content = getXML("nothing_bad.xml", m).xml; + assertEquals("Geoffrey Chaucer", m.get("author")); + assertContains("<p>And bathed every veyne in swich licour,</p>", content); + assertMockParser(m); + } + + @Test + public void testNullPointer() throws Exception { + Metadata m = new Metadata(); + assertThrowable("null_pointer.xml", m, NullPointerException.class, "another null pointer exception"); + assertMockParser(m); + } + + @Test + public void testNullPointerNoMsg() throws Exception { + Metadata m = new Metadata(); + assertThrowable("null_pointer_no_msg.xml", m, NullPointerException.class, null); + assertMockParser(m); + } + + + @Test + public void testSleep() throws Exception { + long start = new Date().getTime(); + Metadata m = new Metadata(); + String content = getXML("sleep.xml", m).xml; + assertMockParser(m); + long elapsed = new Date().getTime()-start; + //should sleep for at least 3000 + boolean enoughTimeHasElapsed = elapsed > 2000; + assertTrue("not enough time has not elapsed: "+elapsed, enoughTimeHasElapsed); + assertMockParser(m); + } + + @Test + public void testHeavyHang() throws Exception { + long start = new Date().getTime(); + Metadata m = new Metadata(); + + String content = getXML("heavy_hang.xml", m).xml; + assertMockParser(m); + long elapsed = new Date().getTime()-start; + //should sleep for at least 3000 + boolean enoughTimeHasElapsed = elapsed > 2000; + assertTrue("not enough time has elapsed: "+elapsed, enoughTimeHasElapsed); + assertMockParser(m); + } + + @Test + public void testFakeOOM() throws Exception { + Metadata m = new Metadata(); + assertThrowable("fake_oom.xml", m, OutOfMemoryError.class, "not another oom"); + assertMockParser(m); + } + + @Test + public void testRealOOM() throws Exception { + //Note: we're not actually testing the diff between fake and real oom + //i.e. by creating child process and setting different -Xmx or + //memory profiling. + Metadata m = new Metadata(); + assertThrowable("real_oom.xml", m, OutOfMemoryError.class, "Java heap space"); + assertMockParser(m); + } + + @Test + public void testInterruptibleSleep() { + //Without static initialization of the parser, it can take ~1 second after t.start() + //before the parser actually calls parse. This is + //just the time it takes to instantiate and call AutoDetectParser, do the detection, etc. + //This is not thread creation overhead. + ParserRunnable r = new ParserRunnable("sleep_interruptible.xml"); + Thread t = new Thread(r); + t.start(); + long start = new Date().getTime(); + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + //swallow + } + + t.interrupt(); + + try { + t.join(10000); + } catch (InterruptedException e) { + //swallow + } + long elapsed = new Date().getTime()-start; + boolean shortEnough = elapsed < 2000;//the xml file specifies 3000 + assertTrue("elapsed (" + elapsed + " millis) was not short enough", shortEnough); + } + + @Test + public void testNonInterruptibleSleep() { + ParserRunnable r = new ParserRunnable("sleep_not_interruptible.xml"); + Thread t = new Thread(r); + t.start(); + long start = new Date().getTime(); + try { + //make sure that the thread has actually started + Thread.sleep(1000); + } catch (InterruptedException e) { + //swallow + } + t.interrupt(); + try { + t.join(20000); + } catch (InterruptedException e) { + //swallow + } + long elapsed = new Date().getTime()-start; + boolean longEnough = elapsed > 3000;//the xml file specifies 3000, this sleeps 1000 + assertTrue("elapsed ("+elapsed+" millis) was not long enough", longEnough); + } + + private class ParserRunnable implements Runnable { + private final String path; + ParserRunnable(String path) { + this.path = path; + } + @Override + public void run() { + Metadata m = new Metadata(); + try { + getXML(path, m); + } catch (Exception e) { + throw new RuntimeException(e); + } finally { + assertMockParser(m); + } + } + } + + private void assertThrowable(String path, Metadata m, Class<? extends Throwable> expected, String message) { + + try { + getXML(path, m); + } catch (Throwable t) { + //if this is a throwable wrapped in a TikaException, use the cause + if (t instanceof TikaException && t.getCause() != null) { + t = t.getCause(); + } + if (! (t.getClass().isAssignableFrom(expected))){ + fail(t.getClass() +" is not assignable from "+expected); + } + if (message != null) { + assertEquals(message, t.getMessage()); + } + } + } + + private void assertMockParser(Metadata m) { + String[] parsers = m.getValues("X-Parsed-By"); + //make sure that it was actually parsed by mock. + boolean parsedByMock = false; + for (String parser : parsers) { + if (parser.equals("org.apache.tika.parser.mock.MockParser")) { + parsedByMock = true; + break; + } + } + assertTrue("mock parser should have been called", parsedByMock); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-server/pom.xml ---------------------------------------------------------------------- diff --git a/tika-server/pom.xml b/tika-server/pom.xml index 4634068..958cd74 100644 --- a/tika-server/pom.xml +++ b/tika-server/pom.xml @@ -120,11 +120,13 @@ <scope>test</scope> </dependency> <dependency> - <groupId>${project.groupId}</groupId> + <groupId>org.apache.tika</groupId> <artifactId>tika-test-resources</artifactId> <version>${project.version}</version> + <type>test-jar</type> <scope>test</scope> </dependency> + <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/pom.xml ---------------------------------------------------------------------- diff --git a/tika-test-resources/pom.xml b/tika-test-resources/pom.xml index 7574e0c..5df07f4 100644 --- a/tika-test-resources/pom.xml +++ b/tika-test-resources/pom.xml @@ -78,7 +78,7 @@ <executions> <execution> <goals> - <goal>jar</goal> + <goal>test-jar</goal> </goals> </execution> </executions> http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/main/java/org/apache/tika/TikaTest.java ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/main/java/org/apache/tika/TikaTest.java b/tika-test-resources/src/main/java/org/apache/tika/TikaTest.java deleted file mode 100644 index 2c6f21f..0000000 --- a/tika-test-resources/src/main/java/org/apache/tika/TikaTest.java +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika; - -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.net.URISyntaxException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import org.apache.tika.extractor.EmbeddedResourceHandler; -import org.apache.tika.io.IOUtils; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.sax.BodyContentHandler; -import org.apache.tika.sax.ToXMLContentHandler; -import org.xml.sax.ContentHandler; - -/** - * Parent class of Tika tests - */ -public abstract class TikaTest { - /** - * This method will give you back the filename incl. the absolute path name - * to the resource. If the resource does not exist it will give you back the - * resource name incl. the path. - * - * @param name - * The named resource to search for. - * @return an absolute path incl. the name which is in the same directory as - * the the class you've called it from. - */ - public File getResourceAsFile(String name) throws URISyntaxException { - URL url = this.getClass().getResource(name); - if (url != null) { - return new File(url.toURI()); - } else { - // We have a file which does not exists - // We got the path - url = this.getClass().getResource("."); - File file = new File(new File(url.toURI()), name); - if (file == null) { - fail("Unable to find requested file " + name); - } - return file; - } - } - - public InputStream getResourceAsStream(String name) { - InputStream stream = this.getClass().getResourceAsStream(name); - if (stream == null) { - fail("Unable to find requested resource " + name); - } - return stream; - } - - public static void assertContains(String needle, String haystack) { - assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle)); - } - public static <T> void assertContains(T needle, Collection<? extends T> haystack) { - assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle)); - } - - public static void assertNotContained(String needle, String haystack) { - assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle)); - } - public static <T> void assertNotContained(T needle, Collection<? extends T> haystack) { - assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle)); - } - - protected static class XMLResult { - public final String xml; - public final Metadata metadata; - - public XMLResult(String xml, Metadata metadata) { - this.xml = xml; - this.metadata = metadata; - } - } - - protected XMLResult getXML(String filePath, Parser parser, Metadata metadata) throws Exception { - return getXML(getResourceAsStream("/test-documents/" + filePath), parser, metadata); - } - - protected XMLResult getXML(String filePath, Metadata metadata) throws Exception { - return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), metadata); - } - - protected XMLResult getXML(String filePath) throws Exception { - return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata()); - } - - protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception { - ParseContext context = new ParseContext(); - context.set(Parser.class, parser); - - try { - ContentHandler handler = new ToXMLContentHandler(); - parser.parse(input, handler, metadata, context); - return new XMLResult(handler.toString(), metadata); - } finally { - input.close(); - } - } - - /** - * Basic text extraction. - * <p> - * Tries to close input stream after processing. - */ - public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{ - ContentHandler handler = new BodyContentHandler(1000000); - try { - parser.parse(is, handler, metadata, context); - } finally { - is.close(); - } - return handler.toString(); - } - - public String getText(InputStream is, Parser parser, Metadata metadata) throws Exception{ - return getText(is, parser, new ParseContext(), metadata); - } - - public String getText(InputStream is, Parser parser, ParseContext context) throws Exception{ - return getText(is, parser, context, new Metadata()); - } - - public String getText(InputStream is, Parser parser) throws Exception{ - return getText(is, parser, new ParseContext(), new Metadata()); - } - - /** - * Keeps track of media types and file names recursively. - * - */ - public static class TrackingHandler implements EmbeddedResourceHandler { - public List<String> filenames = new ArrayList<String>(); - public List<MediaType> mediaTypes = new ArrayList<MediaType>(); - - private final Set<MediaType> skipTypes; - - public TrackingHandler() { - skipTypes = new HashSet<MediaType>(); - } - - public TrackingHandler(Set<MediaType> skipTypes) { - this.skipTypes = skipTypes; - } - - @Override - public void handle(String filename, MediaType mediaType, - InputStream stream) { - if (skipTypes.contains(mediaType)) { - return; - } - mediaTypes.add(mediaType); - filenames.add(filename); - } - } - - /** - * Copies byte[] of embedded documents into a List. - */ - public static class ByteCopyingHandler implements EmbeddedResourceHandler { - - public List<byte[]> bytes = new ArrayList<byte[]>(); - - @Override - public void handle(String filename, MediaType mediaType, - InputStream stream) { - ByteArrayOutputStream os = new ByteArrayOutputStream(); - if (! stream.markSupported()) { - stream = TikaInputStream.get(stream); - } - stream.mark(0); - try { - IOUtils.copy(stream, os); - bytes.add(os.toByteArray()); - stream.reset(); - } catch (IOException e) { - //swallow - } - } - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/main/java/org/apache/tika/config/AbstractTikaConfigTest.java ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/main/java/org/apache/tika/config/AbstractTikaConfigTest.java b/tika-test-resources/src/main/java/org/apache/tika/config/AbstractTikaConfigTest.java deleted file mode 100644 index 1b104f7..0000000 --- a/tika-test-resources/src/main/java/org/apache/tika/config/AbstractTikaConfigTest.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.config; - -import static org.junit.Assert.assertNotNull; - -import java.net.URL; - -import org.apache.tika.TikaTest; -import org.apache.tika.parser.ParseContext; -import org.junit.After; - -/** - * Parent of Junit test classes for {@link TikaConfig}, including - * Tika Core based ones, and ones in Tika Parsers that do things - * that tika-core's can't, do due to a need for the - * full set of "real" classes of parsers / detectors - */ -public abstract class AbstractTikaConfigTest extends TikaTest { - protected static ParseContext context = new ParseContext(); - - protected static String getConfigPath(String config) throws Exception { - URL url = TikaConfig.class.getResource(config); - assertNotNull("Test Tika Config not found: " + config, url); - return url.toExternalForm(); - } - protected static TikaConfig getConfig(String config) throws Exception { - System.setProperty("tika.config", getConfigPath(config)); - return new TikaConfig(); - } - - @After - public void resetConfig() { - System.clearProperty("tika.config"); - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/main/java/org/apache/tika/parser/mock/MockParser.java ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/main/java/org/apache/tika/parser/mock/MockParser.java b/tika-test-resources/src/main/java/org/apache/tika/parser/mock/MockParser.java deleted file mode 100644 index a920502..0000000 --- a/tika-test-resources/src/main/java/org/apache/tika/parser/mock/MockParser.java +++ /dev/null @@ -1,365 +0,0 @@ -package org.apache.tika.parser.mock; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import static java.nio.charset.StandardCharsets.UTF_8; - -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.lang.reflect.Constructor; -import java.util.ArrayList; -import java.util.Date; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaMetadataKeys; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AbstractParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.sax.EmbeddedContentHandler; -import org.apache.tika.sax.XHTMLContentHandler; -import org.w3c.dom.Document; -import org.w3c.dom.NamedNodeMap; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - -/** - * This class enables mocking of parser behavior for use in testing - * wrappers and drivers of parsers. - * <p> - * See resources/test-documents/mock/example.xml in tika-parsers/test for the documentation - * of all the options for this MockParser. - * <p> - * Tests for this class are in tika-parsers. - * <p> - * See also {@link org.apache.tika.parser.DummyParser} for another option. - */ - -public class MockParser extends AbstractParser { - - private static final long serialVersionUID = 1L; - - @Override - public Set<MediaType> getSupportedTypes(ParseContext context) { - Set<MediaType> types = new HashSet<MediaType>(); - MediaType type = MediaType.application("mock+xml"); - types.add(type); - return types; - } - - @Override - public void parse(InputStream stream, ContentHandler handler, - Metadata metadata, ParseContext context) throws IOException, - SAXException, TikaException { - Document doc = null; - DocumentBuilderFactory fact = DocumentBuilderFactory.newInstance(); - DocumentBuilder docBuilder = null; - try { - docBuilder = fact.newDocumentBuilder(); - doc = docBuilder.parse(stream); - } catch (ParserConfigurationException e) { - throw new IOException(e); - } catch (SAXException e) { - throw new IOException(e); - } - Node root = doc.getDocumentElement(); - NodeList actions = root.getChildNodes(); - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); - xhtml.startDocument(); - for (int i = 0; i < actions.getLength(); i++) { - executeAction(actions.item(i), metadata, context, xhtml); - } - xhtml.endDocument(); - } - - private void executeAction(Node action, Metadata metadata, ParseContext context, - XHTMLContentHandler xhtml) throws SAXException, - IOException, TikaException { - - if (action.getNodeType() != 1) { - return; - } - - String name = action.getNodeName(); - if ("metadata".equals(name)) { - metadata(action, metadata); - } else if("write".equals(name)) { - write(action, xhtml); - } else if ("throw".equals(name)) { - throwIt(action); - } else if ("hang".equals(name)) { - hang(action); - } else if ("oom".equals(name)) { - kabOOM(); - } else if ("print_out".equals(name) || "print_err".equals(name)){ - print(action, name); - } else if ("embedded".equals(name)) { - handleEmbedded(action, xhtml, context); - } else if ("throwIllegalChars".equals(name)) { - throwIllegalChars(); - } else { - throw new IllegalArgumentException("Didn't recognize mock action: "+name); - } - } - - private void throwIllegalChars() throws IOException { - throw new IOException("Can't say \u0000 in xml or \u0001 or \u0002 or \u0003"); - } - - private void handleEmbedded(Node action, XHTMLContentHandler handler, ParseContext context) - throws TikaException, SAXException, IOException { - String fileName = ""; - String contentType = ""; - NamedNodeMap attrs = action.getAttributes(); - if (attrs != null) { - Node n = attrs.getNamedItem("filename"); - if (n != null) { - fileName = n.getNodeValue(); - } - n = attrs.getNamedItem("content-type"); - if (n != null) { - contentType = n.getNodeValue(); - } - } - - String embeddedText = action.getTextContent(); - EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(context); - Metadata m = new Metadata(); - m.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName); - if (! "".equals(contentType)) { - m.set(Metadata.CONTENT_TYPE, contentType); - } - InputStream is = new ByteArrayInputStream(embeddedText.getBytes(UTF_8)); - - extractor.parseEmbedded( - is, - new EmbeddedContentHandler(handler), - m, true); - - - } - - protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) { - EmbeddedDocumentExtractor extractor = - context.get(EmbeddedDocumentExtractor.class); - if (extractor == null) { - Parser p = context.get(Parser.class); - if (p == null) { - context.set(Parser.class, new MockParser()); - } - extractor = new ParsingEmbeddedDocumentExtractor(context); - } - return extractor; - } - - private void print(Node action, String name) { - String content = action.getTextContent(); - if ("print_out".equals(name)) { - System.out.println(content); - } else if ("print_err".equals(name)) { - System.err.println(content); - } else { - throw new IllegalArgumentException("must be print_out or print_err"); - } - } - private void hang(Node action) { - boolean interruptible = true; - boolean heavy = false; - long millis = -1; - long pulseMillis = -1; - NamedNodeMap attrs = action.getAttributes(); - Node iNode = attrs.getNamedItem("interruptible"); - if (iNode != null) { - interruptible = ("true".equals(iNode.getNodeValue())); - } - Node hNode = attrs.getNamedItem("heavy"); - if (hNode != null) { - heavy = ("true".equals(hNode.getNodeValue())); - } - - Node mNode = attrs.getNamedItem("millis"); - if (mNode == null) { - throw new RuntimeException("Must specify \"millis\" attribute for hang."); - } - String millisString = mNode.getNodeValue(); - try { - millis = Long.parseLong(millisString); - } catch (NumberFormatException e) { - throw new RuntimeException("Value for \"millis\" attribute must be a long."); - } - - if (heavy) { - Node pNode = attrs.getNamedItem("pulse_millis"); - if (pNode == null) { - throw new RuntimeException("Must specify attribute \"pulse_millis\" if the hang is \"heavy\""); - } - String pulseMillisString = mNode.getNodeValue(); - try { - pulseMillis = Long.parseLong(pulseMillisString); - } catch (NumberFormatException e) { - throw new RuntimeException("Value for \"millis\" attribute must be a long."); - } - } - if (heavy) { - hangHeavy(millis, pulseMillis, interruptible); - } else { - sleep(millis, interruptible); - } - } - - private void throwIt(Node action) throws IOException, - SAXException, TikaException { - NamedNodeMap attrs = action.getAttributes(); - String className = attrs.getNamedItem("class").getNodeValue(); - String msg = action.getTextContent(); - throwIt(className, msg); - } - - private void metadata(Node action, Metadata metadata) { - NamedNodeMap attrs = action.getAttributes(); - //throws npe unless there is a name - String name = attrs.getNamedItem("name").getNodeValue(); - String value = action.getTextContent(); - Node actionType = attrs.getNamedItem("action"); - if (actionType == null) { - metadata.add(name, value); - } else { - if ("set".equals(actionType.getNodeValue())) { - metadata.set(name, value); - } else { - metadata.add(name, value); - } - } - } - - private void write(Node action, XHTMLContentHandler xhtml) throws SAXException { - NamedNodeMap attrs = action.getAttributes(); - Node eNode = attrs.getNamedItem("element"); - String elementType = "p"; - if (eNode != null) { - elementType = eNode.getTextContent(); - } - String text = action.getTextContent(); - xhtml.startElement(elementType); - xhtml.characters(text); - xhtml.endElement(elementType); - } - - - private void throwIt(String className, String msg) throws IOException, - SAXException, TikaException { - Throwable t = null; - if (msg == null || msg.equals("")) { - try { - t = (Throwable) Class.forName(className).newInstance(); - } catch (Exception e) { - throw new RuntimeException("couldn't create throwable class:"+className, e); - } - } else { - try { - Class<?> clazz = Class.forName(className); - Constructor<?> con = clazz.getConstructor(String.class); - t = (Throwable) con.newInstance(msg); - } catch (Exception e) { - throw new RuntimeException("couldn't create throwable class:" + className, e); - } - } - if (t instanceof SAXException) { - throw (SAXException)t; - } else if (t instanceof IOException) { - throw (IOException) t; - } else if (t instanceof TikaException) { - throw (TikaException) t; - } else if (t instanceof Error) { - throw (Error) t; - } else if (t instanceof RuntimeException) { - throw (RuntimeException) t; - } else { - //wrap the throwable in a RuntimeException - throw new RuntimeException(t); - } - } - - private void kabOOM() { - List<int[]> ints = new ArrayList<int[]>(); - - while (true) { - int[] intArr = new int[32000]; - ints.add(intArr); - } - } - - private void hangHeavy(long maxMillis, long pulseCheckMillis, boolean interruptible) { - //do some heavy computation and occasionally check for - //whether time has exceeded maxMillis (see TIKA-1132 for inspiration) - //or whether the thread was interrupted - long start = new Date().getTime(); - int lastChecked = 0; - while (true) { - for (int i = 1; i < Integer.MAX_VALUE; i++) { - for (int j = 1; j < Integer.MAX_VALUE; j++) { - double div = (double) i / (double) j; - lastChecked++; - if (lastChecked > pulseCheckMillis) { - lastChecked = 0; - if (interruptible && Thread.currentThread().isInterrupted()) { - return; - } - long elapsed = new Date().getTime()-start; - if (elapsed > maxMillis) { - return; - } - } - } - } - } - } - - private void sleep(long maxMillis, boolean isInterruptible) { - long start = new Date().getTime(); - long millisRemaining = maxMillis; - while (true) { - try { - Thread.sleep(millisRemaining); - } catch (InterruptedException e) { - if (isInterruptible) { - return; - } - } - long elapsed = new Date().getTime()-start; - millisRemaining = maxMillis - elapsed; - if (millisRemaining <= 0) { - break; - } - } - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/main/resources/META-INF/services/org.apache.tika.parser.Parser ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-test-resources/src/main/resources/META-INF/services/org.apache.tika.parser.Parser deleted file mode 100644 index 69bfdeb..0000000 --- a/tika-test-resources/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ /dev/null @@ -1 +0,0 @@ -org.apache.tika.parser.mock.MockParser \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/test/java/org/apache/tika/TikaTest.java ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/test/java/org/apache/tika/TikaTest.java b/tika-test-resources/src/test/java/org/apache/tika/TikaTest.java new file mode 100644 index 0000000..2c6f21f --- /dev/null +++ b/tika-test-resources/src/test/java/org/apache/tika/TikaTest.java @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.tika.extractor.EmbeddedResourceHandler; +import org.apache.tika.io.IOUtils; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ToXMLContentHandler; +import org.xml.sax.ContentHandler; + +/** + * Parent class of Tika tests + */ +public abstract class TikaTest { + /** + * This method will give you back the filename incl. the absolute path name + * to the resource. If the resource does not exist it will give you back the + * resource name incl. the path. + * + * @param name + * The named resource to search for. + * @return an absolute path incl. the name which is in the same directory as + * the the class you've called it from. + */ + public File getResourceAsFile(String name) throws URISyntaxException { + URL url = this.getClass().getResource(name); + if (url != null) { + return new File(url.toURI()); + } else { + // We have a file which does not exists + // We got the path + url = this.getClass().getResource("."); + File file = new File(new File(url.toURI()), name); + if (file == null) { + fail("Unable to find requested file " + name); + } + return file; + } + } + + public InputStream getResourceAsStream(String name) { + InputStream stream = this.getClass().getResourceAsStream(name); + if (stream == null) { + fail("Unable to find requested resource " + name); + } + return stream; + } + + public static void assertContains(String needle, String haystack) { + assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle)); + } + public static <T> void assertContains(T needle, Collection<? extends T> haystack) { + assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle)); + } + + public static void assertNotContained(String needle, String haystack) { + assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle)); + } + public static <T> void assertNotContained(T needle, Collection<? extends T> haystack) { + assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle)); + } + + protected static class XMLResult { + public final String xml; + public final Metadata metadata; + + public XMLResult(String xml, Metadata metadata) { + this.xml = xml; + this.metadata = metadata; + } + } + + protected XMLResult getXML(String filePath, Parser parser, Metadata metadata) throws Exception { + return getXML(getResourceAsStream("/test-documents/" + filePath), parser, metadata); + } + + protected XMLResult getXML(String filePath, Metadata metadata) throws Exception { + return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), metadata); + } + + protected XMLResult getXML(String filePath) throws Exception { + return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata()); + } + + protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception { + ParseContext context = new ParseContext(); + context.set(Parser.class, parser); + + try { + ContentHandler handler = new ToXMLContentHandler(); + parser.parse(input, handler, metadata, context); + return new XMLResult(handler.toString(), metadata); + } finally { + input.close(); + } + } + + /** + * Basic text extraction. + * <p> + * Tries to close input stream after processing. + */ + public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{ + ContentHandler handler = new BodyContentHandler(1000000); + try { + parser.parse(is, handler, metadata, context); + } finally { + is.close(); + } + return handler.toString(); + } + + public String getText(InputStream is, Parser parser, Metadata metadata) throws Exception{ + return getText(is, parser, new ParseContext(), metadata); + } + + public String getText(InputStream is, Parser parser, ParseContext context) throws Exception{ + return getText(is, parser, context, new Metadata()); + } + + public String getText(InputStream is, Parser parser) throws Exception{ + return getText(is, parser, new ParseContext(), new Metadata()); + } + + /** + * Keeps track of media types and file names recursively. + * + */ + public static class TrackingHandler implements EmbeddedResourceHandler { + public List<String> filenames = new ArrayList<String>(); + public List<MediaType> mediaTypes = new ArrayList<MediaType>(); + + private final Set<MediaType> skipTypes; + + public TrackingHandler() { + skipTypes = new HashSet<MediaType>(); + } + + public TrackingHandler(Set<MediaType> skipTypes) { + this.skipTypes = skipTypes; + } + + @Override + public void handle(String filename, MediaType mediaType, + InputStream stream) { + if (skipTypes.contains(mediaType)) { + return; + } + mediaTypes.add(mediaType); + filenames.add(filename); + } + } + + /** + * Copies byte[] of embedded documents into a List. + */ + public static class ByteCopyingHandler implements EmbeddedResourceHandler { + + public List<byte[]> bytes = new ArrayList<byte[]>(); + + @Override + public void handle(String filename, MediaType mediaType, + InputStream stream) { + ByteArrayOutputStream os = new ByteArrayOutputStream(); + if (! stream.markSupported()) { + stream = TikaInputStream.get(stream); + } + stream.mark(0); + try { + IOUtils.copy(stream, os); + bytes.add(os.toByteArray()); + stream.reset(); + } catch (IOException e) { + //swallow + } + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java b/tika-test-resources/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java new file mode 100644 index 0000000..1b104f7 --- /dev/null +++ b/tika-test-resources/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config; + +import static org.junit.Assert.assertNotNull; + +import java.net.URL; + +import org.apache.tika.TikaTest; +import org.apache.tika.parser.ParseContext; +import org.junit.After; + +/** + * Parent of Junit test classes for {@link TikaConfig}, including + * Tika Core based ones, and ones in Tika Parsers that do things + * that tika-core's can't, do due to a need for the + * full set of "real" classes of parsers / detectors + */ +public abstract class AbstractTikaConfigTest extends TikaTest { + protected static ParseContext context = new ParseContext(); + + protected static String getConfigPath(String config) throws Exception { + URL url = TikaConfig.class.getResource(config); + assertNotNull("Test Tika Config not found: " + config, url); + return url.toExternalForm(); + } + protected static TikaConfig getConfig(String config) throws Exception { + System.setProperty("tika.config", getConfigPath(config)); + return new TikaConfig(); + } + + @After + public void resetConfig() { + System.clearProperty("tika.config"); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParser.java ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParser.java b/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParser.java new file mode 100644 index 0000000..a920502 --- /dev/null +++ b/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParser.java @@ -0,0 +1,365 @@ +package org.apache.tika.parser.mock; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import static java.nio.charset.StandardCharsets.UTF_8; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.lang.reflect.Constructor; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaMetadataKeys; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.w3c.dom.Document; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * This class enables mocking of parser behavior for use in testing + * wrappers and drivers of parsers. + * <p> + * See resources/test-documents/mock/example.xml in tika-parsers/test for the documentation + * of all the options for this MockParser. + * <p> + * Tests for this class are in tika-parsers. + * <p> + * See also {@link org.apache.tika.parser.DummyParser} for another option. + */ + +public class MockParser extends AbstractParser { + + private static final long serialVersionUID = 1L; + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + Set<MediaType> types = new HashSet<MediaType>(); + MediaType type = MediaType.application("mock+xml"); + types.add(type); + return types; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) throws IOException, + SAXException, TikaException { + Document doc = null; + DocumentBuilderFactory fact = DocumentBuilderFactory.newInstance(); + DocumentBuilder docBuilder = null; + try { + docBuilder = fact.newDocumentBuilder(); + doc = docBuilder.parse(stream); + } catch (ParserConfigurationException e) { + throw new IOException(e); + } catch (SAXException e) { + throw new IOException(e); + } + Node root = doc.getDocumentElement(); + NodeList actions = root.getChildNodes(); + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + for (int i = 0; i < actions.getLength(); i++) { + executeAction(actions.item(i), metadata, context, xhtml); + } + xhtml.endDocument(); + } + + private void executeAction(Node action, Metadata metadata, ParseContext context, + XHTMLContentHandler xhtml) throws SAXException, + IOException, TikaException { + + if (action.getNodeType() != 1) { + return; + } + + String name = action.getNodeName(); + if ("metadata".equals(name)) { + metadata(action, metadata); + } else if("write".equals(name)) { + write(action, xhtml); + } else if ("throw".equals(name)) { + throwIt(action); + } else if ("hang".equals(name)) { + hang(action); + } else if ("oom".equals(name)) { + kabOOM(); + } else if ("print_out".equals(name) || "print_err".equals(name)){ + print(action, name); + } else if ("embedded".equals(name)) { + handleEmbedded(action, xhtml, context); + } else if ("throwIllegalChars".equals(name)) { + throwIllegalChars(); + } else { + throw new IllegalArgumentException("Didn't recognize mock action: "+name); + } + } + + private void throwIllegalChars() throws IOException { + throw new IOException("Can't say \u0000 in xml or \u0001 or \u0002 or \u0003"); + } + + private void handleEmbedded(Node action, XHTMLContentHandler handler, ParseContext context) + throws TikaException, SAXException, IOException { + String fileName = ""; + String contentType = ""; + NamedNodeMap attrs = action.getAttributes(); + if (attrs != null) { + Node n = attrs.getNamedItem("filename"); + if (n != null) { + fileName = n.getNodeValue(); + } + n = attrs.getNamedItem("content-type"); + if (n != null) { + contentType = n.getNodeValue(); + } + } + + String embeddedText = action.getTextContent(); + EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(context); + Metadata m = new Metadata(); + m.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName); + if (! "".equals(contentType)) { + m.set(Metadata.CONTENT_TYPE, contentType); + } + InputStream is = new ByteArrayInputStream(embeddedText.getBytes(UTF_8)); + + extractor.parseEmbedded( + is, + new EmbeddedContentHandler(handler), + m, true); + + + } + + protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) { + EmbeddedDocumentExtractor extractor = + context.get(EmbeddedDocumentExtractor.class); + if (extractor == null) { + Parser p = context.get(Parser.class); + if (p == null) { + context.set(Parser.class, new MockParser()); + } + extractor = new ParsingEmbeddedDocumentExtractor(context); + } + return extractor; + } + + private void print(Node action, String name) { + String content = action.getTextContent(); + if ("print_out".equals(name)) { + System.out.println(content); + } else if ("print_err".equals(name)) { + System.err.println(content); + } else { + throw new IllegalArgumentException("must be print_out or print_err"); + } + } + private void hang(Node action) { + boolean interruptible = true; + boolean heavy = false; + long millis = -1; + long pulseMillis = -1; + NamedNodeMap attrs = action.getAttributes(); + Node iNode = attrs.getNamedItem("interruptible"); + if (iNode != null) { + interruptible = ("true".equals(iNode.getNodeValue())); + } + Node hNode = attrs.getNamedItem("heavy"); + if (hNode != null) { + heavy = ("true".equals(hNode.getNodeValue())); + } + + Node mNode = attrs.getNamedItem("millis"); + if (mNode == null) { + throw new RuntimeException("Must specify \"millis\" attribute for hang."); + } + String millisString = mNode.getNodeValue(); + try { + millis = Long.parseLong(millisString); + } catch (NumberFormatException e) { + throw new RuntimeException("Value for \"millis\" attribute must be a long."); + } + + if (heavy) { + Node pNode = attrs.getNamedItem("pulse_millis"); + if (pNode == null) { + throw new RuntimeException("Must specify attribute \"pulse_millis\" if the hang is \"heavy\""); + } + String pulseMillisString = mNode.getNodeValue(); + try { + pulseMillis = Long.parseLong(pulseMillisString); + } catch (NumberFormatException e) { + throw new RuntimeException("Value for \"millis\" attribute must be a long."); + } + } + if (heavy) { + hangHeavy(millis, pulseMillis, interruptible); + } else { + sleep(millis, interruptible); + } + } + + private void throwIt(Node action) throws IOException, + SAXException, TikaException { + NamedNodeMap attrs = action.getAttributes(); + String className = attrs.getNamedItem("class").getNodeValue(); + String msg = action.getTextContent(); + throwIt(className, msg); + } + + private void metadata(Node action, Metadata metadata) { + NamedNodeMap attrs = action.getAttributes(); + //throws npe unless there is a name + String name = attrs.getNamedItem("name").getNodeValue(); + String value = action.getTextContent(); + Node actionType = attrs.getNamedItem("action"); + if (actionType == null) { + metadata.add(name, value); + } else { + if ("set".equals(actionType.getNodeValue())) { + metadata.set(name, value); + } else { + metadata.add(name, value); + } + } + } + + private void write(Node action, XHTMLContentHandler xhtml) throws SAXException { + NamedNodeMap attrs = action.getAttributes(); + Node eNode = attrs.getNamedItem("element"); + String elementType = "p"; + if (eNode != null) { + elementType = eNode.getTextContent(); + } + String text = action.getTextContent(); + xhtml.startElement(elementType); + xhtml.characters(text); + xhtml.endElement(elementType); + } + + + private void throwIt(String className, String msg) throws IOException, + SAXException, TikaException { + Throwable t = null; + if (msg == null || msg.equals("")) { + try { + t = (Throwable) Class.forName(className).newInstance(); + } catch (Exception e) { + throw new RuntimeException("couldn't create throwable class:"+className, e); + } + } else { + try { + Class<?> clazz = Class.forName(className); + Constructor<?> con = clazz.getConstructor(String.class); + t = (Throwable) con.newInstance(msg); + } catch (Exception e) { + throw new RuntimeException("couldn't create throwable class:" + className, e); + } + } + if (t instanceof SAXException) { + throw (SAXException)t; + } else if (t instanceof IOException) { + throw (IOException) t; + } else if (t instanceof TikaException) { + throw (TikaException) t; + } else if (t instanceof Error) { + throw (Error) t; + } else if (t instanceof RuntimeException) { + throw (RuntimeException) t; + } else { + //wrap the throwable in a RuntimeException + throw new RuntimeException(t); + } + } + + private void kabOOM() { + List<int[]> ints = new ArrayList<int[]>(); + + while (true) { + int[] intArr = new int[32000]; + ints.add(intArr); + } + } + + private void hangHeavy(long maxMillis, long pulseCheckMillis, boolean interruptible) { + //do some heavy computation and occasionally check for + //whether time has exceeded maxMillis (see TIKA-1132 for inspiration) + //or whether the thread was interrupted + long start = new Date().getTime(); + int lastChecked = 0; + while (true) { + for (int i = 1; i < Integer.MAX_VALUE; i++) { + for (int j = 1; j < Integer.MAX_VALUE; j++) { + double div = (double) i / (double) j; + lastChecked++; + if (lastChecked > pulseCheckMillis) { + lastChecked = 0; + if (interruptible && Thread.currentThread().isInterrupted()) { + return; + } + long elapsed = new Date().getTime()-start; + if (elapsed > maxMillis) { + return; + } + } + } + } + } + } + + private void sleep(long maxMillis, boolean isInterruptible) { + long start = new Date().getTime(); + long millisRemaining = maxMillis; + while (true) { + try { + Thread.sleep(millisRemaining); + } catch (InterruptedException e) { + if (isInterruptible) { + return; + } + } + long elapsed = new Date().getTime()-start; + millisRemaining = maxMillis - elapsed; + if (millisRemaining <= 0) { + break; + } + } + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParserTest.java ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParserTest.java b/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParserTest.java deleted file mode 100644 index 29fa3af..0000000 --- a/tika-test-resources/src/test/java/org/apache/tika/parser/mock/MockParserTest.java +++ /dev/null @@ -1,247 +0,0 @@ -package org.apache.tika.parser.mock; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.PrintStream; -import java.util.Date; - -import org.apache.tika.TikaTest; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.IOUtils; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.Parser; -import org.junit.Test; - -public class MockParserTest extends TikaTest { - private final static String M = "/test-documents/mock/"; - private final static Parser PARSER = new AutoDetectParser(); - - @Override - public XMLResult getXML(String path, Metadata m) throws Exception { - //note that this is specific to MockParserTest with addition of M to the path! - InputStream is = getResourceAsStream(M+path); - try { - return super.getXML(is, PARSER, m); - } finally { - IOUtils.closeQuietly(is); - } - } - - @Test - public void testExample() throws Exception { - Metadata m = new Metadata(); - PrintStream out = System.out; - PrintStream err = System.err; - ByteArrayOutputStream outBos = new ByteArrayOutputStream(); - ByteArrayOutputStream errBos = new ByteArrayOutputStream(); - PrintStream tmpOut = new PrintStream(outBos, true, UTF_8.toString()); - PrintStream tmpErr = new PrintStream(errBos, true, UTF_8.toString()); - System.setOut(tmpOut); - System.setErr(tmpErr); - try { - assertThrowable("example.xml", m, IOException.class, "not another IOException"); - assertMockParser(m); - } finally { - System.setOut(out); - System.setErr(err); - } - String outString = new String(outBos.toByteArray(), UTF_8); - assertContains("writing to System.out", outString); - - String errString = new String(errBos.toByteArray(), UTF_8); - assertContains("writing to System.err", errString); - - } - - @Test - public void testNothingBad() throws Exception { - Metadata m = new Metadata(); - String content = getXML("nothing_bad.xml", m).xml; - assertEquals("Geoffrey Chaucer", m.get("author")); - assertContains("<p>And bathed every veyne in swich licour,</p>", content); - assertMockParser(m); - } - - @Test - public void testNullPointer() throws Exception { - Metadata m = new Metadata(); - assertThrowable("null_pointer.xml", m, NullPointerException.class, "another null pointer exception"); - assertMockParser(m); - } - - @Test - public void testNullPointerNoMsg() throws Exception { - Metadata m = new Metadata(); - assertThrowable("null_pointer_no_msg.xml", m, NullPointerException.class, null); - assertMockParser(m); - } - - - @Test - public void testSleep() throws Exception { - long start = new Date().getTime(); - Metadata m = new Metadata(); - String content = getXML("sleep.xml", m).xml; - assertMockParser(m); - long elapsed = new Date().getTime()-start; - //should sleep for at least 3000 - boolean enoughTimeHasElapsed = elapsed > 2000; - assertTrue("not enough time has not elapsed: "+elapsed, enoughTimeHasElapsed); - assertMockParser(m); - } - - @Test - public void testHeavyHang() throws Exception { - long start = new Date().getTime(); - Metadata m = new Metadata(); - - String content = getXML("heavy_hang.xml", m).xml; - assertMockParser(m); - long elapsed = new Date().getTime()-start; - //should sleep for at least 3000 - boolean enoughTimeHasElapsed = elapsed > 2000; - assertTrue("not enough time has elapsed: "+elapsed, enoughTimeHasElapsed); - assertMockParser(m); - } - - @Test - public void testFakeOOM() throws Exception { - Metadata m = new Metadata(); - assertThrowable("fake_oom.xml", m, OutOfMemoryError.class, "not another oom"); - assertMockParser(m); - } - - @Test - public void testRealOOM() throws Exception { - //Note: we're not actually testing the diff between fake and real oom - //i.e. by creating child process and setting different -Xmx or - //memory profiling. - Metadata m = new Metadata(); - assertThrowable("real_oom.xml", m, OutOfMemoryError.class, "Java heap space"); - assertMockParser(m); - } - - @Test - public void testInterruptibleSleep() { - //Without static initialization of the parser, it can take ~1 second after t.start() - //before the parser actually calls parse. This is - //just the time it takes to instantiate and call AutoDetectParser, do the detection, etc. - //This is not thread creation overhead. - ParserRunnable r = new ParserRunnable("sleep_interruptible.xml"); - Thread t = new Thread(r); - t.start(); - long start = new Date().getTime(); - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - //swallow - } - - t.interrupt(); - - try { - t.join(10000); - } catch (InterruptedException e) { - //swallow - } - long elapsed = new Date().getTime()-start; - boolean shortEnough = elapsed < 2000;//the xml file specifies 3000 - assertTrue("elapsed (" + elapsed + " millis) was not short enough", shortEnough); - } - - @Test - public void testNonInterruptibleSleep() { - ParserRunnable r = new ParserRunnable("sleep_not_interruptible.xml"); - Thread t = new Thread(r); - t.start(); - long start = new Date().getTime(); - try { - //make sure that the thread has actually started - Thread.sleep(1000); - } catch (InterruptedException e) { - //swallow - } - t.interrupt(); - try { - t.join(20000); - } catch (InterruptedException e) { - //swallow - } - long elapsed = new Date().getTime()-start; - boolean longEnough = elapsed > 3000;//the xml file specifies 3000, this sleeps 1000 - assertTrue("elapsed ("+elapsed+" millis) was not long enough", longEnough); - } - - private class ParserRunnable implements Runnable { - private final String path; - ParserRunnable(String path) { - this.path = path; - } - @Override - public void run() { - Metadata m = new Metadata(); - try { - getXML(path, m); - } catch (Exception e) { - throw new RuntimeException(e); - } finally { - assertMockParser(m); - } - } - } - - private void assertThrowable(String path, Metadata m, Class<? extends Throwable> expected, String message) { - - try { - getXML(path, m); - } catch (Throwable t) { - //if this is a throwable wrapped in a TikaException, use the cause - if (t instanceof TikaException && t.getCause() != null) { - t = t.getCause(); - } - if (! (t.getClass().isAssignableFrom(expected))){ - fail(t.getClass() +" is not assignable from "+expected); - } - if (message != null) { - assertEquals(message, t.getMessage()); - } - } - } - - private void assertMockParser(Metadata m) { - String[] parsers = m.getValues("X-Parsed-By"); - //make sure that it was actually parsed by mock. - boolean parsedByMock = false; - for (String parser : parsers) { - if (parser.equals("org.apache.tika.parser.mock.MockParser")) { - parsedByMock = true; - break; - } - } - assertTrue("mock parser should have been called", parsedByMock); - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/249105aa/tika-test-resources/src/test/resources/META-INF/services/org.apache.tika.parser.Parser ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/test/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-test-resources/src/test/resources/META-INF/services/org.apache.tika.parser.Parser new file mode 100644 index 0000000..69bfdeb --- /dev/null +++ b/tika-test-resources/src/test/resources/META-INF/services/org.apache.tika.parser.Parser @@ -0,0 +1 @@ +org.apache.tika.parser.mock.MockParser \ No newline at end of file
