Repository: tika Updated Branches: refs/heads/master 2df8567ff -> 361ffa40a
TIKA-2096 -- automatically add AutoDetectParser for embedded documents if the user forgets Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/361ffa40 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/361ffa40 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/361ffa40 Branch: refs/heads/master Commit: 361ffa40a5cee9f37d01f40c2074a18b04c4a6fb Parents: 2df8567 Author: tballison <[email protected]> Authored: Mon Nov 28 11:08:44 2016 -0500 Committer: tballison <[email protected]> Committed: Mon Nov 28 11:08:44 2016 -0500 ---------------------------------------------------------------------- .../tika/extractor/EmbeddedDocumentUtil.java | 18 ++++++++ .../src/test/java/org/apache/tika/TikaTest.java | 1 - .../extractor/EmbeddedDocumentUtilTest.java | 43 ++++++++++++++++++++ .../parser/fork/ForkParserIntegrationTest.java | 2 + .../tika/parser/jdbc/SQLite3ParserTest.java | 9 ++-- 5 files changed, 68 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java index 3ceba90..2ff0efe 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java @@ -30,7 +30,9 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.utils.ExceptionUtils; import org.xml.sax.ContentHandler; @@ -58,9 +60,25 @@ public class EmbeddedDocumentUtil implements Serializable { this.embeddedDocumentExtractor = getEmbeddedDocumentExtractor(context); } + /** + * This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext. + * As of Tika 1.15, an AutoDetectParser will automatically be added to parse + * embedded documents if no Parser.class is specified in the ParseContext. + * <p/> + * If you'd prefer not to parse embedded documents, set Parser.class + * to {@link org.apache.tika.parser.EmptyParser} in the ParseContext. + * @param context + * @return EmbeddedDocumentExtractor + */ public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) { EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class); if (extractor == null) { + //ensure that an AutoDetectParser is + //available for parsing embedded docs TIKA-2096 + Parser embeddedParser = context.get(Parser.class); + if (embeddedParser == null) { + context.set(Parser.class, new AutoDetectParser()); + } extractor = new ParsingEmbeddedDocumentExtractor(context); } return extractor; http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-core/src/test/java/org/apache/tika/TikaTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index aa673f0..6644d86 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -192,7 +192,6 @@ public abstract class TikaTest { protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, ParseContext context) throws Exception { if (context == null) { context = new ParseContext(); - context.set(Parser.class, parser); } try { http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-parsers/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java b/tika-parsers/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java new file mode 100644 index 0000000..d09cf77 --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.extractor; + +import org.apache.tika.TikaTest; +import org.apache.tika.parser.EmptyParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.junit.Test; + +/** + * Integration tests for EmbeddedDocumentUtil + */ +public class EmbeddedDocumentUtilTest extends TikaTest { + + @Test + public void testAutomaticAdditionOfAutoDetectParserIfForgotten() throws Exception { + String needle = "When in the Course"; + //TIKA-2096 + TikaTest.XMLResult xmlResult = getXML("test_recursive_embedded.doc", new ParseContext()); + assertContains(needle, xmlResult.xml); + + ParseContext context = new ParseContext(); + context.set(Parser.class, new EmptyParser()); + xmlResult = getXML("test_recursive_embedded.doc", context); + assertNotContained(needle, xmlResult.xml); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java index 6a7739c..45605d9 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java @@ -34,6 +34,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.fork.ForkParser; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; @@ -256,6 +257,7 @@ public class ForkParserIntegrationTest { InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream( "/test-documents/testPDF.pdf"); ParseContext context = new ParseContext(); + context.set(Parser.class, new EmptyParser()); parser.parse(stream, output, new Metadata(), context); String content = output.toString(); http://git-wip-us.apache.org/repos/asf/tika/blob/361ffa40/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java index ca31991..e28921a 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java @@ -36,6 +36,7 @@ import org.apache.tika.metadata.Database; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; @@ -115,17 +116,17 @@ public class SQLite3ParserTest extends TikaTest { assertContains("tempor\n", s); } - //test what happens if the user forgets to pass in a parser via context - //to handle embedded documents + //test what happens if the user does not want embedded docs handled @Test public void testNotAddingEmbeddedParserToParseContext() throws Exception { Parser p = new AutoDetectParser(); ContentHandler handler = new ToXMLContentHandler(); Metadata metadata = new Metadata(); - + ParseContext parseContext = new ParseContext(); + parseContext.set(Parser.class, new EmptyParser()); try (InputStream is = getResourceAsStream(TEST_FILE1)) { metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); - p.parse(is, handler, metadata, new ParseContext()); + p.parse(is, handler, metadata, parseContext); } String xml = handler.toString(); //just includes headers for embedded documents
