Repository: tika Updated Branches: refs/heads/2.x a47a69933 -> e5e4d4d91
TIKA-2096 change default to extract embedded documents even if the user forgets to specify an AutoDetectParser in the ParseContext Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e5e4d4d9 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e5e4d4d9 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e5e4d4d9 Branch: refs/heads/2.x Commit: e5e4d4d9193daa001821cdf7637c023d0abe072e Parents: a47a699 Author: tballison <[email protected]> Authored: Mon Nov 28 11:25:38 2016 -0500 Committer: tballison <[email protected]> Committed: Mon Nov 28 11:25:38 2016 -0500 ---------------------------------------------------------------------- CHANGES.txt | 5 +++ .../extractor/EmbeddedDocumentUtilTest.java | 43 ++++++++++++++++++++ .../parser/fork/ForkParserIntegrationTest.java | 2 + .../tika/extractor/EmbeddedDocumentUtil.java | 18 ++++++++ .../tika/parser/jdbc/SQLite3ParserTest.java | 9 ++-- 5 files changed, 74 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/e5e4d4d9/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index c8443b7..82c29e2 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -17,6 +17,11 @@ Release 2.0 - ??? Release 1.15 -??? + * Change default behavior to parse embedded documents even if the user + forgets to specify a Parser.class in the ParseContext (TIKA-2096). + Users who wish to parse only the container document should set + an EmptyParser as the Parser.class in the ParseContext. + * Add mime detection and parser for Word 2006ML format (TIKA-2179). * Upgrade to POI 3.16-beta1 (TIKA-2116). http://git-wip-us.apache.org/repos/asf/tika/blob/e5e4d4d9/tika-app/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java b/tika-app/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java new file mode 100644 index 0000000..d09cf77 --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.extractor; + +import org.apache.tika.TikaTest; +import org.apache.tika.parser.EmptyParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.junit.Test; + +/** + * Integration tests for EmbeddedDocumentUtil + */ +public class EmbeddedDocumentUtilTest extends TikaTest { + + @Test + public void testAutomaticAdditionOfAutoDetectParserIfForgotten() throws Exception { + String needle = "When in the Course"; + //TIKA-2096 + TikaTest.XMLResult xmlResult = getXML("test_recursive_embedded.doc", new ParseContext()); + assertContains(needle, xmlResult.xml); + + ParseContext context = new ParseContext(); + context.set(Parser.class, new EmptyParser()); + xmlResult = getXML("test_recursive_embedded.doc", context); + assertNotContained(needle, xmlResult.xml); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/e5e4d4d9/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java b/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java index 6a7739c..45605d9 100644 --- a/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java +++ b/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java @@ -34,6 +34,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.fork.ForkParser; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; @@ -256,6 +257,7 @@ public class ForkParserIntegrationTest { InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream( "/test-documents/testPDF.pdf"); ParseContext context = new ParseContext(); + context.set(Parser.class, new EmptyParser()); parser.parse(stream, output, new Metadata(), context); String content = output.toString(); http://git-wip-us.apache.org/repos/asf/tika/blob/e5e4d4d9/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java index 3ceba90..2ff0efe 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java @@ -30,7 +30,9 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.utils.ExceptionUtils; import org.xml.sax.ContentHandler; @@ -58,9 +60,25 @@ public class EmbeddedDocumentUtil implements Serializable { this.embeddedDocumentExtractor = getEmbeddedDocumentExtractor(context); } + /** + * This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext. + * As of Tika 1.15, an AutoDetectParser will automatically be added to parse + * embedded documents if no Parser.class is specified in the ParseContext. + * <p/> + * If you'd prefer not to parse embedded documents, set Parser.class + * to {@link org.apache.tika.parser.EmptyParser} in the ParseContext. + * @param context + * @return EmbeddedDocumentExtractor + */ public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) { EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class); if (extractor == null) { + //ensure that an AutoDetectParser is + //available for parsing embedded docs TIKA-2096 + Parser embeddedParser = context.get(Parser.class); + if (embeddedParser == null) { + context.set(Parser.class, new AutoDetectParser()); + } extractor = new ParsingEmbeddedDocumentExtractor(context); } return extractor; http://git-wip-us.apache.org/repos/asf/tika/blob/e5e4d4d9/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java b/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java index eef071f..d6ab5ed 100644 --- a/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java +++ b/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java @@ -40,6 +40,7 @@ import org.apache.tika.metadata.Database; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; @@ -136,14 +137,16 @@ public class SQLite3ParserTest extends TikaTest { assertContains("tempor\n", s); } - //test what happens if the user forgets to pass in a parser via context - //to handle embedded documents + //test what happens if the user does not want embedded docs handled @Test public void testNotAddingEmbeddedParserToParseContext() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME); - XMLResult r = getXML(TEST_FILE_NAME, new AutoDetectParser(), new Metadata(), new ParseContext()); + ParseContext parseContext = new ParseContext(); + parseContext.set(Parser.class, new EmptyParser()); + + XMLResult r = getXML(TEST_FILE_NAME, new AutoDetectParser(), new Metadata(), parseContext); String xml = r.xml; //just includes headers for embedded documents assertContains("<table name=\"my_table1\"><thead><tr>", xml);
