tika git commit: TIKA-2096 change default to extract embedded documents even if the user forgets to specify an AutoDetectParser in the ParseContext

tallison Mon, 28 Nov 2016 08:25:59 -0800

Repository: tika
Updated Branches:
  refs/heads/2.x a47a69933 -> e5e4d4d91



TIKA-2096 change default to extract embedded documents even if the user forgets 
to specify an AutoDetectParser in the ParseContext


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e5e4d4d9
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e5e4d4d9
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e5e4d4d9

Branch: refs/heads/2.x
Commit: e5e4d4d9193daa001821cdf7637c023d0abe072e
Parents: a47a699
Author: tballison <[email protected]>
Authored: Mon Nov 28 11:25:38 2016 -0500
Committer: tballison <[email protected]>
Committed: Mon Nov 28 11:25:38 2016 -0500

----------------------------------------------------------------------
 CHANGES.txt                                     |  5 +++
 .../extractor/EmbeddedDocumentUtilTest.java     | 43 ++++++++++++++++++++
 .../parser/fork/ForkParserIntegrationTest.java  |  2 +
 .../tika/extractor/EmbeddedDocumentUtil.java    | 18 ++++++++
 .../tika/parser/jdbc/SQLite3ParserTest.java     |  9 ++--
 5 files changed, 74 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/e5e4d4d9/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index c8443b7..82c29e2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,11 @@ Release 2.0 - ???
 
 Release 1.15 -???
 
+  * Change default behavior to parse embedded documents even if the user
+    forgets to specify a Parser.class in the ParseContext (TIKA-2096).
+    Users who wish to parse only the container document should set
+    an EmptyParser as the Parser.class in the ParseContext.
+
   * Add mime detection and parser for Word 2006ML format (TIKA-2179).
 
   * Upgrade to POI 3.16-beta1 (TIKA-2116).

http://git-wip-us.apache.org/repos/asf/tika/blob/e5e4d4d9/tika-app/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java
----------------------------------------------------------------------
diff --git 
a/tika-app/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java
 
b/tika-app/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java
new file mode 100644
index 0000000..d09cf77
--- /dev/null
+++ 
b/tika-app/src/test/java/org/apache/tika/extractor/EmbeddedDocumentUtilTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.extractor;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+/**
+ * Integration tests for EmbeddedDocumentUtil
+ */
+public class EmbeddedDocumentUtilTest extends TikaTest {
+
+    @Test
+    public void testAutomaticAdditionOfAutoDetectParserIfForgotten() throws 
Exception {
+        String needle = "When in the Course";
+        //TIKA-2096
+        TikaTest.XMLResult xmlResult = getXML("test_recursive_embedded.doc", 
new ParseContext());
+        assertContains(needle, xmlResult.xml);
+
+        ParseContext context = new ParseContext();
+        context.set(Parser.class, new EmptyParser());
+        xmlResult = getXML("test_recursive_embedded.doc", context);
+        assertNotContained(needle, xmlResult.xml);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/e5e4d4d9/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
----------------------------------------------------------------------
diff --git 
a/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
 
b/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
index 6a7739c..45605d9 100644
--- 
a/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
+++ 
b/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
@@ -34,6 +34,7 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.fork.ForkParser;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
@@ -256,6 +257,7 @@ public class ForkParserIntegrationTest {
             InputStream stream = 
ForkParserIntegrationTest.class.getResourceAsStream(
                     "/test-documents/testPDF.pdf");
             ParseContext context = new ParseContext();
+            context.set(Parser.class, new EmptyParser());
             parser.parse(stream, output, new Metadata(), context);
 
             String content = output.toString();

http://git-wip-us.apache.org/repos/asf/tika/blob/e5e4d4d9/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index 3ceba90..2ff0efe 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -30,7 +30,9 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypeException;
 import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.utils.ExceptionUtils;
 import org.xml.sax.ContentHandler;
@@ -58,9 +60,25 @@ public class EmbeddedDocumentUtil implements Serializable {
         this.embeddedDocumentExtractor = getEmbeddedDocumentExtractor(context);
     }
 
+    /**
+     * This offers a uniform way to get an EmbeddedDocumentExtractor from a 
ParseContext.
+     * As of Tika 1.15, an AutoDetectParser will automatically be added to 
parse
+     * embedded documents if no Parser.class is specified in the ParseContext.
+     * <p/>
+     * If you'd prefer not to parse embedded documents, set Parser.class
+     * to {@link org.apache.tika.parser.EmptyParser} in the ParseContext.
+     * @param context
+     * @return EmbeddedDocumentExtractor
+     */
     public static EmbeddedDocumentExtractor 
getEmbeddedDocumentExtractor(ParseContext context) {
         EmbeddedDocumentExtractor extractor = 
context.get(EmbeddedDocumentExtractor.class);
         if (extractor == null) {
+            //ensure that an AutoDetectParser is
+            //available for parsing embedded docs TIKA-2096
+            Parser embeddedParser = context.get(Parser.class);
+            if (embeddedParser == null) {
+                context.set(Parser.class, new AutoDetectParser());
+            }
             extractor = new ParsingEmbeddedDocumentExtractor(context);
         }
         return extractor;

http://git-wip-us.apache.org/repos/asf/tika/blob/e5e4d4d9/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
 
b/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
index eef071f..d6ab5ed 100644
--- 
a/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
+++ 
b/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
@@ -40,6 +40,7 @@ import org.apache.tika.metadata.Database;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.RecursiveParserWrapper;
@@ -136,14 +137,16 @@ public class SQLite3ParserTest extends TikaTest {
         assertContains("tempor\n", s);
     }
 
-    //test what happens if the user forgets to pass in a parser via context
-    //to handle embedded documents
+    //test what happens if the user does not want embedded docs handled
     @Test
     public void testNotAddingEmbeddedParserToParseContext() throws Exception {
 
         Metadata metadata = new Metadata();
         metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
-        XMLResult r = getXML(TEST_FILE_NAME, new AutoDetectParser(), new 
Metadata(), new ParseContext());
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(Parser.class, new EmptyParser());
+
+        XMLResult r = getXML(TEST_FILE_NAME, new AutoDetectParser(), new 
Metadata(), parseContext);
         String xml = r.xml;
         //just includes headers for embedded documents
         assertContains("<table name=\"my_table1\"><thead><tr>", xml);

tika git commit: TIKA-2096 change default to extract embedded documents even if the user forgets to specify an AutoDetectParser in the ParseContext

Reply via email to