This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new b1acb75eb TIKA-4472 -- extract macros by default in tika-app when 
parsing a single file (#2309)
b1acb75eb is described below

commit b1acb75ebcac28bdca1e2d82dca746867845a3c1
Author: Tim Allison <[email protected]>
AuthorDate: Thu Aug 21 11:06:38 2025 -0400

    TIKA-4472 -- extract macros by default in tika-app when parsing a single 
file (#2309)
---
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  38 +++++-----------
 .../resources/tika-config-default-single-file.xml  |  49 +++++++++++++++++++++
 .../java/org/apache/tika/cli/TikaCLIAsyncTest.java |   2 +-
 .../test/java/org/apache/tika/cli/TikaCLITest.java |  18 +++++++-
 .../test/resources/test-data/testPDFPackage.pdf    | Bin 0 -> 92359 bytes
 .../test/resources/test-data/testPPT_macros.ppt    | Bin 0 -> 88064 bytes
 6 files changed, 79 insertions(+), 28 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index aefc03660..96276935b 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -95,7 +95,6 @@ import org.apache.tika.parser.ParserDecorator;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.digestutils.CommonsDigester;
-import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerFactory;
@@ -339,21 +338,6 @@ public class TikaCLI {
         return false;
     }
 
-    private void configurePDFExtractSettings() {
-        if (configFilePath == null && context.get(PDFParserConfig.class) == 
null) {
-            PDFParserConfig pdfParserConfig = new PDFParserConfig();
-            pdfParserConfig.setExtractInlineImages(true);
-            pdfParserConfig.setExtractIncrementalUpdateInfo(true);
-            pdfParserConfig.setParseIncrementalUpdates(true);
-            String warn = "As a convenience, TikaCLI has turned on extraction 
of\n" +
-                    "inline images and parsing of incremental updates for the 
PDFParser (TIKA-2374, " +
-                    "TIKA-4017 and TIKA-4354).\n" +
-                    "This is not the default behavior in Tika generally or in 
tika-server.";
-            LOG.info(warn);
-            context.set(PDFParserConfig.class, pdfParserConfig);
-        }
-    }
-
     public void process(String arg) throws Exception {
         if (arg.equals("-?") || arg.equals("--help")) {
             pipeMode = false;
@@ -478,7 +462,6 @@ public class TikaCLI {
                 } else {
                     url = new URL(arg);
                 }
-                configurePDFExtractSettings();
                 if (recursiveJSON) {
                     handleRecursiveJson(url, System.out);
                 } else {
@@ -669,17 +652,21 @@ public class TikaCLI {
     }
 
     private void configure() throws TikaException, IOException, SAXException {
-
+        if (configFilePath != null) {
+            config = new TikaConfig(new File(configFilePath));
+        } else {
+            String warn = "As a convenience, TikaCLI has turned on several 
non-default features\n" +
+                    "as specified in 
tika-app/src/main/resources/tika-config-default-single-file.xml.\n" +
+                    "See: TIKA-2374, TIKA-4017, TIKA-4354 and TIKA-4472).\n" +
+                    "This is not the default behavior in Tika generally or in 
tika-server.";
+            LOG.info(warn);
+            try (InputStream is = 
getClass().getResourceAsStream("/tika-config-default-single-file.xml")) {
+                config = new TikaConfig(is);
+            }
+        }
         if (networkURI != null) {
             parser = new NetworkParser(networkURI);
-            config = TikaConfig.getDefaultConfig();
         } else {
-            if (configFilePath != null) {
-                config = new TikaConfig(new File(configFilePath));
-            } else {
-                config = TikaConfig.getDefaultConfig();
-            }
-
             parser = new AutoDetectParser(config);
             if (digester != null) {
                 parser = new DigestingParser(parser, digester, false);
@@ -1080,7 +1067,6 @@ public class TikaCLI {
 
     private class FileEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtractor {
 
-        private final TikaConfig config = TikaConfig.getDefaultConfig();
         private final EmbeddedStreamTranslator embeddedStreamTranslator = new 
DefaultEmbeddedStreamTranslator();
         private int count = 0;
 
diff --git a/tika-app/src/main/resources/tika-config-default-single-file.xml 
b/tika-app/src/main/resources/tika-config-default-single-file.xml
new file mode 100644
index 000000000..696b555a8
--- /dev/null
+++ b/tika-app/src/main/resources/tika-config-default-single-file.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<properties>
+  <service-loader initializableProblemHandler="throw"/>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+      <parser-exclude 
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
+      <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
+    </parser>
+    <parser class="org.apache.tika.parser.pdf.PDFParser">
+      <params>
+        <param name="extractActions" type="bool">true</param>
+        <param name="extractInlineImages" type="bool">true</param>
+        <param name="extractIncrementalUpdateInfo" type="bool">true</param>
+        <param name="parseIncrementalUpdates" type="bool">true</param>
+      </params>
+    </parser>
+    <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+      <params>
+        <param name="includeDeletedContent" type="bool">true</param>
+        <param name="includeMoveFromContent" type="bool">true</param>
+        <param name="extractMacros" type="bool">true</param>
+      </params>
+    </parser>
+    <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+      <params>
+        <param name="extractMacros" type="bool">true</param>
+      </params>
+    </parser>
+  </parsers>
+</properties>
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
index 072f2c7d7..56e1289b8 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
@@ -120,7 +120,7 @@ public class TikaCLIAsyncTest {
                 json++;
             }
         }
-        assertEquals(18, json);
+        assertEquals(20, json);
     }
 
     private void checkForPrettyPrint(File f) throws IOException {
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 13bf4153d..2195685d7 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -232,7 +232,8 @@ public class TikaCLITest {
     public void testJsonMetadataPrettyPrintOutput() throws Exception {
         String json = getParamOutContent("--json", "-r", resourcePrefix + 
"testJsonMultipleInts.html");
 
-        assertTrue(json.contains("\"X-TIKA:Parsed-By\" : [ 
\"org.apache.tika.parser.DefaultParser\", " + 
"\"org.apache.tika.parser.html.JSoupParser\" ],"));
+        assertTrue(json.contains("\"X-TIKA:Parsed-By\" : [ 
\"org.apache.tika.parser.CompositeParser\", " +
+                "\"org.apache.tika.parser.DefaultParser\", 
\"org.apache.tika.parser.html.JSoupParser\" ],"));
         //test pretty-print alphabetic sort of keys
         int enc = json.indexOf("\"Content-Encoding\"");
         int fb = json.indexOf("fb:admins");
@@ -249,6 +250,21 @@ public class TikaCLITest {
         assertTrue(json.contains("embeddedResourceType\":\"VERSION\""));
     }
 
+    @Test
+    public void testExtractJavascript() throws Exception {
+        String json = getParamOutContent("-J", resourcePrefix + 
"testPDFPackage.pdf");
+        assertTrue(json.contains("type=\\\"PDActionJavaScript\\\""));
+        assertTrue(json.contains("MACRO"));
+        assertTrue(json.contains("NAMES_TREE"));
+    }
+
+    @Test
+    public void testMacros() throws Exception {
+        String json = getParamOutContent("-J", resourcePrefix + 
"testPPT_macros.ppt");
+        assertTrue(json.contains("MACRO"));
+        assertTrue(json.contains("Module1"));
+    }
+
     /**
      * Tests -l option of the cli
      *
diff --git a/tika-app/src/test/resources/test-data/testPDFPackage.pdf 
b/tika-app/src/test/resources/test-data/testPDFPackage.pdf
new file mode 100644
index 000000000..0cd2d487a
Binary files /dev/null and 
b/tika-app/src/test/resources/test-data/testPDFPackage.pdf differ
diff --git a/tika-app/src/test/resources/test-data/testPPT_macros.ppt 
b/tika-app/src/test/resources/test-data/testPPT_macros.ppt
new file mode 100644
index 000000000..7af9008dd
Binary files /dev/null and 
b/tika-app/src/test/resources/test-data/testPPT_macros.ppt differ

Reply via email to