This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new b1acb75eb TIKA-4472 -- extract macros by default in tika-app when
parsing a single file (#2309)
b1acb75eb is described below
commit b1acb75ebcac28bdca1e2d82dca746867845a3c1
Author: Tim Allison <[email protected]>
AuthorDate: Thu Aug 21 11:06:38 2025 -0400
TIKA-4472 -- extract macros by default in tika-app when parsing a single
file (#2309)
---
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 38 +++++-----------
.../resources/tika-config-default-single-file.xml | 49 +++++++++++++++++++++
.../java/org/apache/tika/cli/TikaCLIAsyncTest.java | 2 +-
.../test/java/org/apache/tika/cli/TikaCLITest.java | 18 +++++++-
.../test/resources/test-data/testPDFPackage.pdf | Bin 0 -> 92359 bytes
.../test/resources/test-data/testPPT_macros.ppt | Bin 0 -> 88064 bytes
6 files changed, 79 insertions(+), 28 deletions(-)
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index aefc03660..96276935b 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -95,7 +95,6 @@ import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.digestutils.CommonsDigester;
-import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerFactory;
@@ -339,21 +338,6 @@ public class TikaCLI {
return false;
}
- private void configurePDFExtractSettings() {
- if (configFilePath == null && context.get(PDFParserConfig.class) ==
null) {
- PDFParserConfig pdfParserConfig = new PDFParserConfig();
- pdfParserConfig.setExtractInlineImages(true);
- pdfParserConfig.setExtractIncrementalUpdateInfo(true);
- pdfParserConfig.setParseIncrementalUpdates(true);
- String warn = "As a convenience, TikaCLI has turned on extraction
of\n" +
- "inline images and parsing of incremental updates for the
PDFParser (TIKA-2374, " +
- "TIKA-4017 and TIKA-4354).\n" +
- "This is not the default behavior in Tika generally or in
tika-server.";
- LOG.info(warn);
- context.set(PDFParserConfig.class, pdfParserConfig);
- }
- }
-
public void process(String arg) throws Exception {
if (arg.equals("-?") || arg.equals("--help")) {
pipeMode = false;
@@ -478,7 +462,6 @@ public class TikaCLI {
} else {
url = new URL(arg);
}
- configurePDFExtractSettings();
if (recursiveJSON) {
handleRecursiveJson(url, System.out);
} else {
@@ -669,17 +652,21 @@ public class TikaCLI {
}
private void configure() throws TikaException, IOException, SAXException {
-
+ if (configFilePath != null) {
+ config = new TikaConfig(new File(configFilePath));
+ } else {
+ String warn = "As a convenience, TikaCLI has turned on several
non-default features\n" +
+ "as specified in
tika-app/src/main/resources/tika-config-default-single-file.xml.\n" +
+ "See: TIKA-2374, TIKA-4017, TIKA-4354 and TIKA-4472).\n" +
+ "This is not the default behavior in Tika generally or in
tika-server.";
+ LOG.info(warn);
+ try (InputStream is =
getClass().getResourceAsStream("/tika-config-default-single-file.xml")) {
+ config = new TikaConfig(is);
+ }
+ }
if (networkURI != null) {
parser = new NetworkParser(networkURI);
- config = TikaConfig.getDefaultConfig();
} else {
- if (configFilePath != null) {
- config = new TikaConfig(new File(configFilePath));
- } else {
- config = TikaConfig.getDefaultConfig();
- }
-
parser = new AutoDetectParser(config);
if (digester != null) {
parser = new DigestingParser(parser, digester, false);
@@ -1080,7 +1067,6 @@ public class TikaCLI {
private class FileEmbeddedDocumentExtractor implements
EmbeddedDocumentExtractor {
- private final TikaConfig config = TikaConfig.getDefaultConfig();
private final EmbeddedStreamTranslator embeddedStreamTranslator = new
DefaultEmbeddedStreamTranslator();
private int count = 0;
diff --git a/tika-app/src/main/resources/tika-config-default-single-file.xml
b/tika-app/src/main/resources/tika-config-default-single-file.xml
new file mode 100644
index 000000000..696b555a8
--- /dev/null
+++ b/tika-app/src/main/resources/tika-config-default-single-file.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<properties>
+ <service-loader initializableProblemHandler="throw"/>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ <parser-exclude
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
+ <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="extractActions" type="bool">true</param>
+ <param name="extractInlineImages" type="bool">true</param>
+ <param name="extractIncrementalUpdateInfo" type="bool">true</param>
+ <param name="parseIncrementalUpdates" type="bool">true</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+ <params>
+ <param name="includeDeletedContent" type="bool">true</param>
+ <param name="includeMoveFromContent" type="bool">true</param>
+ <param name="extractMacros" type="bool">true</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.microsoft.OfficeParser">
+ <params>
+ <param name="extractMacros" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
index 072f2c7d7..56e1289b8 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
@@ -120,7 +120,7 @@ public class TikaCLIAsyncTest {
json++;
}
}
- assertEquals(18, json);
+ assertEquals(20, json);
}
private void checkForPrettyPrint(File f) throws IOException {
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 13bf4153d..2195685d7 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -232,7 +232,8 @@ public class TikaCLITest {
public void testJsonMetadataPrettyPrintOutput() throws Exception {
String json = getParamOutContent("--json", "-r", resourcePrefix +
"testJsonMultipleInts.html");
- assertTrue(json.contains("\"X-TIKA:Parsed-By\" : [
\"org.apache.tika.parser.DefaultParser\", " +
"\"org.apache.tika.parser.html.JSoupParser\" ],"));
+ assertTrue(json.contains("\"X-TIKA:Parsed-By\" : [
\"org.apache.tika.parser.CompositeParser\", " +
+ "\"org.apache.tika.parser.DefaultParser\",
\"org.apache.tika.parser.html.JSoupParser\" ],"));
//test pretty-print alphabetic sort of keys
int enc = json.indexOf("\"Content-Encoding\"");
int fb = json.indexOf("fb:admins");
@@ -249,6 +250,21 @@ public class TikaCLITest {
assertTrue(json.contains("embeddedResourceType\":\"VERSION\""));
}
+ @Test
+ public void testExtractJavascript() throws Exception {
+ String json = getParamOutContent("-J", resourcePrefix +
"testPDFPackage.pdf");
+ assertTrue(json.contains("type=\\\"PDActionJavaScript\\\""));
+ assertTrue(json.contains("MACRO"));
+ assertTrue(json.contains("NAMES_TREE"));
+ }
+
+ @Test
+ public void testMacros() throws Exception {
+ String json = getParamOutContent("-J", resourcePrefix +
"testPPT_macros.ppt");
+ assertTrue(json.contains("MACRO"));
+ assertTrue(json.contains("Module1"));
+ }
+
/**
* Tests -l option of the cli
*
diff --git a/tika-app/src/test/resources/test-data/testPDFPackage.pdf
b/tika-app/src/test/resources/test-data/testPDFPackage.pdf
new file mode 100644
index 000000000..0cd2d487a
Binary files /dev/null and
b/tika-app/src/test/resources/test-data/testPDFPackage.pdf differ
diff --git a/tika-app/src/test/resources/test-data/testPPT_macros.ppt
b/tika-app/src/test/resources/test-data/testPPT_macros.ppt
new file mode 100644
index 000000000..7af9008dd
Binary files /dev/null and
b/tika-app/src/test/resources/test-data/testPPT_macros.ppt differ