This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 56397f14a TIKA-4553 -- rm TikaConfig from tika-app (#2431)
56397f14a is described below
commit 56397f14a4a44151df7edf7d5afa8c8e901f0eda
Author: Tim Allison <[email protected]>
AuthorDate: Tue Dec 9 14:40:18 2025 -0500
TIKA-4553 -- rm TikaConfig from tika-app (#2431)
---
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 40 +++++++++++-------
.../src/main/java/org/apache/tika/gui/TikaGUI.java | 42 ++++++++++++-------
.../resources/tika-config-default-single-file.json | 5 ++-
.../resources/tika-config-default-single-file.xml | 49 ----------------------
.../java/org/apache/tika/cli/TikaCLIAsyncTest.java | 2 +-
.../test/java/org/apache/tika/cli/TikaCLITest.java | 23 ++++------
.../src/test/resources/configs/tika-config1.json | 18 ++++++++
.../src/test/resources/configs/tika-config2.json | 26 ++++++++++++
.../test-data/TIKA-2389-ignore-init-problems.xml | 20 ---------
.../src/test/resources/test-data/tika-config1.xml | 13 ------
.../src/test/resources/test-data/tika-config2.xml | 14 -------
11 files changed, 109 insertions(+), 143 deletions(-)
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 91cb313b2..adb708c2d 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -68,8 +68,8 @@ import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.Tika;
import org.apache.tika.async.cli.TikaAsyncCLI;
-import org.apache.tika.config.TikaConfig;
import org.apache.tika.config.TikaConfigSerializer;
+import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
@@ -129,7 +129,7 @@ public class TikaCLI {
private ParseContext context;
private Detector detector;
private Parser parser;
- private TikaConfig config;
+ private TikaLoader tikaLoader;
private String configFilePath;
private boolean recursiveJSON = false;
private URI networkURI = null;
@@ -518,9 +518,9 @@ public class TikaCLI {
private void dumpConfig(TikaConfigSerializer.Mode mode) throws Exception {
configure();
- TikaConfig localConfig = (config == null) ?
TikaConfig.getDefaultConfig() : config;
-
- TikaConfigSerializer.serialize(localConfig, mode, new
OutputStreamWriter(System.out, UTF_8), UTF_8);
+ TikaLoader localConfig = (tikaLoader == null) ?
TikaLoader.loadDefault() : tikaLoader;
+ //TODO -- implement mode
+ System.out.println(localConfig.getConfig().toString());
}
private void convertConfigXmlToJson(String paths) throws Exception {
@@ -553,14 +553,16 @@ public class TikaCLI {
private void handleRecursiveJson(URL url, OutputStream output) throws
IOException, SAXException, TikaException {
Metadata metadata = new Metadata();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
- RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1,
config.getMetadataFilter());
+ RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1,
+ tikaLoader.loadMetadataFilters());
try (InputStream input = TikaInputStream.get(url, metadata)) {
wrapper.parse(input, handler, metadata, context);
}
JsonMetadataList.setPrettyPrinting(prettyPrint);
try (Writer writer = getOutputWriter(output, encoding)) {
List<Metadata> metadataList = handler.getMetadataList();
- metadataList = config.getMetadataFilter().filter(metadataList);
+ metadataList = tikaLoader
+ .loadMetadataFilters().filter(metadataList);
JsonMetadataList.toJson(metadataList, writer);
}
}
@@ -710,26 +712,32 @@ public class TikaCLI {
private void configure() throws TikaException, IOException, SAXException {
if (configFilePath != null) {
- config = new TikaConfig(new File(configFilePath));
+ tikaLoader = TikaLoader.load(Paths.get(configFilePath));
} else {
String warn = "As a convenience, TikaCLI has turned on several
non-default features\n" +
- "as specified in
tika-app/src/main/resources/tika-config-default-single-file.xml.\n" +
+ "as specified in
tika-app/src/main/resources/tika-config-default-single-file.json.\n" +
"See: TIKA-2374, TIKA-4017, TIKA-4354 and TIKA-4472).\n" +
"This is not the default behavior in Tika generally or in
tika-server.";
LOG.info(warn);
- try (InputStream is =
getClass().getResourceAsStream("/tika-config-default-single-file.xml")) {
- config = new TikaConfig(is);
+ Path tempConfig = Files.createTempFile("tika-config-", ".json");
+ try {
+ try (InputStream is =
getClass().getResourceAsStream("/tika-config-default-single-file.json")) {
+ Files.copy(is, tempConfig,
StandardCopyOption.REPLACE_EXISTING);
+ }
+ tikaLoader = TikaLoader.load(tempConfig);
+ } finally {
+ Files.deleteIfExists(tempConfig);
}
}
if (networkURI != null) {
parser = new NetworkParser(networkURI);
} else {
- parser = new AutoDetectParser(config);
+ parser = tikaLoader.loadAutoDetectParser();
if (digester != null) {
parser = new DigestingParser(parser, digester, false);
}
}
- detector = config.getDetector();
+ detector = tikaLoader.loadDetectors();
context.set(Parser.class, parser);
context.set(PasswordProvider.class, new
SimplePasswordProvider(password));
}
@@ -932,9 +940,9 @@ public class TikaCLI {
}
// See how those compare to the Tika ones
- TikaConfig config = TikaConfig.getDefaultConfig();
- MimeTypes mimeTypes = config.getMimeRepository();
- MediaTypeRegistry registry = config.getMediaTypeRegistry();
+ TikaLoader loader = TikaLoader.loadDefault();
+ MimeTypes mimeTypes = TikaLoader.getMimeTypes();
+ MediaTypeRegistry registry = loader.getMediaTypeRegistry();
for (String mime : fileMimes) {
try {
final MimeType type = mimeTypes.getRegisteredMimeType(mime);
diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
index a2a4d526e..5e6adb242 100644
--- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
+++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
@@ -36,6 +36,8 @@ import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
@@ -70,7 +72,8 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
-import org.apache.tika.config.TikaConfig;
+import org.apache.tika.config.loader.TikaLoader;
+import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.io.TikaInputStream;
@@ -78,11 +81,9 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.parser.digestutils.CommonsDigester;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
@@ -154,9 +155,9 @@ public class TikaGUI extends JFrame implements
ActionListener, HyperlinkListener
* File chooser.
*/
private final JFileChooser chooser = new JFileChooser();
- private final TikaConfig tikaConfig;
+ private final TikaLoader tikaConfig;
- public TikaGUI(Parser parser, TikaConfig tikaConfig) {
+ public TikaGUI(Parser parser, TikaLoader tikaConfig) {
super("Apache Tika");
this.tikaConfig = tikaConfig;
setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
@@ -194,21 +195,32 @@ public class TikaGUI extends JFrame implements
ActionListener, HyperlinkListener
* @throws Exception if an error occurs
*/
public static void main(String[] args) throws Exception {
- TikaConfig config = null;
+ TikaLoader config = null;
if (args.length > 0) {
File configFile = new File(args[0]);
- config = new TikaConfig(configFile);
+ config = TikaLoader.load(configFile.toPath());
} else {
- try (InputStream is =
TikaGUI.class.getResourceAsStream("/tika-config-default-single-file.xml")) {
- config = new TikaConfig(is);
+ Path tempConfig = Files.createTempFile("tika-config-", ".json");
+ try {
+ try (InputStream is =
TikaGUI.class.getResourceAsStream("/tika-config-default-single-file.json")) {
+ Files.copy(is, tempConfig,
StandardCopyOption.REPLACE_EXISTING);
+ }
+ config = TikaLoader.load(tempConfig);
+ } finally {
+ Files.deleteIfExists(tempConfig);
}
}
UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
- final TikaConfig finalConfig = config;
- SwingUtilities.invokeLater(() -> new TikaGUI(
- new DigestingParser(new AutoDetectParser(finalConfig),
- new CommonsDigester(MAX_MARK,
CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256),
- false), finalConfig).setVisible(true));
+ final TikaLoader tikaLoader = config;
+ SwingUtilities.invokeLater(() -> {
+ try {
+ new TikaGUI(tikaLoader.loadAutoDetectParser(),
tikaLoader).setVisible(true);
+ } catch (TikaConfigException e) {
+ throw new RuntimeException(e);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ });
}
private void addMenuBar() {
@@ -384,7 +396,7 @@ public class TikaGUI extends JFrame implements
ActionListener, HyperlinkListener
StringWriter jsonBuffer = new StringWriter();
JsonMetadataList.setPrettyPrinting(true);
List<Metadata> metadataList =
recursiveParserWrapperHandler.getMetadataList();
- metadataList = tikaConfig.getMetadataFilter().filter(metadataList);
+ metadataList =
tikaConfig.loadMetadataFilters().filter(metadataList);
JsonMetadataList.toJson(metadataList, jsonBuffer);
setText(json, jsonBuffer.toString());
}
diff --git a/tika-app/src/main/resources/tika-config-default-single-file.json
b/tika-app/src/main/resources/tika-config-default-single-file.json
index 77bdffc4f..696a8f641 100644
--- a/tika-app/src/main/resources/tika-config-default-single-file.json
+++ b/tika-app/src/main/resources/tika-config-default-single-file.json
@@ -7,7 +7,10 @@
"pdf-parser": {
"extractActions": true,
"extractInlineImages": true,
- "checkExtractAccessPermissions": true,
+ "accessChecker": {
+ "needToCheck": true,
+ "allowExtractionForAccessibility": true
+ },
"extractIncrementalUpdateInfo": true,
"parseIncrementalUpdates":true
diff --git a/tika-app/src/main/resources/tika-config-default-single-file.xml
b/tika-app/src/main/resources/tika-config-default-single-file.xml
deleted file mode 100644
index 696b555a8..000000000
--- a/tika-app/src/main/resources/tika-config-default-single-file.xml
+++ /dev/null
@@ -1,49 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<properties>
- <service-loader initializableProblemHandler="throw"/>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
- <parser-exclude
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
- <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
- </parser>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="extractActions" type="bool">true</param>
- <param name="extractInlineImages" type="bool">true</param>
- <param name="extractIncrementalUpdateInfo" type="bool">true</param>
- <param name="parseIncrementalUpdates" type="bool">true</param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
- <params>
- <param name="includeDeletedContent" type="bool">true</param>
- <param name="includeMoveFromContent" type="bool">true</param>
- <param name="extractMacros" type="bool">true</param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.OfficeParser">
- <params>
- <param name="extractMacros" type="bool">true</param>
- </params>
- </parser>
- </parsers>
-</properties>
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
index e0679aab1..5351078e9 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
@@ -139,7 +139,7 @@ public class TikaCLIAsyncTest {
json++;
}
}
- assertEquals(21, json);
+ assertEquals(18, json);
}
private void checkForPrettyPrint(File f) throws IOException {
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 5a05e37a5..7b6628887 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -42,6 +42,7 @@ import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
@@ -58,6 +59,7 @@ import org.apache.tika.utils.StringUtils;
public class TikaCLITest {
static final File TEST_DATA_FILE = new
File("src/test/resources/test-data");
+ static final File CONFIGS_DIR = new File("src/test/resources/configs");
private final URI testDataURI = TEST_DATA_FILE.toURI();
@TempDir
private Path extractDir;
@@ -246,8 +248,7 @@ public class TikaCLITest {
public void testJsonMetadataPrettyPrintOutput() throws Exception {
String json = getParamOutContent("--json", "-r", resourcePrefix +
"testJsonMultipleInts.html");
- assertTrue(json.contains("\"X-TIKA:Parsed-By\" : [
\"org.apache.tika.parser.CompositeParser\", " +
- "\"org.apache.tika.parser.DefaultParser\",
\"org.apache.tika.parser.html.JSoupParser\" ],"));
+ assertTrue(json.contains("org.apache.tika.parser.CompositeParser\",
\"org.apache.tika.parser.html.JSoupParser"));
//test pretty-print alphabetic sort of keys
int enc = json.indexOf("\"Content-Encoding\"");
int fb = json.indexOf("fb:admins");
@@ -550,20 +551,11 @@ public class TikaCLITest {
@Test
public void testConfig() throws Exception {
- String content = getParamOutContent("--config=" +
TEST_DATA_FILE.toString() + "/tika-config1.xml", resourcePrefix +
"bad_xml.xml");
+ String content = getParamOutContent("--config=" +
CONFIGS_DIR.toString() + "/tika-config1.json", resourcePrefix + "bad_xml.xml");
assertTrue(content.contains("apple"));
assertTrue(content.contains("org.apache.tika.parser.html.JSoupParser"));
}
- @Test
- public void testConfigIgnoreInit() throws Exception {
- String content = getParamOutContent("--config=" +
TEST_DATA_FILE.toString() + "/TIKA-2389-ignore-init-problems.xml",
resourcePrefix + "test_recursive_embedded.docx");
- assertTrue(content.contains("embed_1a"));
- //TODO: add a real unit test that configures logging to a file to test
that nothing is
- //written at the various logging levels
- }
-
-
@Test
public void testJsonRecursiveMetadataParserMetadataOnly() throws Exception
{
String content = getParamOutContent("-m", "-J", "-r", resourcePrefix +
"test_recursive_embedded.docx");
@@ -594,6 +586,7 @@ public class TikaCLITest {
}
@Test
+ @Disabled("until we re-implement serialization")
public void testConfigSerializationStaticAndCurrent() throws Exception {
String content = getParamOutContent("--dump-static-config");
//make sure at least one detector is there
@@ -610,8 +603,9 @@ public class TikaCLITest {
}
@Test
+ @Disabled("until we re-implement serialization")
public void testConfigSerializationCustomMinimal() throws Exception {
- String content = getParamOutContent("--config=" +
TEST_DATA_FILE.toString() + "/tika-config2.xml",
"--dump-minimal-config").replaceAll("[\r\n\t ]+", " ");
+ String content = getParamOutContent("--config=" +
CONFIGS_DIR.toString() + "/tika-config2.json",
"--dump-minimal-config").replaceAll("[\r\n\t ]+", " ");
String expected =
"<parser class=\"org.apache.tika.parser.DefaultParser\">" + "
<mime-exclude>application/pdf</mime-exclude>" + "
<mime-exclude>image/jpeg</mime-exclude> " +
@@ -620,8 +614,9 @@ public class TikaCLITest {
}
@Test
+ @Disabled("until we re-implement serialization")
public void testConfigSerializationCustomStatic() throws Exception {
- String content = getParamOutContent("--config=" +
TEST_DATA_FILE.toString() + "/tika-config2.xml", "--dump-static-config");
+ String content = getParamOutContent("--config=" +
TEST_DATA_FILE.toString() + "/tika-config2.json", "--dump-static-config");
assertFalse(content.contains("org.apache.tika.parser.executable.Executable"));
}
diff --git a/tika-app/src/test/resources/configs/tika-config1.json
b/tika-app/src/test/resources/configs/tika-config1.json
new file mode 100644
index 000000000..e4cdbaf96
--- /dev/null
+++ b/tika-app/src/test/resources/configs/tika-config1.json
@@ -0,0 +1,18 @@
+{
+ "parsers": [
+ {
+ "jsoup-parser": {
+ "_decorate": {
+ "mimeInclude": [
+ "application/vnd.wap.xhtml+xml",
+ "application/x-asp",
+ "application/xhtml+xml",
+ "text/html",
+ "application/xml",
+ "text/xml"
+ ]
+ }
+ }
+ }
+ ]
+}
diff --git a/tika-app/src/test/resources/configs/tika-config2.json
b/tika-app/src/test/resources/configs/tika-config2.json
new file mode 100644
index 000000000..0f3cf8ac4
--- /dev/null
+++ b/tika-app/src/test/resources/configs/tika-config2.json
@@ -0,0 +1,26 @@
+{
+ "parsers": [
+ {
+ "default-parser": {
+ "_decorate": {
+ "mimeExclude": [
+ "image/jpeg",
+ "application/pdf"
+ ],
+ "parserExclude": [
+ "org.apache.tika.parser.executable.ExecutableParser"
+ ]
+ }
+ }
+ },
+ {
+ "empty-parser": {
+ "_decorate": {
+ "mimeInclude": [
+ "application/pdf"
+ ]
+ }
+ }
+ }
+ ]
+}
diff --git
a/tika-app/src/test/resources/test-data/TIKA-2389-ignore-init-problems.xml
b/tika-app/src/test/resources/test-data/TIKA-2389-ignore-init-problems.xml
deleted file mode 100644
index 30af37d7b..000000000
--- a/tika-app/src/test/resources/test-data/TIKA-2389-ignore-init-problems.xml
+++ /dev/null
@@ -1,20 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <service-loader initializableProblemHandler="ignore"/>
-</properties>
diff --git a/tika-app/src/test/resources/test-data/tika-config1.xml
b/tika-app/src/test/resources/test-data/tika-config1.xml
deleted file mode 100644
index 52f4f0949..000000000
--- a/tika-app/src/test/resources/test-data/tika-config1.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.html.JSoupParser">
- <mime>application/vnd.wap.xhtml+xml</mime>
- <mime>application/x-asp</mime>
- <mime>application/xhtml+xml</mime>
- <mime>text/html</mime>
- <mime>application/xml</mime>
- <mime>text/xml</mime>
- </parser>
- </parsers>
-</properties>
diff --git a/tika-app/src/test/resources/test-data/tika-config2.xml
b/tika-app/src/test/resources/test-data/tika-config2.xml
deleted file mode 100644
index 3a511ed7d..000000000
--- a/tika-app/src/test/resources/test-data/tika-config2.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <mime-exclude>image/jpeg</mime-exclude>
- <mime-exclude>application/pdf</mime-exclude>
- <parser-exclude
class="org.apache.tika.parser.executable.ExecutableParser"/>
- <parser-exclu
class="org.apache.tika.parser.executable.ExecutableParser2"/>
- </parser>
- <parser class="org.apache.tika.parser.EmptyParser">
- <mime>application/pdf</mime>
- <no-mime>hello/world</no-mime>
- </parser>
- </parsers>
-</properties>
\ No newline at end of file