This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4553-rm-tika-config-tika-app in repository https://gitbox.apache.org/repos/asf/tika.git
commit 503137c847f344300ae701c87c4b3b552621714f Author: tallison <[email protected]> AuthorDate: Tue Dec 9 13:58:29 2025 -0500 TIKA-4553 -- rm TikaConfig from tika-app --- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 40 +++++++++++------- .../src/main/java/org/apache/tika/gui/TikaGUI.java | 42 ++++++++++++------- .../resources/tika-config-default-single-file.json | 5 ++- .../resources/tika-config-default-single-file.xml | 49 ---------------------- .../java/org/apache/tika/cli/TikaCLIAsyncTest.java | 2 +- .../test/java/org/apache/tika/cli/TikaCLITest.java | 23 ++++------ .../src/test/resources/configs/tika-config1.json | 18 ++++++++ .../src/test/resources/configs/tika-config2.json | 26 ++++++++++++ .../test-data/TIKA-2389-ignore-init-problems.xml | 20 --------- .../src/test/resources/test-data/tika-config1.xml | 13 ------ .../src/test/resources/test-data/tika-config2.xml | 14 ------- 11 files changed, 109 insertions(+), 143 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 91cb313b2..adb708c2d 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -68,8 +68,8 @@ import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.Tika; import org.apache.tika.async.cli.TikaAsyncCLI; -import org.apache.tika.config.TikaConfig; import org.apache.tika.config.TikaConfigSerializer; +import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; @@ -129,7 +129,7 @@ public class TikaCLI { private ParseContext context; private Detector detector; private Parser parser; - private TikaConfig config; + private TikaLoader tikaLoader; private String configFilePath; private boolean recursiveJSON = false; private URI networkURI = null; @@ -518,9 +518,9 @@ public class TikaCLI { private void dumpConfig(TikaConfigSerializer.Mode mode) throws Exception { configure(); - TikaConfig localConfig = (config == null) ? TikaConfig.getDefaultConfig() : config; - - TikaConfigSerializer.serialize(localConfig, mode, new OutputStreamWriter(System.out, UTF_8), UTF_8); + TikaLoader localConfig = (tikaLoader == null) ? TikaLoader.loadDefault() : tikaLoader; + //TODO -- implement mode + System.out.println(localConfig.getConfig().toString()); } private void convertConfigXmlToJson(String paths) throws Exception { @@ -553,14 +553,16 @@ public class TikaCLI { private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException { Metadata metadata = new Metadata(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1, config.getMetadataFilter()); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1, + tikaLoader.loadMetadataFilters()); try (InputStream input = TikaInputStream.get(url, metadata)) { wrapper.parse(input, handler, metadata, context); } JsonMetadataList.setPrettyPrinting(prettyPrint); try (Writer writer = getOutputWriter(output, encoding)) { List<Metadata> metadataList = handler.getMetadataList(); - metadataList = config.getMetadataFilter().filter(metadataList); + metadataList = tikaLoader + .loadMetadataFilters().filter(metadataList); JsonMetadataList.toJson(metadataList, writer); } } @@ -710,26 +712,32 @@ public class TikaCLI { private void configure() throws TikaException, IOException, SAXException { if (configFilePath != null) { - config = new TikaConfig(new File(configFilePath)); + tikaLoader = TikaLoader.load(Paths.get(configFilePath)); } else { String warn = "As a convenience, TikaCLI has turned on several non-default features\n" + - "as specified in tika-app/src/main/resources/tika-config-default-single-file.xml.\n" + + "as specified in tika-app/src/main/resources/tika-config-default-single-file.json.\n" + "See: TIKA-2374, TIKA-4017, TIKA-4354 and TIKA-4472).\n" + "This is not the default behavior in Tika generally or in tika-server."; LOG.info(warn); - try (InputStream is = getClass().getResourceAsStream("/tika-config-default-single-file.xml")) { - config = new TikaConfig(is); + Path tempConfig = Files.createTempFile("tika-config-", ".json"); + try { + try (InputStream is = getClass().getResourceAsStream("/tika-config-default-single-file.json")) { + Files.copy(is, tempConfig, StandardCopyOption.REPLACE_EXISTING); + } + tikaLoader = TikaLoader.load(tempConfig); + } finally { + Files.deleteIfExists(tempConfig); } } if (networkURI != null) { parser = new NetworkParser(networkURI); } else { - parser = new AutoDetectParser(config); + parser = tikaLoader.loadAutoDetectParser(); if (digester != null) { parser = new DigestingParser(parser, digester, false); } } - detector = config.getDetector(); + detector = tikaLoader.loadDetectors(); context.set(Parser.class, parser); context.set(PasswordProvider.class, new SimplePasswordProvider(password)); } @@ -932,9 +940,9 @@ public class TikaCLI { } // See how those compare to the Tika ones - TikaConfig config = TikaConfig.getDefaultConfig(); - MimeTypes mimeTypes = config.getMimeRepository(); - MediaTypeRegistry registry = config.getMediaTypeRegistry(); + TikaLoader loader = TikaLoader.loadDefault(); + MimeTypes mimeTypes = TikaLoader.getMimeTypes(); + MediaTypeRegistry registry = loader.getMediaTypeRegistry(); for (String mime : fileMimes) { try { final MimeType type = mimeTypes.getRegisteredMimeType(mime); diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java index a2a4d526e..5e6adb242 100644 --- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java +++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java @@ -36,6 +36,8 @@ import java.io.Writer; import java.net.MalformedURLException; import java.net.URL; import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -70,7 +72,8 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import org.apache.tika.config.TikaConfig; +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.io.TikaInputStream; @@ -78,11 +81,9 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.DigestingParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; -import org.apache.tika.parser.digestutils.CommonsDigester; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; @@ -154,9 +155,9 @@ public class TikaGUI extends JFrame implements ActionListener, HyperlinkListener * File chooser. */ private final JFileChooser chooser = new JFileChooser(); - private final TikaConfig tikaConfig; + private final TikaLoader tikaConfig; - public TikaGUI(Parser parser, TikaConfig tikaConfig) { + public TikaGUI(Parser parser, TikaLoader tikaConfig) { super("Apache Tika"); this.tikaConfig = tikaConfig; setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); @@ -194,21 +195,32 @@ public class TikaGUI extends JFrame implements ActionListener, HyperlinkListener * @throws Exception if an error occurs */ public static void main(String[] args) throws Exception { - TikaConfig config = null; + TikaLoader config = null; if (args.length > 0) { File configFile = new File(args[0]); - config = new TikaConfig(configFile); + config = TikaLoader.load(configFile.toPath()); } else { - try (InputStream is = TikaGUI.class.getResourceAsStream("/tika-config-default-single-file.xml")) { - config = new TikaConfig(is); + Path tempConfig = Files.createTempFile("tika-config-", ".json"); + try { + try (InputStream is = TikaGUI.class.getResourceAsStream("/tika-config-default-single-file.json")) { + Files.copy(is, tempConfig, StandardCopyOption.REPLACE_EXISTING); + } + config = TikaLoader.load(tempConfig); + } finally { + Files.deleteIfExists(tempConfig); } } UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName()); - final TikaConfig finalConfig = config; - SwingUtilities.invokeLater(() -> new TikaGUI( - new DigestingParser(new AutoDetectParser(finalConfig), - new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256), - false), finalConfig).setVisible(true)); + final TikaLoader tikaLoader = config; + SwingUtilities.invokeLater(() -> { + try { + new TikaGUI(tikaLoader.loadAutoDetectParser(), tikaLoader).setVisible(true); + } catch (TikaConfigException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); } private void addMenuBar() { @@ -384,7 +396,7 @@ public class TikaGUI extends JFrame implements ActionListener, HyperlinkListener StringWriter jsonBuffer = new StringWriter(); JsonMetadataList.setPrettyPrinting(true); List<Metadata> metadataList = recursiveParserWrapperHandler.getMetadataList(); - metadataList = tikaConfig.getMetadataFilter().filter(metadataList); + metadataList = tikaConfig.loadMetadataFilters().filter(metadataList); JsonMetadataList.toJson(metadataList, jsonBuffer); setText(json, jsonBuffer.toString()); } diff --git a/tika-app/src/main/resources/tika-config-default-single-file.json b/tika-app/src/main/resources/tika-config-default-single-file.json index 77bdffc4f..696a8f641 100644 --- a/tika-app/src/main/resources/tika-config-default-single-file.json +++ b/tika-app/src/main/resources/tika-config-default-single-file.json @@ -7,7 +7,10 @@ "pdf-parser": { "extractActions": true, "extractInlineImages": true, - "checkExtractAccessPermissions": true, + "accessChecker": { + "needToCheck": true, + "allowExtractionForAccessibility": true + }, "extractIncrementalUpdateInfo": true, "parseIncrementalUpdates":true diff --git a/tika-app/src/main/resources/tika-config-default-single-file.xml b/tika-app/src/main/resources/tika-config-default-single-file.xml deleted file mode 100644 index 696b555a8..000000000 --- a/tika-app/src/main/resources/tika-config-default-single-file.xml +++ /dev/null @@ -1,49 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> -<properties> - <service-loader initializableProblemHandler="throw"/> - <parsers> - <parser class="org.apache.tika.parser.DefaultParser"> - <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> - <parser-exclude class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/> - <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/> - </parser> - <parser class="org.apache.tika.parser.pdf.PDFParser"> - <params> - <param name="extractActions" type="bool">true</param> - <param name="extractInlineImages" type="bool">true</param> - <param name="extractIncrementalUpdateInfo" type="bool">true</param> - <param name="parseIncrementalUpdates" type="bool">true</param> - </params> - </parser> - <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"> - <params> - <param name="includeDeletedContent" type="bool">true</param> - <param name="includeMoveFromContent" type="bool">true</param> - <param name="extractMacros" type="bool">true</param> - </params> - </parser> - <parser class="org.apache.tika.parser.microsoft.OfficeParser"> - <params> - <param name="extractMacros" type="bool">true</param> - </params> - </parser> - </parsers> -</properties> diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java index e0679aab1..5351078e9 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java @@ -139,7 +139,7 @@ public class TikaCLIAsyncTest { json++; } } - assertEquals(21, json); + assertEquals(18, json); } private void checkForPrettyPrint(File f) throws IOException { diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index 5a05e37a5..7b6628887 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -42,6 +42,7 @@ import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @@ -58,6 +59,7 @@ import org.apache.tika.utils.StringUtils; public class TikaCLITest { static final File TEST_DATA_FILE = new File("src/test/resources/test-data"); + static final File CONFIGS_DIR = new File("src/test/resources/configs"); private final URI testDataURI = TEST_DATA_FILE.toURI(); @TempDir private Path extractDir; @@ -246,8 +248,7 @@ public class TikaCLITest { public void testJsonMetadataPrettyPrintOutput() throws Exception { String json = getParamOutContent("--json", "-r", resourcePrefix + "testJsonMultipleInts.html"); - assertTrue(json.contains("\"X-TIKA:Parsed-By\" : [ \"org.apache.tika.parser.CompositeParser\", " + - "\"org.apache.tika.parser.DefaultParser\", \"org.apache.tika.parser.html.JSoupParser\" ],")); + assertTrue(json.contains("org.apache.tika.parser.CompositeParser\", \"org.apache.tika.parser.html.JSoupParser")); //test pretty-print alphabetic sort of keys int enc = json.indexOf("\"Content-Encoding\""); int fb = json.indexOf("fb:admins"); @@ -550,20 +551,11 @@ public class TikaCLITest { @Test public void testConfig() throws Exception { - String content = getParamOutContent("--config=" + TEST_DATA_FILE.toString() + "/tika-config1.xml", resourcePrefix + "bad_xml.xml"); + String content = getParamOutContent("--config=" + CONFIGS_DIR.toString() + "/tika-config1.json", resourcePrefix + "bad_xml.xml"); assertTrue(content.contains("apple")); assertTrue(content.contains("org.apache.tika.parser.html.JSoupParser")); } - @Test - public void testConfigIgnoreInit() throws Exception { - String content = getParamOutContent("--config=" + TEST_DATA_FILE.toString() + "/TIKA-2389-ignore-init-problems.xml", resourcePrefix + "test_recursive_embedded.docx"); - assertTrue(content.contains("embed_1a")); - //TODO: add a real unit test that configures logging to a file to test that nothing is - //written at the various logging levels - } - - @Test public void testJsonRecursiveMetadataParserMetadataOnly() throws Exception { String content = getParamOutContent("-m", "-J", "-r", resourcePrefix + "test_recursive_embedded.docx"); @@ -594,6 +586,7 @@ public class TikaCLITest { } @Test + @Disabled("until we re-implement serialization") public void testConfigSerializationStaticAndCurrent() throws Exception { String content = getParamOutContent("--dump-static-config"); //make sure at least one detector is there @@ -610,8 +603,9 @@ public class TikaCLITest { } @Test + @Disabled("until we re-implement serialization") public void testConfigSerializationCustomMinimal() throws Exception { - String content = getParamOutContent("--config=" + TEST_DATA_FILE.toString() + "/tika-config2.xml", "--dump-minimal-config").replaceAll("[\r\n\t ]+", " "); + String content = getParamOutContent("--config=" + CONFIGS_DIR.toString() + "/tika-config2.json", "--dump-minimal-config").replaceAll("[\r\n\t ]+", " "); String expected = "<parser class=\"org.apache.tika.parser.DefaultParser\">" + " <mime-exclude>application/pdf</mime-exclude>" + " <mime-exclude>image/jpeg</mime-exclude> " + @@ -620,8 +614,9 @@ public class TikaCLITest { } @Test + @Disabled("until we re-implement serialization") public void testConfigSerializationCustomStatic() throws Exception { - String content = getParamOutContent("--config=" + TEST_DATA_FILE.toString() + "/tika-config2.xml", "--dump-static-config"); + String content = getParamOutContent("--config=" + TEST_DATA_FILE.toString() + "/tika-config2.json", "--dump-static-config"); assertFalse(content.contains("org.apache.tika.parser.executable.Executable")); } diff --git a/tika-app/src/test/resources/configs/tika-config1.json b/tika-app/src/test/resources/configs/tika-config1.json new file mode 100644 index 000000000..e4cdbaf96 --- /dev/null +++ b/tika-app/src/test/resources/configs/tika-config1.json @@ -0,0 +1,18 @@ +{ + "parsers": [ + { + "jsoup-parser": { + "_decorate": { + "mimeInclude": [ + "application/vnd.wap.xhtml+xml", + "application/x-asp", + "application/xhtml+xml", + "text/html", + "application/xml", + "text/xml" + ] + } + } + } + ] +} diff --git a/tika-app/src/test/resources/configs/tika-config2.json b/tika-app/src/test/resources/configs/tika-config2.json new file mode 100644 index 000000000..0f3cf8ac4 --- /dev/null +++ b/tika-app/src/test/resources/configs/tika-config2.json @@ -0,0 +1,26 @@ +{ + "parsers": [ + { + "default-parser": { + "_decorate": { + "mimeExclude": [ + "image/jpeg", + "application/pdf" + ], + "parserExclude": [ + "org.apache.tika.parser.executable.ExecutableParser" + ] + } + } + }, + { + "empty-parser": { + "_decorate": { + "mimeInclude": [ + "application/pdf" + ] + } + } + } + ] +} diff --git a/tika-app/src/test/resources/test-data/TIKA-2389-ignore-init-problems.xml b/tika-app/src/test/resources/test-data/TIKA-2389-ignore-init-problems.xml deleted file mode 100644 index 30af37d7b..000000000 --- a/tika-app/src/test/resources/test-data/TIKA-2389-ignore-init-problems.xml +++ /dev/null @@ -1,20 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<properties> - <service-loader initializableProblemHandler="ignore"/> -</properties> diff --git a/tika-app/src/test/resources/test-data/tika-config1.xml b/tika-app/src/test/resources/test-data/tika-config1.xml deleted file mode 100644 index 52f4f0949..000000000 --- a/tika-app/src/test/resources/test-data/tika-config1.xml +++ /dev/null @@ -1,13 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<properties> - <parsers> - <parser class="org.apache.tika.parser.html.JSoupParser"> - <mime>application/vnd.wap.xhtml+xml</mime> - <mime>application/x-asp</mime> - <mime>application/xhtml+xml</mime> - <mime>text/html</mime> - <mime>application/xml</mime> - <mime>text/xml</mime> - </parser> - </parsers> -</properties> diff --git a/tika-app/src/test/resources/test-data/tika-config2.xml b/tika-app/src/test/resources/test-data/tika-config2.xml deleted file mode 100644 index 3a511ed7d..000000000 --- a/tika-app/src/test/resources/test-data/tika-config2.xml +++ /dev/null @@ -1,14 +0,0 @@ -<properties> - <parsers> - <parser class="org.apache.tika.parser.DefaultParser"> - <mime-exclude>image/jpeg</mime-exclude> - <mime-exclude>application/pdf</mime-exclude> - <parser-exclude class="org.apache.tika.parser.executable.ExecutableParser"/> - <parser-exclu class="org.apache.tika.parser.executable.ExecutableParser2"/> - </parser> - <parser class="org.apache.tika.parser.EmptyParser"> - <mime>application/pdf</mime> - <no-mime>hello/world</no-mime> - </parser> - </parsers> -</properties> \ No newline at end of file
