This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 9ef776bf0 TIKA-4552 -- add tool to cover most cases of converting
parsers from … (#2423)
9ef776bf0 is described below
commit 9ef776bf02e588eeaa551cfcb716543657fc30ab
Author: Tim Allison <[email protected]>
AuthorDate: Thu Dec 4 18:14:40 2025 -0500
TIKA-4552 -- add tool to cover most cases of converting parsers from …
(#2423)
* TIKA-4552 -- add tool to cover most cases of converting parsers from the
legacy xml to json.
Generated-by: Claude Sonnet 4.5 (claude-sonnet-4-5-20250929)
Significant design and implementation with Claude
---
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 32 ++
.../apache/tika/cli/XmlToJsonConfigConverter.java | 606 +++++++++++++++++++++
.../tika/cli/XmlToJsonConfigConverterTest.java | 330 +++++++++++
.../xml-configs/tika-config-list-map-types.xml | 38 ++
.../xml-configs/tika-config-numeric-types.xml | 30 +
.../tika-config-redundant-exclusion.xml | 42 ++
.../resources/xml-configs/tika-config-simple.xml | 30 +
.../xml-configs/tika-config-with-excludes.xml | 32 ++
.../apache/tika/parser/ocr/TesseractOCRConfig.java | 5 +-
.../tika/config/loader/KebabCaseConverter.java | 4 +-
.../apache/tika/config/loader/ParserLoader.java | 9 +
.../apache/tika/config/loader/TikaLoaderTest.java | 38 ++
12 files changed, 1191 insertions(+), 5 deletions(-)
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index f7d933090..91cb313b2 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -424,6 +424,9 @@ public class TikaCLI {
} else if (arg.equals("--dump-static-full-config")) {
pipeMode = false;
dumpConfig(TikaConfigSerializer.Mode.STATIC_FULL);
+ } else if (arg.startsWith("--convert-config-xml-to-json=")) {
+ pipeMode = false;
+
convertConfigXmlToJson(arg.substring("--convert-config-xml-to-json=".length()));
} else if (arg.equals("--container-aware") ||
arg.equals("--container-aware-detector")) {
// ignore, as container-aware detectors are now always used
} else if (arg.equals("-f") || arg.equals("--fork")) {
@@ -520,6 +523,33 @@ public class TikaCLI {
TikaConfigSerializer.serialize(localConfig, mode, new
OutputStreamWriter(System.out, UTF_8), UTF_8);
}
+ private void convertConfigXmlToJson(String paths) throws Exception {
+ String[] parts = paths.split(",");
+ if (parts.length != 2) {
+ System.err.println("Error: --convert-config-xml-to-json requires
input and output paths separated by comma");
+ System.err.println("Usage:
--convert-config-xml-to-json=<input.xml>,<output.json>");
+ return;
+ }
+
+ Path xmlPath = Paths.get(parts[0].trim());
+ Path jsonPath = Paths.get(parts[1].trim());
+
+ if (!Files.exists(xmlPath)) {
+ System.err.println("Error: Input XML file not found: " + xmlPath);
+ return;
+ }
+
+ try {
+ XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+ System.out.println("Successfully converted XML config to JSON:");
+ System.out.println(" Input: " + xmlPath.toAbsolutePath());
+ System.out.println(" Output: " + jsonPath.toAbsolutePath());
+ } catch (Exception e) {
+ System.err.println("Error converting config: " + e.getMessage());
+ throw e;
+ }
+ }
+
private void handleRecursiveJson(URL url, OutputStream output) throws
IOException, SAXException, TikaException {
Metadata metadata = new Metadata();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
@@ -569,6 +599,8 @@ public class TikaCLI {
out.println(" --dump-current-config Print current TikaConfig");
out.println(" --dump-static-config Print static config");
out.println(" --dump-static-full-config Print static explicit
config");
+ out.println("
--convert-config-xml-to-json=<input.xml>,<output.json>");
+ out.println(" Convert legacy XML config to JSON format (parsers
section only)");
out.println("");
out.println(" -x or --xml Output XHTML content
(default)");
out.println(" -h or --html Output HTML content");
diff --git
a/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java
b/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java
new file mode 100644
index 000000000..c8d8945fa
--- /dev/null
+++ b/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java
@@ -0,0 +1,606 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.cli;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+import org.apache.tika.config.loader.ComponentRegistry;
+import org.apache.tika.config.loader.KebabCaseConverter;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.utils.XMLReaderUtils;
+
+/**
+ * Converts legacy XML Tika configuration files to the new JSON format.
+ * <p>
+ * Currently supports converting the "parsers" section of tika-config.xml files
+ * for parsers in the tika-parsers-standard module.
+ * <p>
+ * Supports parameter types: bool, int, long, double, float, string, list, and
map.
+ * <p>
+ * <strong>Special Case:</strong> TesseractOCR's {@code
otherTesseractSettings} list
+ * (containing space-delimited key-value pairs) is automatically converted to
the
+ * {@code otherTesseractConfig} map format expected by the JSON configuration.
+ * <p>
+ * Example usage:
+ * <pre>
+ * XmlToJsonConfigConverter.convert(
+ * Paths.get("tika-config.xml"),
+ * Paths.get("tika-config.json")
+ * );
+ * </pre>
+ *
+ * <p>XML Format (with various parameter types):
+ * <pre>
+ * <properties>
+ * <parsers>
+ * <parser class="org.apache.tika.parser.pdf.PDFParser">
+ * <params>
+ * <param name="sortByPosition" type="bool">true</param>
+ * <param name="maxPages" type="int">1000</param>
+ * </params>
+ * </parser>
+ * <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
+ * <params>
+ * <!-- Special case: space-delimited key-value pairs -->
+ * <param name="otherTesseractSettings" type="list">
+ * <string>textord_initialx_ile 0.75</string>
+ * <string>textord_noise_hfract 0.15625</string>
+ * </param>
+ * <param name="envVars" type="map">
+ *
<TESSDATA_PREFIX>/usr/share/tesseract</TESSDATA_PREFIX>
+ * </param>
+ * </params>
+ * </parser>
+ * <parser class="org.apache.tika.parser.DefaultParser">
+ * <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ * </parser>
+ * </parsers>
+ * </properties>
+ * </pre>
+ *
+ * <p>JSON Format:
+ * <pre>
+ * {
+ * "parsers": [
+ * {
+ * "pdf-parser": {
+ * "sortByPosition": true,
+ * "maxPages": 1000
+ * }
+ * },
+ * {
+ * "tesseract-ocr-parser": {
+ * "otherTesseractConfig": {
+ * "textord_initialx_ile": "0.75",
+ * "textord_noise_hfract": "0.15625"
+ * },
+ * "envVars": {
+ * "TESSDATA_PREFIX": "/usr/share/tesseract"
+ * }
+ * }
+ * },
+ * {
+ * "default-parser": {
+ * "exclude": ["pdf-parser"]
+ * }
+ * }
+ * ]
+ * }
+ * </pre>
+ */
+public class XmlToJsonConfigConverter {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(XmlToJsonConfigConverter.class);
+
+ // Use a plain ObjectMapper for clean JSON output without @class
annotations
+ private static final ObjectMapper MAPPER = new ObjectMapper();
+
+ private XmlToJsonConfigConverter() {
+ // Utility class
+ }
+
+ /**
+ * Converts an XML Tika configuration file to JSON format.
+ *
+ * @param xmlPath path to the XML configuration file
+ * @param jsonPath path where the JSON output should be written
+ * @throws TikaConfigException if conversion fails
+ * @throws IOException if file I/O fails
+ */
+ public static void convert(Path xmlPath, Path jsonPath) throws
TikaConfigException, IOException {
+ try (InputStream in = Files.newInputStream(xmlPath);
+ OutputStream out = Files.newOutputStream(jsonPath)) {
+ convert(in, out);
+ }
+ }
+
+ /**
+ * Converts an XML Tika configuration stream to JSON format.
+ *
+ * @param xmlInput input stream containing XML configuration
+ * @param jsonOutput output stream where JSON will be written
+ * @throws TikaConfigException if conversion fails
+ * @throws IOException if stream I/O fails
+ */
+ public static void convert(InputStream xmlInput, OutputStream jsonOutput)
+ throws TikaConfigException, IOException {
+ convert(xmlInput, jsonOutput,
Thread.currentThread().getContextClassLoader());
+ }
+
+ /**
+ * Converts an XML Tika configuration stream to JSON format.
+ *
+ * @param xmlInput input stream containing XML configuration
+ * @param jsonOutput output stream where JSON will be written
+ * @param classLoader class loader to use for component registry
+ * @throws TikaConfigException if conversion fails
+ * @throws IOException if stream I/O fails
+ */
+ public static void convert(InputStream xmlInput, OutputStream jsonOutput,
ClassLoader classLoader)
+ throws TikaConfigException, IOException {
+ try {
+ // Load component registry to properly map class names to
component names
+ ComponentRegistry parserRegistry = new
ComponentRegistry("parsers", classLoader);
+
+ Document doc = XMLReaderUtils.buildDOM(xmlInput);
+ Map<String, Object> jsonConfig = convertDocument(doc,
parserRegistry);
+
+ try (Writer writer = new OutputStreamWriter(jsonOutput,
StandardCharsets.UTF_8)) {
+ MAPPER.writerWithDefaultPrettyPrinter().writeValue(writer,
jsonConfig);
+ }
+ } catch (Exception e) {
+ throw new TikaConfigException("Failed to convert XML config to
JSON", e);
+ }
+ }
+
+ /**
+ * Converts the entire XML configuration document to a JSON-compatible map.
+ */
+ private static Map<String, Object> convertDocument(Document doc,
ComponentRegistry parserRegistry)
+ throws TikaConfigException {
+ Map<String, Object> result = new LinkedHashMap<>();
+
+ Element root = doc.getDocumentElement();
+ if (!"properties".equals(root.getNodeName())) {
+ throw new TikaConfigException(
+ "Invalid XML config: root element must be <properties>,
found: " +
+ root.getNodeName());
+ }
+
+ NodeList children = root.getChildNodes();
+ for (int i = 0; i < children.getLength(); i++) {
+ Node child = children.item(i);
+ if (child.getNodeType() != Node.ELEMENT_NODE) {
+ continue;
+ }
+
+ Element element = (Element) child;
+ String sectionName = element.getNodeName();
+
+ if ("parsers".equals(sectionName)) {
+ result.put("parsers", convertParsersSection(element,
parserRegistry));
+ }
+ // Future: add support for detectors, translators, etc.
+ }
+
+ return result;
+ }
+
+ /**
+ * Converts the <parsers> section to JSON array format.
+ */
+ private static List<Map<String, Object>> convertParsersSection(Element
parsersElement,
+
ComponentRegistry parserRegistry)
+ throws TikaConfigException {
+ List<Map<String, Object>> parsersList = new ArrayList<>();
+
+ NodeList parserNodes = parsersElement.getElementsByTagName("parser");
+ for (int i = 0; i < parserNodes.getLength(); i++) {
+ Element parserElement = (Element) parserNodes.item(i);
+ Map<String, Object> parserEntry =
convertParserElement(parserElement, parserRegistry);
+ if (parserEntry != null && !parserEntry.isEmpty()) {
+ parsersList.add(parserEntry);
+ }
+ }
+
+ // Check for redundant exclusions and inform users
+ checkForRedundantExclusions(parsersList);
+
+ return parsersList;
+ }
+
+ /**
+ * Checks if parsers are excluded from default-parser but also configured
separately,
+ * which is redundant. Logs INFO messages to help users understand they
can remove
+ * the exclusion since configured parsers automatically override the
default.
+ */
+ private static void checkForRedundantExclusions(List<Map<String, Object>>
parsersList) {
+ // Find exclusions from default-parser
+ Set<String> excludedParsers = new HashSet<>();
+ for (Map<String, Object> parserEntry : parsersList) {
+ if (parserEntry.containsKey("default-parser")) {
+ Map<?, ?> config = (Map<?, ?>)
parserEntry.get("default-parser");
+ if (config.containsKey("exclude")) {
+ @SuppressWarnings("unchecked")
+ List<String> excludes = (List<String>)
config.get("exclude");
+ excludedParsers.addAll(excludes);
+ }
+ }
+ }
+
+ // Find configured parsers
+ Set<String> configuredParsers = new HashSet<>();
+ for (Map<String, Object> parserEntry : parsersList) {
+ for (String parserName : parserEntry.keySet()) {
+ if (!"default-parser".equals(parserName)) {
+ configuredParsers.add(parserName);
+ }
+ }
+ }
+
+ // Check for overlap and log informational messages
+ Set<String> redundantExclusions = new HashSet<>(excludedParsers);
+ redundantExclusions.retainAll(configuredParsers);
+
+ if (!redundantExclusions.isEmpty()) {
+ LOG.info("=".repeat(80));
+ LOG.info("CONFIGURATION OPTIMIZATION NOTICE");
+ LOG.info("=".repeat(80));
+ LOG.info("");
+ LOG.info("The following parsers are excluded from default-parser
but also configured separately:");
+ for (String parserName : redundantExclusions) {
+ LOG.info(" - {}", parserName);
+ }
+ LOG.info("");
+ LOG.info("This exclusion is redundant. When you configure a parser
with specific settings,");
+ LOG.info("the loader excludes loading that parser from SPI. You
can remove these");
+ LOG.info("exclusions from your default-parser configuration.");
+ LOG.info("");
+ LOG.info("Example - Instead of:");
+ LOG.info(" {");
+ LOG.info(" \"default-parser\": {");
+ LOG.info(" \"exclude\": [\"pdf-parser\"]");
+ LOG.info(" }");
+ LOG.info(" },");
+ LOG.info(" {");
+ LOG.info(" \"pdf-parser\": {");
+ LOG.info(" \"sortByPosition\": true");
+ LOG.info(" }");
+ LOG.info(" }");
+ LOG.info("");
+ LOG.info("Simply use:");
+ LOG.info(" {");
+ LOG.info(" \"default-parser\": {},");
+ LOG.info(" \"pdf-parser\": {");
+ LOG.info(" \"sortByPosition\": true");
+ LOG.info(" }");
+ LOG.info(" }");
+ LOG.info("");
+ LOG.info("=".repeat(80));
+ }
+ }
+
+ /**
+ * Converts a single <parser> element to a JSON map entry.
+ *
+ * @return map with single entry: { "parser-name": { config... } }
+ */
+ private static Map<String, Object> convertParserElement(Element
parserElement,
+ ComponentRegistry
parserRegistry)
+ throws TikaConfigException {
+ String className = parserElement.getAttribute("class");
+ if (className == null || className.isEmpty()) {
+ throw new TikaConfigException("Parser element missing 'class'
attribute");
+ }
+
+ // Convert class name to component name using the registry
+ String componentName = classNameToComponentName(className,
parserRegistry);
+
+ Map<String, Object> config = new LinkedHashMap<>();
+ List<String> excludes = null;
+
+ // Process child elements
+ NodeList children = parserElement.getChildNodes();
+ for (int i = 0; i < children.getLength(); i++) {
+ Node child = children.item(i);
+ if (child.getNodeType() != Node.ELEMENT_NODE) {
+ continue;
+ }
+
+ Element element = (Element) child;
+ String tagName = element.getNodeName();
+
+ if ("params".equals(tagName)) {
+ // Process <params> section
+ Map<String, Object> params = convertParamsElement(element);
+ config.putAll(params);
+ } else if ("parser-exclude".equals(tagName)) {
+ // Process <parser-exclude> elements -> excludes array
+ if (excludes == null) {
+ excludes = new ArrayList<>();
+ }
+ String excludeClass = element.getAttribute("class");
+ if (excludeClass != null && !excludeClass.isEmpty()) {
+ excludes.add(classNameToComponentName(excludeClass,
parserRegistry));
+ }
+ }
+ }
+
+ if (excludes != null && !excludes.isEmpty()) {
+ config.put("exclude", excludes);
+ }
+
+ Map<String, Object> result = new LinkedHashMap<>();
+ result.put(componentName, config);
+ return result;
+ }
+
+ /**
+ * Converts a <params> element to a map of parameter names to values.
+ */
+ private static Map<String, Object> convertParamsElement(Element
paramsElement) {
+ Map<String, Object> params = new LinkedHashMap<>();
+
+ NodeList paramNodes = paramsElement.getElementsByTagName("param");
+ for (int i = 0; i < paramNodes.getLength(); i++) {
+ Element paramElement = (Element) paramNodes.item(i);
+ String name = paramElement.getAttribute("name");
+ String type = paramElement.getAttribute("type");
+
+ if (name != null && !name.isEmpty()) {
+ // Special case: otherTesseractSettings is a list of
space-delimited key-value pairs
+ // that needs to be converted to otherTesseractConfig map
+ if ("otherTesseractSettings".equals(name) &&
"list".equals(type)) {
+ Map<String, String> configMap =
convertTesseractSettingsList(paramElement);
+ params.put("otherTesseractConfig", configMap);
+ } else {
+ Object value = convertParamValue(paramElement, type);
+ params.put(name, value);
+ }
+ }
+ }
+
+ return params;
+ }
+
+ /**
+ * Special handler for TesseractOCR's otherTesseractSettings list.
+ * <p>
+ * Converts a list of space-delimited key-value pairs into a map.
+ * <p>
+ * XML Format:
+ * <pre>
+ * <param name="otherTesseractSettings" type="list">
+ * <string>textord_initialx_ile 0.75</string>
+ * <string>textord_noise_hfract 0.15625</string>
+ * </param>
+ * </pre>
+ * <p>
+ * JSON Output (as otherTesseractConfig):
+ * <pre>
+ * "otherTesseractConfig": {
+ * "textord_initialx_ile": "0.75",
+ * "textord_noise_hfract": "0.15625"
+ * }
+ * </pre>
+ */
+ private static Map<String, String> convertTesseractSettingsList(Element
paramElement) {
+ Map<String, String> configMap = new LinkedHashMap<>();
+ NodeList stringNodes = paramElement.getElementsByTagName("string");
+
+ for (int i = 0; i < stringNodes.getLength(); i++) {
+ Node stringNode = stringNodes.item(i);
+ if (stringNode.getNodeType() == Node.ELEMENT_NODE &&
+ stringNode.getParentNode().equals(paramElement)) {
+ String setting = stringNode.getTextContent().trim();
+ // Parse space-delimited key-value pair
+ int spaceIndex = setting.indexOf(' ');
+ if (spaceIndex > 0) {
+ String key = setting.substring(0, spaceIndex).trim();
+ String value = setting.substring(spaceIndex + 1).trim();
+ configMap.put(key, value);
+ } else {
+ LOG.warn("Ignoring malformed Tesseract setting (expected
'key value'): {}", setting);
+ }
+ }
+ }
+
+ return configMap;
+ }
+
+ /**
+ * Converts a parameter value from XML element to the appropriate type.
+ * <p>
+ * Supports primitive types (bool, int, long, double), as well as
collections:
+ * <ul>
+ * <li>list - converts child <string> elements to a JSON array</li>
+ * <li>map - converts child elements (where element name is key) to a
JSON object</li>
+ * </ul>
+ */
+ private static Object convertParamValue(Element paramElement, String type)
{
+ if (type == null || type.isEmpty()) {
+ // No type specified, return text content as string
+ return paramElement.getTextContent().trim();
+ }
+
+ String typeKey = type.toLowerCase(Locale.ROOT);
+
+ // Handle collection types that need child element processing
+ if ("list".equals(typeKey)) {
+ return convertListParam(paramElement);
+ } else if ("map".equals(typeKey)) {
+ return convertMapParam(paramElement);
+ }
+
+ // Handle primitive types using text content
+ String valueStr = paramElement.getTextContent().trim();
+
+ if (valueStr.isEmpty()) {
+ return valueStr;
+ }
+
+ switch (typeKey) {
+ case "bool":
+ case "boolean":
+ return Boolean.parseBoolean(valueStr);
+ case "int":
+ case "integer":
+ try {
+ return Integer.parseInt(valueStr);
+ } catch (NumberFormatException e) {
+ return valueStr;
+ }
+ case "long":
+ try {
+ return Long.parseLong(valueStr);
+ } catch (NumberFormatException e) {
+ return valueStr;
+ }
+ case "double":
+ case "float":
+ try {
+ return Double.parseDouble(valueStr);
+ } catch (NumberFormatException e) {
+ return valueStr;
+ }
+ default:
+ // Unknown type, return as string
+ return valueStr;
+ }
+ }
+
+ /**
+ * Converts a list parameter by extracting <string> child elements.
+ * <p>
+ * XML Format:
+ * <pre>
+ * <param name="languages" type="list">
+ * <string>en</string>
+ * <string>fr</string>
+ * </param>
+ * </pre>
+ * <p>
+ * JSON Output: ["en", "fr"]
+ */
+ private static List<String> convertListParam(Element paramElement) {
+ List<String> list = new ArrayList<>();
+ NodeList stringNodes = paramElement.getElementsByTagName("string");
+
+ for (int i = 0; i < stringNodes.getLength(); i++) {
+ Node stringNode = stringNodes.item(i);
+ if (stringNode.getNodeType() == Node.ELEMENT_NODE) {
+ // Only include direct children, not nested strings
+ if (stringNode.getParentNode().equals(paramElement)) {
+ list.add(stringNode.getTextContent().trim());
+ }
+ }
+ }
+
+ return list;
+ }
+
+ /**
+ * Converts a map parameter by using child element names as keys and text
content as values.
+ * <p>
+ * XML Format:
+ * <pre>
+ * <param name="captureMap" type="map">
+ * <title>^Title: ([^\r\n]+)</title>
+ * <author>^Author: ([^\r\n]+)</author>
+ * </param>
+ * </pre>
+ * <p>
+ * JSON Output: {"title": "^Title: ([^\\r\\n]+)", "author": "^Author:
([^\\r\\n]+)"}
+ */
+ private static Map<String, String> convertMapParam(Element paramElement) {
+ Map<String, String> map = new LinkedHashMap<>();
+ NodeList children = paramElement.getChildNodes();
+
+ for (int i = 0; i < children.getLength(); i++) {
+ Node child = children.item(i);
+ if (child.getNodeType() == Node.ELEMENT_NODE) {
+ Element childElement = (Element) child;
+ String key = childElement.getNodeName();
+ String value = childElement.getTextContent().trim();
+ map.put(key, value);
+ }
+ }
+
+ return map;
+ }
+
+ /**
+ * Converts a full Java class name to a component name.
+ * <p>
+ * Uses the ComponentRegistry to perform a reverse lookup, respecting
+ * custom component names from {@code @TikaComponent} annotations.
+ * Falls back to kebab-case conversion if the class is not in the registry.
+ * <p>
+ * Examples:
+ * <ul>
+ * <li>org.apache.tika.parser.pdf.PDFParser → pdf-parser</li>
+ * <li>org.apache.tika.parser.DefaultParser → default-parser</li>
+ * <li>org.apache.tika.parser.html.JSoupParser → jsoup-parser (from
@TikaComponent annotation)</li>
+ * </ul>
+ */
+ private static String classNameToComponentName(String fullClassName,
ComponentRegistry registry) {
+ try {
+ // Try to load the class and find it in the registry
+ Class<?> clazz =
Thread.currentThread().getContextClassLoader().loadClass(fullClassName);
+
+ // Reverse lookup: find the component name for this class
+ for (Map.Entry<String, Class<?>> entry :
registry.getAllComponents().entrySet()) {
+ if (entry.getValue().equals(clazz)) {
+ return entry.getKey();
+ }
+ }
+ } catch (ClassNotFoundException e) {
+ // Class not found or not in registry - fall through to kebab-case
conversion
+ }
+
+ // Fallback: use kebab-case conversion
+ String simpleClassName = fullClassName;
+ int lastDot = fullClassName.lastIndexOf('.');
+ if (lastDot >= 0) {
+ simpleClassName = fullClassName.substring(lastDot + 1);
+ }
+
+ return KebabCaseConverter.toKebabCase(simpleClassName);
+ }
+}
diff --git
a/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java
b/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java
new file mode 100644
index 000000000..98671c37b
--- /dev/null
+++
b/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java
@@ -0,0 +1,330 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.cli;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
+
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Map;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.config.loader.TikaLoader;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.JSoupParser;
+import org.apache.tika.parser.pdf.PDFParser;
+
+/**
+ * Tests for XmlToJsonConfigConverter.
+ * These tests verify that XML configurations are correctly converted to JSON
+ * and can be loaded by TikaLoader to produce properly configured parsers.
+ */
+public class XmlToJsonConfigConverterTest {
+
+ @Test
+ public void testSimpleParserConfig(@TempDir Path tempDir) throws Exception
{
+ Path xmlPath =
Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI());
+ Path jsonPath = tempDir.resolve("simple-config.json");
+
+ // Convert XML to JSON
+ XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+ // Verify JSON file was created
+ assertTrue(Files.exists(jsonPath));
+
+ // Load the JSON config with TikaLoader
+ TikaLoader loader = TikaLoader.load(jsonPath);
+ Parser parser = loader.loadParsers();
+
+ assertNotNull(parser);
+ assertTrue(parser instanceof CompositeParser);
+
+ // Verify PDF parser is configured
+ CompositeParser compositeParser = (CompositeParser) parser;
+ ParseContext context = new ParseContext();
+ Map<MediaType, Parser> parsers = compositeParser.getParsers(context);
+
+ // Check that PDF parser is present
+ MediaType pdfType = MediaType.parse("application/pdf");
+ assertTrue(parsers.containsKey(pdfType), "PDF parser should be
configured");
+
+ Parser pdfParser = parsers.get(pdfType);
+ assertTrue(pdfParser instanceof PDFParser, "Parser for PDF should be
PDFParser");
+
+ // The actual parser configuration (sortByPosition,
extractInlineImages, etc.)
+ // is tested by the parser's behavior, not directly accessible here
+ }
+
+ @Test
+ public void testParserWithExcludes(@TempDir Path tempDir) throws Exception
{
+ Path xmlPath =
Paths.get(getClass().getResource("/xml-configs/tika-config-with-excludes.xml").toURI());
+ Path jsonPath = tempDir.resolve("excludes-config.json");
+
+ // Convert XML to JSON
+ XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+ // Print JSON for debugging
+ String json = new String(Files.readAllBytes(jsonPath),
StandardCharsets.UTF_8);
+ System.out.println("Generated JSON:");
+ System.out.println(json);
+
+ // Verify exclude is at the correct level (not under _decorate)
+ assertTrue(json.contains("\"exclude\""), "Should have exclude array");
+ assertFalse(json.contains("\"_decorate\""), "_decorate should not be
used for parser excludes");
+ assertTrue(json.contains("\"jsoup-parser\""), "Should exclude
jsoup-parser");
+ assertTrue(json.contains("\"pdf-parser\""), "Should exclude
pdf-parser");
+
+ // Load the JSON config with TikaLoader
+ TikaLoader loader = TikaLoader.load(jsonPath);
+ Parser parser = loader.loadParsers();
+
+ assertNotNull(parser);
+ assertTrue(parser instanceof CompositeParser);
+
+ // Verify parsers are configured
+ CompositeParser compositeParser = (CompositeParser) parser;
+ for (Parser p : ((CompositeParser) parser).getAllComponentParsers()) {
+ if (p instanceof PDFParser) {
+ fail("pdf parser should have been excluded");
+ }
+ }
+ ParseContext context = new ParseContext();
+ Map<MediaType, Parser> parsers = compositeParser.getParsers(context);
+
+ // Check that HTML parser is present (JSoupParser should be configured)
+ MediaType htmlType = MediaType.parse("text/html");
+ assertTrue(parsers.containsKey(htmlType), "HTML parser should be
configured");
+
+ Parser htmlParser = parsers.get(htmlType);
+ // JSoupParser extends HtmlParser, so this checks for the correct
family
+ assertTrue(htmlParser instanceof JSoupParser, "Parser for HTML should
be HtmlParser or JSoupParser");
+ }
+
+ @Test
+ public void testNumericTypes(@TempDir Path tempDir) throws Exception {
+ Path xmlPath =
Paths.get(getClass().getResource("/xml-configs/tika-config-numeric-types.xml").toURI());
+ Path jsonPath = tempDir.resolve("numeric-config.json");
+
+ // Convert XML to JSON
+ XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+ // Verify JSON file was created and contains proper numeric types
+ assertTrue(Files.exists(jsonPath));
+
+ // Read the JSON to verify numeric types are preserved
+ String json = new String(Files.readAllBytes(jsonPath),
StandardCharsets.UTF_8);
+
+ // Verify numbers are not quoted (they should appear as: "density":
300, not "density": "300")
+ assertTrue(json.contains("\"density\" : 300"), "density should be
numeric, not string");
+ assertFalse(json.contains("\"timeout\" : \"300\""), "timeout should
not be a quoted string");
+
+ // Load the JSON config with TikaLoader to verify it's valid
+ TikaLoader loader = TikaLoader.load(jsonPath);
+ Parser parser = loader.loadParsers();
+ assertNotNull(parser);
+ }
+
+ @Test
+ public void testFileConversion(@TempDir Path tempDir) throws Exception {
+ Path xmlPath =
Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI());
+ Path jsonPath = tempDir.resolve("output.json");
+
+ // Test the Path-based conversion method
+ XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+ // Verify file exists
+ assertTrue(Files.exists(jsonPath));
+
+ // Verify it can be loaded by TikaLoader
+ TikaLoader loader = TikaLoader.load(jsonPath);
+ Parser parser = loader.loadParsers();
+ assertNotNull(parser);
+ assertTrue(parser instanceof CompositeParser);
+ }
+
+ @Test
+ public void testClassNameConversion(@TempDir Path tempDir) throws
Exception {
+ Path xmlPath =
Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI());
+ Path jsonPath = tempDir.resolve("classname-config.json");
+
+ // Convert XML to JSON
+ XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+ // Read JSON and verify component name conversion
+ String json = new String(Files.readAllBytes(jsonPath),
StandardCharsets.UTF_8);
+
+ // Verify that PDFParser was converted to pdf-parser (kebab-case)
+ assertTrue(json.contains("\"pdf-parser\""), "PDFParser should be
converted to pdf-parser");
+
+ // Verify the config loads successfully
+ TikaLoader loader = TikaLoader.load(jsonPath);
+ Parser parser = loader.loadParsers();
+ assertNotNull(parser);
+ }
+
+ @Test
+ public void testAutoDetectParserLoading(@TempDir Path tempDir) throws
Exception {
+ Path xmlPath =
Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI());
+ Path jsonPath = tempDir.resolve("autodetect-config.json");
+
+ // Convert XML to JSON
+ XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+ // Load via TikaLoader and get AutoDetectParser
+ TikaLoader loader = TikaLoader.load(jsonPath);
+ Parser autoDetectParser = loader.loadAutoDetectParser();
+
+ assertNotNull(autoDetectParser);
+
+ // Verify it supports PDF type
+ ParseContext context = new ParseContext();
+ MediaType pdfType = MediaType.parse("application/pdf");
+
assertTrue(autoDetectParser.getSupportedTypes(context).contains(pdfType),
+ "AutoDetectParser should support PDF");
+ }
+
+ @Test
+ public void testRedundantExclusionWarning(@TempDir Path tempDir) throws
Exception {
+ // This test demonstrates the old pattern where users excluded parsers
from default-parser
+ // and then configured those same parsers separately. The converter
will log an INFO message
+ // informing users that the exclusion is redundant.
+ Path xmlPath =
Paths.get(getClass().getResource("/xml-configs/tika-config-redundant-exclusion.xml").toURI());
+ Path jsonPath = tempDir.resolve("redundant-config.json");
+
+ // Convert XML to JSON (this will log the INFO message about redundant
exclusions)
+ XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+ // Print the generated JSON
+ String json = new String(Files.readAllBytes(jsonPath),
StandardCharsets.UTF_8);
+ System.out.println("Generated JSON with redundant exclusions:");
+ System.out.println(json);
+
+ // Verify the JSON still contains the exclusions (we don't remove
them, just inform)
+ assertTrue(json.contains("\"exclude\""), "Should still have exclude
array");
+ assertTrue(json.contains("\"pdf-parser\""), "Should have pdf-parser
configured");
+ assertTrue(json.contains("\"jsoup-parser\""), "Should have
jsoup-parser configured");
+
+ // Verify it loads correctly via TikaLoader
+ TikaLoader loader = TikaLoader.load(jsonPath);
+ Parser parser = loader.loadParsers();
+ assertNotNull(parser);
+ assertTrue(parser instanceof CompositeParser);
+
+ // Verify both parsers are configured and working
+ CompositeParser compositeParser = (CompositeParser) parser;
+ ParseContext context = new ParseContext();
+ Map<MediaType, Parser> parsers = compositeParser.getParsers(context);
+
+ MediaType pdfType = MediaType.parse("application/pdf");
+ assertTrue(parsers.containsKey(pdfType), "PDF parser should be
configured");
+
+ MediaType htmlType = MediaType.parse("text/html");
+ assertTrue(parsers.containsKey(htmlType), "HTML parser should be
configured");
+ }
+
+ @Test
+ public void testTesseractArbitrarySettings(@TempDir Path tempDir) throws
Exception {
+ // Test the special case conversion of TesseractOCR's
otherTesseractSettings
+ String xmlConfig = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
+ "<properties>\n" +
+ " <parsers>\n" +
+ " <parser
class=\"org.apache.tika.parser.ocr.TesseractOCRParser\">\n" +
+ " <params>\n" +
+ " <param name=\"otherTesseractSettings\"
type=\"list\">\n" +
+ " <string>textord_initialx_ile
0.75</string>\n" +
+ " <string>textord_noise_hfract
0.15625</string>\n" +
+ " </param>\n" +
+ " </params>\n" +
+ " </parser>\n" +
+ " </parsers>\n" +
+ "</properties>";
+
+ Path xmlPath = tempDir.resolve("tesseract-arbitrary.xml");
+ Path jsonPath = tempDir.resolve("tesseract-arbitrary.json");
+ Files.write(xmlPath, xmlConfig.getBytes(StandardCharsets.UTF_8));
+
+ // Convert
+ XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+ String json = new String(Files.readAllBytes(jsonPath),
StandardCharsets.UTF_8);
+ System.out.println("Tesseract arbitrary settings conversion:");
+ System.out.println(json);
+
+ // Verify conversion: list of space-delimited pairs -> map
+ assertTrue(json.contains("\"otherTesseractConfig\""),
+ "Should convert to otherTesseractConfig");
+ assertFalse(json.contains("\"otherTesseractSettings\""),
+ "Should not keep old parameter name");
+ assertTrue(json.contains("\"textord_initialx_ile\" : \"0.75\""),
+ "Should parse key-value pairs correctly");
+ assertTrue(json.contains("\"textord_noise_hfract\" : \"0.15625\""),
+ "Should parse second pair");
+
+ // Verify it loads via TikaLoader without errors
+ TikaLoader loader = TikaLoader.load(jsonPath);
+ Parser parser = loader.loadParsers();
+ assertNotNull(parser);
+ }
+
+ @Test
+ public void testListAndMapParameterTypes(@TempDir Path tempDir) throws
Exception {
+ Path xmlPath =
Paths.get(getClass().getResource("/xml-configs/tika-config-list-map-types.xml").toURI());
+ Path jsonPath = tempDir.resolve("list-map-config.json");
+
+ // Convert XML to JSON
+ XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+ // Print the generated JSON for debugging
+ String json = new String(Files.readAllBytes(jsonPath),
StandardCharsets.UTF_8);
+ System.out.println("Generated JSON with list and map types:");
+ System.out.println(json);
+
+ // Verify otherTesseractSettings (list) is converted to
otherTesseractConfig (map)
+ // This is a special case where space-delimited key-value pairs are
parsed
+ assertTrue(json.contains("\"otherTesseractConfig\" : {"),
+ "Should convert otherTesseractSettings list to
otherTesseractConfig map");
+ assertFalse(json.contains("\"otherTesseractSettings\""),
+ "Should not have old otherTesseractSettings name");
+ assertTrue(json.contains("\"textord_initialx_ile\" : \"0.75\""),
+ "Should parse first key-value pair");
+ assertTrue(json.contains("\"textord_noise_hfract\" : \"0.15625\""),
+ "Should parse second key-value pair");
+ assertTrue(json.contains("\"preserve_interword_spaces\" : \"1\""),
+ "Should parse third key-value pair");
+
+ // Verify regular parameters still work
+ assertTrue(json.contains("\"timeoutSeconds\" : 300"), "Should have
integer parameter");
+ assertTrue(json.contains("\"enableImagePreprocessing\" : true"),
"Should have boolean parameter");
+ assertTrue(json.contains("\"language\" : \"eng\""), "Should have
string parameter");
+
+ // Verify it loads correctly via TikaLoader
+ TikaLoader loader = TikaLoader.load(jsonPath);
+ Parser parser = loader.loadParsers();
+ assertNotNull(parser);
+ assertTrue(parser instanceof CompositeParser);
+ }
+}
diff --git
a/tika-app/src/test/resources/xml-configs/tika-config-list-map-types.xml
b/tika-app/src/test/resources/xml-configs/tika-config-list-map-types.xml
new file mode 100644
index 000000000..33ffc2119
--- /dev/null
+++ b/tika-app/src/test/resources/xml-configs/tika-config-list-map-types.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
+ <params>
+ <!-- Special Tesseract case: list of space-delimited key-value
pairs -->
+ <!-- This will be converted to otherTesseractConfig map in
JSON -->
+ <param name="otherTesseractSettings" type="list">
+ <string>textord_initialx_ile 0.75</string>
+ <string>textord_noise_hfract 0.15625</string>
+ <string>preserve_interword_spaces 1</string>
+ </param>
+ <!-- Regular parameters -->
+ <param name="timeoutSeconds" type="int">300</param>
+ <param name="enableImagePreprocessing" type="bool">true</param>
+ <param name="language">eng</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git
a/tika-app/src/test/resources/xml-configs/tika-config-numeric-types.xml
b/tika-app/src/test/resources/xml-configs/tika-config-numeric-types.xml
new file mode 100644
index 000000000..99da01562
--- /dev/null
+++ b/tika-app/src/test/resources/xml-configs/tika-config-numeric-types.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
+ <params>
+ <param name="maxFileSizeToOcr" type="long">10000000</param>
+ <param name="minFileSizeToOcr" type="long">0</param>
+ <param name="density" type="int">300</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git
a/tika-app/src/test/resources/xml-configs/tika-config-redundant-exclusion.xml
b/tika-app/src/test/resources/xml-configs/tika-config-redundant-exclusion.xml
new file mode 100644
index 000000000..6caeecd7a
--- /dev/null
+++
b/tika-app/src/test/resources/xml-configs/tika-config-redundant-exclusion.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<properties>
+ <parsers>
+ <!-- Old pattern: excluding parser from default, then configuring it
separately -->
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ <parser-exclude class="org.apache.tika.parser.html.JSoupParser"/>
+ </parser>
+
+ <!-- These parsers are configured separately, making the exclusions
above redundant -->
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="sortByPosition" type="bool">true</param>
+ <param name="extractInlineImages" type="bool">false</param>
+ </params>
+ </parser>
+
+ <parser class="org.apache.tika.parser.html.JSoupParser">
+ <params>
+ <param name="extractScripts" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git a/tika-app/src/test/resources/xml-configs/tika-config-simple.xml
b/tika-app/src/test/resources/xml-configs/tika-config-simple.xml
new file mode 100644
index 000000000..1a880dd68
--- /dev/null
+++ b/tika-app/src/test/resources/xml-configs/tika-config-simple.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="sortByPosition" type="bool">true</param>
+ <param name="extractInlineImages" type="bool">false</param>
+ <param name="ocrRenderingStrategy"
type="string">TEXT_ONLY</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git
a/tika-app/src/test/resources/xml-configs/tika-config-with-excludes.xml
b/tika-app/src/test/resources/xml-configs/tika-config-with-excludes.xml
new file mode 100644
index 000000000..9089428c3
--- /dev/null
+++ b/tika-app/src/test/resources/xml-configs/tika-config-with-excludes.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.html.JSoupParser"/>
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.html.JSoupParser">
+ <params>
+ <param name="extractScripts" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 799cd6ffd..7cad1c7ac 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -20,7 +20,6 @@ import java.io.Serializable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
-import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -82,7 +81,7 @@ public class TesseractOCRConfig implements Serializable {
// Maximum time (seconds) to wait for the ocring process termination
private int timeoutSeconds = 120;
// See addOtherTesseractConfig.
- private Map<String, String> otherTesseractConfig = new HashMap<>();
+ private HashMap<String, String> otherTesseractConfig = new HashMap<>();
private boolean inlineContent = false;
private String tesseractPath = "";
@@ -478,7 +477,7 @@ public class TesseractOCRConfig implements Serializable {
/**
* @see #addOtherTesseractConfig(String, String)
*/
- public Map<String, String> getOtherTesseractConfig() {
+ public HashMap<String, String> getOtherTesseractConfig() {
return otherTesseractConfig;
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java
index 8a12a5033..fc434361d 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java
@@ -36,7 +36,7 @@ import java.util.Locale;
* <li>TesseractOCRParser → tesseract-ocr-parser</li>
* </ul>
*/
-class KebabCaseConverter {
+public class KebabCaseConverter {
private KebabCaseConverter() {
// Utility class
@@ -48,7 +48,7 @@ class KebabCaseConverter {
* @param className the simple class name (without package)
* @return the kebab-case version of the name
*/
- static String toKebabCase(String className) {
+ public static String toKebabCase(String className) {
if (className == null || className.isEmpty()) {
return className;
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
index 786d2e9bb..b6bb8ebff 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
@@ -110,6 +110,15 @@ public class ParserLoader {
// Parse exclusions from default-parser config
JsonNode configNode = entry.getValue();
+
+ // Check for common mistake: using "excludes" instead of
"exclude"
+ if (configNode != null && configNode.has("excludes")) {
+ throw new TikaConfigException(
+ "Invalid configuration for default-parser: found
'excludes' but the correct " +
+ "field name is 'exclude' (singular). Please change
'excludes' to 'exclude' " +
+ "in your configuration.");
+ }
+
if (configNode != null && configNode.has("exclude")) {
JsonNode excludeNode = configNode.get("exclude");
if (excludeNode.isArray()) {
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
index 44c145418..435282998 100644
---
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
@@ -24,6 +24,7 @@ import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
import java.nio.file.Path;
import org.junit.jupiter.api.Test;
@@ -358,4 +359,41 @@ public class TikaLoaderTest {
assertNotNull(translator, "Translator should not be null");
// Should be DefaultTranslator since no translator configured in
test-loader-config.json
}
+
+ @Test
+ public void testExcludesInsteadOfExcludeThrowsException() throws Exception
{
+ // Create a config with the common mistake: "excludes" instead of
"exclude"
+ String invalidConfig = "{\n" +
+ " \"parsers\": [\n" +
+ " {\n" +
+ " \"default-parser\": {\n" +
+ " \"excludes\": [\"pdf-parser\"]\n" +
+ " }\n" +
+ " }\n" +
+ " ]\n" +
+ "}";
+
+ // Write to a temp file
+ Path tempFile = Files.createTempFile("test-invalid-excludes", ".json");
+ try {
+ Files.write(tempFile,
invalidConfig.getBytes(StandardCharsets.UTF_8));
+
+ // Attempt to load should throw TikaConfigException
+ try {
+ TikaLoader loader = TikaLoader.load(tempFile);
+ loader.loadParsers();
+ throw new AssertionError("Expected TikaConfigException to be
thrown");
+ } catch (org.apache.tika.exception.TikaConfigException e) {
+ // Expected - verify the error message is helpful
+ assertTrue(e.getMessage().contains("excludes"),
+ "Error message should mention 'excludes'");
+ assertTrue(e.getMessage().contains("exclude"),
+ "Error message should mention the correct field
'exclude'");
+ assertTrue(e.getMessage().contains("singular"),
+ "Error message should explain it should be singular");
+ }
+ } finally {
+ Files.deleteIfExists(tempFile);
+ }
+ }
}