(tika) branch main updated: TIKA-4552 -- add tool to cover most cases of converting parsers from … (#2423)

tallison Thu, 04 Dec 2025 15:14:52 -0800

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new 9ef776bf0 TIKA-4552 -- add tool to cover most cases of converting 
parsers from … (#2423)
9ef776bf0 is described below

commit 9ef776bf02e588eeaa551cfcb716543657fc30ab
Author: Tim Allison <[email protected]>
AuthorDate: Thu Dec 4 18:14:40 2025 -0500

    TIKA-4552 -- add tool to cover most cases of converting parsers from … 
(#2423)
    
    * TIKA-4552 -- add tool to cover most cases of converting parsers from the 
legacy xml to json.
    
    Generated-by: Claude Sonnet 4.5 (claude-sonnet-4-5-20250929)
     Significant design and implementation with Claude
---
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  32 ++
 .../apache/tika/cli/XmlToJsonConfigConverter.java  | 606 +++++++++++++++++++++
 .../tika/cli/XmlToJsonConfigConverterTest.java     | 330 +++++++++++
 .../xml-configs/tika-config-list-map-types.xml     |  38 ++
 .../xml-configs/tika-config-numeric-types.xml      |  30 +
 .../tika-config-redundant-exclusion.xml            |  42 ++
 .../resources/xml-configs/tika-config-simple.xml   |  30 +
 .../xml-configs/tika-config-with-excludes.xml      |  32 ++
 .../apache/tika/parser/ocr/TesseractOCRConfig.java |   5 +-
 .../tika/config/loader/KebabCaseConverter.java     |   4 +-
 .../apache/tika/config/loader/ParserLoader.java    |   9 +
 .../apache/tika/config/loader/TikaLoaderTest.java  |  38 ++
 12 files changed, 1191 insertions(+), 5 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index f7d933090..91cb313b2 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -424,6 +424,9 @@ public class TikaCLI {
         } else if (arg.equals("--dump-static-full-config")) {
             pipeMode = false;
             dumpConfig(TikaConfigSerializer.Mode.STATIC_FULL);
+        } else if (arg.startsWith("--convert-config-xml-to-json=")) {
+            pipeMode = false;
+            
convertConfigXmlToJson(arg.substring("--convert-config-xml-to-json=".length()));
         } else if (arg.equals("--container-aware") || 
arg.equals("--container-aware-detector")) {
             // ignore, as container-aware detectors are now always used
         } else if (arg.equals("-f") || arg.equals("--fork")) {
@@ -520,6 +523,33 @@ public class TikaCLI {
         TikaConfigSerializer.serialize(localConfig, mode, new 
OutputStreamWriter(System.out, UTF_8), UTF_8);
     }
 
+    private void convertConfigXmlToJson(String paths) throws Exception {
+        String[] parts = paths.split(",");
+        if (parts.length != 2) {
+            System.err.println("Error: --convert-config-xml-to-json requires 
input and output paths separated by comma");
+            System.err.println("Usage: 
--convert-config-xml-to-json=<input.xml>,<output.json>");
+            return;
+        }
+
+        Path xmlPath = Paths.get(parts[0].trim());
+        Path jsonPath = Paths.get(parts[1].trim());
+
+        if (!Files.exists(xmlPath)) {
+            System.err.println("Error: Input XML file not found: " + xmlPath);
+            return;
+        }
+
+        try {
+            XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+            System.out.println("Successfully converted XML config to JSON:");
+            System.out.println("  Input:  " + xmlPath.toAbsolutePath());
+            System.out.println("  Output: " + jsonPath.toAbsolutePath());
+        } catch (Exception e) {
+            System.err.println("Error converting config: " + e.getMessage());
+            throw e;
+        }
+    }
+
     private void handleRecursiveJson(URL url, OutputStream output) throws 
IOException, SAXException, TikaException {
         Metadata metadata = new Metadata();
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
@@ -569,6 +599,8 @@ public class TikaCLI {
         out.println("    --dump-current-config  Print current TikaConfig");
         out.println("    --dump-static-config   Print static config");
         out.println("    --dump-static-full-config  Print static explicit 
config");
+        out.println("    
--convert-config-xml-to-json=<input.xml>,<output.json>");
+        out.println("        Convert legacy XML config to JSON format (parsers 
section only)");
         out.println("");
         out.println("    -x  or --xml           Output XHTML content 
(default)");
         out.println("    -h  or --html          Output HTML content");
diff --git 
a/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java 
b/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java
new file mode 100644
index 000000000..c8d8945fa
--- /dev/null
+++ b/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java
@@ -0,0 +1,606 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.cli;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+import org.apache.tika.config.loader.ComponentRegistry;
+import org.apache.tika.config.loader.KebabCaseConverter;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.utils.XMLReaderUtils;
+
+/**
+ * Converts legacy XML Tika configuration files to the new JSON format.
+ * <p>
+ * Currently supports converting the "parsers" section of tika-config.xml files
+ * for parsers in the tika-parsers-standard module.
+ * <p>
+ * Supports parameter types: bool, int, long, double, float, string, list, and 
map.
+ * <p>
+ * <strong>Special Case:</strong> TesseractOCR's {@code 
otherTesseractSettings} list
+ * (containing space-delimited key-value pairs) is automatically converted to 
the
+ * {@code otherTesseractConfig} map format expected by the JSON configuration.
+ * <p>
+ * Example usage:
+ * <pre>
+ * XmlToJsonConfigConverter.convert(
+ *     Paths.get("tika-config.xml"),
+ *     Paths.get("tika-config.json")
+ * );
+ * </pre>
+ *
+ * <p>XML Format (with various parameter types):
+ * <pre>
+ * &lt;properties&gt;
+ *   &lt;parsers&gt;
+ *     &lt;parser class="org.apache.tika.parser.pdf.PDFParser"&gt;
+ *       &lt;params&gt;
+ *         &lt;param name="sortByPosition" type="bool"&gt;true&lt;/param&gt;
+ *         &lt;param name="maxPages" type="int"&gt;1000&lt;/param&gt;
+ *       &lt;/params&gt;
+ *     &lt;/parser&gt;
+ *     &lt;parser class="org.apache.tika.parser.ocr.TesseractOCRParser"&gt;
+ *       &lt;params&gt;
+ *         &lt;!-- Special case: space-delimited key-value pairs --&gt;
+ *         &lt;param name="otherTesseractSettings" type="list"&gt;
+ *           &lt;string&gt;textord_initialx_ile 0.75&lt;/string&gt;
+ *           &lt;string&gt;textord_noise_hfract 0.15625&lt;/string&gt;
+ *         &lt;/param&gt;
+ *         &lt;param name="envVars" type="map"&gt;
+ *           
&lt;TESSDATA_PREFIX&gt;/usr/share/tesseract&lt;/TESSDATA_PREFIX&gt;
+ *         &lt;/param&gt;
+ *       &lt;/params&gt;
+ *     &lt;/parser&gt;
+ *     &lt;parser class="org.apache.tika.parser.DefaultParser"&gt;
+ *       &lt;parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/&gt;
+ *     &lt;/parser&gt;
+ *   &lt;/parsers&gt;
+ * &lt;/properties&gt;
+ * </pre>
+ *
+ * <p>JSON Format:
+ * <pre>
+ * {
+ *   "parsers": [
+ *     {
+ *       "pdf-parser": {
+ *         "sortByPosition": true,
+ *         "maxPages": 1000
+ *       }
+ *     },
+ *     {
+ *       "tesseract-ocr-parser": {
+ *         "otherTesseractConfig": {
+ *           "textord_initialx_ile": "0.75",
+ *           "textord_noise_hfract": "0.15625"
+ *         },
+ *         "envVars": {
+ *           "TESSDATA_PREFIX": "/usr/share/tesseract"
+ *         }
+ *       }
+ *     },
+ *     {
+ *       "default-parser": {
+ *         "exclude": ["pdf-parser"]
+ *       }
+ *     }
+ *   ]
+ * }
+ * </pre>
+ */
+public class XmlToJsonConfigConverter {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(XmlToJsonConfigConverter.class);
+
+    // Use a plain ObjectMapper for clean JSON output without @class 
annotations
+    private static final ObjectMapper MAPPER = new ObjectMapper();
+
+    private XmlToJsonConfigConverter() {
+        // Utility class
+    }
+
+    /**
+     * Converts an XML Tika configuration file to JSON format.
+     *
+     * @param xmlPath path to the XML configuration file
+     * @param jsonPath path where the JSON output should be written
+     * @throws TikaConfigException if conversion fails
+     * @throws IOException if file I/O fails
+     */
+    public static void convert(Path xmlPath, Path jsonPath) throws 
TikaConfigException, IOException {
+        try (InputStream in = Files.newInputStream(xmlPath);
+             OutputStream out = Files.newOutputStream(jsonPath)) {
+            convert(in, out);
+        }
+    }
+
+    /**
+     * Converts an XML Tika configuration stream to JSON format.
+     *
+     * @param xmlInput input stream containing XML configuration
+     * @param jsonOutput output stream where JSON will be written
+     * @throws TikaConfigException if conversion fails
+     * @throws IOException if stream I/O fails
+     */
+    public static void convert(InputStream xmlInput, OutputStream jsonOutput)
+            throws TikaConfigException, IOException {
+        convert(xmlInput, jsonOutput, 
Thread.currentThread().getContextClassLoader());
+    }
+
+    /**
+     * Converts an XML Tika configuration stream to JSON format.
+     *
+     * @param xmlInput input stream containing XML configuration
+     * @param jsonOutput output stream where JSON will be written
+     * @param classLoader class loader to use for component registry
+     * @throws TikaConfigException if conversion fails
+     * @throws IOException if stream I/O fails
+     */
+    public static void convert(InputStream xmlInput, OutputStream jsonOutput, 
ClassLoader classLoader)
+            throws TikaConfigException, IOException {
+        try {
+            // Load component registry to properly map class names to 
component names
+            ComponentRegistry parserRegistry = new 
ComponentRegistry("parsers", classLoader);
+
+            Document doc = XMLReaderUtils.buildDOM(xmlInput);
+            Map<String, Object> jsonConfig = convertDocument(doc, 
parserRegistry);
+
+            try (Writer writer = new OutputStreamWriter(jsonOutput, 
StandardCharsets.UTF_8)) {
+                MAPPER.writerWithDefaultPrettyPrinter().writeValue(writer, 
jsonConfig);
+            }
+        } catch (Exception e) {
+            throw new TikaConfigException("Failed to convert XML config to 
JSON", e);
+        }
+    }
+
+    /**
+     * Converts the entire XML configuration document to a JSON-compatible map.
+     */
+    private static Map<String, Object> convertDocument(Document doc, 
ComponentRegistry parserRegistry)
+            throws TikaConfigException {
+        Map<String, Object> result = new LinkedHashMap<>();
+
+        Element root = doc.getDocumentElement();
+        if (!"properties".equals(root.getNodeName())) {
+            throw new TikaConfigException(
+                    "Invalid XML config: root element must be <properties>, 
found: " +
+                    root.getNodeName());
+        }
+
+        NodeList children = root.getChildNodes();
+        for (int i = 0; i < children.getLength(); i++) {
+            Node child = children.item(i);
+            if (child.getNodeType() != Node.ELEMENT_NODE) {
+                continue;
+            }
+
+            Element element = (Element) child;
+            String sectionName = element.getNodeName();
+
+            if ("parsers".equals(sectionName)) {
+                result.put("parsers", convertParsersSection(element, 
parserRegistry));
+            }
+            // Future: add support for detectors, translators, etc.
+        }
+
+        return result;
+    }
+
+    /**
+     * Converts the &lt;parsers&gt; section to JSON array format.
+     */
+    private static List<Map<String, Object>> convertParsersSection(Element 
parsersElement,
+                                                                     
ComponentRegistry parserRegistry)
+            throws TikaConfigException {
+        List<Map<String, Object>> parsersList = new ArrayList<>();
+
+        NodeList parserNodes = parsersElement.getElementsByTagName("parser");
+        for (int i = 0; i < parserNodes.getLength(); i++) {
+            Element parserElement = (Element) parserNodes.item(i);
+            Map<String, Object> parserEntry = 
convertParserElement(parserElement, parserRegistry);
+            if (parserEntry != null && !parserEntry.isEmpty()) {
+                parsersList.add(parserEntry);
+            }
+        }
+
+        // Check for redundant exclusions and inform users
+        checkForRedundantExclusions(parsersList);
+
+        return parsersList;
+    }
+
+    /**
+     * Checks if parsers are excluded from default-parser but also configured 
separately,
+     * which is redundant. Logs INFO messages to help users understand they 
can remove
+     * the exclusion since configured parsers automatically override the 
default.
+     */
+    private static void checkForRedundantExclusions(List<Map<String, Object>> 
parsersList) {
+        // Find exclusions from default-parser
+        Set<String> excludedParsers = new HashSet<>();
+        for (Map<String, Object> parserEntry : parsersList) {
+            if (parserEntry.containsKey("default-parser")) {
+                Map<?, ?> config = (Map<?, ?>) 
parserEntry.get("default-parser");
+                if (config.containsKey("exclude")) {
+                    @SuppressWarnings("unchecked")
+                    List<String> excludes = (List<String>) 
config.get("exclude");
+                    excludedParsers.addAll(excludes);
+                }
+            }
+        }
+
+        // Find configured parsers
+        Set<String> configuredParsers = new HashSet<>();
+        for (Map<String, Object> parserEntry : parsersList) {
+            for (String parserName : parserEntry.keySet()) {
+                if (!"default-parser".equals(parserName)) {
+                    configuredParsers.add(parserName);
+                }
+            }
+        }
+
+        // Check for overlap and log informational messages
+        Set<String> redundantExclusions = new HashSet<>(excludedParsers);
+        redundantExclusions.retainAll(configuredParsers);
+
+        if (!redundantExclusions.isEmpty()) {
+            LOG.info("=".repeat(80));
+            LOG.info("CONFIGURATION OPTIMIZATION NOTICE");
+            LOG.info("=".repeat(80));
+            LOG.info("");
+            LOG.info("The following parsers are excluded from default-parser 
but also configured separately:");
+            for (String parserName : redundantExclusions) {
+                LOG.info("  - {}", parserName);
+            }
+            LOG.info("");
+            LOG.info("This exclusion is redundant. When you configure a parser 
with specific settings,");
+            LOG.info("the loader excludes loading that parser from SPI. You 
can remove these");
+            LOG.info("exclusions from your default-parser configuration.");
+            LOG.info("");
+            LOG.info("Example - Instead of:");
+            LOG.info("  {");
+            LOG.info("    \"default-parser\": {");
+            LOG.info("      \"exclude\": [\"pdf-parser\"]");
+            LOG.info("    }");
+            LOG.info("  },");
+            LOG.info("  {");
+            LOG.info("    \"pdf-parser\": {");
+            LOG.info("      \"sortByPosition\": true");
+            LOG.info("    }");
+            LOG.info("  }");
+            LOG.info("");
+            LOG.info("Simply use:");
+            LOG.info("  {");
+            LOG.info("    \"default-parser\": {},");
+            LOG.info("    \"pdf-parser\": {");
+            LOG.info("      \"sortByPosition\": true");
+            LOG.info("    }");
+            LOG.info("  }");
+            LOG.info("");
+            LOG.info("=".repeat(80));
+        }
+    }
+
+    /**
+     * Converts a single &lt;parser&gt; element to a JSON map entry.
+     *
+     * @return map with single entry: { "parser-name": { config... } }
+     */
+    private static Map<String, Object> convertParserElement(Element 
parserElement,
+                                                             ComponentRegistry 
parserRegistry)
+            throws TikaConfigException {
+        String className = parserElement.getAttribute("class");
+        if (className == null || className.isEmpty()) {
+            throw new TikaConfigException("Parser element missing 'class' 
attribute");
+        }
+
+        // Convert class name to component name using the registry
+        String componentName = classNameToComponentName(className, 
parserRegistry);
+
+        Map<String, Object> config = new LinkedHashMap<>();
+        List<String> excludes = null;
+
+        // Process child elements
+        NodeList children = parserElement.getChildNodes();
+        for (int i = 0; i < children.getLength(); i++) {
+            Node child = children.item(i);
+            if (child.getNodeType() != Node.ELEMENT_NODE) {
+                continue;
+            }
+
+            Element element = (Element) child;
+            String tagName = element.getNodeName();
+
+            if ("params".equals(tagName)) {
+                // Process <params> section
+                Map<String, Object> params = convertParamsElement(element);
+                config.putAll(params);
+            } else if ("parser-exclude".equals(tagName)) {
+                // Process <parser-exclude> elements -> excludes array
+                if (excludes == null) {
+                    excludes = new ArrayList<>();
+                }
+                String excludeClass = element.getAttribute("class");
+                if (excludeClass != null && !excludeClass.isEmpty()) {
+                    excludes.add(classNameToComponentName(excludeClass, 
parserRegistry));
+                }
+            }
+        }
+
+        if (excludes != null && !excludes.isEmpty()) {
+            config.put("exclude", excludes);
+        }
+
+        Map<String, Object> result = new LinkedHashMap<>();
+        result.put(componentName, config);
+        return result;
+    }
+
+    /**
+     * Converts a &lt;params&gt; element to a map of parameter names to values.
+     */
+    private static Map<String, Object> convertParamsElement(Element 
paramsElement) {
+        Map<String, Object> params = new LinkedHashMap<>();
+
+        NodeList paramNodes = paramsElement.getElementsByTagName("param");
+        for (int i = 0; i < paramNodes.getLength(); i++) {
+            Element paramElement = (Element) paramNodes.item(i);
+            String name = paramElement.getAttribute("name");
+            String type = paramElement.getAttribute("type");
+
+            if (name != null && !name.isEmpty()) {
+                // Special case: otherTesseractSettings is a list of 
space-delimited key-value pairs
+                // that needs to be converted to otherTesseractConfig map
+                if ("otherTesseractSettings".equals(name) && 
"list".equals(type)) {
+                    Map<String, String> configMap = 
convertTesseractSettingsList(paramElement);
+                    params.put("otherTesseractConfig", configMap);
+                } else {
+                    Object value = convertParamValue(paramElement, type);
+                    params.put(name, value);
+                }
+            }
+        }
+
+        return params;
+    }
+
+    /**
+     * Special handler for TesseractOCR's otherTesseractSettings list.
+     * <p>
+     * Converts a list of space-delimited key-value pairs into a map.
+     * <p>
+     * XML Format:
+     * <pre>
+     * &lt;param name="otherTesseractSettings" type="list"&gt;
+     *   &lt;string&gt;textord_initialx_ile 0.75&lt;/string&gt;
+     *   &lt;string&gt;textord_noise_hfract 0.15625&lt;/string&gt;
+     * &lt;/param&gt;
+     * </pre>
+     * <p>
+     * JSON Output (as otherTesseractConfig):
+     * <pre>
+     * "otherTesseractConfig": {
+     *   "textord_initialx_ile": "0.75",
+     *   "textord_noise_hfract": "0.15625"
+     * }
+     * </pre>
+     */
+    private static Map<String, String> convertTesseractSettingsList(Element 
paramElement) {
+        Map<String, String> configMap = new LinkedHashMap<>();
+        NodeList stringNodes = paramElement.getElementsByTagName("string");
+
+        for (int i = 0; i < stringNodes.getLength(); i++) {
+            Node stringNode = stringNodes.item(i);
+            if (stringNode.getNodeType() == Node.ELEMENT_NODE &&
+                stringNode.getParentNode().equals(paramElement)) {
+                String setting = stringNode.getTextContent().trim();
+                // Parse space-delimited key-value pair
+                int spaceIndex = setting.indexOf(' ');
+                if (spaceIndex > 0) {
+                    String key = setting.substring(0, spaceIndex).trim();
+                    String value = setting.substring(spaceIndex + 1).trim();
+                    configMap.put(key, value);
+                } else {
+                    LOG.warn("Ignoring malformed Tesseract setting (expected 
'key value'): {}", setting);
+                }
+            }
+        }
+
+        return configMap;
+    }
+
+    /**
+     * Converts a parameter value from XML element to the appropriate type.
+     * <p>
+     * Supports primitive types (bool, int, long, double), as well as 
collections:
+     * <ul>
+     *   <li>list - converts child &lt;string&gt; elements to a JSON array</li>
+     *   <li>map - converts child elements (where element name is key) to a 
JSON object</li>
+     * </ul>
+     */
+    private static Object convertParamValue(Element paramElement, String type) 
{
+        if (type == null || type.isEmpty()) {
+            // No type specified, return text content as string
+            return paramElement.getTextContent().trim();
+        }
+
+        String typeKey = type.toLowerCase(Locale.ROOT);
+
+        // Handle collection types that need child element processing
+        if ("list".equals(typeKey)) {
+            return convertListParam(paramElement);
+        } else if ("map".equals(typeKey)) {
+            return convertMapParam(paramElement);
+        }
+
+        // Handle primitive types using text content
+        String valueStr = paramElement.getTextContent().trim();
+
+        if (valueStr.isEmpty()) {
+            return valueStr;
+        }
+
+        switch (typeKey) {
+            case "bool":
+            case "boolean":
+                return Boolean.parseBoolean(valueStr);
+            case "int":
+            case "integer":
+                try {
+                    return Integer.parseInt(valueStr);
+                } catch (NumberFormatException e) {
+                    return valueStr;
+                }
+            case "long":
+                try {
+                    return Long.parseLong(valueStr);
+                } catch (NumberFormatException e) {
+                    return valueStr;
+                }
+            case "double":
+            case "float":
+                try {
+                    return Double.parseDouble(valueStr);
+                } catch (NumberFormatException e) {
+                    return valueStr;
+                }
+            default:
+                // Unknown type, return as string
+                return valueStr;
+        }
+    }
+
+    /**
+     * Converts a list parameter by extracting &lt;string&gt; child elements.
+     * <p>
+     * XML Format:
+     * <pre>
+     * &lt;param name="languages" type="list"&gt;
+     *   &lt;string&gt;en&lt;/string&gt;
+     *   &lt;string&gt;fr&lt;/string&gt;
+     * &lt;/param&gt;
+     * </pre>
+     * <p>
+     * JSON Output: ["en", "fr"]
+     */
+    private static List<String> convertListParam(Element paramElement) {
+        List<String> list = new ArrayList<>();
+        NodeList stringNodes = paramElement.getElementsByTagName("string");
+
+        for (int i = 0; i < stringNodes.getLength(); i++) {
+            Node stringNode = stringNodes.item(i);
+            if (stringNode.getNodeType() == Node.ELEMENT_NODE) {
+                // Only include direct children, not nested strings
+                if (stringNode.getParentNode().equals(paramElement)) {
+                    list.add(stringNode.getTextContent().trim());
+                }
+            }
+        }
+
+        return list;
+    }
+
+    /**
+     * Converts a map parameter by using child element names as keys and text 
content as values.
+     * <p>
+     * XML Format:
+     * <pre>
+     * &lt;param name="captureMap" type="map"&gt;
+     *   &lt;title&gt;^Title: ([^\r\n]+)&lt;/title&gt;
+     *   &lt;author&gt;^Author: ([^\r\n]+)&lt;/author&gt;
+     * &lt;/param&gt;
+     * </pre>
+     * <p>
+     * JSON Output: {"title": "^Title: ([^\\r\\n]+)", "author": "^Author: 
([^\\r\\n]+)"}
+     */
+    private static Map<String, String> convertMapParam(Element paramElement) {
+        Map<String, String> map = new LinkedHashMap<>();
+        NodeList children = paramElement.getChildNodes();
+
+        for (int i = 0; i < children.getLength(); i++) {
+            Node child = children.item(i);
+            if (child.getNodeType() == Node.ELEMENT_NODE) {
+                Element childElement = (Element) child;
+                String key = childElement.getNodeName();
+                String value = childElement.getTextContent().trim();
+                map.put(key, value);
+            }
+        }
+
+        return map;
+    }
+
+    /**
+     * Converts a full Java class name to a component name.
+     * <p>
+     * Uses the ComponentRegistry to perform a reverse lookup, respecting
+     * custom component names from {@code @TikaComponent} annotations.
+     * Falls back to kebab-case conversion if the class is not in the registry.
+     * <p>
+     * Examples:
+     * <ul>
+     *   <li>org.apache.tika.parser.pdf.PDFParser → pdf-parser</li>
+     *   <li>org.apache.tika.parser.DefaultParser → default-parser</li>
+     *   <li>org.apache.tika.parser.html.JSoupParser → jsoup-parser (from 
@TikaComponent annotation)</li>
+     * </ul>
+     */
+    private static String classNameToComponentName(String fullClassName, 
ComponentRegistry registry) {
+        try {
+            // Try to load the class and find it in the registry
+            Class<?> clazz = 
Thread.currentThread().getContextClassLoader().loadClass(fullClassName);
+
+            // Reverse lookup: find the component name for this class
+            for (Map.Entry<String, Class<?>> entry : 
registry.getAllComponents().entrySet()) {
+                if (entry.getValue().equals(clazz)) {
+                    return entry.getKey();
+                }
+            }
+        } catch (ClassNotFoundException e) {
+            // Class not found or not in registry - fall through to kebab-case 
conversion
+        }
+
+        // Fallback: use kebab-case conversion
+        String simpleClassName = fullClassName;
+        int lastDot = fullClassName.lastIndexOf('.');
+        if (lastDot >= 0) {
+            simpleClassName = fullClassName.substring(lastDot + 1);
+        }
+
+        return KebabCaseConverter.toKebabCase(simpleClassName);
+    }
+}
diff --git 
a/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java 
b/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java
new file mode 100644
index 000000000..98671c37b
--- /dev/null
+++ 
b/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java
@@ -0,0 +1,330 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.cli;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
+
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Map;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import org.apache.tika.config.loader.TikaLoader;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.JSoupParser;
+import org.apache.tika.parser.pdf.PDFParser;
+
+/**
+ * Tests for XmlToJsonConfigConverter.
+ * These tests verify that XML configurations are correctly converted to JSON
+ * and can be loaded by TikaLoader to produce properly configured parsers.
+ */
+public class XmlToJsonConfigConverterTest {
+
+    @Test
+    public void testSimpleParserConfig(@TempDir Path tempDir) throws Exception 
{
+        Path xmlPath = 
Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI());
+        Path jsonPath = tempDir.resolve("simple-config.json");
+
+        // Convert XML to JSON
+        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+        // Verify JSON file was created
+        assertTrue(Files.exists(jsonPath));
+
+        // Load the JSON config with TikaLoader
+        TikaLoader loader = TikaLoader.load(jsonPath);
+        Parser parser = loader.loadParsers();
+
+        assertNotNull(parser);
+        assertTrue(parser instanceof CompositeParser);
+
+        // Verify PDF parser is configured
+        CompositeParser compositeParser = (CompositeParser) parser;
+        ParseContext context = new ParseContext();
+        Map<MediaType, Parser> parsers = compositeParser.getParsers(context);
+
+        // Check that PDF parser is present
+        MediaType pdfType = MediaType.parse("application/pdf");
+        assertTrue(parsers.containsKey(pdfType), "PDF parser should be 
configured");
+
+        Parser pdfParser = parsers.get(pdfType);
+        assertTrue(pdfParser instanceof PDFParser, "Parser for PDF should be 
PDFParser");
+
+        // The actual parser configuration (sortByPosition, 
extractInlineImages, etc.)
+        // is tested by the parser's behavior, not directly accessible here
+    }
+
+    @Test
+    public void testParserWithExcludes(@TempDir Path tempDir) throws Exception 
{
+        Path xmlPath = 
Paths.get(getClass().getResource("/xml-configs/tika-config-with-excludes.xml").toURI());
+        Path jsonPath = tempDir.resolve("excludes-config.json");
+
+        // Convert XML to JSON
+        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+        // Print JSON for debugging
+        String json = new String(Files.readAllBytes(jsonPath), 
StandardCharsets.UTF_8);
+        System.out.println("Generated JSON:");
+        System.out.println(json);
+
+        // Verify exclude is at the correct level (not under _decorate)
+        assertTrue(json.contains("\"exclude\""), "Should have exclude array");
+        assertFalse(json.contains("\"_decorate\""), "_decorate should not be 
used for parser excludes");
+        assertTrue(json.contains("\"jsoup-parser\""), "Should exclude 
jsoup-parser");
+        assertTrue(json.contains("\"pdf-parser\""), "Should exclude 
pdf-parser");
+
+        // Load the JSON config with TikaLoader
+        TikaLoader loader = TikaLoader.load(jsonPath);
+        Parser parser = loader.loadParsers();
+
+        assertNotNull(parser);
+        assertTrue(parser instanceof CompositeParser);
+
+        // Verify parsers are configured
+        CompositeParser compositeParser = (CompositeParser) parser;
+        for (Parser p : ((CompositeParser) parser).getAllComponentParsers()) {
+            if (p instanceof PDFParser) {
+                fail("pdf parser should have been excluded");
+            }
+        }
+        ParseContext context = new ParseContext();
+        Map<MediaType, Parser> parsers = compositeParser.getParsers(context);
+
+        // Check that HTML parser is present (JSoupParser should be configured)
+        MediaType htmlType = MediaType.parse("text/html");
+        assertTrue(parsers.containsKey(htmlType), "HTML parser should be 
configured");
+
+        Parser htmlParser = parsers.get(htmlType);
+        // JSoupParser extends HtmlParser, so this checks for the correct 
family
+        assertTrue(htmlParser instanceof JSoupParser, "Parser for HTML should 
be HtmlParser or JSoupParser");
+    }
+
+    @Test
+    public void testNumericTypes(@TempDir Path tempDir) throws Exception {
+        Path xmlPath = 
Paths.get(getClass().getResource("/xml-configs/tika-config-numeric-types.xml").toURI());
+        Path jsonPath = tempDir.resolve("numeric-config.json");
+
+        // Convert XML to JSON
+        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+        // Verify JSON file was created and contains proper numeric types
+        assertTrue(Files.exists(jsonPath));
+
+        // Read the JSON to verify numeric types are preserved
+        String json = new String(Files.readAllBytes(jsonPath), 
StandardCharsets.UTF_8);
+
+        // Verify numbers are not quoted (they should appear as: "density": 
300, not "density": "300")
+        assertTrue(json.contains("\"density\" : 300"), "density should be 
numeric, not string");
+        assertFalse(json.contains("\"timeout\" : \"300\""), "timeout should 
not be a quoted string");
+
+        // Load the JSON config with TikaLoader to verify it's valid
+        TikaLoader loader = TikaLoader.load(jsonPath);
+        Parser parser = loader.loadParsers();
+        assertNotNull(parser);
+    }
+
+    @Test
+    public void testFileConversion(@TempDir Path tempDir) throws Exception {
+        Path xmlPath = 
Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI());
+        Path jsonPath = tempDir.resolve("output.json");
+
+        // Test the Path-based conversion method
+        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+        // Verify file exists
+        assertTrue(Files.exists(jsonPath));
+
+        // Verify it can be loaded by TikaLoader
+        TikaLoader loader = TikaLoader.load(jsonPath);
+        Parser parser = loader.loadParsers();
+        assertNotNull(parser);
+        assertTrue(parser instanceof CompositeParser);
+    }
+
+    @Test
+    public void testClassNameConversion(@TempDir Path tempDir) throws 
Exception {
+        Path xmlPath = 
Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI());
+        Path jsonPath = tempDir.resolve("classname-config.json");
+
+        // Convert XML to JSON
+        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+        // Read JSON and verify component name conversion
+        String json = new String(Files.readAllBytes(jsonPath), 
StandardCharsets.UTF_8);
+
+        // Verify that PDFParser was converted to pdf-parser (kebab-case)
+        assertTrue(json.contains("\"pdf-parser\""), "PDFParser should be 
converted to pdf-parser");
+
+        // Verify the config loads successfully
+        TikaLoader loader = TikaLoader.load(jsonPath);
+        Parser parser = loader.loadParsers();
+        assertNotNull(parser);
+    }
+
+    @Test
+    public void testAutoDetectParserLoading(@TempDir Path tempDir) throws 
Exception {
+        Path xmlPath = 
Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI());
+        Path jsonPath = tempDir.resolve("autodetect-config.json");
+
+        // Convert XML to JSON
+        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+        // Load via TikaLoader and get AutoDetectParser
+        TikaLoader loader = TikaLoader.load(jsonPath);
+        Parser autoDetectParser = loader.loadAutoDetectParser();
+
+        assertNotNull(autoDetectParser);
+
+        // Verify it supports PDF type
+        ParseContext context = new ParseContext();
+        MediaType pdfType = MediaType.parse("application/pdf");
+        
assertTrue(autoDetectParser.getSupportedTypes(context).contains(pdfType),
+                "AutoDetectParser should support PDF");
+    }
+
+    @Test
+    public void testRedundantExclusionWarning(@TempDir Path tempDir) throws 
Exception {
+        // This test demonstrates the old pattern where users excluded parsers 
from default-parser
+        // and then configured those same parsers separately. The converter 
will log an INFO message
+        // informing users that the exclusion is redundant.
+        Path xmlPath = 
Paths.get(getClass().getResource("/xml-configs/tika-config-redundant-exclusion.xml").toURI());
+        Path jsonPath = tempDir.resolve("redundant-config.json");
+
+        // Convert XML to JSON (this will log the INFO message about redundant 
exclusions)
+        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+        // Print the generated JSON
+        String json = new String(Files.readAllBytes(jsonPath), 
StandardCharsets.UTF_8);
+        System.out.println("Generated JSON with redundant exclusions:");
+        System.out.println(json);
+
+        // Verify the JSON still contains the exclusions (we don't remove 
them, just inform)
+        assertTrue(json.contains("\"exclude\""), "Should still have exclude 
array");
+        assertTrue(json.contains("\"pdf-parser\""), "Should have pdf-parser 
configured");
+        assertTrue(json.contains("\"jsoup-parser\""), "Should have 
jsoup-parser configured");
+
+        // Verify it loads correctly via TikaLoader
+        TikaLoader loader = TikaLoader.load(jsonPath);
+        Parser parser = loader.loadParsers();
+        assertNotNull(parser);
+        assertTrue(parser instanceof CompositeParser);
+
+        // Verify both parsers are configured and working
+        CompositeParser compositeParser = (CompositeParser) parser;
+        ParseContext context = new ParseContext();
+        Map<MediaType, Parser> parsers = compositeParser.getParsers(context);
+
+        MediaType pdfType = MediaType.parse("application/pdf");
+        assertTrue(parsers.containsKey(pdfType), "PDF parser should be 
configured");
+
+        MediaType htmlType = MediaType.parse("text/html");
+        assertTrue(parsers.containsKey(htmlType), "HTML parser should be 
configured");
+    }
+
+    @Test
+    public void testTesseractArbitrarySettings(@TempDir Path tempDir) throws 
Exception {
+        // Test the special case conversion of TesseractOCR's 
otherTesseractSettings
+        String xmlConfig = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
+                "<properties>\n" +
+                "    <parsers>\n" +
+                "        <parser 
class=\"org.apache.tika.parser.ocr.TesseractOCRParser\">\n" +
+                "            <params>\n" +
+                "                <param name=\"otherTesseractSettings\" 
type=\"list\">\n" +
+                "                    <string>textord_initialx_ile 
0.75</string>\n" +
+                "                    <string>textord_noise_hfract 
0.15625</string>\n" +
+                "                </param>\n" +
+                "            </params>\n" +
+                "        </parser>\n" +
+                "    </parsers>\n" +
+                "</properties>";
+
+        Path xmlPath = tempDir.resolve("tesseract-arbitrary.xml");
+        Path jsonPath = tempDir.resolve("tesseract-arbitrary.json");
+        Files.write(xmlPath, xmlConfig.getBytes(StandardCharsets.UTF_8));
+
+        // Convert
+        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+        String json = new String(Files.readAllBytes(jsonPath), 
StandardCharsets.UTF_8);
+        System.out.println("Tesseract arbitrary settings conversion:");
+        System.out.println(json);
+
+        // Verify conversion: list of space-delimited pairs -> map
+        assertTrue(json.contains("\"otherTesseractConfig\""),
+                "Should convert to otherTesseractConfig");
+        assertFalse(json.contains("\"otherTesseractSettings\""),
+                "Should not keep old parameter name");
+        assertTrue(json.contains("\"textord_initialx_ile\" : \"0.75\""),
+                "Should parse key-value pairs correctly");
+        assertTrue(json.contains("\"textord_noise_hfract\" : \"0.15625\""),
+                "Should parse second pair");
+
+        // Verify it loads via TikaLoader without errors
+        TikaLoader loader = TikaLoader.load(jsonPath);
+        Parser parser = loader.loadParsers();
+        assertNotNull(parser);
+    }
+
+    @Test
+    public void testListAndMapParameterTypes(@TempDir Path tempDir) throws 
Exception {
+        Path xmlPath = 
Paths.get(getClass().getResource("/xml-configs/tika-config-list-map-types.xml").toURI());
+        Path jsonPath = tempDir.resolve("list-map-config.json");
+
+        // Convert XML to JSON
+        XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
+
+        // Print the generated JSON for debugging
+        String json = new String(Files.readAllBytes(jsonPath), 
StandardCharsets.UTF_8);
+        System.out.println("Generated JSON with list and map types:");
+        System.out.println(json);
+
+        // Verify otherTesseractSettings (list) is converted to 
otherTesseractConfig (map)
+        // This is a special case where space-delimited key-value pairs are 
parsed
+        assertTrue(json.contains("\"otherTesseractConfig\" : {"),
+                "Should convert otherTesseractSettings list to 
otherTesseractConfig map");
+        assertFalse(json.contains("\"otherTesseractSettings\""),
+                "Should not have old otherTesseractSettings name");
+        assertTrue(json.contains("\"textord_initialx_ile\" : \"0.75\""),
+                "Should parse first key-value pair");
+        assertTrue(json.contains("\"textord_noise_hfract\" : \"0.15625\""),
+                "Should parse second key-value pair");
+        assertTrue(json.contains("\"preserve_interword_spaces\" : \"1\""),
+                "Should parse third key-value pair");
+
+        // Verify regular parameters still work
+        assertTrue(json.contains("\"timeoutSeconds\" : 300"), "Should have 
integer parameter");
+        assertTrue(json.contains("\"enableImagePreprocessing\" : true"), 
"Should have boolean parameter");
+        assertTrue(json.contains("\"language\" : \"eng\""), "Should have 
string parameter");
+
+        // Verify it loads correctly via TikaLoader
+        TikaLoader loader = TikaLoader.load(jsonPath);
+        Parser parser = loader.loadParsers();
+        assertNotNull(parser);
+        assertTrue(parser instanceof CompositeParser);
+    }
+}
diff --git 
a/tika-app/src/test/resources/xml-configs/tika-config-list-map-types.xml 
b/tika-app/src/test/resources/xml-configs/tika-config-list-map-types.xml
new file mode 100644
index 000000000..33ffc2119
--- /dev/null
+++ b/tika-app/src/test/resources/xml-configs/tika-config-list-map-types.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
+            <params>
+                <!-- Special Tesseract case: list of space-delimited key-value 
pairs -->
+                <!-- This will be converted to otherTesseractConfig map in 
JSON -->
+                <param name="otherTesseractSettings" type="list">
+                    <string>textord_initialx_ile 0.75</string>
+                    <string>textord_noise_hfract 0.15625</string>
+                    <string>preserve_interword_spaces 1</string>
+                </param>
+                <!-- Regular parameters -->
+                <param name="timeoutSeconds" type="int">300</param>
+                <param name="enableImagePreprocessing" type="bool">true</param>
+                <param name="language">eng</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git 
a/tika-app/src/test/resources/xml-configs/tika-config-numeric-types.xml 
b/tika-app/src/test/resources/xml-configs/tika-config-numeric-types.xml
new file mode 100644
index 000000000..99da01562
--- /dev/null
+++ b/tika-app/src/test/resources/xml-configs/tika-config-numeric-types.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
+            <params>
+                <param name="maxFileSizeToOcr" type="long">10000000</param>
+                <param name="minFileSizeToOcr" type="long">0</param>
+                <param name="density" type="int">300</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git 
a/tika-app/src/test/resources/xml-configs/tika-config-redundant-exclusion.xml 
b/tika-app/src/test/resources/xml-configs/tika-config-redundant-exclusion.xml
new file mode 100644
index 000000000..6caeecd7a
--- /dev/null
+++ 
b/tika-app/src/test/resources/xml-configs/tika-config-redundant-exclusion.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<properties>
+    <parsers>
+        <!-- Old pattern: excluding parser from default, then configuring it 
separately -->
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+            <parser-exclude class="org.apache.tika.parser.html.JSoupParser"/>
+        </parser>
+
+        <!-- These parsers are configured separately, making the exclusions 
above redundant -->
+        <parser class="org.apache.tika.parser.pdf.PDFParser">
+            <params>
+                <param name="sortByPosition" type="bool">true</param>
+                <param name="extractInlineImages" type="bool">false</param>
+            </params>
+        </parser>
+
+        <parser class="org.apache.tika.parser.html.JSoupParser">
+            <params>
+                <param name="extractScripts" type="bool">true</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git a/tika-app/src/test/resources/xml-configs/tika-config-simple.xml 
b/tika-app/src/test/resources/xml-configs/tika-config-simple.xml
new file mode 100644
index 000000000..1a880dd68
--- /dev/null
+++ b/tika-app/src/test/resources/xml-configs/tika-config-simple.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.pdf.PDFParser">
+            <params>
+                <param name="sortByPosition" type="bool">true</param>
+                <param name="extractInlineImages" type="bool">false</param>
+                <param name="ocrRenderingStrategy" 
type="string">TEXT_ONLY</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git 
a/tika-app/src/test/resources/xml-configs/tika-config-with-excludes.xml 
b/tika-app/src/test/resources/xml-configs/tika-config-with-excludes.xml
new file mode 100644
index 000000000..9089428c3
--- /dev/null
+++ b/tika-app/src/test/resources/xml-configs/tika-config-with-excludes.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <parser-exclude class="org.apache.tika.parser.html.JSoupParser"/>
+            <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+        </parser>
+        <parser class="org.apache.tika.parser.html.JSoupParser">
+            <params>
+                <param name="extractScripts" type="bool">true</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 799cd6ffd..7cad1c7ac 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -20,7 +20,6 @@ import java.io.Serializable;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Locale;
-import java.util.Map;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -82,7 +81,7 @@ public class TesseractOCRConfig implements Serializable {
     // Maximum time (seconds) to wait for the ocring process termination
     private int timeoutSeconds = 120;
     // See addOtherTesseractConfig.
-    private Map<String, String> otherTesseractConfig = new HashMap<>();
+    private HashMap<String, String> otherTesseractConfig = new HashMap<>();
     private boolean inlineContent = false;
 
     private String tesseractPath = "";
@@ -478,7 +477,7 @@ public class TesseractOCRConfig implements Serializable {
     /**
      * @see #addOtherTesseractConfig(String, String)
      */
-    public Map<String, String> getOtherTesseractConfig() {
+    public HashMap<String, String> getOtherTesseractConfig() {
         return otherTesseractConfig;
     }
 
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java
 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java
index 8a12a5033..fc434361d 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java
@@ -36,7 +36,7 @@ import java.util.Locale;
  *   <li>TesseractOCRParser → tesseract-ocr-parser</li>
  * </ul>
  */
-class KebabCaseConverter {
+public class KebabCaseConverter {
 
     private KebabCaseConverter() {
         // Utility class
@@ -48,7 +48,7 @@ class KebabCaseConverter {
      * @param className the simple class name (without package)
      * @return the kebab-case version of the name
      */
-    static String toKebabCase(String className) {
+    public static String toKebabCase(String className) {
         if (className == null || className.isEmpty()) {
             return className;
         }
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
index 786d2e9bb..b6bb8ebff 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
@@ -110,6 +110,15 @@ public class ParserLoader {
 
                     // Parse exclusions from default-parser config
                     JsonNode configNode = entry.getValue();
+
+                    // Check for common mistake: using "excludes" instead of 
"exclude"
+                    if (configNode != null && configNode.has("excludes")) {
+                        throw new TikaConfigException(
+                            "Invalid configuration for default-parser: found 
'excludes' but the correct " +
+                            "field name is 'exclude' (singular). Please change 
'excludes' to 'exclude' " +
+                            "in your configuration.");
+                    }
+
                     if (configNode != null && configNode.has("exclude")) {
                         JsonNode excludeNode = configNode.get("exclude");
                         if (excludeNode.isArray()) {
diff --git 
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
 
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
index 44c145418..435282998 100644
--- 
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
+++ 
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java
@@ -24,6 +24,7 @@ import java.io.ByteArrayInputStream;
 import java.io.InputStream;
 import java.net.URL;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
 import java.nio.file.Path;
 
 import org.junit.jupiter.api.Test;
@@ -358,4 +359,41 @@ public class TikaLoaderTest {
         assertNotNull(translator, "Translator should not be null");
         // Should be DefaultTranslator since no translator configured in 
test-loader-config.json
     }
+
+    @Test
+    public void testExcludesInsteadOfExcludeThrowsException() throws Exception 
{
+        // Create a config with the common mistake: "excludes" instead of 
"exclude"
+        String invalidConfig = "{\n" +
+                "  \"parsers\": [\n" +
+                "    {\n" +
+                "      \"default-parser\": {\n" +
+                "        \"excludes\": [\"pdf-parser\"]\n" +
+                "      }\n" +
+                "    }\n" +
+                "  ]\n" +
+                "}";
+
+        // Write to a temp file
+        Path tempFile = Files.createTempFile("test-invalid-excludes", ".json");
+        try {
+            Files.write(tempFile, 
invalidConfig.getBytes(StandardCharsets.UTF_8));
+
+            // Attempt to load should throw TikaConfigException
+            try {
+                TikaLoader loader = TikaLoader.load(tempFile);
+                loader.loadParsers();
+                throw new AssertionError("Expected TikaConfigException to be 
thrown");
+            } catch (org.apache.tika.exception.TikaConfigException e) {
+                // Expected - verify the error message is helpful
+                assertTrue(e.getMessage().contains("excludes"),
+                        "Error message should mention 'excludes'");
+                assertTrue(e.getMessage().contains("exclude"),
+                        "Error message should mention the correct field 
'exclude'");
+                assertTrue(e.getMessage().contains("singular"),
+                        "Error message should explain it should be singular");
+            }
+        } finally {
+            Files.deleteIfExists(tempFile);
+        }
+    }
 }

(tika) branch main updated: TIKA-4552 -- add tool to cover most cases of converting parsers from … (#2423)

Reply via email to