This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4552 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 8bfb45247cb022e08c2f4e73efeb4c9e1a7749ff Author: tallison <[email protected]> AuthorDate: Thu Dec 4 17:33:57 2025 -0500 TIKA-4552 -- add tool to cover most cases of converting parsers from the legacy xml to json. Generated-by: Claude Sonnet 4.5 (claude-sonnet-4-5-20250929) Significant design and implementation with Claude --- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 32 ++ .../apache/tika/cli/XmlToJsonConfigConverter.java | 606 +++++++++++++++++++++ .../tika/cli/XmlToJsonConfigConverterTest.java | 330 +++++++++++ .../xml-configs/tika-config-list-map-types.xml | 38 ++ .../xml-configs/tika-config-numeric-types.xml | 30 + .../tika-config-redundant-exclusion.xml | 42 ++ .../resources/xml-configs/tika-config-simple.xml | 30 + .../xml-configs/tika-config-with-excludes.xml | 32 ++ .../tika/config/loader/KebabCaseConverter.java | 4 +- .../apache/tika/config/loader/ParserLoader.java | 9 + .../apache/tika/config/loader/TikaLoaderTest.java | 38 ++ 11 files changed, 1189 insertions(+), 2 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index f7d933090..91cb313b2 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -424,6 +424,9 @@ public class TikaCLI { } else if (arg.equals("--dump-static-full-config")) { pipeMode = false; dumpConfig(TikaConfigSerializer.Mode.STATIC_FULL); + } else if (arg.startsWith("--convert-config-xml-to-json=")) { + pipeMode = false; + convertConfigXmlToJson(arg.substring("--convert-config-xml-to-json=".length())); } else if (arg.equals("--container-aware") || arg.equals("--container-aware-detector")) { // ignore, as container-aware detectors are now always used } else if (arg.equals("-f") || arg.equals("--fork")) { @@ -520,6 +523,33 @@ public class TikaCLI { TikaConfigSerializer.serialize(localConfig, mode, new OutputStreamWriter(System.out, UTF_8), UTF_8); } + private void convertConfigXmlToJson(String paths) throws Exception { + String[] parts = paths.split(","); + if (parts.length != 2) { + System.err.println("Error: --convert-config-xml-to-json requires input and output paths separated by comma"); + System.err.println("Usage: --convert-config-xml-to-json=<input.xml>,<output.json>"); + return; + } + + Path xmlPath = Paths.get(parts[0].trim()); + Path jsonPath = Paths.get(parts[1].trim()); + + if (!Files.exists(xmlPath)) { + System.err.println("Error: Input XML file not found: " + xmlPath); + return; + } + + try { + XmlToJsonConfigConverter.convert(xmlPath, jsonPath); + System.out.println("Successfully converted XML config to JSON:"); + System.out.println(" Input: " + xmlPath.toAbsolutePath()); + System.out.println(" Output: " + jsonPath.toAbsolutePath()); + } catch (Exception e) { + System.err.println("Error converting config: " + e.getMessage()); + throw e; + } + } + private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException { Metadata metadata = new Metadata(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); @@ -569,6 +599,8 @@ public class TikaCLI { out.println(" --dump-current-config Print current TikaConfig"); out.println(" --dump-static-config Print static config"); out.println(" --dump-static-full-config Print static explicit config"); + out.println(" --convert-config-xml-to-json=<input.xml>,<output.json>"); + out.println(" Convert legacy XML config to JSON format (parsers section only)"); out.println(""); out.println(" -x or --xml Output XHTML content (default)"); out.println(" -h or --html Output HTML content"); diff --git a/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java b/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java new file mode 100644 index 000000000..c8d8945fa --- /dev/null +++ b/tika-app/src/main/java/org/apache/tika/cli/XmlToJsonConfigConverter.java @@ -0,0 +1,606 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.cli; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import org.apache.tika.config.loader.ComponentRegistry; +import org.apache.tika.config.loader.KebabCaseConverter; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.utils.XMLReaderUtils; + +/** + * Converts legacy XML Tika configuration files to the new JSON format. + * <p> + * Currently supports converting the "parsers" section of tika-config.xml files + * for parsers in the tika-parsers-standard module. + * <p> + * Supports parameter types: bool, int, long, double, float, string, list, and map. + * <p> + * <strong>Special Case:</strong> TesseractOCR's {@code otherTesseractSettings} list + * (containing space-delimited key-value pairs) is automatically converted to the + * {@code otherTesseractConfig} map format expected by the JSON configuration. + * <p> + * Example usage: + * <pre> + * XmlToJsonConfigConverter.convert( + * Paths.get("tika-config.xml"), + * Paths.get("tika-config.json") + * ); + * </pre> + * + * <p>XML Format (with various parameter types): + * <pre> + * <properties> + * <parsers> + * <parser class="org.apache.tika.parser.pdf.PDFParser"> + * <params> + * <param name="sortByPosition" type="bool">true</param> + * <param name="maxPages" type="int">1000</param> + * </params> + * </parser> + * <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"> + * <params> + * <!-- Special case: space-delimited key-value pairs --> + * <param name="otherTesseractSettings" type="list"> + * <string>textord_initialx_ile 0.75</string> + * <string>textord_noise_hfract 0.15625</string> + * </param> + * <param name="envVars" type="map"> + * <TESSDATA_PREFIX>/usr/share/tesseract</TESSDATA_PREFIX> + * </param> + * </params> + * </parser> + * <parser class="org.apache.tika.parser.DefaultParser"> + * <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> + * </parser> + * </parsers> + * </properties> + * </pre> + * + * <p>JSON Format: + * <pre> + * { + * "parsers": [ + * { + * "pdf-parser": { + * "sortByPosition": true, + * "maxPages": 1000 + * } + * }, + * { + * "tesseract-ocr-parser": { + * "otherTesseractConfig": { + * "textord_initialx_ile": "0.75", + * "textord_noise_hfract": "0.15625" + * }, + * "envVars": { + * "TESSDATA_PREFIX": "/usr/share/tesseract" + * } + * } + * }, + * { + * "default-parser": { + * "exclude": ["pdf-parser"] + * } + * } + * ] + * } + * </pre> + */ +public class XmlToJsonConfigConverter { + + private static final Logger LOG = LoggerFactory.getLogger(XmlToJsonConfigConverter.class); + + // Use a plain ObjectMapper for clean JSON output without @class annotations + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private XmlToJsonConfigConverter() { + // Utility class + } + + /** + * Converts an XML Tika configuration file to JSON format. + * + * @param xmlPath path to the XML configuration file + * @param jsonPath path where the JSON output should be written + * @throws TikaConfigException if conversion fails + * @throws IOException if file I/O fails + */ + public static void convert(Path xmlPath, Path jsonPath) throws TikaConfigException, IOException { + try (InputStream in = Files.newInputStream(xmlPath); + OutputStream out = Files.newOutputStream(jsonPath)) { + convert(in, out); + } + } + + /** + * Converts an XML Tika configuration stream to JSON format. + * + * @param xmlInput input stream containing XML configuration + * @param jsonOutput output stream where JSON will be written + * @throws TikaConfigException if conversion fails + * @throws IOException if stream I/O fails + */ + public static void convert(InputStream xmlInput, OutputStream jsonOutput) + throws TikaConfigException, IOException { + convert(xmlInput, jsonOutput, Thread.currentThread().getContextClassLoader()); + } + + /** + * Converts an XML Tika configuration stream to JSON format. + * + * @param xmlInput input stream containing XML configuration + * @param jsonOutput output stream where JSON will be written + * @param classLoader class loader to use for component registry + * @throws TikaConfigException if conversion fails + * @throws IOException if stream I/O fails + */ + public static void convert(InputStream xmlInput, OutputStream jsonOutput, ClassLoader classLoader) + throws TikaConfigException, IOException { + try { + // Load component registry to properly map class names to component names + ComponentRegistry parserRegistry = new ComponentRegistry("parsers", classLoader); + + Document doc = XMLReaderUtils.buildDOM(xmlInput); + Map<String, Object> jsonConfig = convertDocument(doc, parserRegistry); + + try (Writer writer = new OutputStreamWriter(jsonOutput, StandardCharsets.UTF_8)) { + MAPPER.writerWithDefaultPrettyPrinter().writeValue(writer, jsonConfig); + } + } catch (Exception e) { + throw new TikaConfigException("Failed to convert XML config to JSON", e); + } + } + + /** + * Converts the entire XML configuration document to a JSON-compatible map. + */ + private static Map<String, Object> convertDocument(Document doc, ComponentRegistry parserRegistry) + throws TikaConfigException { + Map<String, Object> result = new LinkedHashMap<>(); + + Element root = doc.getDocumentElement(); + if (!"properties".equals(root.getNodeName())) { + throw new TikaConfigException( + "Invalid XML config: root element must be <properties>, found: " + + root.getNodeName()); + } + + NodeList children = root.getChildNodes(); + for (int i = 0; i < children.getLength(); i++) { + Node child = children.item(i); + if (child.getNodeType() != Node.ELEMENT_NODE) { + continue; + } + + Element element = (Element) child; + String sectionName = element.getNodeName(); + + if ("parsers".equals(sectionName)) { + result.put("parsers", convertParsersSection(element, parserRegistry)); + } + // Future: add support for detectors, translators, etc. + } + + return result; + } + + /** + * Converts the <parsers> section to JSON array format. + */ + private static List<Map<String, Object>> convertParsersSection(Element parsersElement, + ComponentRegistry parserRegistry) + throws TikaConfigException { + List<Map<String, Object>> parsersList = new ArrayList<>(); + + NodeList parserNodes = parsersElement.getElementsByTagName("parser"); + for (int i = 0; i < parserNodes.getLength(); i++) { + Element parserElement = (Element) parserNodes.item(i); + Map<String, Object> parserEntry = convertParserElement(parserElement, parserRegistry); + if (parserEntry != null && !parserEntry.isEmpty()) { + parsersList.add(parserEntry); + } + } + + // Check for redundant exclusions and inform users + checkForRedundantExclusions(parsersList); + + return parsersList; + } + + /** + * Checks if parsers are excluded from default-parser but also configured separately, + * which is redundant. Logs INFO messages to help users understand they can remove + * the exclusion since configured parsers automatically override the default. + */ + private static void checkForRedundantExclusions(List<Map<String, Object>> parsersList) { + // Find exclusions from default-parser + Set<String> excludedParsers = new HashSet<>(); + for (Map<String, Object> parserEntry : parsersList) { + if (parserEntry.containsKey("default-parser")) { + Map<?, ?> config = (Map<?, ?>) parserEntry.get("default-parser"); + if (config.containsKey("exclude")) { + @SuppressWarnings("unchecked") + List<String> excludes = (List<String>) config.get("exclude"); + excludedParsers.addAll(excludes); + } + } + } + + // Find configured parsers + Set<String> configuredParsers = new HashSet<>(); + for (Map<String, Object> parserEntry : parsersList) { + for (String parserName : parserEntry.keySet()) { + if (!"default-parser".equals(parserName)) { + configuredParsers.add(parserName); + } + } + } + + // Check for overlap and log informational messages + Set<String> redundantExclusions = new HashSet<>(excludedParsers); + redundantExclusions.retainAll(configuredParsers); + + if (!redundantExclusions.isEmpty()) { + LOG.info("=".repeat(80)); + LOG.info("CONFIGURATION OPTIMIZATION NOTICE"); + LOG.info("=".repeat(80)); + LOG.info(""); + LOG.info("The following parsers are excluded from default-parser but also configured separately:"); + for (String parserName : redundantExclusions) { + LOG.info(" - {}", parserName); + } + LOG.info(""); + LOG.info("This exclusion is redundant. When you configure a parser with specific settings,"); + LOG.info("the loader excludes loading that parser from SPI. You can remove these"); + LOG.info("exclusions from your default-parser configuration."); + LOG.info(""); + LOG.info("Example - Instead of:"); + LOG.info(" {"); + LOG.info(" \"default-parser\": {"); + LOG.info(" \"exclude\": [\"pdf-parser\"]"); + LOG.info(" }"); + LOG.info(" },"); + LOG.info(" {"); + LOG.info(" \"pdf-parser\": {"); + LOG.info(" \"sortByPosition\": true"); + LOG.info(" }"); + LOG.info(" }"); + LOG.info(""); + LOG.info("Simply use:"); + LOG.info(" {"); + LOG.info(" \"default-parser\": {},"); + LOG.info(" \"pdf-parser\": {"); + LOG.info(" \"sortByPosition\": true"); + LOG.info(" }"); + LOG.info(" }"); + LOG.info(""); + LOG.info("=".repeat(80)); + } + } + + /** + * Converts a single <parser> element to a JSON map entry. + * + * @return map with single entry: { "parser-name": { config... } } + */ + private static Map<String, Object> convertParserElement(Element parserElement, + ComponentRegistry parserRegistry) + throws TikaConfigException { + String className = parserElement.getAttribute("class"); + if (className == null || className.isEmpty()) { + throw new TikaConfigException("Parser element missing 'class' attribute"); + } + + // Convert class name to component name using the registry + String componentName = classNameToComponentName(className, parserRegistry); + + Map<String, Object> config = new LinkedHashMap<>(); + List<String> excludes = null; + + // Process child elements + NodeList children = parserElement.getChildNodes(); + for (int i = 0; i < children.getLength(); i++) { + Node child = children.item(i); + if (child.getNodeType() != Node.ELEMENT_NODE) { + continue; + } + + Element element = (Element) child; + String tagName = element.getNodeName(); + + if ("params".equals(tagName)) { + // Process <params> section + Map<String, Object> params = convertParamsElement(element); + config.putAll(params); + } else if ("parser-exclude".equals(tagName)) { + // Process <parser-exclude> elements -> excludes array + if (excludes == null) { + excludes = new ArrayList<>(); + } + String excludeClass = element.getAttribute("class"); + if (excludeClass != null && !excludeClass.isEmpty()) { + excludes.add(classNameToComponentName(excludeClass, parserRegistry)); + } + } + } + + if (excludes != null && !excludes.isEmpty()) { + config.put("exclude", excludes); + } + + Map<String, Object> result = new LinkedHashMap<>(); + result.put(componentName, config); + return result; + } + + /** + * Converts a <params> element to a map of parameter names to values. + */ + private static Map<String, Object> convertParamsElement(Element paramsElement) { + Map<String, Object> params = new LinkedHashMap<>(); + + NodeList paramNodes = paramsElement.getElementsByTagName("param"); + for (int i = 0; i < paramNodes.getLength(); i++) { + Element paramElement = (Element) paramNodes.item(i); + String name = paramElement.getAttribute("name"); + String type = paramElement.getAttribute("type"); + + if (name != null && !name.isEmpty()) { + // Special case: otherTesseractSettings is a list of space-delimited key-value pairs + // that needs to be converted to otherTesseractConfig map + if ("otherTesseractSettings".equals(name) && "list".equals(type)) { + Map<String, String> configMap = convertTesseractSettingsList(paramElement); + params.put("otherTesseractConfig", configMap); + } else { + Object value = convertParamValue(paramElement, type); + params.put(name, value); + } + } + } + + return params; + } + + /** + * Special handler for TesseractOCR's otherTesseractSettings list. + * <p> + * Converts a list of space-delimited key-value pairs into a map. + * <p> + * XML Format: + * <pre> + * <param name="otherTesseractSettings" type="list"> + * <string>textord_initialx_ile 0.75</string> + * <string>textord_noise_hfract 0.15625</string> + * </param> + * </pre> + * <p> + * JSON Output (as otherTesseractConfig): + * <pre> + * "otherTesseractConfig": { + * "textord_initialx_ile": "0.75", + * "textord_noise_hfract": "0.15625" + * } + * </pre> + */ + private static Map<String, String> convertTesseractSettingsList(Element paramElement) { + Map<String, String> configMap = new LinkedHashMap<>(); + NodeList stringNodes = paramElement.getElementsByTagName("string"); + + for (int i = 0; i < stringNodes.getLength(); i++) { + Node stringNode = stringNodes.item(i); + if (stringNode.getNodeType() == Node.ELEMENT_NODE && + stringNode.getParentNode().equals(paramElement)) { + String setting = stringNode.getTextContent().trim(); + // Parse space-delimited key-value pair + int spaceIndex = setting.indexOf(' '); + if (spaceIndex > 0) { + String key = setting.substring(0, spaceIndex).trim(); + String value = setting.substring(spaceIndex + 1).trim(); + configMap.put(key, value); + } else { + LOG.warn("Ignoring malformed Tesseract setting (expected 'key value'): {}", setting); + } + } + } + + return configMap; + } + + /** + * Converts a parameter value from XML element to the appropriate type. + * <p> + * Supports primitive types (bool, int, long, double), as well as collections: + * <ul> + * <li>list - converts child <string> elements to a JSON array</li> + * <li>map - converts child elements (where element name is key) to a JSON object</li> + * </ul> + */ + private static Object convertParamValue(Element paramElement, String type) { + if (type == null || type.isEmpty()) { + // No type specified, return text content as string + return paramElement.getTextContent().trim(); + } + + String typeKey = type.toLowerCase(Locale.ROOT); + + // Handle collection types that need child element processing + if ("list".equals(typeKey)) { + return convertListParam(paramElement); + } else if ("map".equals(typeKey)) { + return convertMapParam(paramElement); + } + + // Handle primitive types using text content + String valueStr = paramElement.getTextContent().trim(); + + if (valueStr.isEmpty()) { + return valueStr; + } + + switch (typeKey) { + case "bool": + case "boolean": + return Boolean.parseBoolean(valueStr); + case "int": + case "integer": + try { + return Integer.parseInt(valueStr); + } catch (NumberFormatException e) { + return valueStr; + } + case "long": + try { + return Long.parseLong(valueStr); + } catch (NumberFormatException e) { + return valueStr; + } + case "double": + case "float": + try { + return Double.parseDouble(valueStr); + } catch (NumberFormatException e) { + return valueStr; + } + default: + // Unknown type, return as string + return valueStr; + } + } + + /** + * Converts a list parameter by extracting <string> child elements. + * <p> + * XML Format: + * <pre> + * <param name="languages" type="list"> + * <string>en</string> + * <string>fr</string> + * </param> + * </pre> + * <p> + * JSON Output: ["en", "fr"] + */ + private static List<String> convertListParam(Element paramElement) { + List<String> list = new ArrayList<>(); + NodeList stringNodes = paramElement.getElementsByTagName("string"); + + for (int i = 0; i < stringNodes.getLength(); i++) { + Node stringNode = stringNodes.item(i); + if (stringNode.getNodeType() == Node.ELEMENT_NODE) { + // Only include direct children, not nested strings + if (stringNode.getParentNode().equals(paramElement)) { + list.add(stringNode.getTextContent().trim()); + } + } + } + + return list; + } + + /** + * Converts a map parameter by using child element names as keys and text content as values. + * <p> + * XML Format: + * <pre> + * <param name="captureMap" type="map"> + * <title>^Title: ([^\r\n]+)</title> + * <author>^Author: ([^\r\n]+)</author> + * </param> + * </pre> + * <p> + * JSON Output: {"title": "^Title: ([^\\r\\n]+)", "author": "^Author: ([^\\r\\n]+)"} + */ + private static Map<String, String> convertMapParam(Element paramElement) { + Map<String, String> map = new LinkedHashMap<>(); + NodeList children = paramElement.getChildNodes(); + + for (int i = 0; i < children.getLength(); i++) { + Node child = children.item(i); + if (child.getNodeType() == Node.ELEMENT_NODE) { + Element childElement = (Element) child; + String key = childElement.getNodeName(); + String value = childElement.getTextContent().trim(); + map.put(key, value); + } + } + + return map; + } + + /** + * Converts a full Java class name to a component name. + * <p> + * Uses the ComponentRegistry to perform a reverse lookup, respecting + * custom component names from {@code @TikaComponent} annotations. + * Falls back to kebab-case conversion if the class is not in the registry. + * <p> + * Examples: + * <ul> + * <li>org.apache.tika.parser.pdf.PDFParser → pdf-parser</li> + * <li>org.apache.tika.parser.DefaultParser → default-parser</li> + * <li>org.apache.tika.parser.html.JSoupParser → jsoup-parser (from @TikaComponent annotation)</li> + * </ul> + */ + private static String classNameToComponentName(String fullClassName, ComponentRegistry registry) { + try { + // Try to load the class and find it in the registry + Class<?> clazz = Thread.currentThread().getContextClassLoader().loadClass(fullClassName); + + // Reverse lookup: find the component name for this class + for (Map.Entry<String, Class<?>> entry : registry.getAllComponents().entrySet()) { + if (entry.getValue().equals(clazz)) { + return entry.getKey(); + } + } + } catch (ClassNotFoundException e) { + // Class not found or not in registry - fall through to kebab-case conversion + } + + // Fallback: use kebab-case conversion + String simpleClassName = fullClassName; + int lastDot = fullClassName.lastIndexOf('.'); + if (lastDot >= 0) { + simpleClassName = fullClassName.substring(lastDot + 1); + } + + return KebabCaseConverter.toKebabCase(simpleClassName); + } +} diff --git a/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java b/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java new file mode 100644 index 000000000..98671c37b --- /dev/null +++ b/tika-app/src/test/java/org/apache/tika/cli/XmlToJsonConfigConverterTest.java @@ -0,0 +1,330 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.cli; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Map; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.CompositeParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.html.JSoupParser; +import org.apache.tika.parser.pdf.PDFParser; + +/** + * Tests for XmlToJsonConfigConverter. + * These tests verify that XML configurations are correctly converted to JSON + * and can be loaded by TikaLoader to produce properly configured parsers. + */ +public class XmlToJsonConfigConverterTest { + + @Test + public void testSimpleParserConfig(@TempDir Path tempDir) throws Exception { + Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI()); + Path jsonPath = tempDir.resolve("simple-config.json"); + + // Convert XML to JSON + XmlToJsonConfigConverter.convert(xmlPath, jsonPath); + + // Verify JSON file was created + assertTrue(Files.exists(jsonPath)); + + // Load the JSON config with TikaLoader + TikaLoader loader = TikaLoader.load(jsonPath); + Parser parser = loader.loadParsers(); + + assertNotNull(parser); + assertTrue(parser instanceof CompositeParser); + + // Verify PDF parser is configured + CompositeParser compositeParser = (CompositeParser) parser; + ParseContext context = new ParseContext(); + Map<MediaType, Parser> parsers = compositeParser.getParsers(context); + + // Check that PDF parser is present + MediaType pdfType = MediaType.parse("application/pdf"); + assertTrue(parsers.containsKey(pdfType), "PDF parser should be configured"); + + Parser pdfParser = parsers.get(pdfType); + assertTrue(pdfParser instanceof PDFParser, "Parser for PDF should be PDFParser"); + + // The actual parser configuration (sortByPosition, extractInlineImages, etc.) + // is tested by the parser's behavior, not directly accessible here + } + + @Test + public void testParserWithExcludes(@TempDir Path tempDir) throws Exception { + Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-with-excludes.xml").toURI()); + Path jsonPath = tempDir.resolve("excludes-config.json"); + + // Convert XML to JSON + XmlToJsonConfigConverter.convert(xmlPath, jsonPath); + + // Print JSON for debugging + String json = new String(Files.readAllBytes(jsonPath), StandardCharsets.UTF_8); + System.out.println("Generated JSON:"); + System.out.println(json); + + // Verify exclude is at the correct level (not under _decorate) + assertTrue(json.contains("\"exclude\""), "Should have exclude array"); + assertFalse(json.contains("\"_decorate\""), "_decorate should not be used for parser excludes"); + assertTrue(json.contains("\"jsoup-parser\""), "Should exclude jsoup-parser"); + assertTrue(json.contains("\"pdf-parser\""), "Should exclude pdf-parser"); + + // Load the JSON config with TikaLoader + TikaLoader loader = TikaLoader.load(jsonPath); + Parser parser = loader.loadParsers(); + + assertNotNull(parser); + assertTrue(parser instanceof CompositeParser); + + // Verify parsers are configured + CompositeParser compositeParser = (CompositeParser) parser; + for (Parser p : ((CompositeParser) parser).getAllComponentParsers()) { + if (p instanceof PDFParser) { + fail("pdf parser should have been excluded"); + } + } + ParseContext context = new ParseContext(); + Map<MediaType, Parser> parsers = compositeParser.getParsers(context); + + // Check that HTML parser is present (JSoupParser should be configured) + MediaType htmlType = MediaType.parse("text/html"); + assertTrue(parsers.containsKey(htmlType), "HTML parser should be configured"); + + Parser htmlParser = parsers.get(htmlType); + // JSoupParser extends HtmlParser, so this checks for the correct family + assertTrue(htmlParser instanceof JSoupParser, "Parser for HTML should be HtmlParser or JSoupParser"); + } + + @Test + public void testNumericTypes(@TempDir Path tempDir) throws Exception { + Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-numeric-types.xml").toURI()); + Path jsonPath = tempDir.resolve("numeric-config.json"); + + // Convert XML to JSON + XmlToJsonConfigConverter.convert(xmlPath, jsonPath); + + // Verify JSON file was created and contains proper numeric types + assertTrue(Files.exists(jsonPath)); + + // Read the JSON to verify numeric types are preserved + String json = new String(Files.readAllBytes(jsonPath), StandardCharsets.UTF_8); + + // Verify numbers are not quoted (they should appear as: "density": 300, not "density": "300") + assertTrue(json.contains("\"density\" : 300"), "density should be numeric, not string"); + assertFalse(json.contains("\"timeout\" : \"300\""), "timeout should not be a quoted string"); + + // Load the JSON config with TikaLoader to verify it's valid + TikaLoader loader = TikaLoader.load(jsonPath); + Parser parser = loader.loadParsers(); + assertNotNull(parser); + } + + @Test + public void testFileConversion(@TempDir Path tempDir) throws Exception { + Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI()); + Path jsonPath = tempDir.resolve("output.json"); + + // Test the Path-based conversion method + XmlToJsonConfigConverter.convert(xmlPath, jsonPath); + + // Verify file exists + assertTrue(Files.exists(jsonPath)); + + // Verify it can be loaded by TikaLoader + TikaLoader loader = TikaLoader.load(jsonPath); + Parser parser = loader.loadParsers(); + assertNotNull(parser); + assertTrue(parser instanceof CompositeParser); + } + + @Test + public void testClassNameConversion(@TempDir Path tempDir) throws Exception { + Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI()); + Path jsonPath = tempDir.resolve("classname-config.json"); + + // Convert XML to JSON + XmlToJsonConfigConverter.convert(xmlPath, jsonPath); + + // Read JSON and verify component name conversion + String json = new String(Files.readAllBytes(jsonPath), StandardCharsets.UTF_8); + + // Verify that PDFParser was converted to pdf-parser (kebab-case) + assertTrue(json.contains("\"pdf-parser\""), "PDFParser should be converted to pdf-parser"); + + // Verify the config loads successfully + TikaLoader loader = TikaLoader.load(jsonPath); + Parser parser = loader.loadParsers(); + assertNotNull(parser); + } + + @Test + public void testAutoDetectParserLoading(@TempDir Path tempDir) throws Exception { + Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-simple.xml").toURI()); + Path jsonPath = tempDir.resolve("autodetect-config.json"); + + // Convert XML to JSON + XmlToJsonConfigConverter.convert(xmlPath, jsonPath); + + // Load via TikaLoader and get AutoDetectParser + TikaLoader loader = TikaLoader.load(jsonPath); + Parser autoDetectParser = loader.loadAutoDetectParser(); + + assertNotNull(autoDetectParser); + + // Verify it supports PDF type + ParseContext context = new ParseContext(); + MediaType pdfType = MediaType.parse("application/pdf"); + assertTrue(autoDetectParser.getSupportedTypes(context).contains(pdfType), + "AutoDetectParser should support PDF"); + } + + @Test + public void testRedundantExclusionWarning(@TempDir Path tempDir) throws Exception { + // This test demonstrates the old pattern where users excluded parsers from default-parser + // and then configured those same parsers separately. The converter will log an INFO message + // informing users that the exclusion is redundant. + Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-redundant-exclusion.xml").toURI()); + Path jsonPath = tempDir.resolve("redundant-config.json"); + + // Convert XML to JSON (this will log the INFO message about redundant exclusions) + XmlToJsonConfigConverter.convert(xmlPath, jsonPath); + + // Print the generated JSON + String json = new String(Files.readAllBytes(jsonPath), StandardCharsets.UTF_8); + System.out.println("Generated JSON with redundant exclusions:"); + System.out.println(json); + + // Verify the JSON still contains the exclusions (we don't remove them, just inform) + assertTrue(json.contains("\"exclude\""), "Should still have exclude array"); + assertTrue(json.contains("\"pdf-parser\""), "Should have pdf-parser configured"); + assertTrue(json.contains("\"jsoup-parser\""), "Should have jsoup-parser configured"); + + // Verify it loads correctly via TikaLoader + TikaLoader loader = TikaLoader.load(jsonPath); + Parser parser = loader.loadParsers(); + assertNotNull(parser); + assertTrue(parser instanceof CompositeParser); + + // Verify both parsers are configured and working + CompositeParser compositeParser = (CompositeParser) parser; + ParseContext context = new ParseContext(); + Map<MediaType, Parser> parsers = compositeParser.getParsers(context); + + MediaType pdfType = MediaType.parse("application/pdf"); + assertTrue(parsers.containsKey(pdfType), "PDF parser should be configured"); + + MediaType htmlType = MediaType.parse("text/html"); + assertTrue(parsers.containsKey(htmlType), "HTML parser should be configured"); + } + + @Test + public void testTesseractArbitrarySettings(@TempDir Path tempDir) throws Exception { + // Test the special case conversion of TesseractOCR's otherTesseractSettings + String xmlConfig = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + + "<properties>\n" + + " <parsers>\n" + + " <parser class=\"org.apache.tika.parser.ocr.TesseractOCRParser\">\n" + + " <params>\n" + + " <param name=\"otherTesseractSettings\" type=\"list\">\n" + + " <string>textord_initialx_ile 0.75</string>\n" + + " <string>textord_noise_hfract 0.15625</string>\n" + + " </param>\n" + + " </params>\n" + + " </parser>\n" + + " </parsers>\n" + + "</properties>"; + + Path xmlPath = tempDir.resolve("tesseract-arbitrary.xml"); + Path jsonPath = tempDir.resolve("tesseract-arbitrary.json"); + Files.write(xmlPath, xmlConfig.getBytes(StandardCharsets.UTF_8)); + + // Convert + XmlToJsonConfigConverter.convert(xmlPath, jsonPath); + + String json = new String(Files.readAllBytes(jsonPath), StandardCharsets.UTF_8); + System.out.println("Tesseract arbitrary settings conversion:"); + System.out.println(json); + + // Verify conversion: list of space-delimited pairs -> map + assertTrue(json.contains("\"otherTesseractConfig\""), + "Should convert to otherTesseractConfig"); + assertFalse(json.contains("\"otherTesseractSettings\""), + "Should not keep old parameter name"); + assertTrue(json.contains("\"textord_initialx_ile\" : \"0.75\""), + "Should parse key-value pairs correctly"); + assertTrue(json.contains("\"textord_noise_hfract\" : \"0.15625\""), + "Should parse second pair"); + + // Verify it loads via TikaLoader without errors + TikaLoader loader = TikaLoader.load(jsonPath); + Parser parser = loader.loadParsers(); + assertNotNull(parser); + } + + @Test + public void testListAndMapParameterTypes(@TempDir Path tempDir) throws Exception { + Path xmlPath = Paths.get(getClass().getResource("/xml-configs/tika-config-list-map-types.xml").toURI()); + Path jsonPath = tempDir.resolve("list-map-config.json"); + + // Convert XML to JSON + XmlToJsonConfigConverter.convert(xmlPath, jsonPath); + + // Print the generated JSON for debugging + String json = new String(Files.readAllBytes(jsonPath), StandardCharsets.UTF_8); + System.out.println("Generated JSON with list and map types:"); + System.out.println(json); + + // Verify otherTesseractSettings (list) is converted to otherTesseractConfig (map) + // This is a special case where space-delimited key-value pairs are parsed + assertTrue(json.contains("\"otherTesseractConfig\" : {"), + "Should convert otherTesseractSettings list to otherTesseractConfig map"); + assertFalse(json.contains("\"otherTesseractSettings\""), + "Should not have old otherTesseractSettings name"); + assertTrue(json.contains("\"textord_initialx_ile\" : \"0.75\""), + "Should parse first key-value pair"); + assertTrue(json.contains("\"textord_noise_hfract\" : \"0.15625\""), + "Should parse second key-value pair"); + assertTrue(json.contains("\"preserve_interword_spaces\" : \"1\""), + "Should parse third key-value pair"); + + // Verify regular parameters still work + assertTrue(json.contains("\"timeoutSeconds\" : 300"), "Should have integer parameter"); + assertTrue(json.contains("\"enableImagePreprocessing\" : true"), "Should have boolean parameter"); + assertTrue(json.contains("\"language\" : \"eng\""), "Should have string parameter"); + + // Verify it loads correctly via TikaLoader + TikaLoader loader = TikaLoader.load(jsonPath); + Parser parser = loader.loadParsers(); + assertNotNull(parser); + assertTrue(parser instanceof CompositeParser); + } +} diff --git a/tika-app/src/test/resources/xml-configs/tika-config-list-map-types.xml b/tika-app/src/test/resources/xml-configs/tika-config-list-map-types.xml new file mode 100644 index 000000000..33ffc2119 --- /dev/null +++ b/tika-app/src/test/resources/xml-configs/tika-config-list-map-types.xml @@ -0,0 +1,38 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"> + <params> + <!-- Special Tesseract case: list of space-delimited key-value pairs --> + <!-- This will be converted to otherTesseractConfig map in JSON --> + <param name="otherTesseractSettings" type="list"> + <string>textord_initialx_ile 0.75</string> + <string>textord_noise_hfract 0.15625</string> + <string>preserve_interword_spaces 1</string> + </param> + <!-- Regular parameters --> + <param name="timeoutSeconds" type="int">300</param> + <param name="enableImagePreprocessing" type="bool">true</param> + <param name="language">eng</param> + </params> + </parser> + </parsers> +</properties> diff --git a/tika-app/src/test/resources/xml-configs/tika-config-numeric-types.xml b/tika-app/src/test/resources/xml-configs/tika-config-numeric-types.xml new file mode 100644 index 000000000..99da01562 --- /dev/null +++ b/tika-app/src/test/resources/xml-configs/tika-config-numeric-types.xml @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"> + <params> + <param name="maxFileSizeToOcr" type="long">10000000</param> + <param name="minFileSizeToOcr" type="long">0</param> + <param name="density" type="int">300</param> + </params> + </parser> + </parsers> +</properties> diff --git a/tika-app/src/test/resources/xml-configs/tika-config-redundant-exclusion.xml b/tika-app/src/test/resources/xml-configs/tika-config-redundant-exclusion.xml new file mode 100644 index 000000000..6caeecd7a --- /dev/null +++ b/tika-app/src/test/resources/xml-configs/tika-config-redundant-exclusion.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<properties> + <parsers> + <!-- Old pattern: excluding parser from default, then configuring it separately --> + <parser class="org.apache.tika.parser.DefaultParser"> + <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> + <parser-exclude class="org.apache.tika.parser.html.JSoupParser"/> + </parser> + + <!-- These parsers are configured separately, making the exclusions above redundant --> + <parser class="org.apache.tika.parser.pdf.PDFParser"> + <params> + <param name="sortByPosition" type="bool">true</param> + <param name="extractInlineImages" type="bool">false</param> + </params> + </parser> + + <parser class="org.apache.tika.parser.html.JSoupParser"> + <params> + <param name="extractScripts" type="bool">true</param> + </params> + </parser> + </parsers> +</properties> diff --git a/tika-app/src/test/resources/xml-configs/tika-config-simple.xml b/tika-app/src/test/resources/xml-configs/tika-config-simple.xml new file mode 100644 index 000000000..1a880dd68 --- /dev/null +++ b/tika-app/src/test/resources/xml-configs/tika-config-simple.xml @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.pdf.PDFParser"> + <params> + <param name="sortByPosition" type="bool">true</param> + <param name="extractInlineImages" type="bool">false</param> + <param name="ocrRenderingStrategy" type="string">TEXT_ONLY</param> + </params> + </parser> + </parsers> +</properties> diff --git a/tika-app/src/test/resources/xml-configs/tika-config-with-excludes.xml b/tika-app/src/test/resources/xml-configs/tika-config-with-excludes.xml new file mode 100644 index 000000000..9089428c3 --- /dev/null +++ b/tika-app/src/test/resources/xml-configs/tika-config-with-excludes.xml @@ -0,0 +1,32 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"> + <parser-exclude class="org.apache.tika.parser.html.JSoupParser"/> + <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> + </parser> + <parser class="org.apache.tika.parser.html.JSoupParser"> + <params> + <param name="extractScripts" type="bool">true</param> + </params> + </parser> + </parsers> +</properties> diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java index 8a12a5033..fc434361d 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/KebabCaseConverter.java @@ -36,7 +36,7 @@ import java.util.Locale; * <li>TesseractOCRParser → tesseract-ocr-parser</li> * </ul> */ -class KebabCaseConverter { +public class KebabCaseConverter { private KebabCaseConverter() { // Utility class @@ -48,7 +48,7 @@ class KebabCaseConverter { * @param className the simple class name (without package) * @return the kebab-case version of the name */ - static String toKebabCase(String className) { + public static String toKebabCase(String className) { if (className == null || className.isEmpty()) { return className; } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java index 786d2e9bb..b6bb8ebff 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java @@ -110,6 +110,15 @@ public class ParserLoader { // Parse exclusions from default-parser config JsonNode configNode = entry.getValue(); + + // Check for common mistake: using "excludes" instead of "exclude" + if (configNode != null && configNode.has("excludes")) { + throw new TikaConfigException( + "Invalid configuration for default-parser: found 'excludes' but the correct " + + "field name is 'exclude' (singular). Please change 'excludes' to 'exclude' " + + "in your configuration."); + } + if (configNode != null && configNode.has("exclude")) { JsonNode excludeNode = configNode.get("exclude"); if (excludeNode.isArray()) { diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java index 44c145418..435282998 100644 --- a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java @@ -24,6 +24,7 @@ import java.io.ByteArrayInputStream; import java.io.InputStream; import java.net.URL; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.nio.file.Path; import org.junit.jupiter.api.Test; @@ -358,4 +359,41 @@ public class TikaLoaderTest { assertNotNull(translator, "Translator should not be null"); // Should be DefaultTranslator since no translator configured in test-loader-config.json } + + @Test + public void testExcludesInsteadOfExcludeThrowsException() throws Exception { + // Create a config with the common mistake: "excludes" instead of "exclude" + String invalidConfig = "{\n" + + " \"parsers\": [\n" + + " {\n" + + " \"default-parser\": {\n" + + " \"excludes\": [\"pdf-parser\"]\n" + + " }\n" + + " }\n" + + " ]\n" + + "}"; + + // Write to a temp file + Path tempFile = Files.createTempFile("test-invalid-excludes", ".json"); + try { + Files.write(tempFile, invalidConfig.getBytes(StandardCharsets.UTF_8)); + + // Attempt to load should throw TikaConfigException + try { + TikaLoader loader = TikaLoader.load(tempFile); + loader.loadParsers(); + throw new AssertionError("Expected TikaConfigException to be thrown"); + } catch (org.apache.tika.exception.TikaConfigException e) { + // Expected - verify the error message is helpful + assertTrue(e.getMessage().contains("excludes"), + "Error message should mention 'excludes'"); + assertTrue(e.getMessage().contains("exclude"), + "Error message should mention the correct field 'exclude'"); + assertTrue(e.getMessage().contains("singular"), + "Error message should explain it should be singular"); + } + } finally { + Files.deleteIfExists(tempFile); + } + } }
