This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4566 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 25560f55acabb868e892a9782a5ffa16a8659e5d Author: tallison <[email protected]> AuthorDate: Thu Dec 11 09:40:39 2025 -0500 TIKA-4566 - allow non-config elements to standalone in array --- .../apache/tika/config/loader/TikaJsonConfig.java | 42 +++-- .../tika/config/loader/TikaJsonConfigTest.java | 193 +++++++++++++++++++++ 2 files changed, 213 insertions(+), 22 deletions(-) diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java index 1a7cff5d4f..d90854666d 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java @@ -69,31 +69,26 @@ import org.apache.tika.exception.TikaConfigException; * { * // Core Tika components (validated by TikaLoader) * "parsers": [ - * { "pdf-parser": { "_decorate": {...}, "ocrStrategy": "AUTO", ... } }, - * { "html-parser": { ... } }, - * { "default-parser": {} } + * { "pdf-parser": { "_mime-include": ["application/pdf"], "ocrStrategy": "AUTO" } }, + * "html-parser", // String shorthand for no-config components + * { "default-parser": { "_exclude": ["ocr-parser"] } } * ], * "detectors": [ - * { "mime-magic-detector": {} }, - * { "zip-container-detector": { "maxDepth": 10 } } + * "poifs-container-detector", // String shorthand + * { "mime-types": { "markLimit": 10000 } } * ], * * // Pipes components (validated by validateKeys()) * "plugin-roots": ["/path/to/plugins"], * "fetchers": [...], - * "emitters": [...], - * - * // Custom configurations (for testing or extensions) - * "other-configs": { - * "test-config": { ... }, - * "my-custom-config": { ... }, - * "anything": { ... } - * } + * "emitters": [...] * } * </pre> * * <p>All components use array format for explicit ordering. - * Parsers support decoration via "_decorate" field. + * Components without configuration can use string shorthand: "component-name" + * instead of { "component-name": {} }. + * Parsers support mime filtering via "_mime-include" and "_mime-exclude" fields. * Special "default-parser" entry enables SPI fallback for unlisted parsers. */ public class TikaJsonConfig { @@ -292,15 +287,18 @@ public class TikaJsonConfig { List<Map.Entry<String, JsonNode>> components = new ArrayList<>(); for (JsonNode arrayItem : typeNode) { - if (!arrayItem.isObject()) { - continue; - } - - // Each array item should have exactly one field: { "component-name": {...config...} } - for (Map.Entry<String, JsonNode> componentEntry : arrayItem.properties()) { - components.add(Map.entry(componentEntry.getKey(), componentEntry.getValue())); - break; // Only take the first field + if (arrayItem.isTextual()) { + // String shorthand: "component-name" -> treat as { "component-name": {} } + String componentName = arrayItem.asText(); + components.add(Map.entry(componentName, OBJECT_MAPPER.createObjectNode())); + } else if (arrayItem.isObject()) { + // Object syntax: { "component-name": {...config...} } + for (Map.Entry<String, JsonNode> componentEntry : arrayItem.properties()) { + components.add(Map.entry(componentEntry.getKey(), componentEntry.getValue())); + break; // Only take the first field + } } + // Skip other types (null, numbers, arrays, etc.) } if (!components.isEmpty()) { diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java new file mode 100644 index 0000000000..edd8e55634 --- /dev/null +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config.loader; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Map; + +import com.fasterxml.jackson.databind.JsonNode; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for TikaJsonConfig parsing functionality. + */ +public class TikaJsonConfigTest { + + @Test + public void testStringShorthandForParsers() throws Exception { + String json = """ + { + "parsers": [ + "html-parser", + { "pdf-parser": { "ocrStrategy": "AUTO" } }, + "txt-parser" + ] + } + """; + + TikaJsonConfig config = TikaJsonConfig.load( + new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8))); + + List<Map.Entry<String, JsonNode>> parsers = config.getArrayComponents("parsers"); + assertEquals(3, parsers.size(), "Should have 3 parsers"); + + // First parser: string shorthand + assertEquals("html-parser", parsers.get(0).getKey()); + assertTrue(parsers.get(0).getValue().isEmpty(), "Should have empty config for shorthand"); + + // Second parser: full object syntax + assertEquals("pdf-parser", parsers.get(1).getKey()); + assertEquals("AUTO", parsers.get(1).getValue().get("ocrStrategy").asText()); + + // Third parser: string shorthand + assertEquals("txt-parser", parsers.get(2).getKey()); + assertTrue(parsers.get(2).getValue().isEmpty(), "Should have empty config for shorthand"); + } + + @Test + public void testStringShorthandForDetectors() throws Exception { + String json = """ + { + "detectors": [ + "poifs-container-detector", + { "mime-types": { "markLimit": 10000 } }, + "zip-container-detector" + ] + } + """; + + TikaJsonConfig config = TikaJsonConfig.load( + new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8))); + + List<Map.Entry<String, JsonNode>> detectors = config.getArrayComponents("detectors"); + assertEquals(3, detectors.size(), "Should have 3 detectors"); + + assertEquals("poifs-container-detector", detectors.get(0).getKey()); + assertTrue(detectors.get(0).getValue().isEmpty()); + + assertEquals("mime-types", detectors.get(1).getKey()); + assertEquals(10000, detectors.get(1).getValue().get("markLimit").asInt()); + + assertEquals("zip-container-detector", detectors.get(2).getKey()); + assertTrue(detectors.get(2).getValue().isEmpty()); + } + + @Test + public void testStringShorthandForMetadataFilters() throws Exception { + String json = """ + { + "metadata-filters": [ + "date-normalizing-metadata-filter", + { "field-name-mapping-filter": { "excludeUnmapped": true } } + ] + } + """; + + TikaJsonConfig config = TikaJsonConfig.load( + new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8))); + + List<Map.Entry<String, JsonNode>> filters = config.getArrayComponents("metadata-filters"); + assertEquals(2, filters.size(), "Should have 2 filters"); + + assertEquals("date-normalizing-metadata-filter", filters.get(0).getKey()); + assertTrue(filters.get(0).getValue().isEmpty()); + + assertEquals("field-name-mapping-filter", filters.get(1).getKey()); + assertTrue(filters.get(1).getValue().get("excludeUnmapped").asBoolean()); + } + + @Test + public void testMixedShorthandAndObjectSyntax() throws Exception { + String json = """ + { + "parsers": [ + "first-parser", + { "second-parser": { "option": "value" } }, + "third-parser", + { "fourth-parser": {} }, + "fifth-parser" + ] + } + """; + + TikaJsonConfig config = TikaJsonConfig.load( + new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8))); + + List<Map.Entry<String, JsonNode>> parsers = config.getArrayComponents("parsers"); + assertEquals(5, parsers.size(), "Should have 5 parsers"); + + // Verify ordering is preserved + assertEquals("first-parser", parsers.get(0).getKey()); + assertEquals("second-parser", parsers.get(1).getKey()); + assertEquals("third-parser", parsers.get(2).getKey()); + assertEquals("fourth-parser", parsers.get(3).getKey()); + assertEquals("fifth-parser", parsers.get(4).getKey()); + + // Verify configs + assertTrue(parsers.get(0).getValue().isEmpty()); + assertEquals("value", parsers.get(1).getValue().get("option").asText()); + assertTrue(parsers.get(2).getValue().isEmpty()); + assertTrue(parsers.get(3).getValue().isEmpty()); + assertTrue(parsers.get(4).getValue().isEmpty()); + } + + @Test + public void testAllStringsShorthand() throws Exception { + String json = """ + { + "detectors": [ + "detector-a", + "detector-b", + "detector-c" + ] + } + """; + + TikaJsonConfig config = TikaJsonConfig.load( + new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8))); + + List<Map.Entry<String, JsonNode>> detectors = config.getArrayComponents("detectors"); + assertEquals(3, detectors.size()); + + for (Map.Entry<String, JsonNode> entry : detectors) { + assertNotNull(entry.getKey()); + assertTrue(entry.getValue().isEmpty(), + "All shorthand entries should have empty config"); + } + } + + @Test + public void testEmptyArrayWithShorthand() throws Exception { + String json = """ + { + "parsers": [] + } + """; + + TikaJsonConfig config = TikaJsonConfig.load( + new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8))); + + List<Map.Entry<String, JsonNode>> parsers = config.getArrayComponents("parsers"); + assertTrue(parsers.isEmpty(), "Empty array should return empty list"); + } +}
