This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new bee7682711 TIKA-4566 - allow non-config elements to standalone in
array (#2442)
bee7682711 is described below
commit bee7682711bf5e967ffdaaabf902788bdc673437
Author: Tim Allison <[email protected]>
AuthorDate: Thu Dec 11 10:20:24 2025 -0500
TIKA-4566 - allow non-config elements to standalone in array (#2442)
---
.../apache/tika/config/loader/TikaJsonConfig.java | 42 +++--
.../tika/config/loader/TikaJsonConfigTest.java | 193 +++++++++++++++++++++
2 files changed, 213 insertions(+), 22 deletions(-)
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
index 1a7cff5d4f..d90854666d 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
@@ -69,31 +69,26 @@ import org.apache.tika.exception.TikaConfigException;
* {
* // Core Tika components (validated by TikaLoader)
* "parsers": [
- * { "pdf-parser": { "_decorate": {...}, "ocrStrategy": "AUTO", ... } },
- * { "html-parser": { ... } },
- * { "default-parser": {} }
+ * { "pdf-parser": { "_mime-include": ["application/pdf"], "ocrStrategy":
"AUTO" } },
+ * "html-parser", // String shorthand for no-config
components
+ * { "default-parser": { "_exclude": ["ocr-parser"] } }
* ],
* "detectors": [
- * { "mime-magic-detector": {} },
- * { "zip-container-detector": { "maxDepth": 10 } }
+ * "poifs-container-detector", // String shorthand
+ * { "mime-types": { "markLimit": 10000 } }
* ],
*
* // Pipes components (validated by validateKeys())
* "plugin-roots": ["/path/to/plugins"],
* "fetchers": [...],
- * "emitters": [...],
- *
- * // Custom configurations (for testing or extensions)
- * "other-configs": {
- * "test-config": { ... },
- * "my-custom-config": { ... },
- * "anything": { ... }
- * }
+ * "emitters": [...]
* }
* </pre>
*
* <p>All components use array format for explicit ordering.
- * Parsers support decoration via "_decorate" field.
+ * Components without configuration can use string shorthand: "component-name"
+ * instead of { "component-name": {} }.
+ * Parsers support mime filtering via "_mime-include" and "_mime-exclude"
fields.
* Special "default-parser" entry enables SPI fallback for unlisted parsers.
*/
public class TikaJsonConfig {
@@ -292,15 +287,18 @@ public class TikaJsonConfig {
List<Map.Entry<String, JsonNode>> components = new ArrayList<>();
for (JsonNode arrayItem : typeNode) {
- if (!arrayItem.isObject()) {
- continue;
- }
-
- // Each array item should have exactly one field: {
"component-name": {...config...} }
- for (Map.Entry<String, JsonNode> componentEntry :
arrayItem.properties()) {
- components.add(Map.entry(componentEntry.getKey(),
componentEntry.getValue()));
- break; // Only take the first field
+ if (arrayItem.isTextual()) {
+ // String shorthand: "component-name" -> treat as {
"component-name": {} }
+ String componentName = arrayItem.asText();
+ components.add(Map.entry(componentName,
OBJECT_MAPPER.createObjectNode()));
+ } else if (arrayItem.isObject()) {
+ // Object syntax: { "component-name": {...config...} }
+ for (Map.Entry<String, JsonNode> componentEntry :
arrayItem.properties()) {
+ components.add(Map.entry(componentEntry.getKey(),
componentEntry.getValue()));
+ break; // Only take the first field
+ }
}
+ // Skip other types (null, numbers, arrays, etc.)
}
if (!components.isEmpty()) {
diff --git
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java
new file mode 100644
index 0000000000..edd8e55634
--- /dev/null
+++
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+import java.util.Map;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Unit tests for TikaJsonConfig parsing functionality.
+ */
+public class TikaJsonConfigTest {
+
+ @Test
+ public void testStringShorthandForParsers() throws Exception {
+ String json = """
+ {
+ "parsers": [
+ "html-parser",
+ { "pdf-parser": { "ocrStrategy": "AUTO" } },
+ "txt-parser"
+ ]
+ }
+ """;
+
+ TikaJsonConfig config = TikaJsonConfig.load(
+ new
ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
+
+ List<Map.Entry<String, JsonNode>> parsers =
config.getArrayComponents("parsers");
+ assertEquals(3, parsers.size(), "Should have 3 parsers");
+
+ // First parser: string shorthand
+ assertEquals("html-parser", parsers.get(0).getKey());
+ assertTrue(parsers.get(0).getValue().isEmpty(), "Should have empty
config for shorthand");
+
+ // Second parser: full object syntax
+ assertEquals("pdf-parser", parsers.get(1).getKey());
+ assertEquals("AUTO",
parsers.get(1).getValue().get("ocrStrategy").asText());
+
+ // Third parser: string shorthand
+ assertEquals("txt-parser", parsers.get(2).getKey());
+ assertTrue(parsers.get(2).getValue().isEmpty(), "Should have empty
config for shorthand");
+ }
+
+ @Test
+ public void testStringShorthandForDetectors() throws Exception {
+ String json = """
+ {
+ "detectors": [
+ "poifs-container-detector",
+ { "mime-types": { "markLimit": 10000 } },
+ "zip-container-detector"
+ ]
+ }
+ """;
+
+ TikaJsonConfig config = TikaJsonConfig.load(
+ new
ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
+
+ List<Map.Entry<String, JsonNode>> detectors =
config.getArrayComponents("detectors");
+ assertEquals(3, detectors.size(), "Should have 3 detectors");
+
+ assertEquals("poifs-container-detector", detectors.get(0).getKey());
+ assertTrue(detectors.get(0).getValue().isEmpty());
+
+ assertEquals("mime-types", detectors.get(1).getKey());
+ assertEquals(10000,
detectors.get(1).getValue().get("markLimit").asInt());
+
+ assertEquals("zip-container-detector", detectors.get(2).getKey());
+ assertTrue(detectors.get(2).getValue().isEmpty());
+ }
+
+ @Test
+ public void testStringShorthandForMetadataFilters() throws Exception {
+ String json = """
+ {
+ "metadata-filters": [
+ "date-normalizing-metadata-filter",
+ { "field-name-mapping-filter": { "excludeUnmapped": true } }
+ ]
+ }
+ """;
+
+ TikaJsonConfig config = TikaJsonConfig.load(
+ new
ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
+
+ List<Map.Entry<String, JsonNode>> filters =
config.getArrayComponents("metadata-filters");
+ assertEquals(2, filters.size(), "Should have 2 filters");
+
+ assertEquals("date-normalizing-metadata-filter",
filters.get(0).getKey());
+ assertTrue(filters.get(0).getValue().isEmpty());
+
+ assertEquals("field-name-mapping-filter", filters.get(1).getKey());
+
assertTrue(filters.get(1).getValue().get("excludeUnmapped").asBoolean());
+ }
+
+ @Test
+ public void testMixedShorthandAndObjectSyntax() throws Exception {
+ String json = """
+ {
+ "parsers": [
+ "first-parser",
+ { "second-parser": { "option": "value" } },
+ "third-parser",
+ { "fourth-parser": {} },
+ "fifth-parser"
+ ]
+ }
+ """;
+
+ TikaJsonConfig config = TikaJsonConfig.load(
+ new
ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
+
+ List<Map.Entry<String, JsonNode>> parsers =
config.getArrayComponents("parsers");
+ assertEquals(5, parsers.size(), "Should have 5 parsers");
+
+ // Verify ordering is preserved
+ assertEquals("first-parser", parsers.get(0).getKey());
+ assertEquals("second-parser", parsers.get(1).getKey());
+ assertEquals("third-parser", parsers.get(2).getKey());
+ assertEquals("fourth-parser", parsers.get(3).getKey());
+ assertEquals("fifth-parser", parsers.get(4).getKey());
+
+ // Verify configs
+ assertTrue(parsers.get(0).getValue().isEmpty());
+ assertEquals("value",
parsers.get(1).getValue().get("option").asText());
+ assertTrue(parsers.get(2).getValue().isEmpty());
+ assertTrue(parsers.get(3).getValue().isEmpty());
+ assertTrue(parsers.get(4).getValue().isEmpty());
+ }
+
+ @Test
+ public void testAllStringsShorthand() throws Exception {
+ String json = """
+ {
+ "detectors": [
+ "detector-a",
+ "detector-b",
+ "detector-c"
+ ]
+ }
+ """;
+
+ TikaJsonConfig config = TikaJsonConfig.load(
+ new
ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
+
+ List<Map.Entry<String, JsonNode>> detectors =
config.getArrayComponents("detectors");
+ assertEquals(3, detectors.size());
+
+ for (Map.Entry<String, JsonNode> entry : detectors) {
+ assertNotNull(entry.getKey());
+ assertTrue(entry.getValue().isEmpty(),
+ "All shorthand entries should have empty config");
+ }
+ }
+
+ @Test
+ public void testEmptyArrayWithShorthand() throws Exception {
+ String json = """
+ {
+ "parsers": []
+ }
+ """;
+
+ TikaJsonConfig config = TikaJsonConfig.load(
+ new
ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
+
+ List<Map.Entry<String, JsonNode>> parsers =
config.getArrayComponents("parsers");
+ assertTrue(parsers.isEmpty(), "Empty array should return empty list");
+ }
+}