This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new bee7682711 TIKA-4566 - allow non-config elements to standalone in 
array (#2442)
bee7682711 is described below

commit bee7682711bf5e967ffdaaabf902788bdc673437
Author: Tim Allison <[email protected]>
AuthorDate: Thu Dec 11 10:20:24 2025 -0500

    TIKA-4566 - allow non-config elements to standalone in array (#2442)
---
 .../apache/tika/config/loader/TikaJsonConfig.java  |  42 +++--
 .../tika/config/loader/TikaJsonConfigTest.java     | 193 +++++++++++++++++++++
 2 files changed, 213 insertions(+), 22 deletions(-)

diff --git 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
index 1a7cff5d4f..d90854666d 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java
@@ -69,31 +69,26 @@ import org.apache.tika.exception.TikaConfigException;
  * {
  *   // Core Tika components (validated by TikaLoader)
  *   "parsers": [
- *     { "pdf-parser": { "_decorate": {...}, "ocrStrategy": "AUTO", ... } },
- *     { "html-parser": { ... } },
- *     { "default-parser": {} }
+ *     { "pdf-parser": { "_mime-include": ["application/pdf"], "ocrStrategy": 
"AUTO" } },
+ *     "html-parser",                    // String shorthand for no-config 
components
+ *     { "default-parser": { "_exclude": ["ocr-parser"] } }
  *   ],
  *   "detectors": [
- *     { "mime-magic-detector": {} },
- *     { "zip-container-detector": { "maxDepth": 10 } }
+ *     "poifs-container-detector",       // String shorthand
+ *     { "mime-types": { "markLimit": 10000 } }
  *   ],
  *
  *   // Pipes components (validated by validateKeys())
  *   "plugin-roots": ["/path/to/plugins"],
  *   "fetchers": [...],
- *   "emitters": [...],
- *
- *   // Custom configurations (for testing or extensions)
- *   "other-configs": {
- *     "test-config": { ... },
- *     "my-custom-config": { ... },
- *     "anything": { ... }
- *   }
+ *   "emitters": [...]
  * }
  * </pre>
  *
  * <p>All components use array format for explicit ordering.
- * Parsers support decoration via "_decorate" field.
+ * Components without configuration can use string shorthand: "component-name"
+ * instead of { "component-name": {} }.
+ * Parsers support mime filtering via "_mime-include" and "_mime-exclude" 
fields.
  * Special "default-parser" entry enables SPI fallback for unlisted parsers.
  */
 public class TikaJsonConfig {
@@ -292,15 +287,18 @@ public class TikaJsonConfig {
             List<Map.Entry<String, JsonNode>> components = new ArrayList<>();
 
             for (JsonNode arrayItem : typeNode) {
-                if (!arrayItem.isObject()) {
-                    continue;
-                }
-
-                // Each array item should have exactly one field: { 
"component-name": {...config...} }
-                for (Map.Entry<String, JsonNode> componentEntry : 
arrayItem.properties()) {
-                    components.add(Map.entry(componentEntry.getKey(), 
componentEntry.getValue()));
-                    break; // Only take the first field
+                if (arrayItem.isTextual()) {
+                    // String shorthand: "component-name" -> treat as { 
"component-name": {} }
+                    String componentName = arrayItem.asText();
+                    components.add(Map.entry(componentName, 
OBJECT_MAPPER.createObjectNode()));
+                } else if (arrayItem.isObject()) {
+                    // Object syntax: { "component-name": {...config...} }
+                    for (Map.Entry<String, JsonNode> componentEntry : 
arrayItem.properties()) {
+                        components.add(Map.entry(componentEntry.getKey(), 
componentEntry.getValue()));
+                        break; // Only take the first field
+                    }
                 }
+                // Skip other types (null, numbers, arrays, etc.)
             }
 
             if (!components.isEmpty()) {
diff --git 
a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java
 
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java
new file mode 100644
index 0000000000..edd8e55634
--- /dev/null
+++ 
b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaJsonConfigTest.java
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config.loader;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+import java.util.Map;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Unit tests for TikaJsonConfig parsing functionality.
+ */
+public class TikaJsonConfigTest {
+
+    @Test
+    public void testStringShorthandForParsers() throws Exception {
+        String json = """
+            {
+              "parsers": [
+                "html-parser",
+                { "pdf-parser": { "ocrStrategy": "AUTO" } },
+                "txt-parser"
+              ]
+            }
+            """;
+
+        TikaJsonConfig config = TikaJsonConfig.load(
+                new 
ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
+
+        List<Map.Entry<String, JsonNode>> parsers = 
config.getArrayComponents("parsers");
+        assertEquals(3, parsers.size(), "Should have 3 parsers");
+
+        // First parser: string shorthand
+        assertEquals("html-parser", parsers.get(0).getKey());
+        assertTrue(parsers.get(0).getValue().isEmpty(), "Should have empty 
config for shorthand");
+
+        // Second parser: full object syntax
+        assertEquals("pdf-parser", parsers.get(1).getKey());
+        assertEquals("AUTO", 
parsers.get(1).getValue().get("ocrStrategy").asText());
+
+        // Third parser: string shorthand
+        assertEquals("txt-parser", parsers.get(2).getKey());
+        assertTrue(parsers.get(2).getValue().isEmpty(), "Should have empty 
config for shorthand");
+    }
+
+    @Test
+    public void testStringShorthandForDetectors() throws Exception {
+        String json = """
+            {
+              "detectors": [
+                "poifs-container-detector",
+                { "mime-types": { "markLimit": 10000 } },
+                "zip-container-detector"
+              ]
+            }
+            """;
+
+        TikaJsonConfig config = TikaJsonConfig.load(
+                new 
ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
+
+        List<Map.Entry<String, JsonNode>> detectors = 
config.getArrayComponents("detectors");
+        assertEquals(3, detectors.size(), "Should have 3 detectors");
+
+        assertEquals("poifs-container-detector", detectors.get(0).getKey());
+        assertTrue(detectors.get(0).getValue().isEmpty());
+
+        assertEquals("mime-types", detectors.get(1).getKey());
+        assertEquals(10000, 
detectors.get(1).getValue().get("markLimit").asInt());
+
+        assertEquals("zip-container-detector", detectors.get(2).getKey());
+        assertTrue(detectors.get(2).getValue().isEmpty());
+    }
+
+    @Test
+    public void testStringShorthandForMetadataFilters() throws Exception {
+        String json = """
+            {
+              "metadata-filters": [
+                "date-normalizing-metadata-filter",
+                { "field-name-mapping-filter": { "excludeUnmapped": true } }
+              ]
+            }
+            """;
+
+        TikaJsonConfig config = TikaJsonConfig.load(
+                new 
ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
+
+        List<Map.Entry<String, JsonNode>> filters = 
config.getArrayComponents("metadata-filters");
+        assertEquals(2, filters.size(), "Should have 2 filters");
+
+        assertEquals("date-normalizing-metadata-filter", 
filters.get(0).getKey());
+        assertTrue(filters.get(0).getValue().isEmpty());
+
+        assertEquals("field-name-mapping-filter", filters.get(1).getKey());
+        
assertTrue(filters.get(1).getValue().get("excludeUnmapped").asBoolean());
+    }
+
+    @Test
+    public void testMixedShorthandAndObjectSyntax() throws Exception {
+        String json = """
+            {
+              "parsers": [
+                "first-parser",
+                { "second-parser": { "option": "value" } },
+                "third-parser",
+                { "fourth-parser": {} },
+                "fifth-parser"
+              ]
+            }
+            """;
+
+        TikaJsonConfig config = TikaJsonConfig.load(
+                new 
ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
+
+        List<Map.Entry<String, JsonNode>> parsers = 
config.getArrayComponents("parsers");
+        assertEquals(5, parsers.size(), "Should have 5 parsers");
+
+        // Verify ordering is preserved
+        assertEquals("first-parser", parsers.get(0).getKey());
+        assertEquals("second-parser", parsers.get(1).getKey());
+        assertEquals("third-parser", parsers.get(2).getKey());
+        assertEquals("fourth-parser", parsers.get(3).getKey());
+        assertEquals("fifth-parser", parsers.get(4).getKey());
+
+        // Verify configs
+        assertTrue(parsers.get(0).getValue().isEmpty());
+        assertEquals("value", 
parsers.get(1).getValue().get("option").asText());
+        assertTrue(parsers.get(2).getValue().isEmpty());
+        assertTrue(parsers.get(3).getValue().isEmpty());
+        assertTrue(parsers.get(4).getValue().isEmpty());
+    }
+
+    @Test
+    public void testAllStringsShorthand() throws Exception {
+        String json = """
+            {
+              "detectors": [
+                "detector-a",
+                "detector-b",
+                "detector-c"
+              ]
+            }
+            """;
+
+        TikaJsonConfig config = TikaJsonConfig.load(
+                new 
ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
+
+        List<Map.Entry<String, JsonNode>> detectors = 
config.getArrayComponents("detectors");
+        assertEquals(3, detectors.size());
+
+        for (Map.Entry<String, JsonNode> entry : detectors) {
+            assertNotNull(entry.getKey());
+            assertTrue(entry.getValue().isEmpty(),
+                    "All shorthand entries should have empty config");
+        }
+    }
+
+    @Test
+    public void testEmptyArrayWithShorthand() throws Exception {
+        String json = """
+            {
+              "parsers": []
+            }
+            """;
+
+        TikaJsonConfig config = TikaJsonConfig.load(
+                new 
ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)));
+
+        List<Map.Entry<String, JsonNode>> parsers = 
config.getArrayComponents("parsers");
+        assertTrue(parsers.isEmpty(), "Empty array should return empty list");
+    }
+}

Reply via email to