This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new d09383aa4 TIKA-4577 -- improve metadata filter serialization via 
ParseContext. (#2461)
d09383aa4 is described below

commit d09383aa491c4951789f30628fef71e817a57cf1
Author: Tim Allison <[email protected]>
AuthorDate: Tue Dec 16 15:28:31 2025 -0500

    TIKA-4577 -- improve metadata filter serialization via ParseContext. (#2461)
---
 .../apache/tika/pipes/core/PipesClientTest.java    |  70 +++++++++++
 .../serialization/ParseContextDeserializer.java    |   3 +
 .../tika/serialization/ParseContextUtils.java      | 138 ++++++++++++++++++++-
 .../TestParseContextSerialization.java             |  26 ++++
 4 files changed, 234 insertions(+), 3 deletions(-)

diff --git 
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
 
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
index 7be90f8f8..9fcd2dd06 100644
--- 
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
+++ 
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
@@ -28,6 +28,7 @@ import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.io.TempDir;
 
+import org.apache.tika.config.ConfigContainer;
 import org.apache.tika.config.TikaTaskTimeout;
 import org.apache.tika.config.loader.TikaJsonConfig;
 import org.apache.tika.metadata.Metadata;
@@ -41,6 +42,7 @@ import org.apache.tika.pipes.api.FetchEmitTuple;
 import org.apache.tika.pipes.api.PipesResult;
 import org.apache.tika.pipes.api.emitter.EmitKey;
 import org.apache.tika.pipes.api.fetcher.FetchKey;
+import org.apache.tika.serialization.ParseContextUtils;
 
 public class PipesClientTest {
     String fetcherName = "fsf";
@@ -104,6 +106,74 @@ public class PipesClientTest {
         assertEquals(4, 
Integer.parseInt(metadata.get("X-TIKA:attachment_count")));
     }
 
+    @Test
+    public void testMetadataFilterFromJsonConfig(@TempDir Path tmp) throws 
Exception {
+        // Test that metadata filters specified as JSON array in 
ConfigContainer
+        // are properly resolved and applied during pipe processing.
+        // This tests the full serialization/deserialization flow.
+        ParseContext parseContext = new ParseContext();
+        ConfigContainer configContainer = new ConfigContainer();
+        configContainer.set("metadata-filters", """
+            [
+              "mock-upper-case-filter"
+            ]
+        """);
+        parseContext.set(ConfigContainer.class, configContainer);
+
+        // Resolve the config to actual MetadataFilter instances
+        ParseContextUtils.resolveAll(parseContext, 
PipesClientTest.class.getClassLoader());
+
+        // Verify the filter was resolved
+        MetadataFilter resolvedFilter = parseContext.get(MetadataFilter.class);
+        Assertions.assertNotNull(resolvedFilter, "MetadataFilter should be 
resolved from ConfigContainer");
+        assertEquals(CompositeMetadataFilter.class, resolvedFilter.getClass());
+
+        PipesClient pipesClient = init(tmp, testDoc);
+        PipesResult pipesResult = pipesClient.process(
+                new FetchEmitTuple(testDoc, new FetchKey(fetcherName, testDoc),
+                        new EmitKey(), new Metadata(), parseContext, 
FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP));
+
+        Assertions.assertNotNull(pipesResult.emitData().getMetadataList());
+        assertEquals(1, pipesResult.emitData().getMetadataList().size());
+        Metadata metadata = pipesResult.emitData().getMetadataList().get(0);
+        // MockUpperCaseFilter uppercases all metadata values
+        assertEquals("TESTOVERLAPPINGTEXT.PDF", 
metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+    }
+
+    @Test
+    public void testMultipleMetadataFiltersFromJsonConfig(@TempDir Path tmp) 
throws Exception {
+        // Test multiple filters specified as JSON array
+        ParseContext parseContext = new ParseContext();
+        ConfigContainer configContainer = new ConfigContainer();
+        configContainer.set("metadata-filters", """
+            [
+              "attachment-counting-list-filter",
+              "mock-upper-case-filter"
+            ]
+        """);
+        parseContext.set(ConfigContainer.class, configContainer);
+
+        // Resolve the config to actual MetadataFilter instances
+        ParseContextUtils.resolveAll(parseContext, 
PipesClientTest.class.getClassLoader());
+
+        String testFile = "mock-embedded.xml";
+        PipesClient pipesClient = init(tmp, testFile);
+
+        PipesResult pipesResult = pipesClient.process(
+                new FetchEmitTuple(testFile, new FetchKey(fetcherName, 
testFile),
+                        new EmitKey(), new Metadata(), parseContext, 
FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP));
+
+        Assertions.assertNotNull(pipesResult.emitData().getMetadataList());
+        assertEquals(5, pipesResult.emitData().getMetadataList().size());
+        Metadata metadata = pipesResult.emitData().getMetadataList().get(0);
+
+        // AttachmentCountingListFilter should have added the count
+        assertEquals(4, 
Integer.parseInt(metadata.get("X-TIKA:attachment_count")));
+
+        // MockUpperCaseFilter should have uppercased the resource name
+        assertEquals("MOCK-EMBEDDED.XML", 
metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+    }
+
     @Test
     public void testTimeout(@TempDir Path tmp) throws Exception {
         //TODO -- figure out how to test pipes server timeout alone
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
 
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
index 437bca933..d5d9fc601 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
@@ -200,6 +200,9 @@ public class ParseContextDeserializer extends 
JsonDeserializer<ParseContext> {
             parseContext.set(ConfigContainer.class, configContainer);
         }
 
+        // Resolve array configs (e.g., "metadata-filters") and 
non-SelfConfiguring components
+        ParseContextUtils.resolveAll(parseContext, 
ParseContextDeserializer.class.getClassLoader());
+
         return parseContext;
     }
 }
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
 
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
index 354e8d178..4638b160b 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
@@ -19,7 +19,9 @@ package org.apache.tika.serialization;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
 
+import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -27,9 +29,11 @@ import org.slf4j.LoggerFactory;
 import org.apache.tika.config.ConfigContainer;
 import org.apache.tika.config.JsonConfig;
 import org.apache.tika.config.loader.ComponentInfo;
+import org.apache.tika.config.loader.ComponentInstantiator;
 import org.apache.tika.config.loader.ComponentRegistry;
 import org.apache.tika.config.loader.TikaObjectMapperFactory;
 import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.metadata.filter.CompositeMetadataFilter;
 import org.apache.tika.metadata.filter.MetadataFilter;
 import org.apache.tika.parser.ParseContext;
 
@@ -68,6 +72,20 @@ public class ParseContextUtils {
             // Add other known interfaces as needed
     );
 
+    /**
+     * Mapping of array config keys to their context keys and composite 
wrapper factories.
+     * Key: config name (e.g., "metadata-filters")
+     * Value: (contextKey, componentInterface)
+     */
+    private static final Map<String, ArrayConfigInfo> ARRAY_CONFIGS = Map.of(
+            "metadata-filters", new ArrayConfigInfo(MetadataFilter.class, 
MetadataFilter.class)
+    );
+
+    /**
+     * Holds information about how to process array configs.
+     */
+    private record ArrayConfigInfo(Class<?> contextKey, Class<?> 
componentInterface) {}
+
     /**
      * Resolves all friendly-named components from ConfigContainer and adds 
them to ParseContext.
      * <p>
@@ -102,12 +120,27 @@ public class ParseContextUtils {
 
         List<String> resolvedKeys = new ArrayList<>();
 
+        // First, process known array configs (e.g., "metadata-filters")
+        // These don't depend on the other-configs registry
+        for (String friendlyName : new ArrayList<>(container.getKeys())) {
+            if (ARRAY_CONFIGS.containsKey(friendlyName)) {
+                JsonConfig jsonConfig = container.get(friendlyName, null);
+                if (jsonConfig != null && resolveArrayConfig(friendlyName, 
jsonConfig, context, classLoader)) {
+                    resolvedKeys.add(friendlyName);
+                }
+            }
+        }
+
+        // Then, try to load the "other-configs" registry for single component 
configs
         try {
-            // Load the "other-configs" registry which includes parse-context 
components
             ComponentRegistry registry = new 
ComponentRegistry("other-configs", classLoader);
 
-            // Iterate through all configs in the container
             for (String friendlyName : container.getKeys()) {
+                // Skip already resolved array configs
+                if (resolvedKeys.contains(friendlyName)) {
+                    continue;
+                }
+
                 JsonConfig jsonConfig = container.get(friendlyName, null);
                 if (jsonConfig == null) {
                     continue;
@@ -143,7 +176,8 @@ public class ParseContextUtils {
                 }
             }
         } catch (TikaConfigException e) {
-            LOG.warn("Failed to load other-configs registry for parse-context 
resolution", e);
+            // other-configs registry not available - that's okay, array 
configs were still processed
+            LOG.debug("other-configs registry not available: {}", 
e.getMessage());
         }
 
         // Remove resolved configs from the container
@@ -191,4 +225,102 @@ public class ParseContextUtils {
         // Use the single matched interface, or fall back to the component 
class
         return matches.isEmpty() ? info.componentClass() : matches.get(0);
     }
+
+    /**
+     * Resolves an array config entry (e.g., "metadata-filters") to a 
composite component.
+     * <p>
+     * The array can contain either strings (friendly names) or objects:
+     * <pre>
+     * ["filter-name-1", "filter-name-2"]              // String shorthand
+     * [{"filter-name-1": {}}, {"filter-name-2": {}}]  // Object format
+     * </pre>
+     *
+     * @param configName the config name (e.g., "metadata-filters")
+     * @param jsonConfig the JSON configuration (should be an array)
+     * @param context the ParseContext to add the resolved component to
+     * @param classLoader the ClassLoader to use for loading component classes
+     * @return true if resolution was successful
+     */
+    @SuppressWarnings("unchecked")
+    private static boolean resolveArrayConfig(String configName, JsonConfig 
jsonConfig,
+                                              ParseContext context, 
ClassLoader classLoader) {
+        ArrayConfigInfo configInfo = ARRAY_CONFIGS.get(configName);
+        if (configInfo == null) {
+            return false;
+        }
+
+        try {
+            JsonNode arrayNode = MAPPER.readTree(jsonConfig.json());
+            if (!arrayNode.isArray()) {
+                LOG.warn("Expected array for '{}', got: {}", configName, 
arrayNode.getNodeType());
+                return false;
+            }
+
+            List<Object> components = new ArrayList<>();
+
+            for (JsonNode item : arrayNode) {
+                String typeName;
+                JsonNode configNode;
+
+                if (item.isTextual()) {
+                    // String shorthand: "component-name"
+                    typeName = item.asText();
+                    configNode = MAPPER.createObjectNode();
+                } else if (item.isObject() && item.size() == 1) {
+                    // Object format: {"component-name": {...}}
+                    typeName = item.fieldNames().next();
+                    configNode = item.get(typeName);
+                } else {
+                    LOG.warn("Unexpected item format in '{}': {}", configName, 
item);
+                    continue;
+                }
+
+                try {
+                    Object component = ComponentInstantiator.instantiate(
+                            typeName, configNode, MAPPER, classLoader);
+                    components.add(component);
+                    LOG.debug("Instantiated '{}' for '{}'", typeName, 
configName);
+                } catch (TikaConfigException e) {
+                    LOG.warn("Failed to instantiate '{}' for '{}': {}", 
typeName, configName, e.getMessage());
+                }
+            }
+
+            // Create the composite and add to ParseContext
+            if (!components.isEmpty()) {
+                Object composite = createComposite(configName, components, 
configInfo);
+                if (composite != null) {
+                    context.set((Class) configInfo.contextKey(), composite);
+                    LOG.debug("Resolved '{}' -> {} with {} components",
+                            configName, composite.getClass().getSimpleName(), 
components.size());
+                    return true;
+                }
+            }
+        } catch (IOException e) {
+            LOG.warn("Failed to parse array config '{}': {}", configName, 
e.getMessage());
+        }
+
+        return false;
+    }
+
+    /**
+     * Creates a composite component from a list of individual components.
+     *
+     * @param configName the config name (for error messages)
+     * @param components the list of components
+     * @param configInfo the array config info
+     * @return the composite component, or null if creation failed
+     */
+    @SuppressWarnings("unchecked")
+    private static Object createComposite(String configName, List<Object> 
components,
+                                          ArrayConfigInfo configInfo) {
+        // Handle known composite types
+        if (configInfo.componentInterface() == MetadataFilter.class) {
+            List<MetadataFilter> filters = (List<MetadataFilter>) (List<?>) 
components;
+            return new CompositeMetadataFilter(filters);
+        }
+
+        // Add more composite types as needed
+        LOG.warn("No composite factory for '{}'", configName);
+        return null;
+    }
 }
diff --git 
a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
 
b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
index 0c9c457b5..3b06f4079 100644
--- 
a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
+++ 
b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
@@ -35,6 +35,9 @@ import org.apache.tika.config.TikaTaskTimeout;
 import org.apache.tika.config.loader.TikaObjectMapperFactory;
 import org.apache.tika.extractor.DocumentSelector;
 import org.apache.tika.extractor.SkipEmbeddedDocumentSelector;
+import org.apache.tika.metadata.filter.AttachmentCountingListFilter;
+import org.apache.tika.metadata.filter.CompositeMetadataFilter;
+import org.apache.tika.metadata.filter.MetadataFilter;
 import org.apache.tika.parser.ParseContext;
 
 /**
@@ -296,6 +299,29 @@ public class TestParseContextSerialization {
         assertEquals(0, root.size(), "Objects without friendly names should 
not be serialized");
     }
 
+    @Test
+    public void testMetadataList() throws Exception {
+        ConfigContainer configContainer = new ConfigContainer();
+        configContainer.set("metadata-filters", """
+            [
+              "attachment-counting-list-filter",
+              "mock-upper-case-filter"
+            ]
+        """);
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(ConfigContainer.class, configContainer);
+
+        ObjectMapper mapper = createMapper();
+        String json = mapper.writeValueAsString(parseContext);
+
+        ParseContext deser = mapper.readValue(json, ParseContext.class);
+        MetadataFilter resolvedFilter = deser.get(MetadataFilter.class);
+        assertNotNull(resolvedFilter, "MetadataFilter should be resolved");
+        assertEquals(CompositeMetadataFilter.class, resolvedFilter.getClass());
+        CompositeMetadataFilter deserFilter = (CompositeMetadataFilter) 
resolvedFilter;
+        assertEquals(AttachmentCountingListFilter.class, 
deserFilter.getFilters().get(0).getClass());
+    }
+
     @Test
     public void testContextKeyDeserialization() throws Exception {
         // Test that components with @TikaComponent(contextKey=...) are stored

Reply via email to