This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new d09383aa4 TIKA-4577 -- improve metadata filter serialization via
ParseContext. (#2461)
d09383aa4 is described below
commit d09383aa491c4951789f30628fef71e817a57cf1
Author: Tim Allison <[email protected]>
AuthorDate: Tue Dec 16 15:28:31 2025 -0500
TIKA-4577 -- improve metadata filter serialization via ParseContext. (#2461)
---
.../apache/tika/pipes/core/PipesClientTest.java | 70 +++++++++++
.../serialization/ParseContextDeserializer.java | 3 +
.../tika/serialization/ParseContextUtils.java | 138 ++++++++++++++++++++-
.../TestParseContextSerialization.java | 26 ++++
4 files changed, 234 insertions(+), 3 deletions(-)
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
index 7be90f8f8..9fcd2dd06 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java
@@ -28,6 +28,7 @@ import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
+import org.apache.tika.config.ConfigContainer;
import org.apache.tika.config.TikaTaskTimeout;
import org.apache.tika.config.loader.TikaJsonConfig;
import org.apache.tika.metadata.Metadata;
@@ -41,6 +42,7 @@ import org.apache.tika.pipes.api.FetchEmitTuple;
import org.apache.tika.pipes.api.PipesResult;
import org.apache.tika.pipes.api.emitter.EmitKey;
import org.apache.tika.pipes.api.fetcher.FetchKey;
+import org.apache.tika.serialization.ParseContextUtils;
public class PipesClientTest {
String fetcherName = "fsf";
@@ -104,6 +106,74 @@ public class PipesClientTest {
assertEquals(4,
Integer.parseInt(metadata.get("X-TIKA:attachment_count")));
}
+ @Test
+ public void testMetadataFilterFromJsonConfig(@TempDir Path tmp) throws
Exception {
+ // Test that metadata filters specified as JSON array in
ConfigContainer
+ // are properly resolved and applied during pipe processing.
+ // This tests the full serialization/deserialization flow.
+ ParseContext parseContext = new ParseContext();
+ ConfigContainer configContainer = new ConfigContainer();
+ configContainer.set("metadata-filters", """
+ [
+ "mock-upper-case-filter"
+ ]
+ """);
+ parseContext.set(ConfigContainer.class, configContainer);
+
+ // Resolve the config to actual MetadataFilter instances
+ ParseContextUtils.resolveAll(parseContext,
PipesClientTest.class.getClassLoader());
+
+ // Verify the filter was resolved
+ MetadataFilter resolvedFilter = parseContext.get(MetadataFilter.class);
+ Assertions.assertNotNull(resolvedFilter, "MetadataFilter should be
resolved from ConfigContainer");
+ assertEquals(CompositeMetadataFilter.class, resolvedFilter.getClass());
+
+ PipesClient pipesClient = init(tmp, testDoc);
+ PipesResult pipesResult = pipesClient.process(
+ new FetchEmitTuple(testDoc, new FetchKey(fetcherName, testDoc),
+ new EmitKey(), new Metadata(), parseContext,
FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP));
+
+ Assertions.assertNotNull(pipesResult.emitData().getMetadataList());
+ assertEquals(1, pipesResult.emitData().getMetadataList().size());
+ Metadata metadata = pipesResult.emitData().getMetadataList().get(0);
+ // MockUpperCaseFilter uppercases all metadata values
+ assertEquals("TESTOVERLAPPINGTEXT.PDF",
metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ }
+
+ @Test
+ public void testMultipleMetadataFiltersFromJsonConfig(@TempDir Path tmp)
throws Exception {
+ // Test multiple filters specified as JSON array
+ ParseContext parseContext = new ParseContext();
+ ConfigContainer configContainer = new ConfigContainer();
+ configContainer.set("metadata-filters", """
+ [
+ "attachment-counting-list-filter",
+ "mock-upper-case-filter"
+ ]
+ """);
+ parseContext.set(ConfigContainer.class, configContainer);
+
+ // Resolve the config to actual MetadataFilter instances
+ ParseContextUtils.resolveAll(parseContext,
PipesClientTest.class.getClassLoader());
+
+ String testFile = "mock-embedded.xml";
+ PipesClient pipesClient = init(tmp, testFile);
+
+ PipesResult pipesResult = pipesClient.process(
+ new FetchEmitTuple(testFile, new FetchKey(fetcherName,
testFile),
+ new EmitKey(), new Metadata(), parseContext,
FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP));
+
+ Assertions.assertNotNull(pipesResult.emitData().getMetadataList());
+ assertEquals(5, pipesResult.emitData().getMetadataList().size());
+ Metadata metadata = pipesResult.emitData().getMetadataList().get(0);
+
+ // AttachmentCountingListFilter should have added the count
+ assertEquals(4,
Integer.parseInt(metadata.get("X-TIKA:attachment_count")));
+
+ // MockUpperCaseFilter should have uppercased the resource name
+ assertEquals("MOCK-EMBEDDED.XML",
metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ }
+
@Test
public void testTimeout(@TempDir Path tmp) throws Exception {
//TODO -- figure out how to test pipes server timeout alone
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
index 437bca933..d5d9fc601 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
@@ -200,6 +200,9 @@ public class ParseContextDeserializer extends
JsonDeserializer<ParseContext> {
parseContext.set(ConfigContainer.class, configContainer);
}
+ // Resolve array configs (e.g., "metadata-filters") and
non-SelfConfiguring components
+ ParseContextUtils.resolveAll(parseContext,
ParseContextDeserializer.class.getClassLoader());
+
return parseContext;
}
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
index 354e8d178..4638b160b 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
@@ -19,7 +19,9 @@ package org.apache.tika.serialization;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map;
+import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -27,9 +29,11 @@ import org.slf4j.LoggerFactory;
import org.apache.tika.config.ConfigContainer;
import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.loader.ComponentInfo;
+import org.apache.tika.config.loader.ComponentInstantiator;
import org.apache.tika.config.loader.ComponentRegistry;
import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.metadata.filter.CompositeMetadataFilter;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.parser.ParseContext;
@@ -68,6 +72,20 @@ public class ParseContextUtils {
// Add other known interfaces as needed
);
+ /**
+ * Mapping of array config keys to their context keys and composite
wrapper factories.
+ * Key: config name (e.g., "metadata-filters")
+ * Value: (contextKey, componentInterface)
+ */
+ private static final Map<String, ArrayConfigInfo> ARRAY_CONFIGS = Map.of(
+ "metadata-filters", new ArrayConfigInfo(MetadataFilter.class,
MetadataFilter.class)
+ );
+
+ /**
+ * Holds information about how to process array configs.
+ */
+ private record ArrayConfigInfo(Class<?> contextKey, Class<?>
componentInterface) {}
+
/**
* Resolves all friendly-named components from ConfigContainer and adds
them to ParseContext.
* <p>
@@ -102,12 +120,27 @@ public class ParseContextUtils {
List<String> resolvedKeys = new ArrayList<>();
+ // First, process known array configs (e.g., "metadata-filters")
+ // These don't depend on the other-configs registry
+ for (String friendlyName : new ArrayList<>(container.getKeys())) {
+ if (ARRAY_CONFIGS.containsKey(friendlyName)) {
+ JsonConfig jsonConfig = container.get(friendlyName, null);
+ if (jsonConfig != null && resolveArrayConfig(friendlyName,
jsonConfig, context, classLoader)) {
+ resolvedKeys.add(friendlyName);
+ }
+ }
+ }
+
+ // Then, try to load the "other-configs" registry for single component
configs
try {
- // Load the "other-configs" registry which includes parse-context
components
ComponentRegistry registry = new
ComponentRegistry("other-configs", classLoader);
- // Iterate through all configs in the container
for (String friendlyName : container.getKeys()) {
+ // Skip already resolved array configs
+ if (resolvedKeys.contains(friendlyName)) {
+ continue;
+ }
+
JsonConfig jsonConfig = container.get(friendlyName, null);
if (jsonConfig == null) {
continue;
@@ -143,7 +176,8 @@ public class ParseContextUtils {
}
}
} catch (TikaConfigException e) {
- LOG.warn("Failed to load other-configs registry for parse-context
resolution", e);
+ // other-configs registry not available - that's okay, array
configs were still processed
+ LOG.debug("other-configs registry not available: {}",
e.getMessage());
}
// Remove resolved configs from the container
@@ -191,4 +225,102 @@ public class ParseContextUtils {
// Use the single matched interface, or fall back to the component
class
return matches.isEmpty() ? info.componentClass() : matches.get(0);
}
+
+ /**
+ * Resolves an array config entry (e.g., "metadata-filters") to a
composite component.
+ * <p>
+ * The array can contain either strings (friendly names) or objects:
+ * <pre>
+ * ["filter-name-1", "filter-name-2"] // String shorthand
+ * [{"filter-name-1": {}}, {"filter-name-2": {}}] // Object format
+ * </pre>
+ *
+ * @param configName the config name (e.g., "metadata-filters")
+ * @param jsonConfig the JSON configuration (should be an array)
+ * @param context the ParseContext to add the resolved component to
+ * @param classLoader the ClassLoader to use for loading component classes
+ * @return true if resolution was successful
+ */
+ @SuppressWarnings("unchecked")
+ private static boolean resolveArrayConfig(String configName, JsonConfig
jsonConfig,
+ ParseContext context,
ClassLoader classLoader) {
+ ArrayConfigInfo configInfo = ARRAY_CONFIGS.get(configName);
+ if (configInfo == null) {
+ return false;
+ }
+
+ try {
+ JsonNode arrayNode = MAPPER.readTree(jsonConfig.json());
+ if (!arrayNode.isArray()) {
+ LOG.warn("Expected array for '{}', got: {}", configName,
arrayNode.getNodeType());
+ return false;
+ }
+
+ List<Object> components = new ArrayList<>();
+
+ for (JsonNode item : arrayNode) {
+ String typeName;
+ JsonNode configNode;
+
+ if (item.isTextual()) {
+ // String shorthand: "component-name"
+ typeName = item.asText();
+ configNode = MAPPER.createObjectNode();
+ } else if (item.isObject() && item.size() == 1) {
+ // Object format: {"component-name": {...}}
+ typeName = item.fieldNames().next();
+ configNode = item.get(typeName);
+ } else {
+ LOG.warn("Unexpected item format in '{}': {}", configName,
item);
+ continue;
+ }
+
+ try {
+ Object component = ComponentInstantiator.instantiate(
+ typeName, configNode, MAPPER, classLoader);
+ components.add(component);
+ LOG.debug("Instantiated '{}' for '{}'", typeName,
configName);
+ } catch (TikaConfigException e) {
+ LOG.warn("Failed to instantiate '{}' for '{}': {}",
typeName, configName, e.getMessage());
+ }
+ }
+
+ // Create the composite and add to ParseContext
+ if (!components.isEmpty()) {
+ Object composite = createComposite(configName, components,
configInfo);
+ if (composite != null) {
+ context.set((Class) configInfo.contextKey(), composite);
+ LOG.debug("Resolved '{}' -> {} with {} components",
+ configName, composite.getClass().getSimpleName(),
components.size());
+ return true;
+ }
+ }
+ } catch (IOException e) {
+ LOG.warn("Failed to parse array config '{}': {}", configName,
e.getMessage());
+ }
+
+ return false;
+ }
+
+ /**
+ * Creates a composite component from a list of individual components.
+ *
+ * @param configName the config name (for error messages)
+ * @param components the list of components
+ * @param configInfo the array config info
+ * @return the composite component, or null if creation failed
+ */
+ @SuppressWarnings("unchecked")
+ private static Object createComposite(String configName, List<Object>
components,
+ ArrayConfigInfo configInfo) {
+ // Handle known composite types
+ if (configInfo.componentInterface() == MetadataFilter.class) {
+ List<MetadataFilter> filters = (List<MetadataFilter>) (List<?>)
components;
+ return new CompositeMetadataFilter(filters);
+ }
+
+ // Add more composite types as needed
+ LOG.warn("No composite factory for '{}'", configName);
+ return null;
+ }
}
diff --git
a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
index 0c9c457b5..3b06f4079 100644
---
a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
+++
b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
@@ -35,6 +35,9 @@ import org.apache.tika.config.TikaTaskTimeout;
import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.extractor.SkipEmbeddedDocumentSelector;
+import org.apache.tika.metadata.filter.AttachmentCountingListFilter;
+import org.apache.tika.metadata.filter.CompositeMetadataFilter;
+import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.parser.ParseContext;
/**
@@ -296,6 +299,29 @@ public class TestParseContextSerialization {
assertEquals(0, root.size(), "Objects without friendly names should
not be serialized");
}
+ @Test
+ public void testMetadataList() throws Exception {
+ ConfigContainer configContainer = new ConfigContainer();
+ configContainer.set("metadata-filters", """
+ [
+ "attachment-counting-list-filter",
+ "mock-upper-case-filter"
+ ]
+ """);
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(ConfigContainer.class, configContainer);
+
+ ObjectMapper mapper = createMapper();
+ String json = mapper.writeValueAsString(parseContext);
+
+ ParseContext deser = mapper.readValue(json, ParseContext.class);
+ MetadataFilter resolvedFilter = deser.get(MetadataFilter.class);
+ assertNotNull(resolvedFilter, "MetadataFilter should be resolved");
+ assertEquals(CompositeMetadataFilter.class, resolvedFilter.getClass());
+ CompositeMetadataFilter deserFilter = (CompositeMetadataFilter)
resolvedFilter;
+ assertEquals(AttachmentCountingListFilter.class,
deserFilter.getFilters().get(0).getClass());
+ }
+
@Test
public void testContextKeyDeserialization() throws Exception {
// Test that components with @TikaComponent(contextKey=...) are stored