This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new c0266e145b serialization tweaks (#2603)
c0266e145b is described below
commit c0266e145b09e1ce6db5a58edca0c300515bb474
Author: Tim Allison <[email protected]>
AuthorDate: Wed Feb 11 12:56:46 2026 -0500
serialization tweaks (#2603)
---
.../tika/parser/AutoDetectParserConfigTest.java | 13 ++++++----
.../resources/configs/tika-config-no-names.json | 6 ++---
...a-config-upcasing-custom-handler-decorator.json | 2 +-
.../resources/configs/tika-config-with-names.json | 6 ++---
.../org/apache/tika/pipes/core/PassbackFilter.java | 18 ++++++++++++--
.../resources/configs/tika-config-truncate.json | 4 +++-
.../serdes/ParseContextDeserializer.java | 9 +++++++
.../TestParseContextSerialization.java | 28 ++++++++++++++++++++++
8 files changed, 71 insertions(+), 15 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index 1e21fbef25..f8d86905ce 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -36,13 +36,16 @@ public class AutoDetectParserConfigTest extends TikaTest {
@Test
public void testConfiguringEmbeddedDocExtractor() throws Exception {
-
- Parser p =
TikaLoaderHelper.getLoader("tika-config-no-names.json").loadAutoDetectParser();
- String xml = getXML("testEmbedded.zip", p).xml;
+ TikaLoader noNamesLoader =
TikaLoaderHelper.getLoader("tika-config-no-names.json");
+ Parser p = noNamesLoader.loadAutoDetectParser();
+ ParseContext noNamesContext = noNamesLoader.loadParseContext();
+ String xml = getXML("testPPT_EmbeddedPDF.pptx", p, new Metadata(),
noNamesContext).xml;
assertNotContained("<h1>image3.jpg</h1>", xml);
- p =
TikaLoaderHelper.getLoader("tika-config-with-names.json").loadAutoDetectParser();
- xml = getXML("testPPT_EmbeddedPDF.pptx", p).xml;
+ TikaLoader withNamesLoader =
TikaLoaderHelper.getLoader("tika-config-with-names.json");
+ p = withNamesLoader.loadAutoDetectParser();
+ ParseContext withNamesContext = withNamesLoader.loadParseContext();
+ xml = getXML("testPPT_EmbeddedPDF.pptx", p, new Metadata(),
withNamesContext).xml;
assertContains("<h1>image3.jpg</h1>", xml);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json
index feaa6f4494..58af2892bb 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json
@@ -1,7 +1,7 @@
{
"parse-context": {
- "standard-extractor-factory": {
- "writeFileNameToContent": false
- }
+ "sax-output-config": {
+ "writeFileNameToContent": false
}
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
index 66f81f80a7..99d0ed132c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
@@ -5,7 +5,7 @@
},
"parse-context": {
"commons-digester-factory": {},
- "standard-extractor-factory": {
+ "sax-output-config": {
"writeFileNameToContent": true
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json
index 721ee36e35..c935a6d1ce 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json
@@ -1,7 +1,7 @@
{
"parse-context": {
- "standard-extractor-factory": {
- "writeFileNameToContent": true
- }
+ "sax-output-config": {
+ "writeFileNameToContent": true
}
+ }
}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PassbackFilter.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PassbackFilter.java
index 5fa033929e..8d0a7968ff 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PassbackFilter.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/PassbackFilter.java
@@ -17,12 +17,26 @@
package org.apache.tika.pipes.core;
import java.io.Serializable;
+import java.util.List;
-import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
/**
* Filter/Select some of the emitted output and pass it back to the client
parser.
+ * <p>
+ * This is intentionally NOT a MetadataFilter. MetadataFilter is applied
before emission
+ * to transform metadata (e.g., remove fields, compute digests).
PassbackFilter is applied
+ * after emission to select metadata to pass back from the forked PipesServer
to the parent.
+ * They share a method signature but serve entirely different purposes.
*/
-public abstract class PassbackFilter extends MetadataFilter implements
Serializable {
+public abstract class PassbackFilter implements Serializable {
+ /**
+ * Filters the metadata list in place, selecting which data to pass back
to the client.
+ *
+ * @param metadataList the list to filter (must be mutable)
+ * @throws TikaException if filtering fails
+ */
+ public abstract void filter(List<Metadata> metadataList) throws
TikaException;
}
diff --git
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
index 88b4cc1978..b6303b0cfc 100644
---
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
+++
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
@@ -48,8 +48,10 @@
},
"parse-context": {
"mock-digester-factory": {},
+ "sax-output-config": {
+ "writeFileNameToContent": false
+ },
"runpack-extractor-factory": {
- "writeFileNameToContent": false,
"maxEmbeddedBytesForExtraction": 10
}
},
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
index bacbb40741..c8141c47d9 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
@@ -163,6 +163,15 @@ public class ParseContextDeserializer extends
JsonDeserializer<ParseContext> {
}
ComponentInfo info = infoOpt.get();
+
+ // Self-configuring components (e.g., parsers) stay as JSON configs
and are
+ // accessed by string key at runtime via
ParseContextConfig.getConfig().
+ // They never get resolved to typed objects in the context map, so
multiple
+ // self-configuring components with the same context key are not
duplicates.
+ if (info.selfConfiguring()) {
+ return;
+ }
+
Class<?> contextKey = determineContextKey(info);
String existingName = seenContextKeys.get(contextKey);
diff --git
a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
index db9b75ce9c..960f1a2e7a 100644
---
a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
+++
b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
@@ -464,6 +464,34 @@ public class TestParseContextSerialization {
"Exception should mention the conflicting key: " +
ex.getMessage());
}
+ /**
+ * Test that multiple self-configuring components (e.g., parsers) with the
same
+ * context key are allowed. Self-configuring components stay as JSON
configs and
+ * are accessed by string key at runtime, so they never conflict in the
context map.
+ */
+ @Test
+ public void testSelfConfiguringComponentsAllowDuplicateContextKeys()
throws Exception {
+ // Both parsers resolve to Parser.class as context key, but Parser
extends
+ // SelfConfiguring, so they should be allowed to coexist.
+ String json = """
+ {
+ "configurable-test-parser": {
+ "maxItems": 5
+ },
+ "minimal-test-parser": {}
+ }
+ """;
+
+ ObjectMapper mapper = createMapper();
+ // Should NOT throw - self-configuring components skip duplicate
detection
+ ParseContext deserialized = mapper.readValue(json, ParseContext.class);
+
+ assertTrue(deserialized.hasJsonConfig("configurable-test-parser"),
+ "configurable-test-parser should be stored as JSON config");
+ assertTrue(deserialized.hasJsonConfig("minimal-test-parser"),
+ "minimal-test-parser should be stored as JSON config");
+ }
+
/**
* Test that a single component per context key is allowed (no false
positives).
*/