This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4545-jsonify-all-the-things in repository https://gitbox.apache.org/repos/asf/tika.git
commit 1ccaa1b3f351228c25aa8a014a214cd84c89014f Author: tallison <[email protected]> AuthorDate: Sun Nov 30 10:09:55 2025 -0500 TIKA-4545 -- simplify parse context serialization --- .../org/apache/tika/parser/pdf/AccessChecker.java | 16 +- .../java/org/apache/tika/parser/pdf/PDFParser.java | 2 +- .../apache/tika/parser/pdf/PDFParserConfig.java | 40 ++++- .../pdf/image/ImageGraphicsEngineFactory.java | 20 +++ .../org/apache/tika/parser/pdf/PDFParserTest.java | 37 +++++ .../core/serialization/JsonFetchEmitTuple.java | 2 + .../core/serialization/JsonFetchEmitTupleList.java | 2 + .../serialization/ParseContextDeserializer.java | 22 ++- .../tika/serialization/ParseContextSerializer.java | 104 +++++++++++-- .../CustomClassSerializationTest.java | 163 +++++++++++++++++++++ 10 files changed, 374 insertions(+), 34 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java index 2527f7388..2d4244032 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java @@ -31,8 +31,8 @@ public class AccessChecker implements Serializable { private static final long serialVersionUID = 6492570218190936986L; - private final boolean needToCheck; - private final boolean allowExtractionForAccessibility; + private boolean needToCheck; + private boolean allowExtractionForAccessibility; /** * This constructs an {@link AccessChecker} that @@ -58,10 +58,22 @@ public class AccessChecker implements Serializable { this.allowExtractionForAccessibility = allowExtractionForAccessibility; } + public boolean isNeedToCheck() { + return needToCheck; + } + + public void setNeedToCheck(boolean needToCheck) { + this.needToCheck = needToCheck; + } + public boolean isAllowExtractionForAccessibility() { return allowExtractionForAccessibility; } + public void setAllowExtractionForAccessibility(boolean allowExtractionForAccessibility) { + this.allowExtractionForAccessibility = allowExtractionForAccessibility; + } + /** * Checks to see if a document's content should be extracted based * on metadata values and the value of {@link #allowExtractionForAccessibility} in the constructor. diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 6c29c364a..2d8445d21 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -817,7 +817,7 @@ public class PDFParser implements Parser, RenderingParser, Initializable { @Field public void setOcrStrategyAuto(String ocrStrategyAuto) { - defaultConfig.setOcrStrategyAuto(ocrStrategyAuto); + defaultConfig.setOcrStrategyAutoFromString(ocrStrategyAuto); } public String getOcrStrategyAuto() { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index 2df300e7d..e0ac00b33 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -564,7 +564,25 @@ public class PDFParserConfig implements Serializable { } - public void setOcrStrategyAuto(String ocrStrategyAuto) { + /** + * Sets the OCR strategy auto configuration from an object. + * Used by Jackson deserialization. + * + * @param ocrStrategyAuto the OCR strategy auto configuration + */ + public void setOcrStrategyAuto(OCRStrategyAuto ocrStrategyAuto) { + this.ocrStrategyAuto = ocrStrategyAuto; + userConfigured.add("ocrStrategyAuto"); + } + + /** + * Sets the OCR strategy auto configuration from a string. + * Used for configuration parsing from XML/text via PDFParser's @Field annotation. + * Package-private to prevent Jackson from discovering it during bean introspection. + * + * @param ocrStrategyAuto string representation of OCR strategy + */ + void setOcrStrategyAutoFromString(String ocrStrategyAuto) { final String regex = "^\\s*(faster|better)|(\\d{1,3})(%)?(?:,\\s*(\\d{1,3}))?\\s*$"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(ocrStrategyAuto); @@ -960,8 +978,16 @@ public class PDFParserConfig implements Serializable { * percentage of unmappedCharactersPerPage/totalCharsPerPage */ public static class OCRStrategyAuto implements Serializable { - private final float unmappedUnicodeCharsPerPage; - private final int totalCharsPerPage; + private float unmappedUnicodeCharsPerPage; + private int totalCharsPerPage; + + /** + * No-arg constructor for Jackson deserialization. + * Uses default "better" strategy values. + */ + public OCRStrategyAuto() { + this(10, 10); + } public OCRStrategyAuto(float unmappedUnicodeCharsPerPage, int totalCharsPerPage) { this.totalCharsPerPage = totalCharsPerPage; @@ -972,10 +998,18 @@ public class PDFParserConfig implements Serializable { return unmappedUnicodeCharsPerPage; } + public void setUnmappedUnicodeCharsPerPage(float unmappedUnicodeCharsPerPage) { + this.unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage; + } + public int getTotalCharsPerPage() { return totalCharsPerPage; } + public void setTotalCharsPerPage(int totalCharsPerPage) { + this.totalCharsPerPage = totalCharsPerPage; + } + @Override public String toString() { //TODO -- figure out if this is actual BEST or whatever diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java index db30f3b4b..f0cdd0811 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java @@ -41,4 +41,24 @@ public class ImageGraphicsEngineFactory implements Serializable { return new ImageGraphicsEngine(page, pageNumber, embeddedDocumentExtractor, pdfParserConfig, processedInlineImages, imageCounter, xhtml, parentMetadata, parseContext); } + + /** + * Returns the factory type for serialization purposes. + * This allows Jackson to serialize the factory object without requiring additional dependencies. + * + * @return the fully qualified class name of this factory + */ + public String getFactoryType() { + return getClass().getName(); + } + + /** + * Setter for factory type to complete the JavaBean pattern for Jackson deserialization. + * This is a no-op since the factory type is derived from the class itself. + * + * @param factoryType the factory type (ignored) + */ + public void setFactoryType(String factoryType) { + // No-op: factory type is determined by the class, not set externally + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index d4d9df116..e6cf3d121 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -584,4 +584,41 @@ public class PDFParserTest extends TikaTest { //test that the additional actions on the 3d object are processed assertContains("this.notify3DAnnotPageOpen()", metadataList.get(5).get(TikaCoreProperties.TIKA_CONTENT)); } + + @Test + public void testPDFParserConfigSerialization() throws Exception { + // Test that PDFParserConfig can be serialized and deserialized through ParseContext + PDFParserConfig config = new PDFParserConfig(); + config.setSortByPosition(true); + config.setExtractInlineImages(true); + config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.AUTO); + + ParseContext parseContext = new ParseContext(); + parseContext.set(PDFParserConfig.class, config); + + // Serialize using ParseContextSerializer + com.fasterxml.jackson.databind.ObjectMapper mapper = org.apache.tika.serialization.ParseContextSerializer.createMapper(); + com.fasterxml.jackson.databind.module.SimpleModule module = new com.fasterxml.jackson.databind.module.SimpleModule(); + module.addSerializer(ParseContext.class, new org.apache.tika.serialization.ParseContextSerializer()); + module.addDeserializer(ParseContext.class, new org.apache.tika.serialization.ParseContextDeserializer()); + mapper.registerModule(module); + + String json = mapper.writeValueAsString(parseContext); + System.out.println("Serialized PDFParserConfig in ParseContext:"); + System.out.println(json); + + // Deserialize + ParseContext deserialized = mapper.readValue(json, ParseContext.class); + + // Verify PDFParserConfig was preserved - get it directly from ParseContext + PDFParserConfig deserializedConfig = deserialized.get(PDFParserConfig.class); + + assertNotNull(deserializedConfig, "PDFParserConfig should not be null after deserialization"); + assertTrue(deserializedConfig.isSortByPosition(), + "sortByPosition should be preserved"); + assertTrue(deserializedConfig.isExtractInlineImages(), + "extractInlineImages should be preserved"); + assertEquals(PDFParserConfig.OCR_STRATEGY.AUTO, deserializedConfig.getOcrStrategy(), + "ocrStrategy should be preserved"); + } } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java index b35ed7b05..72ae9c225 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java @@ -28,6 +28,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; import org.apache.tika.serialization.MetadataSerializer; +import org.apache.tika.serialization.ParseContextDeserializer; import org.apache.tika.serialization.ParseContextSerializer; public class JsonFetchEmitTuple { @@ -40,6 +41,7 @@ public class JsonFetchEmitTuple { module.addSerializer(FetchEmitTuple.class, new FetchEmitTupleSerializer()); module.addSerializer(Metadata.class, new MetadataSerializer()); module.addSerializer(ParseContext.class, new ParseContextSerializer()); + module.addDeserializer(ParseContext.class, new ParseContextDeserializer()); OBJECT_MAPPER.registerModule(module); } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java index 86f4a3560..9c6c6369e 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java @@ -30,6 +30,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.api.FetchEmitTuple; import org.apache.tika.serialization.MetadataSerializer; +import org.apache.tika.serialization.ParseContextDeserializer; import org.apache.tika.serialization.ParseContextSerializer; public class JsonFetchEmitTupleList { @@ -42,6 +43,7 @@ public class JsonFetchEmitTupleList { module.addSerializer(FetchEmitTuple.class, new FetchEmitTupleSerializer()); module.addSerializer(Metadata.class, new MetadataSerializer()); module.addSerializer(ParseContext.class, new ParseContextSerializer()); + module.addDeserializer(ParseContext.class, new ParseContextDeserializer()); OBJECT_MAPPER.registerModule(module); } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java index ff1742ec3..4cfb95ff7 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java @@ -27,7 +27,6 @@ import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.JsonDeserializer; import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.config.ConfigContainer; import org.apache.tika.parser.ParseContext; @@ -38,21 +37,20 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { public ParseContext deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException, JacksonException { - ObjectMapper mapper = (ObjectMapper) jsonParser.getCodec(); - JsonNode root = mapper.readTree(jsonParser); - return readParseContext(root, mapper); + JsonNode root = jsonParser.readValueAsTree(); + return readParseContext(root); } /** - * Backwards-compatible version that creates its own ObjectMapper. - * Prefer {@link #readParseContext(JsonNode, ObjectMapper)} when possible. + * Deserializes a ParseContext from a JsonNode. + * Uses a properly configured ObjectMapper with polymorphic type handling + * to ensure objects in the ParseContext are deserialized correctly. + * + * @param jsonNode the JSON node containing the ParseContext data + * @return the deserialized ParseContext + * @throws IOException if deserialization fails */ public static ParseContext readParseContext(JsonNode jsonNode) throws IOException { - return readParseContext(jsonNode, ParseContextSerializer.createMapper()); - } - - public static ParseContext readParseContext(JsonNode jsonNode, ObjectMapper mapper) - throws IOException { // Some use cases include the wrapper node, e.g. { "parseContext": {}} // Some include the contents only. // Try to find "parseContext" to start. If that doesn't exist, assume jsonNode is the contents. @@ -75,7 +73,7 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { // Let Jackson handle polymorphic deserialization with type info // Security is enforced by the PolymorphicTypeValidator in the mapper - Object deserializedObject = mapper.treeToValue(objectNode, Object.class); + Object deserializedObject = ParseContextSerializer.POLYMORPHIC_MAPPER.treeToValue(objectNode, Object.class); parseContext.set((Class) superClass, deserializedObject); } catch (ClassNotFoundException ex) { diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java index 096710a45..89de70757 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java @@ -16,7 +16,15 @@ */ package org.apache.tika.serialization; +import java.io.BufferedReader; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Enumeration; +import java.util.List; import java.util.Map; import com.fasterxml.jackson.core.JsonGenerator; @@ -25,31 +33,100 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializerProvider; import com.fasterxml.jackson.databind.jsontype.BasicPolymorphicTypeValidator; import com.fasterxml.jackson.databind.jsontype.PolymorphicTypeValidator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.tika.config.ConfigContainer; import org.apache.tika.parser.ParseContext; public class ParseContextSerializer extends JsonSerializer<ParseContext> { + private static final Logger LOG = LoggerFactory.getLogger(ParseContextSerializer.class); + public static final String PARSE_CONTEXT = "parseContext"; + /** + * Classpath resource file where users can specify additional package prefixes + * to allow for polymorphic deserialization. One package prefix per line. + * Comments (lines starting with #) and blank lines are ignored. + * + * Example content: + * <pre> + * # Allow com.acme classes + * com.acme + * # Allow com.example classes + * com.example + * </pre> + */ + public static final String ALLOWED_PACKAGES_RESOURCE = "META-INF/tika-serialization-allowlist.txt"; + + /** + * Static ObjectMapper configured for polymorphic serialization/deserialization. + * Initialized once when the class is loaded to avoid creating a new mapper on each call. + * Package-private to allow ParseContextDeserializer to use the same mapper. + */ + static final ObjectMapper POLYMORPHIC_MAPPER = createMapper(); + + /** + * Loads additional package prefixes from classpath resources. + * Scans all {@link #ALLOWED_PACKAGES_RESOURCE} files on the classpath. + * + * @return list of additional package prefixes to allow + */ + static List<String> loadAllowedPackages() { + List<String> packages = new ArrayList<>(); + try { + Enumeration<URL> resources = ParseContextSerializer.class.getClassLoader() + .getResources(ALLOWED_PACKAGES_RESOURCE); + + while (resources.hasMoreElements()) { + URL resource = resources.nextElement(); + LOG.debug("Loading allowed packages from: {}", resource); + + try (InputStream is = resource.openStream(); + BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) { + + String line; + while ((line = reader.readLine()) != null) { + line = line.trim(); + // Skip comments and empty lines + if (line.isEmpty() || line.startsWith("#")) { + continue; + } + packages.add(line); + LOG.info("Allowing polymorphic deserialization for package: {}", line); + } + } catch (IOException e) { + LOG.warn("Failed to read allowed packages from: {}", resource, e); + } + } + } catch (IOException e) { + LOG.warn("Failed to load allowed packages resources", e); + } + return packages; + } + /** * Creates an ObjectMapper for serialization with polymorphic type handling. - * Configures security validation to allow only Tika classes. + * Configures security validation to allow Tika classes and any additional + * packages specified via {@link #ALLOWED_PACKAGES_RESOURCE} files on the classpath. * Uses OBJECT_AND_NON_CONCRETE typing to add type info for Object and abstract types, * which avoids interfering with custom serializers for concrete types like ParseContext. */ - static ObjectMapper createMapper() { + public static ObjectMapper createMapper() { ObjectMapper mapper = new ObjectMapper(); - // Configure polymorphic type validator for security - // Use allowIfSubType to allow: - // - org.apache.tika.* classes (all Tika types) - // - java.util.* classes (collections, dates, etc.) - // This is needed because we deserialize with base type Object.class - PolymorphicTypeValidator typeValidator = BasicPolymorphicTypeValidator.builder() + // Start with Tika and Java standard packages + BasicPolymorphicTypeValidator.Builder builder = BasicPolymorphicTypeValidator.builder() .allowIfSubType("org.apache.tika.") - .allowIfSubType("java.util.") - .build(); + .allowIfSubType("java.util."); + + // Add user-specified packages from classpath + List<String> additionalPackages = loadAllowedPackages(); + for (String packagePrefix : additionalPackages) { + builder.allowIfSubType(packagePrefix); + } + + PolymorphicTypeValidator typeValidator = builder.build(); // Use OBJECT_AND_NON_CONCRETE to add type info when static type is: // - Object.class (for objects in the "objects" map) @@ -75,11 +152,6 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { jsonGenerator.writeFieldName("objects"); jsonGenerator.writeStartObject(); - ObjectMapper mapper = (ObjectMapper) jsonGenerator.getCodec(); - if (mapper == null) { - mapper = createMapper(); - } - for (Map.Entry<String, Object> entry : contextMap.entrySet()) { String className = entry.getKey(); if (className.equals(ConfigContainer.class.getName())) { @@ -93,7 +165,7 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { // Let Jackson handle type information and serialization // Use writerFor(Object.class) to ensure polymorphic type info is added - mapper.writerFor(Object.class).writeValue(jsonGenerator, value); + POLYMORPHIC_MAPPER.writerFor(Object.class).writeValue(jsonGenerator, value); } jsonGenerator.writeEndObject(); diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/CustomClassSerializationTest.java b/tika-serialization/src/test/java/org/apache/tika/serialization/CustomClassSerializationTest.java new file mode 100644 index 000000000..f518ff305 --- /dev/null +++ b/tika-serialization/src/test/java/org/apache/tika/serialization/CustomClassSerializationTest.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.serialization; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.StringWriter; +import java.io.Writer; +import java.util.Locale; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.module.SimpleModule; +import org.junit.jupiter.api.Test; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.filter.MetadataFilter; +import org.apache.tika.parser.ParseContext; + +/** + * Tests that users can serialize their own custom classes in ParseContext + * by adding a META-INF/tika-serialization-allowlist.txt file to their JAR. + * + * <p>Custom classes MUST implement Serializable because the Pipes parser + * uses Java serialization to pass ParseContext between processes.</p> + * + * <p>To enable JSON serialization of custom classes:</p> + * <ol> + * <li>Implement Serializable</li> + * <li>Provide a no-arg constructor</li> + * <li>Follow JavaBean conventions (getters/setters)</li> + * <li>Add your package prefix to META-INF/tika-serialization-allowlist.txt</li> + * </ol> + */ +public class CustomClassSerializationTest { + + /** + * Example custom metadata filter that uppercases all values. + * This simulates a user's custom class (e.g., in package com.acme). + * + * <p>Note: Extends Serializable MetadataFilter - this is REQUIRED for use with Pipes parser.</p> + */ + public static class MyUpperCasingMetadataFilter extends MetadataFilter { + private String prefix = ""; + + public MyUpperCasingMetadataFilter() { + } + + public MyUpperCasingMetadataFilter(String prefix) { + this.prefix = prefix; + } + + public String getPrefix() { + return prefix; + } + + public void setPrefix(String prefix) { + this.prefix = prefix; + } + + @Override + public java.util.List<Metadata> filter(java.util.List<Metadata> metadataList) { + for (Metadata metadata : metadataList) { + for (String name : metadata.names()) { + String[] values = metadata.getValues(name); + metadata.remove(name); + for (String value : values) { + metadata.add(name, prefix + value.toUpperCase(Locale.ROOT)); + } + } + } + return metadataList; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof MyUpperCasingMetadataFilter)) return false; + MyUpperCasingMetadataFilter that = (MyUpperCasingMetadataFilter) o; + return prefix.equals(that.prefix); + } + + @Override + public int hashCode() { + return prefix.hashCode(); + } + } + + private ObjectMapper createMapper() { + ObjectMapper mapper = ParseContextSerializer.createMapper(); + SimpleModule module = new SimpleModule(); + module.addDeserializer(ParseContext.class, new ParseContextDeserializer()); + module.addSerializer(ParseContext.class, new ParseContextSerializer()); + mapper.registerModule(module); + return mapper; + } + + @Test + public void testCustomMetadataFilterSerialization() throws Exception { + // Create a custom metadata filter + MyUpperCasingMetadataFilter customFilter = new MyUpperCasingMetadataFilter("TEST_"); + + // Put it in ParseContext - store as MetadataFilter (the abstract base type) + ParseContext pc = new ParseContext(); + pc.set(MetadataFilter.class, (MetadataFilter) customFilter); + + // Serialize + ObjectMapper mapper = createMapper(); + String json; + try (Writer writer = new StringWriter()) { + try (JsonGenerator jsonGenerator = mapper + .getFactory() + .createGenerator(writer)) { + ParseContextSerializer serializer = new ParseContextSerializer(); + serializer.serialize(pc, jsonGenerator, null); + } + json = writer.toString(); + } + + System.out.println("Serialized custom class:"); + System.out.println(json); + + // Verify JSON contains type information + assertTrue(json.contains("MyUpperCasingMetadataFilter"), + "JSON should contain the custom class name"); + + // Deserialize + ParseContext deserialized = mapper.readValue(json, ParseContext.class); + MetadataFilter deserializedFilter = deserialized.get(MetadataFilter.class); + + // Verify polymorphic deserialization worked - we get back the concrete type + assertNotNull(deserializedFilter, "MetadataFilter should not be null"); + assertTrue(deserializedFilter instanceof MyUpperCasingMetadataFilter, + "Filter should be MyUpperCasingMetadataFilter (polymorphic deserialization)"); + + MyUpperCasingMetadataFilter typedFilter = (MyUpperCasingMetadataFilter) deserializedFilter; + assertEquals("TEST_", typedFilter.getPrefix(), "Prefix should be preserved"); + + // Verify it works + Metadata metadata = new Metadata(); + metadata.add("test", "value"); + java.util.List<Metadata> metadataList = new java.util.ArrayList<>(); + metadataList.add(metadata); + typedFilter.filter(metadataList); + assertEquals("TEST_VALUE", metadata.get("test"), "Filter should uppercase with prefix"); + } +}
