This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4545-jsonify-all-the-things
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 1ccaa1b3f351228c25aa8a014a214cd84c89014f
Author: tallison <[email protected]>
AuthorDate: Sun Nov 30 10:09:55 2025 -0500

    TIKA-4545 -- simplify parse context serialization
---
 .../org/apache/tika/parser/pdf/AccessChecker.java  |  16 +-
 .../java/org/apache/tika/parser/pdf/PDFParser.java |   2 +-
 .../apache/tika/parser/pdf/PDFParserConfig.java    |  40 ++++-
 .../pdf/image/ImageGraphicsEngineFactory.java      |  20 +++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  37 +++++
 .../core/serialization/JsonFetchEmitTuple.java     |   2 +
 .../core/serialization/JsonFetchEmitTupleList.java |   2 +
 .../serialization/ParseContextDeserializer.java    |  22 ++-
 .../tika/serialization/ParseContextSerializer.java | 104 +++++++++++--
 .../CustomClassSerializationTest.java              | 163 +++++++++++++++++++++
 10 files changed, 374 insertions(+), 34 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
index 2527f7388..2d4244032 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
@@ -31,8 +31,8 @@ public class AccessChecker implements Serializable {
 
     private static final long serialVersionUID = 6492570218190936986L;
 
-    private final boolean needToCheck;
-    private final boolean allowExtractionForAccessibility;
+    private boolean needToCheck;
+    private boolean allowExtractionForAccessibility;
 
     /**
      * This constructs an {@link AccessChecker} that
@@ -58,10 +58,22 @@ public class AccessChecker implements Serializable {
         this.allowExtractionForAccessibility = allowExtractionForAccessibility;
     }
 
+    public boolean isNeedToCheck() {
+        return needToCheck;
+    }
+
+    public void setNeedToCheck(boolean needToCheck) {
+        this.needToCheck = needToCheck;
+    }
+
     public boolean isAllowExtractionForAccessibility() {
         return allowExtractionForAccessibility;
     }
 
+    public void setAllowExtractionForAccessibility(boolean 
allowExtractionForAccessibility) {
+        this.allowExtractionForAccessibility = allowExtractionForAccessibility;
+    }
+
     /**
      * Checks to see if a document's content should be extracted based
      * on metadata values and the value of {@link 
#allowExtractionForAccessibility} in the constructor.
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 6c29c364a..2d8445d21 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -817,7 +817,7 @@ public class PDFParser implements Parser, RenderingParser, 
Initializable {
 
     @Field
     public void setOcrStrategyAuto(String ocrStrategyAuto) {
-        defaultConfig.setOcrStrategyAuto(ocrStrategyAuto);
+        defaultConfig.setOcrStrategyAutoFromString(ocrStrategyAuto);
     }
 
     public String getOcrStrategyAuto() {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 2df300e7d..e0ac00b33 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -564,7 +564,25 @@ public class PDFParserConfig implements Serializable {
     }
 
 
-    public void setOcrStrategyAuto(String ocrStrategyAuto) {
+    /**
+     * Sets the OCR strategy auto configuration from an object.
+     * Used by Jackson deserialization.
+     *
+     * @param ocrStrategyAuto the OCR strategy auto configuration
+     */
+    public void setOcrStrategyAuto(OCRStrategyAuto ocrStrategyAuto) {
+        this.ocrStrategyAuto = ocrStrategyAuto;
+        userConfigured.add("ocrStrategyAuto");
+    }
+
+    /**
+     * Sets the OCR strategy auto configuration from a string.
+     * Used for configuration parsing from XML/text via PDFParser's @Field 
annotation.
+     * Package-private to prevent Jackson from discovering it during bean 
introspection.
+     *
+     * @param ocrStrategyAuto string representation of OCR strategy
+     */
+    void setOcrStrategyAutoFromString(String ocrStrategyAuto) {
         final String regex = 
"^\\s*(faster|better)|(\\d{1,3})(%)?(?:,\\s*(\\d{1,3}))?\\s*$";
         Pattern pattern = Pattern.compile(regex);
         Matcher matcher = pattern.matcher(ocrStrategyAuto);
@@ -960,8 +978,16 @@ public class PDFParserConfig implements Serializable {
      * percentage of unmappedCharactersPerPage/totalCharsPerPage
      */
     public static class OCRStrategyAuto implements Serializable {
-        private final float unmappedUnicodeCharsPerPage;
-        private final int totalCharsPerPage;
+        private float unmappedUnicodeCharsPerPage;
+        private int totalCharsPerPage;
+
+        /**
+         * No-arg constructor for Jackson deserialization.
+         * Uses default "better" strategy values.
+         */
+        public OCRStrategyAuto() {
+            this(10, 10);
+        }
 
         public OCRStrategyAuto(float unmappedUnicodeCharsPerPage, int 
totalCharsPerPage) {
             this.totalCharsPerPage = totalCharsPerPage;
@@ -972,10 +998,18 @@ public class PDFParserConfig implements Serializable {
             return unmappedUnicodeCharsPerPage;
         }
 
+        public void setUnmappedUnicodeCharsPerPage(float 
unmappedUnicodeCharsPerPage) {
+            this.unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage;
+        }
+
         public int getTotalCharsPerPage() {
             return totalCharsPerPage;
         }
 
+        public void setTotalCharsPerPage(int totalCharsPerPage) {
+            this.totalCharsPerPage = totalCharsPerPage;
+        }
+
         @Override
         public String toString() {
             //TODO -- figure out if this is actual BEST or whatever
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
index db30f3b4b..f0cdd0811 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
@@ -41,4 +41,24 @@ public class ImageGraphicsEngineFactory implements 
Serializable {
         return new ImageGraphicsEngine(page, pageNumber, 
embeddedDocumentExtractor, pdfParserConfig,
                 processedInlineImages, imageCounter, xhtml, parentMetadata, 
parseContext);
     }
+
+    /**
+     * Returns the factory type for serialization purposes.
+     * This allows Jackson to serialize the factory object without requiring 
additional dependencies.
+     *
+     * @return the fully qualified class name of this factory
+     */
+    public String getFactoryType() {
+        return getClass().getName();
+    }
+
+    /**
+     * Setter for factory type to complete the JavaBean pattern for Jackson 
deserialization.
+     * This is a no-op since the factory type is derived from the class itself.
+     *
+     * @param factoryType the factory type (ignored)
+     */
+    public void setFactoryType(String factoryType) {
+        // No-op: factory type is determined by the class, not set externally
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index d4d9df116..e6cf3d121 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -584,4 +584,41 @@ public class PDFParserTest extends TikaTest {
         //test that the additional actions on the 3d object are processed
         assertContains("this.notify3DAnnotPageOpen()", 
metadataList.get(5).get(TikaCoreProperties.TIKA_CONTENT));
     }
+
+    @Test
+    public void testPDFParserConfigSerialization() throws Exception {
+        // Test that PDFParserConfig can be serialized and deserialized 
through ParseContext
+        PDFParserConfig config = new PDFParserConfig();
+        config.setSortByPosition(true);
+        config.setExtractInlineImages(true);
+        config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.AUTO);
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(PDFParserConfig.class, config);
+
+        // Serialize using ParseContextSerializer
+        com.fasterxml.jackson.databind.ObjectMapper mapper = 
org.apache.tika.serialization.ParseContextSerializer.createMapper();
+        com.fasterxml.jackson.databind.module.SimpleModule module = new 
com.fasterxml.jackson.databind.module.SimpleModule();
+        module.addSerializer(ParseContext.class, new 
org.apache.tika.serialization.ParseContextSerializer());
+        module.addDeserializer(ParseContext.class, new 
org.apache.tika.serialization.ParseContextDeserializer());
+        mapper.registerModule(module);
+
+        String json = mapper.writeValueAsString(parseContext);
+        System.out.println("Serialized PDFParserConfig in ParseContext:");
+        System.out.println(json);
+
+        // Deserialize
+        ParseContext deserialized = mapper.readValue(json, ParseContext.class);
+
+        // Verify PDFParserConfig was preserved - get it directly from 
ParseContext
+        PDFParserConfig deserializedConfig = 
deserialized.get(PDFParserConfig.class);
+
+        assertNotNull(deserializedConfig, "PDFParserConfig should not be null 
after deserialization");
+        assertTrue(deserializedConfig.isSortByPosition(),
+                "sortByPosition should be preserved");
+        assertTrue(deserializedConfig.isExtractInlineImages(),
+                "extractInlineImages should be preserved");
+        assertEquals(PDFParserConfig.OCR_STRATEGY.AUTO, 
deserializedConfig.getOcrStrategy(),
+                "ocrStrategy should be preserved");
+    }
 }
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
index b35ed7b05..72ae9c225 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
@@ -28,6 +28,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.pipes.api.FetchEmitTuple;
 import org.apache.tika.serialization.MetadataSerializer;
+import org.apache.tika.serialization.ParseContextDeserializer;
 import org.apache.tika.serialization.ParseContextSerializer;
 
 public class JsonFetchEmitTuple {
@@ -40,6 +41,7 @@ public class JsonFetchEmitTuple {
         module.addSerializer(FetchEmitTuple.class, new 
FetchEmitTupleSerializer());
         module.addSerializer(Metadata.class, new MetadataSerializer());
         module.addSerializer(ParseContext.class, new ParseContextSerializer());
+        module.addDeserializer(ParseContext.class, new 
ParseContextDeserializer());
         OBJECT_MAPPER.registerModule(module);
     }
 
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java
index 86f4a3560..9c6c6369e 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java
@@ -30,6 +30,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.pipes.api.FetchEmitTuple;
 import org.apache.tika.serialization.MetadataSerializer;
+import org.apache.tika.serialization.ParseContextDeserializer;
 import org.apache.tika.serialization.ParseContextSerializer;
 
 public class JsonFetchEmitTupleList {
@@ -42,6 +43,7 @@ public class JsonFetchEmitTupleList {
         module.addSerializer(FetchEmitTuple.class, new 
FetchEmitTupleSerializer());
         module.addSerializer(Metadata.class, new MetadataSerializer());
         module.addSerializer(ParseContext.class, new ParseContextSerializer());
+        module.addDeserializer(ParseContext.class, new 
ParseContextDeserializer());
         OBJECT_MAPPER.registerModule(module);
     }
 
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
 
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
index ff1742ec3..4cfb95ff7 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java
@@ -27,7 +27,6 @@ import com.fasterxml.jackson.core.JsonParser;
 import com.fasterxml.jackson.databind.DeserializationContext;
 import com.fasterxml.jackson.databind.JsonDeserializer;
 import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
 
 import org.apache.tika.config.ConfigContainer;
 import org.apache.tika.parser.ParseContext;
@@ -38,21 +37,20 @@ public class ParseContextDeserializer extends 
JsonDeserializer<ParseContext> {
     public ParseContext deserialize(JsonParser jsonParser,
                                     DeserializationContext 
deserializationContext)
             throws IOException, JacksonException {
-        ObjectMapper mapper = (ObjectMapper) jsonParser.getCodec();
-        JsonNode root = mapper.readTree(jsonParser);
-        return readParseContext(root, mapper);
+        JsonNode root = jsonParser.readValueAsTree();
+        return readParseContext(root);
     }
 
     /**
-     * Backwards-compatible version that creates its own ObjectMapper.
-     * Prefer {@link #readParseContext(JsonNode, ObjectMapper)} when possible.
+     * Deserializes a ParseContext from a JsonNode.
+     * Uses a properly configured ObjectMapper with polymorphic type handling
+     * to ensure objects in the ParseContext are deserialized correctly.
+     *
+     * @param jsonNode the JSON node containing the ParseContext data
+     * @return the deserialized ParseContext
+     * @throws IOException if deserialization fails
      */
     public static ParseContext readParseContext(JsonNode jsonNode) throws 
IOException {
-        return readParseContext(jsonNode, 
ParseContextSerializer.createMapper());
-    }
-
-    public static ParseContext readParseContext(JsonNode jsonNode, 
ObjectMapper mapper)
-            throws IOException {
         // Some use cases include the wrapper node, e.g. { "parseContext": {}}
         // Some include the contents only.
         // Try to find "parseContext" to start. If that doesn't exist, assume 
jsonNode is the contents.
@@ -75,7 +73,7 @@ public class ParseContextDeserializer extends 
JsonDeserializer<ParseContext> {
 
                     // Let Jackson handle polymorphic deserialization with 
type info
                     // Security is enforced by the PolymorphicTypeValidator in 
the mapper
-                    Object deserializedObject = mapper.treeToValue(objectNode, 
Object.class);
+                    Object deserializedObject = 
ParseContextSerializer.POLYMORPHIC_MAPPER.treeToValue(objectNode, Object.class);
 
                     parseContext.set((Class) superClass, deserializedObject);
                 } catch (ClassNotFoundException ex) {
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java
 
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java
index 096710a45..89de70757 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java
@@ -16,7 +16,15 @@
  */
 package org.apache.tika.serialization;
 
+import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Enumeration;
+import java.util.List;
 import java.util.Map;
 
 import com.fasterxml.jackson.core.JsonGenerator;
@@ -25,31 +33,100 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.SerializerProvider;
 import com.fasterxml.jackson.databind.jsontype.BasicPolymorphicTypeValidator;
 import com.fasterxml.jackson.databind.jsontype.PolymorphicTypeValidator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import org.apache.tika.config.ConfigContainer;
 import org.apache.tika.parser.ParseContext;
 
 public class ParseContextSerializer extends JsonSerializer<ParseContext> {
+    private static final Logger LOG = 
LoggerFactory.getLogger(ParseContextSerializer.class);
+
     public static final String PARSE_CONTEXT = "parseContext";
 
+    /**
+     * Classpath resource file where users can specify additional package 
prefixes
+     * to allow for polymorphic deserialization. One package prefix per line.
+     * Comments (lines starting with #) and blank lines are ignored.
+     *
+     * Example content:
+     * <pre>
+     * # Allow com.acme classes
+     * com.acme
+     * # Allow com.example classes
+     * com.example
+     * </pre>
+     */
+    public static final String ALLOWED_PACKAGES_RESOURCE = 
"META-INF/tika-serialization-allowlist.txt";
+
+    /**
+     * Static ObjectMapper configured for polymorphic 
serialization/deserialization.
+     * Initialized once when the class is loaded to avoid creating a new 
mapper on each call.
+     * Package-private to allow ParseContextDeserializer to use the same 
mapper.
+     */
+    static final ObjectMapper POLYMORPHIC_MAPPER = createMapper();
+
+    /**
+     * Loads additional package prefixes from classpath resources.
+     * Scans all {@link #ALLOWED_PACKAGES_RESOURCE} files on the classpath.
+     *
+     * @return list of additional package prefixes to allow
+     */
+    static List<String> loadAllowedPackages() {
+        List<String> packages = new ArrayList<>();
+        try {
+            Enumeration<URL> resources = 
ParseContextSerializer.class.getClassLoader()
+                    .getResources(ALLOWED_PACKAGES_RESOURCE);
+
+            while (resources.hasMoreElements()) {
+                URL resource = resources.nextElement();
+                LOG.debug("Loading allowed packages from: {}", resource);
+
+                try (InputStream is = resource.openStream();
+                        BufferedReader reader = new BufferedReader(new 
InputStreamReader(is, StandardCharsets.UTF_8))) {
+
+                    String line;
+                    while ((line = reader.readLine()) != null) {
+                        line = line.trim();
+                        // Skip comments and empty lines
+                        if (line.isEmpty() || line.startsWith("#")) {
+                            continue;
+                        }
+                        packages.add(line);
+                        LOG.info("Allowing polymorphic deserialization for 
package: {}", line);
+                    }
+                } catch (IOException e) {
+                    LOG.warn("Failed to read allowed packages from: {}", 
resource, e);
+                }
+            }
+        } catch (IOException e) {
+            LOG.warn("Failed to load allowed packages resources", e);
+        }
+        return packages;
+    }
+
     /**
      * Creates an ObjectMapper for serialization with polymorphic type 
handling.
-     * Configures security validation to allow only Tika classes.
+     * Configures security validation to allow Tika classes and any additional
+     * packages specified via {@link #ALLOWED_PACKAGES_RESOURCE} files on the 
classpath.
      * Uses OBJECT_AND_NON_CONCRETE typing to add type info for Object and 
abstract types,
      * which avoids interfering with custom serializers for concrete types 
like ParseContext.
      */
-    static ObjectMapper createMapper() {
+    public static ObjectMapper createMapper() {
         ObjectMapper mapper = new ObjectMapper();
 
-        // Configure polymorphic type validator for security
-        // Use allowIfSubType to allow:
-        // - org.apache.tika.* classes (all Tika types)
-        // - java.util.* classes (collections, dates, etc.)
-        // This is needed because we deserialize with base type Object.class
-        PolymorphicTypeValidator typeValidator = 
BasicPolymorphicTypeValidator.builder()
+        // Start with Tika and Java standard packages
+        BasicPolymorphicTypeValidator.Builder builder = 
BasicPolymorphicTypeValidator.builder()
                 .allowIfSubType("org.apache.tika.")
-                .allowIfSubType("java.util.")
-                .build();
+                .allowIfSubType("java.util.");
+
+        // Add user-specified packages from classpath
+        List<String> additionalPackages = loadAllowedPackages();
+        for (String packagePrefix : additionalPackages) {
+            builder.allowIfSubType(packagePrefix);
+        }
+
+        PolymorphicTypeValidator typeValidator = builder.build();
 
         // Use OBJECT_AND_NON_CONCRETE to add type info when static type is:
         // - Object.class (for objects in the "objects" map)
@@ -75,11 +152,6 @@ public class ParseContextSerializer extends 
JsonSerializer<ParseContext> {
             jsonGenerator.writeFieldName("objects");
             jsonGenerator.writeStartObject();
 
-            ObjectMapper mapper = (ObjectMapper) jsonGenerator.getCodec();
-            if (mapper == null) {
-                mapper = createMapper();
-            }
-
             for (Map.Entry<String, Object> entry : contextMap.entrySet()) {
                 String className = entry.getKey();
                 if (className.equals(ConfigContainer.class.getName())) {
@@ -93,7 +165,7 @@ public class ParseContextSerializer extends 
JsonSerializer<ParseContext> {
 
                 // Let Jackson handle type information and serialization
                 // Use writerFor(Object.class) to ensure polymorphic type info 
is added
-                mapper.writerFor(Object.class).writeValue(jsonGenerator, 
value);
+                
POLYMORPHIC_MAPPER.writerFor(Object.class).writeValue(jsonGenerator, value);
             }
 
             jsonGenerator.writeEndObject();
diff --git 
a/tika-serialization/src/test/java/org/apache/tika/serialization/CustomClassSerializationTest.java
 
b/tika-serialization/src/test/java/org/apache/tika/serialization/CustomClassSerializationTest.java
new file mode 100644
index 000000000..f518ff305
--- /dev/null
+++ 
b/tika-serialization/src/test/java/org/apache/tika/serialization/CustomClassSerializationTest.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.serialization;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.StringWriter;
+import java.io.Writer;
+import java.util.Locale;
+
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.module.SimpleModule;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Tests that users can serialize their own custom classes in ParseContext
+ * by adding a META-INF/tika-serialization-allowlist.txt file to their JAR.
+ *
+ * <p>Custom classes MUST implement Serializable because the Pipes parser
+ * uses Java serialization to pass ParseContext between processes.</p>
+ *
+ * <p>To enable JSON serialization of custom classes:</p>
+ * <ol>
+ *   <li>Implement Serializable</li>
+ *   <li>Provide a no-arg constructor</li>
+ *   <li>Follow JavaBean conventions (getters/setters)</li>
+ *   <li>Add your package prefix to 
META-INF/tika-serialization-allowlist.txt</li>
+ * </ol>
+ */
+public class CustomClassSerializationTest {
+
+    /**
+     * Example custom metadata filter that uppercases all values.
+     * This simulates a user's custom class (e.g., in package com.acme).
+     *
+     * <p>Note: Extends Serializable MetadataFilter - this is REQUIRED for use 
with Pipes parser.</p>
+     */
+    public static class MyUpperCasingMetadataFilter extends MetadataFilter {
+        private String prefix = "";
+
+        public MyUpperCasingMetadataFilter() {
+        }
+
+        public MyUpperCasingMetadataFilter(String prefix) {
+            this.prefix = prefix;
+        }
+
+        public String getPrefix() {
+            return prefix;
+        }
+
+        public void setPrefix(String prefix) {
+            this.prefix = prefix;
+        }
+
+        @Override
+        public java.util.List<Metadata> filter(java.util.List<Metadata> 
metadataList) {
+            for (Metadata metadata : metadataList) {
+                for (String name : metadata.names()) {
+                    String[] values = metadata.getValues(name);
+                    metadata.remove(name);
+                    for (String value : values) {
+                        metadata.add(name, prefix + 
value.toUpperCase(Locale.ROOT));
+                    }
+                }
+            }
+            return metadataList;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) return true;
+            if (!(o instanceof MyUpperCasingMetadataFilter)) return false;
+            MyUpperCasingMetadataFilter that = (MyUpperCasingMetadataFilter) o;
+            return prefix.equals(that.prefix);
+        }
+
+        @Override
+        public int hashCode() {
+            return prefix.hashCode();
+        }
+    }
+
+    private ObjectMapper createMapper() {
+        ObjectMapper mapper = ParseContextSerializer.createMapper();
+        SimpleModule module = new SimpleModule();
+        module.addDeserializer(ParseContext.class, new 
ParseContextDeserializer());
+        module.addSerializer(ParseContext.class, new ParseContextSerializer());
+        mapper.registerModule(module);
+        return mapper;
+    }
+
+    @Test
+    public void testCustomMetadataFilterSerialization() throws Exception {
+        // Create a custom metadata filter
+        MyUpperCasingMetadataFilter customFilter = new 
MyUpperCasingMetadataFilter("TEST_");
+
+        // Put it in ParseContext - store as MetadataFilter (the abstract base 
type)
+        ParseContext pc = new ParseContext();
+        pc.set(MetadataFilter.class, (MetadataFilter) customFilter);
+
+        // Serialize
+        ObjectMapper mapper = createMapper();
+        String json;
+        try (Writer writer = new StringWriter()) {
+            try (JsonGenerator jsonGenerator = mapper
+                    .getFactory()
+                    .createGenerator(writer)) {
+                ParseContextSerializer serializer = new 
ParseContextSerializer();
+                serializer.serialize(pc, jsonGenerator, null);
+            }
+            json = writer.toString();
+        }
+
+        System.out.println("Serialized custom class:");
+        System.out.println(json);
+
+        // Verify JSON contains type information
+        assertTrue(json.contains("MyUpperCasingMetadataFilter"),
+                "JSON should contain the custom class name");
+
+        // Deserialize
+        ParseContext deserialized = mapper.readValue(json, ParseContext.class);
+        MetadataFilter deserializedFilter = 
deserialized.get(MetadataFilter.class);
+
+        // Verify polymorphic deserialization worked - we get back the 
concrete type
+        assertNotNull(deserializedFilter, "MetadataFilter should not be null");
+        assertTrue(deserializedFilter instanceof MyUpperCasingMetadataFilter,
+                "Filter should be MyUpperCasingMetadataFilter (polymorphic 
deserialization)");
+
+        MyUpperCasingMetadataFilter typedFilter = 
(MyUpperCasingMetadataFilter) deserializedFilter;
+        assertEquals("TEST_", typedFilter.getPrefix(), "Prefix should be 
preserved");
+
+        // Verify it works
+        Metadata metadata = new Metadata();
+        metadata.add("test", "value");
+        java.util.List<Metadata> metadataList = new java.util.ArrayList<>();
+        metadataList.add(metadata);
+        typedFilter.filter(metadataList);
+        assertEquals("TEST_VALUE", metadata.get("test"), "Filter should 
uppercase with prefix");
+    }
+}

Reply via email to