This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch clean-up-metadata-list-json-settings in repository https://gitbox.apache.org/repos/asf/tika.git
commit bc52af0cabcc541cc6c995f6a64a5fa48f1ac00f Author: tallison <[email protected]> AuthorDate: Wed Dec 17 09:55:25 2025 -0500 Clean up jackson settings on metadata list serialization/deserialization --- .../org/apache/tika/config/GlobalSettings.java | 57 -------------- .../apache/tika/config/loader/TikaJsonConfig.java | 2 +- .../org/apache/tika/config/loader/TikaLoader.java | 60 ++++++++++++--- .../apache/tika/serialization/JsonMetadata.java | 89 +++++++++++++--------- .../tika/serialization/JsonMetadataList.java | 74 ++++++++++++------ .../test/resources/configs/tika-config-json.json | 6 +- .../test/resources/configs/tika-config-json.json | 4 +- 7 files changed, 163 insertions(+), 129 deletions(-) diff --git a/tika-serialization/src/main/java/org/apache/tika/config/GlobalSettings.java b/tika-serialization/src/main/java/org/apache/tika/config/GlobalSettings.java index 7d07c3b9e..7493000ae 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/GlobalSettings.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/GlobalSettings.java @@ -25,7 +25,6 @@ import com.fasterxml.jackson.annotation.JsonProperty; * <p>Example JSON: * <pre> * { - * "maxJsonStringFieldLength": 50000000, * "xml-reader-utils": { * "maxEntityExpansions": 1000, * "maxNumReuses": 100, @@ -36,20 +35,6 @@ import com.fasterxml.jackson.annotation.JsonProperty; */ public class GlobalSettings { - /** - * Static maximum length for JSON string fields. - * Default: 20,000,000 (Jackson's default) - * This is static because it's a global setting that affects all JSON parsing. - */ - private static Integer maxJsonStringFieldLength = 20_000_000; - - /** - * Instance field for deserialization from JSON. - * The value is copied to the static field when set. - */ - @JsonProperty("maxJsonStringFieldLength") - private Integer instanceMaxJsonStringFieldLength = 20_000_000; - /** * Service loader configuration for handling initialization problems. */ @@ -62,48 +47,6 @@ public class GlobalSettings { @JsonProperty("xml-reader-utils") private XmlReaderUtilsConfig xmlReaderUtils; - /** - * Gets the static maximum JSON string field length. - * - * @return the max length, or null if not set - */ - public static Integer getMaxJsonStringFieldLength() { - return maxJsonStringFieldLength; - } - - /** - * Sets the static maximum JSON string field length. - * This affects all JSON parsing globally. - * - * @param length the max length to set - */ - public static void setMaxJsonStringFieldLength(Integer length) { - maxJsonStringFieldLength = length; - } - - /** - * Instance getter for deserialization. - * Returns the instance value which may differ from the static value. - * - * @return the instance max length - */ - public Integer getInstanceMaxJsonStringFieldLength() { - return instanceMaxJsonStringFieldLength; - } - - /** - * Instance setter for deserialization. - * Automatically updates the static field when set. - * - * @param length the max length to set - */ - public void setInstanceMaxJsonStringFieldLength(Integer length) { - this.instanceMaxJsonStringFieldLength = length; - if (length != null) { - setMaxJsonStringFieldLength(length); - } - } - public ServiceLoaderConfig getServiceLoader() { return serviceLoader; } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java index 2eeb8bc7a..8ce14a30f 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java @@ -103,7 +103,7 @@ public class TikaJsonConfig { */ private static final Set<String> KNOWN_KEYS = Set.of( // Globals - "maxJsonStringFieldLength", + "metadata-list", "service-loader", "xml-reader-utils", // Core Tika component keys diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 52e17d9d1..b82beb7d8 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -21,6 +21,8 @@ import java.nio.file.Path; import java.util.Collections; import java.util.List; +import com.fasterxml.jackson.core.StreamReadConstraints; +import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.config.GlobalSettings; @@ -39,6 +41,8 @@ import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.Parser; import org.apache.tika.renderer.CompositeRenderer; import org.apache.tika.renderer.Renderer; +import org.apache.tika.serialization.JsonMetadata; +import org.apache.tika.serialization.JsonMetadataList; /** * Main entry point for loading Tika components from JSON configuration. @@ -396,17 +400,18 @@ public class TikaLoader { * * <p>Settings include: * <ul> - * <li>maxJsonStringFieldLength - Maximum JSON string field length (static, affects all JSON parsing)</li> - * <li>service-loader.initializableProblemHandler - How to handle initialization problems</li> + * <li>metadata-list - Jackson StreamReadConstraints for JsonMetadata/JsonMetadataList serialization</li> + * <li>service-loader - Service loader configuration</li> * <li>xml-reader-utils - XML parser security settings</li> * </ul> * * <p>Example JSON: * <pre> * { - * "maxJsonStringFieldLength": 50000000, - * "service-loader": { - * "initializableProblemHandler": "ignore" + * "metadata-list": { + * "maxStringLength": 50000000, + * "maxNestingDepth": 10, + * "maxNumberLength": 500 * }, * "xml-reader-utils": { * "maxEntityExpansions": 1000, @@ -423,11 +428,8 @@ public class TikaLoader { if (globalSettings == null) { globalSettings = new GlobalSettings(); - // Load maxJsonStringFieldLength from top level and set it statically - if (config.getRootNode().has("maxJsonStringFieldLength")) { - GlobalSettings.setMaxJsonStringFieldLength( - config.getRootNode().get("maxJsonStringFieldLength").asInt()); - } + // Load metadata-list config for JsonMetadata/JsonMetadataList serialization + loadMetadataListConfig(); // Load service-loader config (official Tika config at root level) GlobalSettings.ServiceLoaderConfig serviceLoaderConfig = @@ -446,6 +448,44 @@ public class TikaLoader { return globalSettings; } + /** + * Loads the metadata-list configuration section and applies it to + * JsonMetadata and JsonMetadataList serializers. + * <p> + * Configuration uses Jackson's StreamReadConstraints property names: + * <pre> + * { + * "metadata-list": { + * "maxStringLength": 20000000, + * "maxNestingDepth": 10, + * "maxNumberLength": 500 + * } + * } + * </pre> + */ + private void loadMetadataListConfig() { + JsonNode metadataListNode = config.getRootNode().get("metadata-list"); + if (metadataListNode == null) { + return; + } + + StreamReadConstraints.Builder builder = StreamReadConstraints.builder(); + + if (metadataListNode.has("maxStringLength")) { + builder.maxStringLength(metadataListNode.get("maxStringLength").asInt()); + } + if (metadataListNode.has("maxNestingDepth")) { + builder.maxNestingDepth(metadataListNode.get("maxNestingDepth").asInt()); + } + if (metadataListNode.has("maxNumberLength")) { + builder.maxNumberLength(metadataListNode.get("maxNumberLength").asInt()); + } + + StreamReadConstraints constraints = builder.build(); + JsonMetadata.setStreamReadConstraints(constraints); + JsonMetadataList.setStreamReadConstraints(constraints); + } + /** * Gets the global settings if they have been loaded. * diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java index e9adec234..504fb4f19 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java @@ -26,22 +26,69 @@ import com.fasterxml.jackson.core.StreamReadConstraints; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.module.SimpleModule; -import org.apache.tika.config.GlobalSettings; import org.apache.tika.metadata.Metadata; public class JsonMetadata { static volatile boolean PRETTY_PRINT = false; - private static ObjectMapper OBJECT_MAPPER; - private static final ObjectMapper PRETTY_SERIALIZER; + /** + * Default stream read constraints for metadata serialization. + */ + private static final StreamReadConstraints DEFAULT_CONSTRAINTS = StreamReadConstraints + .builder() + .maxNestingDepth(10) + .maxStringLength(20_000_000) + .maxNumberLength(500) + .build(); + + private static volatile StreamReadConstraints streamReadConstraints = DEFAULT_CONSTRAINTS; + private static volatile ObjectMapper OBJECT_MAPPER; + private static volatile ObjectMapper PRETTY_SERIALIZER; static { - OBJECT_MAPPER = buildObjectMapper(StreamReadConstraints.DEFAULT_MAX_STRING_LEN); - PRETTY_SERIALIZER = new ObjectMapper(); + rebuildObjectMappers(); + } + + private static void rebuildObjectMappers() { + JsonFactory factory = new JsonFactory(); + factory.setStreamReadConstraints(streamReadConstraints); + + ObjectMapper mapper = new ObjectMapper(factory); + SimpleModule baseModule = new SimpleModule(); + baseModule.addDeserializer(Metadata.class, new MetadataDeserializer()); + baseModule.addSerializer(Metadata.class, new MetadataSerializer()); + mapper.registerModule(baseModule); + OBJECT_MAPPER = mapper; + + ObjectMapper prettyMapper = new ObjectMapper(factory); SimpleModule prettySerializerModule = new SimpleModule(); prettySerializerModule.addSerializer(Metadata.class, new MetadataSerializer(true)); - PRETTY_SERIALIZER.registerModule(prettySerializerModule); + prettyMapper.registerModule(prettySerializerModule); + PRETTY_SERIALIZER = prettyMapper; + } + + /** + * Sets the stream read constraints for JSON parsing of metadata. + * This affects all subsequent calls to {@link #fromJson(Reader)}. + * <p> + * Typically called by TikaLoader during initialization based on the + * "metadata-list" configuration section. + * + * @param constraints the constraints to use + */ + public static synchronized void setStreamReadConstraints(StreamReadConstraints constraints) { + streamReadConstraints = constraints; + rebuildObjectMappers(); + } + + /** + * Gets the current stream read constraints. + * + * @return the current constraints + */ + public static StreamReadConstraints getStreamReadConstraints() { + return streamReadConstraints; } /** @@ -62,46 +109,20 @@ public class JsonMetadata { } /** - * Read metadata from reader. - * <p> - * This does not close the reader. - * <p> - * This will reset the OBJECT_MAPPER if the max string length differs from that in TikaConfig. + * Read metadata from reader. This does not close the reader. * * @param reader reader to read from - * @return Metadata or null if nothing could be read from the reader + * @return Metadata or null if reader is null * @throws IOException in case of parse failure or IO failure with Reader */ public static Metadata fromJson(Reader reader) throws IOException { if (reader == null) { return null; } - if (OBJECT_MAPPER - .getFactory() - .streamReadConstraints() - .getMaxStringLength() != GlobalSettings.getMaxJsonStringFieldLength()) { - OBJECT_MAPPER = buildObjectMapper(GlobalSettings.getMaxJsonStringFieldLength()); - } return OBJECT_MAPPER.readValue(reader, Metadata.class); } public static void setPrettyPrinting(boolean prettyPrint) { PRETTY_PRINT = prettyPrint; } - - static ObjectMapper buildObjectMapper(int maxStringLen) { - JsonFactory factory = new JsonFactory(); - factory.setStreamReadConstraints(StreamReadConstraints - .builder() - .maxNestingDepth(10) - .maxStringLength(maxStringLen) - .maxNumberLength(500) - .build()); - ObjectMapper objectMapper = new ObjectMapper(factory); - SimpleModule baseModule = new SimpleModule(); - baseModule.addDeserializer(Metadata.class, new MetadataDeserializer()); - baseModule.addSerializer(Metadata.class, new MetadataSerializer()); - objectMapper.registerModule(baseModule); - return objectMapper; - } } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java index 71427947b..7611cdfea 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java @@ -16,8 +16,6 @@ */ package org.apache.tika.serialization; -import static org.apache.tika.serialization.JsonMetadata.buildObjectMapper; - import java.io.IOException; import java.io.Reader; import java.io.Writer; @@ -29,36 +27,69 @@ import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.module.SimpleModule; -import org.apache.tika.config.GlobalSettings; import org.apache.tika.metadata.Metadata; public class JsonMetadataList { static volatile boolean PRETTY_PRINT = false; - private static ObjectMapper OBJECT_MAPPER; - private static final ObjectMapper PRETTY_SERIALIZER; + /** + * Default stream read constraints for metadata list serialization. + */ + private static final StreamReadConstraints DEFAULT_CONSTRAINTS = StreamReadConstraints + .builder() + .maxNestingDepth(10) + .maxStringLength(20_000_000) + .maxNumberLength(500) + .build(); + + private static volatile StreamReadConstraints streamReadConstraints = DEFAULT_CONSTRAINTS; + private static volatile ObjectMapper OBJECT_MAPPER; + private static volatile ObjectMapper PRETTY_SERIALIZER; static { + rebuildObjectMappers(); + } + + private static void rebuildObjectMappers() { JsonFactory factory = new JsonFactory(); - factory.setStreamReadConstraints(StreamReadConstraints - .builder() - .maxNestingDepth(10) - .maxStringLength(GlobalSettings.getMaxJsonStringFieldLength()) - .maxNumberLength(500) -// .maxDocumentLength(1000000) - .build()); - OBJECT_MAPPER = new ObjectMapper(factory); + factory.setStreamReadConstraints(streamReadConstraints); + + ObjectMapper mapper = new ObjectMapper(factory); SimpleModule baseModule = new SimpleModule(); baseModule.addDeserializer(Metadata.class, new MetadataDeserializer()); baseModule.addSerializer(Metadata.class, new MetadataSerializer()); - OBJECT_MAPPER.registerModule(baseModule); + mapper.registerModule(baseModule); + OBJECT_MAPPER = mapper; - PRETTY_SERIALIZER = new ObjectMapper(factory); + ObjectMapper prettyMapper = new ObjectMapper(factory); SimpleModule prettySerializerModule = new SimpleModule(); prettySerializerModule.addSerializer(Metadata.class, new MetadataSerializer(true)); - PRETTY_SERIALIZER.registerModule(prettySerializerModule); + prettyMapper.registerModule(prettySerializerModule); + PRETTY_SERIALIZER = prettyMapper; + } + + /** + * Sets the stream read constraints for JSON parsing of metadata lists. + * This affects all subsequent calls to {@link #fromJson(Reader)}. + * <p> + * Typically called by TikaLoader during initialization based on the + * "metadata-list" configuration section. + * + * @param constraints the constraints to use + */ + public static synchronized void setStreamReadConstraints(StreamReadConstraints constraints) { + streamReadConstraints = constraints; + rebuildObjectMappers(); + } + /** + * Gets the current stream read constraints. + * + * @return the current constraints + */ + public static StreamReadConstraints getStreamReadConstraints() { + return streamReadConstraints; } /** @@ -89,21 +120,16 @@ public class JsonMetadataList { } /** - * Read metadata from reader. This does not close the reader + * Read metadata from reader. This does not close the reader. * - * @param reader - * @return Metadata or null if nothing could be read from the reader + * @param reader the reader to read from + * @return Metadata list or null if reader is null * @throws IOException in case of parse failure or IO failure with Reader */ public static List<Metadata> fromJson(Reader reader) throws IOException { if (reader == null) { return null; } - if (OBJECT_MAPPER.getFactory().streamReadConstraints().getMaxStringLength() - != GlobalSettings.getMaxJsonStringFieldLength()) { - OBJECT_MAPPER = buildObjectMapper(GlobalSettings.getMaxJsonStringFieldLength()); - } - return OBJECT_MAPPER.readValue(reader, new TypeReference<List<Metadata>>(){}); } diff --git a/tika-serialization/src/test/resources/configs/tika-config-json.json b/tika-serialization/src/test/resources/configs/tika-config-json.json index 8d1e5feb0..3650aab64 100644 --- a/tika-serialization/src/test/resources/configs/tika-config-json.json +++ b/tika-serialization/src/test/resources/configs/tika-config-json.json @@ -1,3 +1,5 @@ { - "maxJsonStringFieldLength": 50000000 -} \ No newline at end of file + "metadata-list": { + "maxStringLength": 50000000 + } +} diff --git a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-json.json b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-json.json index 419a225e6..3650aab64 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-json.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-json.json @@ -1,3 +1,5 @@ { - "maxJsonStringFieldLength": 50000000 + "metadata-list": { + "maxStringLength": 50000000 + } }
