This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4503 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6e63d30115e7a80a9703b45aef6fb6dd7ce39139 Author: tallison <[email protected]> AuthorDate: Fri Oct 3 11:06:27 2025 -0400 TIKA-4503 -- refactor serialization --- ...tTuple.java => FetchEmitTupleDeserializer.java} | 113 +++----------- .../serialization/FetchEmitTupleSerializer.java | 65 +++++++++ .../pipes/core/serialization/JsonEmitData.java | 45 ++---- .../core/serialization/JsonFetchEmitTuple.java | 162 ++------------------- .../core/serialization/JsonFetchEmitTupleList.java | 52 +++---- .../apache/tika/serialization/JsonMetadata.java | 140 ++++++------------ .../tika/serialization/JsonMetadataList.java | 102 +++++-------- .../serialization/JsonStreamingSerializer.java | 64 -------- .../tika/serialization/MetadataDeserializer.java | 71 +++++++++ .../tika/serialization/MetadataSerializer.java | 76 ++++++++++ .../tika/serialization/ParseContextSerializer.java | 1 - .../tika/serialization/JsonMetadataListTest.java | 77 ++++------ .../tika/serialization/JsonMetadataTest.java | 14 +- .../TestParseContextSerialization.java | 2 - 14 files changed, 406 insertions(+), 578 deletions(-) diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleDeserializer.java similarity index 53% copy from tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java copy to tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleDeserializer.java index b92685521..23b2d36e5 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleDeserializer.java @@ -16,47 +16,39 @@ */ package org.apache.tika.pipes.core.serialization; +import static org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.EMITKEY; +import static org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.EMITTER; +import static org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.FETCHER; +import static org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.FETCHKEY; +import static org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.FETCH_RANGE_END; +import static org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.FETCH_RANGE_START; +import static org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.ID; +import static org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.METADATAKEY; +import static org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.ON_PARSE_EXCEPTION; +import static org.apache.tika.serialization.ParseContextSerializer.PARSE_CONTEXT; + import java.io.IOException; -import java.io.Reader; -import java.io.StringWriter; -import java.io.Writer; -import java.util.Locale; import java.util.Map; -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.core.JacksonException; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.databind.DeserializationContext; +import com.fasterxml.jackson.databind.JsonDeserializer; import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.core.FetchEmitTuple; import org.apache.tika.pipes.core.emitter.EmitKey; import org.apache.tika.pipes.core.fetcher.FetchKey; -import org.apache.tika.serialization.JsonMetadata; import org.apache.tika.serialization.ParseContextDeserializer; -import org.apache.tika.serialization.ParseContextSerializer; -import org.apache.tika.utils.StringUtils; - -public class JsonFetchEmitTuple { - public static final String ID = "id"; - public static final String FETCHER = "fetcher"; - public static final String FETCHKEY = "fetchKey"; - public static final String FETCH_RANGE_START = "fetchRangeStart"; - public static final String FETCH_RANGE_END = "fetchRangeEnd"; - public static final String EMITTER = "emitter"; - public static final String EMITKEY = "emitKey"; - public static final String METADATAKEY = "metadata"; - public static final String ON_PARSE_EXCEPTION = "onParseException"; +public class FetchEmitTupleDeserializer extends JsonDeserializer<FetchEmitTuple> { - public static FetchEmitTuple fromJson(Reader reader) throws IOException { - JsonNode root = new ObjectMapper().readTree(reader); - return parseFetchEmitTuple(root); - } + @Override + public FetchEmitTuple deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException, JacksonException { + JsonNode root = jsonParser.readValueAsTree(); - - static FetchEmitTuple parseFetchEmitTuple(JsonNode root) throws IOException { String id = readVal(ID, root, null, true); String fetcherName = readVal(FETCHER, root, null, true); String fetchKey = readVal(FETCHKEY, root, null, true); @@ -65,11 +57,12 @@ public class JsonFetchEmitTuple { long fetchRangeStart = readLong(FETCH_RANGE_START, root, -1l, false); long fetchRangeEnd = readLong(FETCH_RANGE_END, root, -1l, false); Metadata metadata = readMetadata(root); - JsonNode parseContextNode = root.get(ParseContextSerializer.PARSE_CONTEXT); + JsonNode parseContextNode = root.get(PARSE_CONTEXT); ParseContext parseContext = parseContextNode == null ? new ParseContext() : ParseContextDeserializer.readParseContext(parseContextNode); FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = readOnParseException(root); - return new FetchEmitTuple(id, new FetchKey(fetcherName, fetchKey, fetchRangeStart, fetchRangeEnd), new EmitKey(emitterName, emitKey), metadata, parseContext, + return new FetchEmitTuple(id, new FetchKey(fetcherName, fetchKey, fetchRangeStart, fetchRangeEnd), + new EmitKey(emitterName, emitKey), metadata, parseContext, onParseException); } @@ -130,66 +123,4 @@ public class JsonFetchEmitTuple { return val.longValue(); } - public static String toJson(FetchEmitTuple t) throws IOException { - StringWriter writer = new StringWriter(); - toJson(t, writer); - return writer.toString(); - } - - public static void toJson(FetchEmitTuple t, Writer writer) throws IOException { - - try (JsonGenerator jsonGenerator = new JsonFactory().createGenerator(writer)) { - writeTuple(t, jsonGenerator); - } - } - - static void writeTuple(FetchEmitTuple t, JsonGenerator jsonGenerator) throws IOException { - jsonGenerator.writeStartObject(); - jsonGenerator.writeStringField(ID, t.getId()); - jsonGenerator.writeStringField(FETCHER, t - .getFetchKey() - .getFetcherName()); - jsonGenerator.writeStringField(FETCHKEY, t - .getFetchKey() - .getFetchKey()); - if (t - .getFetchKey() - .hasRange()) { - jsonGenerator.writeNumberField(FETCH_RANGE_START, t - .getFetchKey() - .getRangeStart()); - jsonGenerator.writeNumberField(FETCH_RANGE_END, t - .getFetchKey() - .getRangeEnd()); - } - jsonGenerator.writeStringField(EMITTER, t - .getEmitKey() - .getEmitterName()); - if (!StringUtils.isBlank(t - .getEmitKey() - .getEmitKey())) { - jsonGenerator.writeStringField(EMITKEY, t - .getEmitKey() - .getEmitKey()); - } - if (t - .getMetadata() - .size() > 0) { - jsonGenerator.writeFieldName(METADATAKEY); - JsonMetadata.writeMetadataObject(t.getMetadata(), jsonGenerator, false); - } - - jsonGenerator.writeStringField(ON_PARSE_EXCEPTION, t - .getOnParseException() - .name() - .toLowerCase(Locale.US)); - if (!t - .getParseContext() - .isEmpty()) { - ParseContextSerializer s = new ParseContextSerializer(); - s.serialize(t.getParseContext(), jsonGenerator, null); - } - jsonGenerator.writeEndObject(); - - } } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleSerializer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleSerializer.java new file mode 100644 index 000000000..3f203ae25 --- /dev/null +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleSerializer.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.core.serialization; + +import static org.apache.tika.serialization.ParseContextSerializer.PARSE_CONTEXT; + +import java.io.IOException; +import java.util.Locale; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.JsonSerializer; +import com.fasterxml.jackson.databind.SerializerProvider; + +import org.apache.tika.pipes.core.FetchEmitTuple; +import org.apache.tika.utils.StringUtils; + +public class FetchEmitTupleSerializer extends JsonSerializer<FetchEmitTuple> { + public static final String ID = "id"; + public static final String FETCHER = "fetcher"; + public static final String FETCHKEY = "fetchKey"; + public static final String FETCH_RANGE_START = "fetchRangeStart"; + public static final String FETCH_RANGE_END = "fetchRangeEnd"; + public static final String EMITTER = "emitter"; + public static final String EMITKEY = "emitKey"; + public static final String METADATAKEY = "metadata"; + public static final String ON_PARSE_EXCEPTION = "onParseException"; + + public void serialize(FetchEmitTuple t, JsonGenerator jsonGenerator, SerializerProvider serializerProvider) throws IOException { + + jsonGenerator.writeStartObject(); + jsonGenerator.writeStringField(ID, t.getId()); + jsonGenerator.writeStringField(FETCHER, t.getFetchKey().getFetcherName()); + jsonGenerator.writeStringField(FETCHKEY, t.getFetchKey().getFetchKey()); + if (t.getFetchKey().hasRange()) { + jsonGenerator.writeNumberField(FETCH_RANGE_START, t.getFetchKey().getRangeStart()); + jsonGenerator.writeNumberField(FETCH_RANGE_END, t.getFetchKey().getRangeEnd()); + } + jsonGenerator.writeStringField(EMITTER, t.getEmitKey().getEmitterName()); + if (!StringUtils.isBlank(t.getEmitKey().getEmitKey())) { + jsonGenerator.writeStringField(EMITKEY, t.getEmitKey().getEmitKey()); + } + if (t.getMetadata().size() > 0) { + jsonGenerator.writeObjectField(METADATAKEY, t.getMetadata()); + } + jsonGenerator.writeStringField(ON_PARSE_EXCEPTION, t.getOnParseException().name().toLowerCase(Locale.US)); + if (!t.getParseContext().isEmpty()) { + jsonGenerator.writeObjectField(PARSE_CONTEXT, t.getParseContext()); + } + jsonGenerator.writeEndObject(); + } +} diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonEmitData.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonEmitData.java index 8eb03c51d..2ec5f9343 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonEmitData.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonEmitData.java @@ -19,41 +19,28 @@ package org.apache.tika.pipes.core.serialization; import java.io.IOException; import java.io.Writer; -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonGenerator; -import com.fasterxml.jackson.core.StreamReadConstraints; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.module.SimpleModule; -import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.pipes.core.FetchEmitTuple; import org.apache.tika.pipes.core.emitter.EmitData; -import org.apache.tika.pipes.core.emitter.EmitKey; -import org.apache.tika.serialization.JsonMetadata; +import org.apache.tika.serialization.MetadataSerializer; +import org.apache.tika.serialization.ParseContextSerializer; public class JsonEmitData { + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + static { + SimpleModule module = new SimpleModule(); + module.addSerializer(FetchEmitTuple.class, new FetchEmitTupleSerializer()); + module.addSerializer(ParseContext.class, new ParseContextSerializer()); + module.addSerializer(Metadata.class, new MetadataSerializer()); + OBJECT_MAPPER.registerModule(module); + } public static void toJson(EmitData emitData, Writer writer) throws IOException { - try (JsonGenerator jsonGenerator = new JsonFactory() - .setStreamReadConstraints(StreamReadConstraints - .builder() - .maxStringLength(TikaConfig.getMaxJsonStringFieldLength()) - .build()) - .createGenerator(writer)) { - jsonGenerator.writeStartObject(); - EmitKey key = emitData.getEmitKey(); - jsonGenerator.writeStringField(JsonFetchEmitTuple.EMITTER, key.getEmitterName()); - jsonGenerator.writeStringField(JsonFetchEmitTuple.EMITKEY, key.getEmitKey()); - if (!emitData - .getParseContext() - .isEmpty()) { - jsonGenerator.writeObject(emitData.getParseContext()); - } - jsonGenerator.writeFieldName("data"); - jsonGenerator.writeStartArray(); - for (Metadata m : emitData.getMetadataList()) { - JsonMetadata.writeMetadataObject(m, jsonGenerator, false); - } - jsonGenerator.writeEndArray(); - jsonGenerator.writeEndObject(); - } + OBJECT_MAPPER.writeValue(writer, emitData); } } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java index b92685521..6841379a0 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java @@ -20,114 +20,31 @@ import java.io.IOException; import java.io.Reader; import java.io.StringWriter; import java.io.Writer; -import java.util.Locale; -import java.util.Map; -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonGenerator; -import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.module.SimpleModule; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.core.FetchEmitTuple; -import org.apache.tika.pipes.core.emitter.EmitKey; -import org.apache.tika.pipes.core.fetcher.FetchKey; -import org.apache.tika.serialization.JsonMetadata; -import org.apache.tika.serialization.ParseContextDeserializer; +import org.apache.tika.serialization.MetadataSerializer; import org.apache.tika.serialization.ParseContextSerializer; -import org.apache.tika.utils.StringUtils; public class JsonFetchEmitTuple { - public static final String ID = "id"; - public static final String FETCHER = "fetcher"; - public static final String FETCHKEY = "fetchKey"; - public static final String FETCH_RANGE_START = "fetchRangeStart"; - public static final String FETCH_RANGE_END = "fetchRangeEnd"; - public static final String EMITTER = "emitter"; - public static final String EMITKEY = "emitKey"; - public static final String METADATAKEY = "metadata"; - public static final String ON_PARSE_EXCEPTION = "onParseException"; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static FetchEmitTuple fromJson(Reader reader) throws IOException { - JsonNode root = new ObjectMapper().readTree(reader); - return parseFetchEmitTuple(root); - } - - - static FetchEmitTuple parseFetchEmitTuple(JsonNode root) throws IOException { - String id = readVal(ID, root, null, true); - String fetcherName = readVal(FETCHER, root, null, true); - String fetchKey = readVal(FETCHKEY, root, null, true); - String emitterName = readVal(EMITTER, root, "", false); - String emitKey = readVal(EMITKEY, root, "", false); - long fetchRangeStart = readLong(FETCH_RANGE_START, root, -1l, false); - long fetchRangeEnd = readLong(FETCH_RANGE_END, root, -1l, false); - Metadata metadata = readMetadata(root); - JsonNode parseContextNode = root.get(ParseContextSerializer.PARSE_CONTEXT); - ParseContext parseContext = parseContextNode == null ? new ParseContext() : ParseContextDeserializer.readParseContext(parseContextNode); - FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = readOnParseException(root); - - return new FetchEmitTuple(id, new FetchKey(fetcherName, fetchKey, fetchRangeStart, fetchRangeEnd), new EmitKey(emitterName, emitKey), metadata, parseContext, - onParseException); - } - - private static FetchEmitTuple.ON_PARSE_EXCEPTION readOnParseException(JsonNode root) throws IOException { - JsonNode onParseExNode = root.get(ON_PARSE_EXCEPTION); - if (onParseExNode == null) { - return FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT; - } - String txt = onParseExNode.asText(); - if ("skip".equalsIgnoreCase(txt)) { - return FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP; - } else if ("emit".equalsIgnoreCase(txt)) { - return FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT; - } else { - throw new IOException(ON_PARSE_EXCEPTION + " must be either 'skip' or 'emit'"); - } - } - - private static Metadata readMetadata(JsonNode root) { - JsonNode metadataNode = root.get(METADATAKEY); - if (metadataNode == null) { - return new Metadata(); - } - Metadata metadata = new Metadata(); - for (Map.Entry<String, JsonNode> e : metadataNode.properties()) { - JsonNode vals = e.getValue(); - String k = e.getKey(); - if (vals.isArray()) { - for (JsonNode arrVal : vals) { - metadata.add(k, arrVal.textValue()); - } - } else { - metadata.set(k, vals.asText()); - } - } - return metadata; + static { + SimpleModule module = new SimpleModule(); + module.addDeserializer(FetchEmitTuple.class, new FetchEmitTupleDeserializer()); + module.addSerializer(FetchEmitTuple.class, new FetchEmitTupleSerializer()); + module.addSerializer(Metadata.class, new MetadataSerializer()); + module.addSerializer(ParseContext.class, new ParseContextSerializer()); + OBJECT_MAPPER.registerModule(module); } - private static String readVal(String key, JsonNode jsonObj, String defaultRet, boolean isRequired) throws IOException { - JsonNode valNode = jsonObj.get(key); - if (valNode == null) { - if (isRequired) { - throw new IOException("required value string, but see: " + key); - } - return defaultRet; - } - return valNode.asText(); - } - - private static long readLong(String key, JsonNode jsonObj, long defaultVal, boolean isRequired) throws IOException { - JsonNode val = jsonObj.get(key); - if (val == null) { - if (isRequired) { - throw new IOException("required value long, but see: " + key); - } - return defaultVal; - } - return val.longValue(); + public static FetchEmitTuple fromJson(Reader reader) throws IOException { + return OBJECT_MAPPER.readValue(reader, FetchEmitTuple.class); } public static String toJson(FetchEmitTuple t) throws IOException { @@ -137,59 +54,6 @@ public class JsonFetchEmitTuple { } public static void toJson(FetchEmitTuple t, Writer writer) throws IOException { - - try (JsonGenerator jsonGenerator = new JsonFactory().createGenerator(writer)) { - writeTuple(t, jsonGenerator); - } - } - - static void writeTuple(FetchEmitTuple t, JsonGenerator jsonGenerator) throws IOException { - jsonGenerator.writeStartObject(); - jsonGenerator.writeStringField(ID, t.getId()); - jsonGenerator.writeStringField(FETCHER, t - .getFetchKey() - .getFetcherName()); - jsonGenerator.writeStringField(FETCHKEY, t - .getFetchKey() - .getFetchKey()); - if (t - .getFetchKey() - .hasRange()) { - jsonGenerator.writeNumberField(FETCH_RANGE_START, t - .getFetchKey() - .getRangeStart()); - jsonGenerator.writeNumberField(FETCH_RANGE_END, t - .getFetchKey() - .getRangeEnd()); - } - jsonGenerator.writeStringField(EMITTER, t - .getEmitKey() - .getEmitterName()); - if (!StringUtils.isBlank(t - .getEmitKey() - .getEmitKey())) { - jsonGenerator.writeStringField(EMITKEY, t - .getEmitKey() - .getEmitKey()); - } - if (t - .getMetadata() - .size() > 0) { - jsonGenerator.writeFieldName(METADATAKEY); - JsonMetadata.writeMetadataObject(t.getMetadata(), jsonGenerator, false); - } - - jsonGenerator.writeStringField(ON_PARSE_EXCEPTION, t - .getOnParseException() - .name() - .toLowerCase(Locale.US)); - if (!t - .getParseContext() - .isEmpty()) { - ParseContextSerializer s = new ParseContextSerializer(); - s.serialize(t.getParseContext(), jsonGenerator, null); - } - jsonGenerator.writeEndObject(); - + OBJECT_MAPPER.writeValue(writer, t); } } diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java index 26a1cc501..8f53c8a87 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java @@ -20,35 +20,33 @@ import java.io.IOException; import java.io.Reader; import java.io.StringWriter; import java.io.Writer; -import java.util.ArrayList; -import java.util.Iterator; import java.util.List; -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonGenerator; -import com.fasterxml.jackson.core.StreamReadConstraints; -import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.module.SimpleModule; -import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.core.FetchEmitTuple; +import org.apache.tika.serialization.MetadataSerializer; +import org.apache.tika.serialization.ParseContextSerializer; public class JsonFetchEmitTupleList { - public static List<FetchEmitTuple> fromJson(Reader reader) throws IOException { - JsonNode root = new ObjectMapper().readTree(reader); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - if (!root.isArray()) { - throw new IOException("FetchEmitTupleList must be an array"); - } - List<FetchEmitTuple> list = new ArrayList<>(); - Iterator<JsonNode> it = root.iterator(); - while (it.hasNext()) { - JsonNode n = it.next(); - FetchEmitTuple t = JsonFetchEmitTuple.parseFetchEmitTuple(n); - list.add(t); - } - return list; + static { + SimpleModule module = new SimpleModule(); + module.addDeserializer(FetchEmitTuple.class, new FetchEmitTupleDeserializer()); + module.addSerializer(FetchEmitTuple.class, new FetchEmitTupleSerializer()); + module.addSerializer(Metadata.class, new MetadataSerializer()); + module.addSerializer(ParseContext.class, new ParseContextSerializer()); + OBJECT_MAPPER.registerModule(module); + } + + public static List<FetchEmitTuple> fromJson(Reader reader) throws IOException { + return OBJECT_MAPPER.readValue(reader, new TypeReference<List<FetchEmitTuple>>() {}); } public static String toJson(List<FetchEmitTuple> list) throws IOException { @@ -58,18 +56,6 @@ public class JsonFetchEmitTupleList { } public static void toJson(List<FetchEmitTuple> list, Writer writer) throws IOException { - - try (JsonGenerator jsonGenerator = new JsonFactory() - .setStreamReadConstraints(StreamReadConstraints - .builder() - .maxStringLength(TikaConfig.getMaxJsonStringFieldLength()) - .build()) - .createGenerator(writer)) { - jsonGenerator.writeStartArray(); - for (FetchEmitTuple t : list) { - JsonFetchEmitTuple.writeTuple(t, jsonGenerator); - } - jsonGenerator.writeEndArray(); - } + OBJECT_MAPPER.writeValue(writer, list); } } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java index 72a9c3f9a..b47acde2e 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java @@ -20,15 +20,11 @@ package org.apache.tika.serialization; import java.io.IOException; import java.io.Reader; import java.io.Writer; -import java.util.Arrays; import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonGenerator; -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonToken; import com.fasterxml.jackson.core.StreamReadConstraints; -import org.apache.commons.io.input.CloseShieldReader; -import org.apache.commons.io.output.CloseShieldWriter; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.module.SimpleModule; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; @@ -37,6 +33,17 @@ public class JsonMetadata { static volatile boolean PRETTY_PRINT = false; + private static ObjectMapper OBJECT_MAPPER; + private static final ObjectMapper PRETTY_SERIALIZER; + + static { + OBJECT_MAPPER = buildObjectMapper(StreamReadConstraints.DEFAULT_MAX_STRING_LEN); + PRETTY_SERIALIZER = new ObjectMapper(); + SimpleModule prettySerializerModule = new SimpleModule(); + prettySerializerModule.addSerializer(Metadata.class, new MetadataSerializer(true)); + PRETTY_SERIALIZER.registerModule(prettySerializerModule); + } + /** * Serializes a Metadata object to Json. This does not flush or close the writer. * @@ -45,115 +52,56 @@ public class JsonMetadata { * @throws java.io.IOException if there is an IOException during writing */ public static void toJson(Metadata metadata, Writer writer) throws IOException { - if (metadata == null) { - writer.write("null"); - return; - } - long max = TikaConfig.getMaxJsonStringFieldLength(); - try (JsonGenerator jsonGenerator = new JsonFactory() - .setStreamReadConstraints(StreamReadConstraints - .builder() - .maxStringLength(TikaConfig.getMaxJsonStringFieldLength()) - .build()) - .createGenerator(CloseShieldWriter.wrap(writer))) { - if (PRETTY_PRINT) { - jsonGenerator.useDefaultPrettyPrinter(); - } - writeMetadataObject(metadata, jsonGenerator, PRETTY_PRINT); + if (PRETTY_PRINT) { + PRETTY_SERIALIZER + .writerWithDefaultPrettyPrinter() + .writeValue(writer, metadata); + } else { + OBJECT_MAPPER.writeValue(writer, metadata); } } - public static void writeMetadataObject(Metadata metadata, JsonGenerator jsonGenerator, boolean prettyPrint) throws IOException { - jsonGenerator.writeStartObject(); - String[] names = metadata.names(); - if (prettyPrint) { - Arrays.sort(names, new PrettyMetadataKeyComparator()); - } - for (String n : names) { - String[] vals = metadata.getValues(n); - if (vals.length == 0) { - continue; - } else if (vals.length == 1) { - jsonGenerator.writeStringField(n, vals[0]); - } else if (vals.length > 1) { - jsonGenerator.writeArrayFieldStart(n); - for (String val : vals) { - jsonGenerator.writeString(val); - } - jsonGenerator.writeEndArray(); - } - } - jsonGenerator.writeEndObject(); - } - /** * Read metadata from reader. * <p> * This does not close the reader. + * <p> + * This will reset the OBJECT_MAPPER if the max string length differs from that in TikaConfig. * * @param reader reader to read from * @return Metadata or null if nothing could be read from the reader * @throws IOException in case of parse failure or IO failure with Reader */ public static Metadata fromJson(Reader reader) throws IOException { - Metadata m = null; - try (JsonParser jParser = new JsonFactory() - .setStreamReadConstraints(StreamReadConstraints - .builder() - .maxStringLength(TikaConfig.getMaxJsonStringFieldLength()) - .build()) - .createParser(CloseShieldReader.wrap(reader))) { - m = readMetadataObject(jParser); + if (reader == null) { + return null; } - return m; - } - - /** - * expects that jParser has not yet started on object or - * for jParser to be pointing to the start object. - * - * @param jParser - * @return - * @throws IOException - */ - public static Metadata readMetadataObject(JsonParser jParser) throws IOException { - Metadata metadata = new Metadata(); - JsonToken token = jParser.currentToken(); - if (token == null) { - token = jParser.nextToken(); - if (token != JsonToken.START_OBJECT) { - throw new IOException("expected start object, but got: " + token.name()); - } - token = jParser.nextToken(); - } else if (token == JsonToken.START_OBJECT) { - token = jParser.nextToken(); - } - - while (token != JsonToken.END_OBJECT) { - token = jParser.currentToken(); - if (token != JsonToken.FIELD_NAME) { - throw new IOException("expected field name, but got: " + token.name()); - } - String key = jParser.currentName(); - token = jParser.nextToken(); - if (token == JsonToken.START_ARRAY) { - while (jParser.nextToken() != JsonToken.END_ARRAY) { - metadata.add(key, jParser.getText()); - } - } else { - if (token != JsonToken.VALUE_STRING) { - throw new IOException("expected string value, but found: " + token.name()); - } - String value = jParser.getValueAsString(); - metadata.set(key, value); - } - token = jParser.nextToken(); + if (OBJECT_MAPPER + .getFactory() + .streamReadConstraints() + .getMaxStringLength() != TikaConfig.getMaxJsonStringFieldLength()) { + OBJECT_MAPPER = buildObjectMapper(TikaConfig.getMaxJsonStringFieldLength()); } - return metadata; + return OBJECT_MAPPER.readValue(reader, Metadata.class); } public static void setPrettyPrinting(boolean prettyPrint) { PRETTY_PRINT = prettyPrint; } + static ObjectMapper buildObjectMapper(int maxStringLen) { + JsonFactory factory = new JsonFactory(); + factory.setStreamReadConstraints(StreamReadConstraints + .builder() + .maxNestingDepth(10) + .maxStringLength(maxStringLen) + .maxNumberLength(500) + .build()); + ObjectMapper objectMapper = new ObjectMapper(factory); + SimpleModule baseModule = new SimpleModule(); + baseModule.addDeserializer(Metadata.class, new MetadataDeserializer()); + baseModule.addSerializer(Metadata.class, new MetadataSerializer()); + objectMapper.registerModule(baseModule); + return objectMapper; + } } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java index a7a3803e5..4b84e9f3a 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java @@ -16,27 +16,51 @@ */ package org.apache.tika.serialization; +import static org.apache.tika.serialization.JsonMetadata.buildObjectMapper; + import java.io.IOException; import java.io.Reader; import java.io.Writer; -import java.util.ArrayList; import java.util.List; import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonGenerator; -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonToken; import com.fasterxml.jackson.core.StreamReadConstraints; -import org.apache.commons.io.input.CloseShieldReader; -import org.apache.commons.io.output.CloseShieldWriter; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.module.SimpleModule; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; public class JsonMetadataList { + static volatile boolean PRETTY_PRINT = false; + private static ObjectMapper OBJECT_MAPPER; + private static final ObjectMapper PRETTY_SERIALIZER; + + static { + JsonFactory factory = new JsonFactory(); + factory.setStreamReadConstraints(StreamReadConstraints + .builder() + .maxNestingDepth(10) + .maxStringLength(TikaConfig.getMaxJsonStringFieldLength()) + .maxNumberLength(500) +// .maxDocumentLength(1000000) + .build()); + OBJECT_MAPPER = new ObjectMapper(factory); + SimpleModule baseModule = new SimpleModule(); + baseModule.addDeserializer(Metadata.class, new MetadataDeserializer()); + baseModule.addSerializer(Metadata.class, new MetadataSerializer()); + OBJECT_MAPPER.registerModule(baseModule); + + PRETTY_SERIALIZER = new ObjectMapper(factory); + SimpleModule prettySerializerModule = new SimpleModule(); + prettySerializerModule.addSerializer(Metadata.class, new MetadataSerializer(true)); + PRETTY_SERIALIZER.registerModule(prettySerializerModule); + + } + /** * Serializes a Metadata object to Json. This does not flush or close the writer. * @@ -46,24 +70,10 @@ public class JsonMetadataList { * @throws org.apache.tika.exception.TikaException if there is an IOException during writing */ public static void toJson(List<Metadata> metadataList, Writer writer, boolean prettyPrint) throws IOException { - if (metadataList == null) { - writer.write("null"); - return; - } - try (JsonGenerator jsonGenerator = new JsonFactory() - .setStreamReadConstraints(StreamReadConstraints - .builder() - .maxStringLength(TikaConfig.getMaxJsonStringFieldLength()) - .build()) - .createGenerator(CloseShieldWriter.wrap(writer))) { - if (prettyPrint) { - jsonGenerator.useDefaultPrettyPrinter(); - } - jsonGenerator.writeStartArray(); - for (Metadata m : metadataList) { - JsonMetadata.writeMetadataObject(m, jsonGenerator, prettyPrint); - } - jsonGenerator.writeEndArray(); + if (prettyPrint) { + PRETTY_SERIALIZER.writerWithDefaultPrettyPrinter().writeValue(writer, metadataList); + } else { + OBJECT_MAPPER.writeValue(writer, metadataList); } } @@ -86,51 +96,19 @@ public class JsonMetadataList { * @throws IOException in case of parse failure or IO failure with Reader */ public static List<Metadata> fromJson(Reader reader) throws IOException { - List<Metadata> ms = null; if (reader == null) { - return ms; - } - ms = new ArrayList<>(); - try (JsonParser jParser = new JsonFactory() - .setStreamReadConstraints(StreamReadConstraints - .builder() - .maxStringLength(TikaConfig.getMaxJsonStringFieldLength()) - .build()) - .createParser(CloseShieldReader.wrap(reader))) { - - JsonToken token = jParser.nextToken(); - if (token != JsonToken.START_ARRAY) { - throw new IOException("metadata list must start with an array, but I see: " + token.name()); - } - token = jParser.nextToken(); - while (token != JsonToken.END_ARRAY) { - Metadata m = JsonMetadata.readMetadataObject(jParser); - ms.add(m); - token = jParser.nextToken(); - } - - } - if (ms == null) { return null; } - //if the last object is the main document, - //as happens with the streaming serializer, - //flip it to be the first element. - if (ms.size() > 1) { - Metadata last = ms.get(ms.size() - 1); - String embResourcePath = last.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); - if (embResourcePath == null && ms - .get(0) - .get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH) != null) { - ms.add(0, ms.remove(ms.size() - 1)); - } + if (OBJECT_MAPPER.getFactory().streamReadConstraints().getMaxStringLength() + != TikaConfig.getMaxJsonStringFieldLength()) { + OBJECT_MAPPER = buildObjectMapper(TikaConfig.getMaxJsonStringFieldLength()); } - return ms; + + return OBJECT_MAPPER.readValue(reader, new TypeReference<List<Metadata>>(){}); } public static void setPrettyPrinting(boolean prettyPrint) { PRETTY_PRINT = prettyPrint; } - } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonStreamingSerializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonStreamingSerializer.java deleted file mode 100644 index 259695ada..000000000 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonStreamingSerializer.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.serialization; - - -import java.io.IOException; -import java.io.Writer; -import java.util.Arrays; - -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonGenerator; -import com.fasterxml.jackson.core.StreamReadConstraints; - -import org.apache.tika.config.TikaConfig; -import org.apache.tika.metadata.Metadata; - - -public class JsonStreamingSerializer implements AutoCloseable { - - private final Writer writer; - boolean hasStartedArray = false; - private JsonGenerator jsonGenerator; - - public JsonStreamingSerializer(Writer writer) { - this.writer = writer; - } - - public void add(Metadata metadata) throws IOException { - if (!hasStartedArray) { - jsonGenerator = new JsonFactory() - .setStreamReadConstraints(StreamReadConstraints - .builder() - .maxStringLength(TikaConfig.getMaxJsonStringFieldLength()) - .build()) - .createGenerator(writer); - jsonGenerator.writeStartArray(); - hasStartedArray = true; - } - String[] names = metadata.names(); - Arrays.sort(names); - JsonMetadata.writeMetadataObject(metadata, jsonGenerator, false); - } - - @Override - public void close() throws IOException { - jsonGenerator.writeEndArray(); - jsonGenerator.flush(); - jsonGenerator.close(); - } -} diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/MetadataDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/MetadataDeserializer.java new file mode 100644 index 000000000..4dc7c3a7e --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/MetadataDeserializer.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.serialization; + +import java.io.IOException; + +import com.fasterxml.jackson.core.JacksonException; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.databind.DeserializationContext; +import com.fasterxml.jackson.databind.JsonDeserializer; + +import org.apache.tika.metadata.Metadata; + +public class MetadataDeserializer extends JsonDeserializer<Metadata> { + + @Override + public Metadata deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException, JacksonException { + Metadata metadata = new Metadata(); + boolean keepGoing = true; + while (keepGoing) { + keepGoing = addField(jsonParser, metadata); + } + return metadata; + } + + private boolean addField(JsonParser jsonParser, Metadata metadata) throws IOException { + String field = jsonParser.nextFieldName(); + if (field == null) { + return false; + } + JsonToken token = jsonParser.nextValue(); + + if (token == null) { + return false; + } + + if (token.isScalarValue()) { + metadata.set(field, jsonParser.getText()); + } else if (jsonParser.isExpectedStartArrayToken()) { + token = jsonParser.nextToken(); + while (token != null) { + if (token == JsonToken.END_ARRAY) { + return true; + } else if (token.isScalarValue()) { + metadata.add(field, jsonParser.getText()); + } else { + break; + } + token = jsonParser.nextToken(); + } + } else { + return false; + } + return true; + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/MetadataSerializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/MetadataSerializer.java new file mode 100644 index 000000000..68e7d6593 --- /dev/null +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/MetadataSerializer.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.serialization; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.JsonSerializer; +import com.fasterxml.jackson.databind.SerializerProvider; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; + +public class MetadataSerializer extends JsonSerializer<Metadata> { + private static final String TIKA_CONTENT_KEY = TikaCoreProperties.TIKA_CONTENT.getName(); + + //always sort the content at the end + private static final Comparator<String> METADATA_KEY_COMPARATOR = new Comparator<String>() { + @Override + public int compare(String o1, String o2) { + if (o1.equals(TIKA_CONTENT_KEY)) { + return 1; + } + if (o2.equals(TIKA_CONTENT_KEY)) { + return -1; + } + return o1.compareTo(o2); + } + }; + + private boolean prettyPrint = false; + + public MetadataSerializer() { + + } + + public MetadataSerializer(boolean prettyPrint) { + this.prettyPrint = prettyPrint; + } + @Override + public void serialize(Metadata metadata, JsonGenerator jsonGenerator, SerializerProvider serializerProvider) throws IOException { + jsonGenerator.writeStartObject(); + String[] names = metadata.names(); + if (prettyPrint) { + Arrays.sort(names, METADATA_KEY_COMPARATOR); + } + for (String n : names) { + String[] v = metadata.getValues(n); + if (v.length == 0) { + continue; + } else if (v.length == 1) { + jsonGenerator.writeStringField(n, v[0]); + } else { + jsonGenerator.writeFieldName(n); + jsonGenerator.writeArray(v, 0, v.length); + } + } + jsonGenerator.writeEndObject(); + } +} diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java index 584920e5c..adc0c4691 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java @@ -30,7 +30,6 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { @Override public void serialize(ParseContext parseContext, JsonGenerator jsonGenerator, SerializerProvider serializerProvider) throws IOException { - jsonGenerator.writeFieldName(PARSE_CONTEXT); jsonGenerator.writeStartObject(); for (String className : parseContext.keySet()) { try { diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java index 79ecf8df4..8c1c45a27 100644 --- a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java @@ -18,17 +18,20 @@ package org.apache.tika.serialization; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; -import java.io.Reader; +import java.io.InputStream; import java.io.StringReader; import java.io.StringWriter; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; +import com.fasterxml.jackson.databind.JsonMappingException; import org.junit.jupiter.api.Test; +import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -60,16 +63,6 @@ public class JsonMetadataListTest { JsonMetadataList.toJson(metadataList, writer); List<Metadata> deserialized = JsonMetadataList.fromJson(new StringReader(writer.toString())); assertEquals(metadataList, deserialized); - - //now test streaming serializer - writer = new StringWriter(); - try (JsonStreamingSerializer streamingSerializer = new JsonStreamingSerializer(writer)) { - streamingSerializer.add(m1); - streamingSerializer.add(m2); - } - deserialized = JsonMetadataList.fromJson(new StringReader(writer.toString())); - assertEquals(metadataList, deserialized); - } @Test @@ -87,8 +80,7 @@ public class JsonMetadataListTest { @Test public void testListCorrupted() throws Exception { String json = "[{\"k1\":[\"v1\",\"v2\",\"v3\",\"v4\",\"v4\"],\"k2\":\"v1\"}," + "\"k3\":[\"v1\",\"v2\",\"v3\",\"v4\",\"v4\"],\"k4\":\"v1\"}]"; - List<Metadata> m = JsonMetadataList.fromJson(null); - assertNull(m); + Exception ex = assertThrows(JsonMappingException.class, () -> JsonMetadataList.fromJson(new StringReader(json))); } @Test @@ -119,13 +111,17 @@ public class JsonMetadataListTest { .toString() .startsWith("[")); writer = new StringWriter(); + JsonMetadata.setPrettyPrinting(true); + + JsonMetadataList.setPrettyPrinting(true); JsonMetadataList.toJson(metadataList, writer); - assertTrue(writer + String expected = "[ {[NEWLINE] \"zk1\" : [ \"v1\", \"v2\", \"v3\", \"v4\", \"v4\" ],[NEWLINE] \"zk2\" : \"v1\",[NEWLINE]" + + " \"X-TIKA:content\" : \"this is the content\"[NEWLINE]}, " + + "{[NEWLINE] \"k3\" : [ \"v1\", \"v2\", \"v3\", \"v4\", \"v4\" ],[NEWLINE] \"k4\" : \"v1\"[NEWLINE]} ]"; + assertEquals(expected, writer .toString() - .replaceAll("\r\n", "\n") - .startsWith("[ {\n" + " \"zk1\" : [ \"v1\", \"v2\", \"v3\", \"v4\", \"v4\" ],\n" + " \"zk2\" : \"v1\",\n" + " \"X-TIKA:content\" : \"this is the content\"\n" + - "},")); + .replaceAll("[\r\n]+", "[NEWLINE]")); //now set it back to false @@ -138,35 +134,24 @@ public class JsonMetadataListTest { } @Test - public void testSwitchingOrderOfMainDoc() throws Exception { - Metadata m1 = new Metadata(); - m1.add("k1", "v1"); - m1.add("k1", "v2"); - m1.add("k1", "v3"); - m1.add("k1", "v4"); - m1.add("k1", "v4"); - m1.add("k2", "v1"); - m1.add(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, "/embedded-1"); - - Metadata m2 = new Metadata(); - m2.add("k3", "v1"); - m2.add("k3", "v2"); - m2.add("k3", "v3"); - m2.add("k3", "v4"); - m2.add("k3", "v4"); - m2.add("k4", "v1"); - - List<Metadata> truth = new ArrayList<>(); - truth.add(m2); - truth.add(m1); - StringWriter stringWriter = new StringWriter(); - try (JsonStreamingSerializer serializer = new JsonStreamingSerializer(stringWriter)) { - serializer.add(m1); - serializer.add(m2); + public void testLargeValues() throws Exception { + //TIKA-4154 + TikaConfig tikaConfig = null; + try (InputStream is = JsonMetadata.class.getResourceAsStream("/config/tika-config-json.xml")) { + tikaConfig = new TikaConfig(is); } - Reader reader = new StringReader(stringWriter.toString()); - List<Metadata> deserialized = JsonMetadataList.fromJson(reader); - assertEquals(truth, deserialized); - + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 30000000; i++) { + sb.append("v"); + } + Metadata m = new Metadata(); + m.add("large_value", sb.toString()); + List<Metadata> list = new ArrayList<>(); + list.add(m); + list.add(m); + StringWriter writer = new StringWriter(); + JsonMetadataList.toJson(list, writer); + List<Metadata> deserialized = JsonMetadataList.fromJson(new StringReader(writer.toString())); + assertEquals(list, deserialized); } } diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java index aa9b8ccad..80d32bdc4 100644 --- a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java @@ -29,12 +29,14 @@ import org.junit.jupiter.api.Test; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; public class JsonMetadataTest { @Test public void testBasicSerializationAndDeserialization() throws Exception { Metadata metadata = new Metadata(); + metadata.add(TikaCoreProperties.TIKA_CONTENT, "this is the content"); metadata.add("k1", "v1"); metadata.add("k1", "v2"); //test duplicate value @@ -54,7 +56,7 @@ public class JsonMetadataTest { StringWriter writer = new StringWriter(); JsonMetadata.toJson(metadata, writer); Metadata deserialized = JsonMetadata.fromJson(new StringReader(writer.toString())); - assertEquals(7, deserialized.names().length); + assertEquals(8, deserialized.names().length); assertEquals(metadata, deserialized); //test that this really is 6 Chinese characters @@ -66,11 +68,13 @@ public class JsonMetadataTest { writer = new StringWriter(); JsonMetadata.setPrettyPrinting(true); JsonMetadata.toJson(metadata, writer); - assertTrue(writer + String expected = "{[NEWLINE] \"alma_mater\" : \"普林斯顿大学\",[NEWLINE] \"html\" : \"<html><body>& </body></html>\"," + + "[NEWLINE] \"json_escapes\" : \"the: \\\"quick\\\" brown, fox\"," + + "[NEWLINE] \"k1\" : [ \"v1\", \"v2\" ],[NEWLINE] \"k3\" : [ \"v3\", \"v3\" ],[NEWLINE] \"k4\" : \"500,000\"," + + "[NEWLINE] \"url\" : \"/myApp/myAction.html?method=router&cmd=1\",[NEWLINE] \"X-TIKA:content\" : \"this is the content\"[NEWLINE]}"; + assertEquals(expected, writer .toString() - .replaceAll("\r\n", "\n") - .contains("\"json_escapes\" : \"the: \\\"quick\\\" brown, fox\",\n" + " \"k1\" : [ \"v1\", \"v2\" ],\n" + " \"k3\" : [ \"v3\", \"v3\" ],\n" + - " \"k4\" : \"500,000\",\n" + " \"url\" : \"/myApp/myAction.html?method=router&cmd=1\"\n" + "}")); + .replaceAll("[\r\n]+", "[NEWLINE]")); } @Test diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java index f6bf0a95f..89913d4b6 100644 --- a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java +++ b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java @@ -45,10 +45,8 @@ public class TestParseContextSerialization { String json; try (Writer writer = new StringWriter()) { try (JsonGenerator jsonGenerator = new JsonFactory().createGenerator(writer)) { - jsonGenerator.writeStartObject(); ParseContextSerializer serializer = new ParseContextSerializer(); serializer.serialize(pc, jsonGenerator, null); - jsonGenerator.writeEndObject(); } json = writer.toString(); }
