This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new baee4e384 TIKA-4503 -- refactor serialization (#2351)
baee4e384 is described below
commit baee4e3845663915c979747f03a27491bcde714f
Author: Tim Allison <[email protected]>
AuthorDate: Fri Oct 3 12:42:17 2025 -0400
TIKA-4503 -- refactor serialization (#2351)
* TIKA-4503 -- refactor serialization
---
...tTuple.java => FetchEmitTupleDeserializer.java} | 119 ++++-----------
.../serialization/FetchEmitTupleSerializer.java | 65 +++++++++
.../pipes/core/serialization/JsonEmitData.java | 45 ++----
.../core/serialization/JsonFetchEmitTuple.java | 162 ++-------------------
.../core/serialization/JsonFetchEmitTupleList.java | 52 +++----
.../apache/tika/serialization/JsonMetadata.java | 140 ++++++------------
.../tika/serialization/JsonMetadataList.java | 102 +++++--------
.../serialization/JsonStreamingSerializer.java | 64 --------
.../tika/serialization/MetadataDeserializer.java | 71 +++++++++
.../tika/serialization/MetadataSerializer.java | 76 ++++++++++
.../tika/serialization/ParseContextSerializer.java | 1 -
.../tika/serialization/JsonMetadataListTest.java | 77 ++++------
.../tika/serialization/JsonMetadataTest.java | 14 +-
.../TestParseContextSerialization.java | 2 -
14 files changed, 409 insertions(+), 581 deletions(-)
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleDeserializer.java
similarity index 51%
copy from
tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
copy to
tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleDeserializer.java
index b92685521..8fe867289 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleDeserializer.java
@@ -16,60 +16,53 @@
*/
package org.apache.tika.pipes.core.serialization;
+import static
org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.EMIT_KEY;
+import static
org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.EMITTER;
+import static
org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.FETCHER;
+import static
org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.FETCH_KEY;
+import static
org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.FETCH_RANGE_END;
+import static
org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.FETCH_RANGE_START;
+import static
org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.ID;
+import static
org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.METADATA_KEY;
+import static
org.apache.tika.pipes.core.serialization.FetchEmitTupleSerializer.ON_PARSE_EXCEPTION;
+import static
org.apache.tika.serialization.ParseContextSerializer.PARSE_CONTEXT;
+
import java.io.IOException;
-import java.io.Reader;
-import java.io.StringWriter;
-import java.io.Writer;
-import java.util.Locale;
import java.util.Map;
-import com.fasterxml.jackson.core.JsonFactory;
-import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.core.JacksonException;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.core.FetchEmitTuple;
import org.apache.tika.pipes.core.emitter.EmitKey;
import org.apache.tika.pipes.core.fetcher.FetchKey;
-import org.apache.tika.serialization.JsonMetadata;
import org.apache.tika.serialization.ParseContextDeserializer;
-import org.apache.tika.serialization.ParseContextSerializer;
-import org.apache.tika.utils.StringUtils;
-
-public class JsonFetchEmitTuple {
- public static final String ID = "id";
- public static final String FETCHER = "fetcher";
- public static final String FETCHKEY = "fetchKey";
- public static final String FETCH_RANGE_START = "fetchRangeStart";
- public static final String FETCH_RANGE_END = "fetchRangeEnd";
- public static final String EMITTER = "emitter";
- public static final String EMITKEY = "emitKey";
- public static final String METADATAKEY = "metadata";
- public static final String ON_PARSE_EXCEPTION = "onParseException";
+public class FetchEmitTupleDeserializer extends
JsonDeserializer<FetchEmitTuple> {
- public static FetchEmitTuple fromJson(Reader reader) throws IOException {
- JsonNode root = new ObjectMapper().readTree(reader);
- return parseFetchEmitTuple(root);
- }
+ @Override
+ public FetchEmitTuple deserialize(JsonParser jsonParser,
DeserializationContext deserializationContext) throws IOException,
JacksonException {
+ JsonNode root = jsonParser.readValueAsTree();
-
- static FetchEmitTuple parseFetchEmitTuple(JsonNode root) throws
IOException {
String id = readVal(ID, root, null, true);
String fetcherName = readVal(FETCHER, root, null, true);
- String fetchKey = readVal(FETCHKEY, root, null, true);
+ String fetchKey = readVal(FETCH_KEY, root, null, true);
String emitterName = readVal(EMITTER, root, "", false);
- String emitKey = readVal(EMITKEY, root, "", false);
+ String emitKey = readVal(EMIT_KEY, root, "", false);
long fetchRangeStart = readLong(FETCH_RANGE_START, root, -1l, false);
long fetchRangeEnd = readLong(FETCH_RANGE_END, root, -1l, false);
Metadata metadata = readMetadata(root);
- JsonNode parseContextNode =
root.get(ParseContextSerializer.PARSE_CONTEXT);
+ JsonNode parseContextNode = root.get(PARSE_CONTEXT);
ParseContext parseContext = parseContextNode == null ? new
ParseContext() : ParseContextDeserializer.readParseContext(parseContextNode);
FetchEmitTuple.ON_PARSE_EXCEPTION onParseException =
readOnParseException(root);
- return new FetchEmitTuple(id, new FetchKey(fetcherName, fetchKey,
fetchRangeStart, fetchRangeEnd), new EmitKey(emitterName, emitKey), metadata,
parseContext,
+ return new FetchEmitTuple(id, new FetchKey(fetcherName, fetchKey,
fetchRangeStart, fetchRangeEnd),
+ new EmitKey(emitterName, emitKey), metadata, parseContext,
onParseException);
}
@@ -89,7 +82,7 @@ public class JsonFetchEmitTuple {
}
private static Metadata readMetadata(JsonNode root) {
- JsonNode metadataNode = root.get(METADATAKEY);
+ JsonNode metadataNode = root.get(METADATA_KEY);
if (metadataNode == null) {
return new Metadata();
}
@@ -130,66 +123,4 @@ public class JsonFetchEmitTuple {
return val.longValue();
}
- public static String toJson(FetchEmitTuple t) throws IOException {
- StringWriter writer = new StringWriter();
- toJson(t, writer);
- return writer.toString();
- }
-
- public static void toJson(FetchEmitTuple t, Writer writer) throws
IOException {
-
- try (JsonGenerator jsonGenerator = new
JsonFactory().createGenerator(writer)) {
- writeTuple(t, jsonGenerator);
- }
- }
-
- static void writeTuple(FetchEmitTuple t, JsonGenerator jsonGenerator)
throws IOException {
- jsonGenerator.writeStartObject();
- jsonGenerator.writeStringField(ID, t.getId());
- jsonGenerator.writeStringField(FETCHER, t
- .getFetchKey()
- .getFetcherName());
- jsonGenerator.writeStringField(FETCHKEY, t
- .getFetchKey()
- .getFetchKey());
- if (t
- .getFetchKey()
- .hasRange()) {
- jsonGenerator.writeNumberField(FETCH_RANGE_START, t
- .getFetchKey()
- .getRangeStart());
- jsonGenerator.writeNumberField(FETCH_RANGE_END, t
- .getFetchKey()
- .getRangeEnd());
- }
- jsonGenerator.writeStringField(EMITTER, t
- .getEmitKey()
- .getEmitterName());
- if (!StringUtils.isBlank(t
- .getEmitKey()
- .getEmitKey())) {
- jsonGenerator.writeStringField(EMITKEY, t
- .getEmitKey()
- .getEmitKey());
- }
- if (t
- .getMetadata()
- .size() > 0) {
- jsonGenerator.writeFieldName(METADATAKEY);
- JsonMetadata.writeMetadataObject(t.getMetadata(), jsonGenerator,
false);
- }
-
- jsonGenerator.writeStringField(ON_PARSE_EXCEPTION, t
- .getOnParseException()
- .name()
- .toLowerCase(Locale.US));
- if (!t
- .getParseContext()
- .isEmpty()) {
- ParseContextSerializer s = new ParseContextSerializer();
- s.serialize(t.getParseContext(), jsonGenerator, null);
- }
- jsonGenerator.writeEndObject();
-
- }
}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleSerializer.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleSerializer.java
new file mode 100644
index 000000000..b994d179d
--- /dev/null
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleSerializer.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.core.serialization;
+
+import static
org.apache.tika.serialization.ParseContextSerializer.PARSE_CONTEXT;
+
+import java.io.IOException;
+import java.util.Locale;
+
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.SerializerProvider;
+
+import org.apache.tika.pipes.core.FetchEmitTuple;
+import org.apache.tika.utils.StringUtils;
+
+public class FetchEmitTupleSerializer extends JsonSerializer<FetchEmitTuple> {
+ public static final String ID = "id";
+ public static final String FETCHER = "fetcher";
+ public static final String FETCH_KEY = "fetchKey";
+ public static final String FETCH_RANGE_START = "fetchRangeStart";
+ public static final String FETCH_RANGE_END = "fetchRangeEnd";
+ public static final String EMITTER = "emitter";
+ public static final String EMIT_KEY = "emitKey";
+ public static final String METADATA_KEY = "metadata";
+ public static final String ON_PARSE_EXCEPTION = "onParseException";
+
+ public void serialize(FetchEmitTuple t, JsonGenerator jsonGenerator,
SerializerProvider serializerProvider) throws IOException {
+
+ jsonGenerator.writeStartObject();
+ jsonGenerator.writeStringField(ID, t.getId());
+ jsonGenerator.writeStringField(FETCHER,
t.getFetchKey().getFetcherName());
+ jsonGenerator.writeStringField(FETCH_KEY,
t.getFetchKey().getFetchKey());
+ if (t.getFetchKey().hasRange()) {
+ jsonGenerator.writeNumberField(FETCH_RANGE_START,
t.getFetchKey().getRangeStart());
+ jsonGenerator.writeNumberField(FETCH_RANGE_END,
t.getFetchKey().getRangeEnd());
+ }
+ jsonGenerator.writeStringField(EMITTER,
t.getEmitKey().getEmitterName());
+ if (!StringUtils.isBlank(t.getEmitKey().getEmitKey())) {
+ jsonGenerator.writeStringField(EMIT_KEY,
t.getEmitKey().getEmitKey());
+ }
+ if (t.getMetadata().size() > 0) {
+ jsonGenerator.writeObjectField(METADATA_KEY, t.getMetadata());
+ }
+ jsonGenerator.writeStringField(ON_PARSE_EXCEPTION,
t.getOnParseException().name().toLowerCase(Locale.US));
+ if (!t.getParseContext().isEmpty()) {
+ jsonGenerator.writeObjectField(PARSE_CONTEXT, t.getParseContext());
+ }
+ jsonGenerator.writeEndObject();
+ }
+}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonEmitData.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonEmitData.java
index 8eb03c51d..2ec5f9343 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonEmitData.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonEmitData.java
@@ -19,41 +19,28 @@ package org.apache.tika.pipes.core.serialization;
import java.io.IOException;
import java.io.Writer;
-import com.fasterxml.jackson.core.JsonFactory;
-import com.fasterxml.jackson.core.JsonGenerator;
-import com.fasterxml.jackson.core.StreamReadConstraints;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.module.SimpleModule;
-import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.pipes.core.FetchEmitTuple;
import org.apache.tika.pipes.core.emitter.EmitData;
-import org.apache.tika.pipes.core.emitter.EmitKey;
-import org.apache.tika.serialization.JsonMetadata;
+import org.apache.tika.serialization.MetadataSerializer;
+import org.apache.tika.serialization.ParseContextSerializer;
public class JsonEmitData {
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ static {
+ SimpleModule module = new SimpleModule();
+ module.addSerializer(FetchEmitTuple.class, new
FetchEmitTupleSerializer());
+ module.addSerializer(ParseContext.class, new ParseContextSerializer());
+ module.addSerializer(Metadata.class, new MetadataSerializer());
+ OBJECT_MAPPER.registerModule(module);
+ }
public static void toJson(EmitData emitData, Writer writer) throws
IOException {
- try (JsonGenerator jsonGenerator = new JsonFactory()
- .setStreamReadConstraints(StreamReadConstraints
- .builder()
-
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength())
- .build())
- .createGenerator(writer)) {
- jsonGenerator.writeStartObject();
- EmitKey key = emitData.getEmitKey();
- jsonGenerator.writeStringField(JsonFetchEmitTuple.EMITTER,
key.getEmitterName());
- jsonGenerator.writeStringField(JsonFetchEmitTuple.EMITKEY,
key.getEmitKey());
- if (!emitData
- .getParseContext()
- .isEmpty()) {
- jsonGenerator.writeObject(emitData.getParseContext());
- }
- jsonGenerator.writeFieldName("data");
- jsonGenerator.writeStartArray();
- for (Metadata m : emitData.getMetadataList()) {
- JsonMetadata.writeMetadataObject(m, jsonGenerator, false);
- }
- jsonGenerator.writeEndArray();
- jsonGenerator.writeEndObject();
- }
+ OBJECT_MAPPER.writeValue(writer, emitData);
}
}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
index b92685521..6841379a0 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTuple.java
@@ -20,114 +20,31 @@ import java.io.IOException;
import java.io.Reader;
import java.io.StringWriter;
import java.io.Writer;
-import java.util.Locale;
-import java.util.Map;
-import com.fasterxml.jackson.core.JsonFactory;
-import com.fasterxml.jackson.core.JsonGenerator;
-import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.module.SimpleModule;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.core.FetchEmitTuple;
-import org.apache.tika.pipes.core.emitter.EmitKey;
-import org.apache.tika.pipes.core.fetcher.FetchKey;
-import org.apache.tika.serialization.JsonMetadata;
-import org.apache.tika.serialization.ParseContextDeserializer;
+import org.apache.tika.serialization.MetadataSerializer;
import org.apache.tika.serialization.ParseContextSerializer;
-import org.apache.tika.utils.StringUtils;
public class JsonFetchEmitTuple {
- public static final String ID = "id";
- public static final String FETCHER = "fetcher";
- public static final String FETCHKEY = "fetchKey";
- public static final String FETCH_RANGE_START = "fetchRangeStart";
- public static final String FETCH_RANGE_END = "fetchRangeEnd";
- public static final String EMITTER = "emitter";
- public static final String EMITKEY = "emitKey";
- public static final String METADATAKEY = "metadata";
- public static final String ON_PARSE_EXCEPTION = "onParseException";
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
- public static FetchEmitTuple fromJson(Reader reader) throws IOException {
- JsonNode root = new ObjectMapper().readTree(reader);
- return parseFetchEmitTuple(root);
- }
-
-
- static FetchEmitTuple parseFetchEmitTuple(JsonNode root) throws
IOException {
- String id = readVal(ID, root, null, true);
- String fetcherName = readVal(FETCHER, root, null, true);
- String fetchKey = readVal(FETCHKEY, root, null, true);
- String emitterName = readVal(EMITTER, root, "", false);
- String emitKey = readVal(EMITKEY, root, "", false);
- long fetchRangeStart = readLong(FETCH_RANGE_START, root, -1l, false);
- long fetchRangeEnd = readLong(FETCH_RANGE_END, root, -1l, false);
- Metadata metadata = readMetadata(root);
- JsonNode parseContextNode =
root.get(ParseContextSerializer.PARSE_CONTEXT);
- ParseContext parseContext = parseContextNode == null ? new
ParseContext() : ParseContextDeserializer.readParseContext(parseContextNode);
- FetchEmitTuple.ON_PARSE_EXCEPTION onParseException =
readOnParseException(root);
-
- return new FetchEmitTuple(id, new FetchKey(fetcherName, fetchKey,
fetchRangeStart, fetchRangeEnd), new EmitKey(emitterName, emitKey), metadata,
parseContext,
- onParseException);
- }
-
- private static FetchEmitTuple.ON_PARSE_EXCEPTION
readOnParseException(JsonNode root) throws IOException {
- JsonNode onParseExNode = root.get(ON_PARSE_EXCEPTION);
- if (onParseExNode == null) {
- return FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT;
- }
- String txt = onParseExNode.asText();
- if ("skip".equalsIgnoreCase(txt)) {
- return FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP;
- } else if ("emit".equalsIgnoreCase(txt)) {
- return FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT;
- } else {
- throw new IOException(ON_PARSE_EXCEPTION + " must be either 'skip'
or 'emit'");
- }
- }
-
- private static Metadata readMetadata(JsonNode root) {
- JsonNode metadataNode = root.get(METADATAKEY);
- if (metadataNode == null) {
- return new Metadata();
- }
- Metadata metadata = new Metadata();
- for (Map.Entry<String, JsonNode> e : metadataNode.properties()) {
- JsonNode vals = e.getValue();
- String k = e.getKey();
- if (vals.isArray()) {
- for (JsonNode arrVal : vals) {
- metadata.add(k, arrVal.textValue());
- }
- } else {
- metadata.set(k, vals.asText());
- }
- }
- return metadata;
+ static {
+ SimpleModule module = new SimpleModule();
+ module.addDeserializer(FetchEmitTuple.class, new
FetchEmitTupleDeserializer());
+ module.addSerializer(FetchEmitTuple.class, new
FetchEmitTupleSerializer());
+ module.addSerializer(Metadata.class, new MetadataSerializer());
+ module.addSerializer(ParseContext.class, new ParseContextSerializer());
+ OBJECT_MAPPER.registerModule(module);
}
- private static String readVal(String key, JsonNode jsonObj, String
defaultRet, boolean isRequired) throws IOException {
- JsonNode valNode = jsonObj.get(key);
- if (valNode == null) {
- if (isRequired) {
- throw new IOException("required value string, but see: " +
key);
- }
- return defaultRet;
- }
- return valNode.asText();
- }
-
- private static long readLong(String key, JsonNode jsonObj, long
defaultVal, boolean isRequired) throws IOException {
- JsonNode val = jsonObj.get(key);
- if (val == null) {
- if (isRequired) {
- throw new IOException("required value long, but see: " + key);
- }
- return defaultVal;
- }
- return val.longValue();
+ public static FetchEmitTuple fromJson(Reader reader) throws IOException {
+ return OBJECT_MAPPER.readValue(reader, FetchEmitTuple.class);
}
public static String toJson(FetchEmitTuple t) throws IOException {
@@ -137,59 +54,6 @@ public class JsonFetchEmitTuple {
}
public static void toJson(FetchEmitTuple t, Writer writer) throws
IOException {
-
- try (JsonGenerator jsonGenerator = new
JsonFactory().createGenerator(writer)) {
- writeTuple(t, jsonGenerator);
- }
- }
-
- static void writeTuple(FetchEmitTuple t, JsonGenerator jsonGenerator)
throws IOException {
- jsonGenerator.writeStartObject();
- jsonGenerator.writeStringField(ID, t.getId());
- jsonGenerator.writeStringField(FETCHER, t
- .getFetchKey()
- .getFetcherName());
- jsonGenerator.writeStringField(FETCHKEY, t
- .getFetchKey()
- .getFetchKey());
- if (t
- .getFetchKey()
- .hasRange()) {
- jsonGenerator.writeNumberField(FETCH_RANGE_START, t
- .getFetchKey()
- .getRangeStart());
- jsonGenerator.writeNumberField(FETCH_RANGE_END, t
- .getFetchKey()
- .getRangeEnd());
- }
- jsonGenerator.writeStringField(EMITTER, t
- .getEmitKey()
- .getEmitterName());
- if (!StringUtils.isBlank(t
- .getEmitKey()
- .getEmitKey())) {
- jsonGenerator.writeStringField(EMITKEY, t
- .getEmitKey()
- .getEmitKey());
- }
- if (t
- .getMetadata()
- .size() > 0) {
- jsonGenerator.writeFieldName(METADATAKEY);
- JsonMetadata.writeMetadataObject(t.getMetadata(), jsonGenerator,
false);
- }
-
- jsonGenerator.writeStringField(ON_PARSE_EXCEPTION, t
- .getOnParseException()
- .name()
- .toLowerCase(Locale.US));
- if (!t
- .getParseContext()
- .isEmpty()) {
- ParseContextSerializer s = new ParseContextSerializer();
- s.serialize(t.getParseContext(), jsonGenerator, null);
- }
- jsonGenerator.writeEndObject();
-
+ OBJECT_MAPPER.writeValue(writer, t);
}
}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java
index 26a1cc501..8f53c8a87 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/JsonFetchEmitTupleList.java
@@ -20,35 +20,33 @@ import java.io.IOException;
import java.io.Reader;
import java.io.StringWriter;
import java.io.Writer;
-import java.util.ArrayList;
-import java.util.Iterator;
import java.util.List;
-import com.fasterxml.jackson.core.JsonFactory;
-import com.fasterxml.jackson.core.JsonGenerator;
-import com.fasterxml.jackson.core.StreamReadConstraints;
-import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.module.SimpleModule;
-import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.core.FetchEmitTuple;
+import org.apache.tika.serialization.MetadataSerializer;
+import org.apache.tika.serialization.ParseContextSerializer;
public class JsonFetchEmitTupleList {
- public static List<FetchEmitTuple> fromJson(Reader reader) throws
IOException {
- JsonNode root = new ObjectMapper().readTree(reader);
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
- if (!root.isArray()) {
- throw new IOException("FetchEmitTupleList must be an array");
- }
- List<FetchEmitTuple> list = new ArrayList<>();
- Iterator<JsonNode> it = root.iterator();
- while (it.hasNext()) {
- JsonNode n = it.next();
- FetchEmitTuple t = JsonFetchEmitTuple.parseFetchEmitTuple(n);
- list.add(t);
- }
- return list;
+ static {
+ SimpleModule module = new SimpleModule();
+ module.addDeserializer(FetchEmitTuple.class, new
FetchEmitTupleDeserializer());
+ module.addSerializer(FetchEmitTuple.class, new
FetchEmitTupleSerializer());
+ module.addSerializer(Metadata.class, new MetadataSerializer());
+ module.addSerializer(ParseContext.class, new ParseContextSerializer());
+ OBJECT_MAPPER.registerModule(module);
+ }
+
+ public static List<FetchEmitTuple> fromJson(Reader reader) throws
IOException {
+ return OBJECT_MAPPER.readValue(reader, new
TypeReference<List<FetchEmitTuple>>() {});
}
public static String toJson(List<FetchEmitTuple> list) throws IOException {
@@ -58,18 +56,6 @@ public class JsonFetchEmitTupleList {
}
public static void toJson(List<FetchEmitTuple> list, Writer writer) throws
IOException {
-
- try (JsonGenerator jsonGenerator = new JsonFactory()
- .setStreamReadConstraints(StreamReadConstraints
- .builder()
-
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength())
- .build())
- .createGenerator(writer)) {
- jsonGenerator.writeStartArray();
- for (FetchEmitTuple t : list) {
- JsonFetchEmitTuple.writeTuple(t, jsonGenerator);
- }
- jsonGenerator.writeEndArray();
- }
+ OBJECT_MAPPER.writeValue(writer, list);
}
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java
index 72a9c3f9a..b47acde2e 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java
@@ -20,15 +20,11 @@ package org.apache.tika.serialization;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
-import java.util.Arrays;
import com.fasterxml.jackson.core.JsonFactory;
-import com.fasterxml.jackson.core.JsonGenerator;
-import com.fasterxml.jackson.core.JsonParser;
-import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.StreamReadConstraints;
-import org.apache.commons.io.input.CloseShieldReader;
-import org.apache.commons.io.output.CloseShieldWriter;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.module.SimpleModule;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
@@ -37,6 +33,17 @@ public class JsonMetadata {
static volatile boolean PRETTY_PRINT = false;
+ private static ObjectMapper OBJECT_MAPPER;
+ private static final ObjectMapper PRETTY_SERIALIZER;
+
+ static {
+ OBJECT_MAPPER =
buildObjectMapper(StreamReadConstraints.DEFAULT_MAX_STRING_LEN);
+ PRETTY_SERIALIZER = new ObjectMapper();
+ SimpleModule prettySerializerModule = new SimpleModule();
+ prettySerializerModule.addSerializer(Metadata.class, new
MetadataSerializer(true));
+ PRETTY_SERIALIZER.registerModule(prettySerializerModule);
+ }
+
/**
* Serializes a Metadata object to Json. This does not flush or close the
writer.
*
@@ -45,115 +52,56 @@ public class JsonMetadata {
* @throws java.io.IOException if there is an IOException during writing
*/
public static void toJson(Metadata metadata, Writer writer) throws
IOException {
- if (metadata == null) {
- writer.write("null");
- return;
- }
- long max = TikaConfig.getMaxJsonStringFieldLength();
- try (JsonGenerator jsonGenerator = new JsonFactory()
- .setStreamReadConstraints(StreamReadConstraints
- .builder()
-
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength())
- .build())
- .createGenerator(CloseShieldWriter.wrap(writer))) {
- if (PRETTY_PRINT) {
- jsonGenerator.useDefaultPrettyPrinter();
- }
- writeMetadataObject(metadata, jsonGenerator, PRETTY_PRINT);
+ if (PRETTY_PRINT) {
+ PRETTY_SERIALIZER
+ .writerWithDefaultPrettyPrinter()
+ .writeValue(writer, metadata);
+ } else {
+ OBJECT_MAPPER.writeValue(writer, metadata);
}
}
- public static void writeMetadataObject(Metadata metadata, JsonGenerator
jsonGenerator, boolean prettyPrint) throws IOException {
- jsonGenerator.writeStartObject();
- String[] names = metadata.names();
- if (prettyPrint) {
- Arrays.sort(names, new PrettyMetadataKeyComparator());
- }
- for (String n : names) {
- String[] vals = metadata.getValues(n);
- if (vals.length == 0) {
- continue;
- } else if (vals.length == 1) {
- jsonGenerator.writeStringField(n, vals[0]);
- } else if (vals.length > 1) {
- jsonGenerator.writeArrayFieldStart(n);
- for (String val : vals) {
- jsonGenerator.writeString(val);
- }
- jsonGenerator.writeEndArray();
- }
- }
- jsonGenerator.writeEndObject();
- }
-
/**
* Read metadata from reader.
* <p>
* This does not close the reader.
+ * <p>
+ * This will reset the OBJECT_MAPPER if the max string length differs from
that in TikaConfig.
*
* @param reader reader to read from
* @return Metadata or null if nothing could be read from the reader
* @throws IOException in case of parse failure or IO failure with Reader
*/
public static Metadata fromJson(Reader reader) throws IOException {
- Metadata m = null;
- try (JsonParser jParser = new JsonFactory()
- .setStreamReadConstraints(StreamReadConstraints
- .builder()
-
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength())
- .build())
- .createParser(CloseShieldReader.wrap(reader))) {
- m = readMetadataObject(jParser);
+ if (reader == null) {
+ return null;
}
- return m;
- }
-
- /**
- * expects that jParser has not yet started on object or
- * for jParser to be pointing to the start object.
- *
- * @param jParser
- * @return
- * @throws IOException
- */
- public static Metadata readMetadataObject(JsonParser jParser) throws
IOException {
- Metadata metadata = new Metadata();
- JsonToken token = jParser.currentToken();
- if (token == null) {
- token = jParser.nextToken();
- if (token != JsonToken.START_OBJECT) {
- throw new IOException("expected start object, but got: " +
token.name());
- }
- token = jParser.nextToken();
- } else if (token == JsonToken.START_OBJECT) {
- token = jParser.nextToken();
- }
-
- while (token != JsonToken.END_OBJECT) {
- token = jParser.currentToken();
- if (token != JsonToken.FIELD_NAME) {
- throw new IOException("expected field name, but got: " +
token.name());
- }
- String key = jParser.currentName();
- token = jParser.nextToken();
- if (token == JsonToken.START_ARRAY) {
- while (jParser.nextToken() != JsonToken.END_ARRAY) {
- metadata.add(key, jParser.getText());
- }
- } else {
- if (token != JsonToken.VALUE_STRING) {
- throw new IOException("expected string value, but found: "
+ token.name());
- }
- String value = jParser.getValueAsString();
- metadata.set(key, value);
- }
- token = jParser.nextToken();
+ if (OBJECT_MAPPER
+ .getFactory()
+ .streamReadConstraints()
+ .getMaxStringLength() !=
TikaConfig.getMaxJsonStringFieldLength()) {
+ OBJECT_MAPPER =
buildObjectMapper(TikaConfig.getMaxJsonStringFieldLength());
}
- return metadata;
+ return OBJECT_MAPPER.readValue(reader, Metadata.class);
}
public static void setPrettyPrinting(boolean prettyPrint) {
PRETTY_PRINT = prettyPrint;
}
+ static ObjectMapper buildObjectMapper(int maxStringLen) {
+ JsonFactory factory = new JsonFactory();
+ factory.setStreamReadConstraints(StreamReadConstraints
+ .builder()
+ .maxNestingDepth(10)
+ .maxStringLength(maxStringLen)
+ .maxNumberLength(500)
+ .build());
+ ObjectMapper objectMapper = new ObjectMapper(factory);
+ SimpleModule baseModule = new SimpleModule();
+ baseModule.addDeserializer(Metadata.class, new MetadataDeserializer());
+ baseModule.addSerializer(Metadata.class, new MetadataSerializer());
+ objectMapper.registerModule(baseModule);
+ return objectMapper;
+ }
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
index a7a3803e5..4b84e9f3a 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java
@@ -16,27 +16,51 @@
*/
package org.apache.tika.serialization;
+import static org.apache.tika.serialization.JsonMetadata.buildObjectMapper;
+
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
-import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.core.JsonFactory;
-import com.fasterxml.jackson.core.JsonGenerator;
-import com.fasterxml.jackson.core.JsonParser;
-import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.StreamReadConstraints;
-import org.apache.commons.io.input.CloseShieldReader;
-import org.apache.commons.io.output.CloseShieldWriter;
+import com.fasterxml.jackson.core.type.TypeReference;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.module.SimpleModule;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
public class JsonMetadataList {
+
static volatile boolean PRETTY_PRINT = false;
+ private static ObjectMapper OBJECT_MAPPER;
+ private static final ObjectMapper PRETTY_SERIALIZER;
+
+ static {
+ JsonFactory factory = new JsonFactory();
+ factory.setStreamReadConstraints(StreamReadConstraints
+ .builder()
+ .maxNestingDepth(10)
+ .maxStringLength(TikaConfig.getMaxJsonStringFieldLength())
+ .maxNumberLength(500)
+//
.maxDocumentLength(1000000)
+ .build());
+ OBJECT_MAPPER = new ObjectMapper(factory);
+ SimpleModule baseModule = new SimpleModule();
+ baseModule.addDeserializer(Metadata.class, new MetadataDeserializer());
+ baseModule.addSerializer(Metadata.class, new MetadataSerializer());
+ OBJECT_MAPPER.registerModule(baseModule);
+
+ PRETTY_SERIALIZER = new ObjectMapper(factory);
+ SimpleModule prettySerializerModule = new SimpleModule();
+ prettySerializerModule.addSerializer(Metadata.class, new
MetadataSerializer(true));
+ PRETTY_SERIALIZER.registerModule(prettySerializerModule);
+
+ }
+
/**
* Serializes a Metadata object to Json. This does not flush or close the
writer.
*
@@ -46,24 +70,10 @@ public class JsonMetadataList {
* @throws org.apache.tika.exception.TikaException if there is an
IOException during writing
*/
public static void toJson(List<Metadata> metadataList, Writer writer,
boolean prettyPrint) throws IOException {
- if (metadataList == null) {
- writer.write("null");
- return;
- }
- try (JsonGenerator jsonGenerator = new JsonFactory()
- .setStreamReadConstraints(StreamReadConstraints
- .builder()
-
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength())
- .build())
- .createGenerator(CloseShieldWriter.wrap(writer))) {
- if (prettyPrint) {
- jsonGenerator.useDefaultPrettyPrinter();
- }
- jsonGenerator.writeStartArray();
- for (Metadata m : metadataList) {
- JsonMetadata.writeMetadataObject(m, jsonGenerator,
prettyPrint);
- }
- jsonGenerator.writeEndArray();
+ if (prettyPrint) {
+
PRETTY_SERIALIZER.writerWithDefaultPrettyPrinter().writeValue(writer,
metadataList);
+ } else {
+ OBJECT_MAPPER.writeValue(writer, metadataList);
}
}
@@ -86,51 +96,19 @@ public class JsonMetadataList {
* @throws IOException in case of parse failure or IO failure with Reader
*/
public static List<Metadata> fromJson(Reader reader) throws IOException {
- List<Metadata> ms = null;
if (reader == null) {
- return ms;
- }
- ms = new ArrayList<>();
- try (JsonParser jParser = new JsonFactory()
- .setStreamReadConstraints(StreamReadConstraints
- .builder()
-
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength())
- .build())
- .createParser(CloseShieldReader.wrap(reader))) {
-
- JsonToken token = jParser.nextToken();
- if (token != JsonToken.START_ARRAY) {
- throw new IOException("metadata list must start with an array,
but I see: " + token.name());
- }
- token = jParser.nextToken();
- while (token != JsonToken.END_ARRAY) {
- Metadata m = JsonMetadata.readMetadataObject(jParser);
- ms.add(m);
- token = jParser.nextToken();
- }
-
- }
- if (ms == null) {
return null;
}
- //if the last object is the main document,
- //as happens with the streaming serializer,
- //flip it to be the first element.
- if (ms.size() > 1) {
- Metadata last = ms.get(ms.size() - 1);
- String embResourcePath =
last.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
- if (embResourcePath == null && ms
- .get(0)
- .get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH) != null) {
- ms.add(0, ms.remove(ms.size() - 1));
- }
+ if
(OBJECT_MAPPER.getFactory().streamReadConstraints().getMaxStringLength()
+ != TikaConfig.getMaxJsonStringFieldLength()) {
+ OBJECT_MAPPER =
buildObjectMapper(TikaConfig.getMaxJsonStringFieldLength());
}
- return ms;
+
+ return OBJECT_MAPPER.readValue(reader, new
TypeReference<List<Metadata>>(){});
}
public static void setPrettyPrinting(boolean prettyPrint) {
PRETTY_PRINT = prettyPrint;
}
-
}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonStreamingSerializer.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonStreamingSerializer.java
deleted file mode 100644
index 259695ada..000000000
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonStreamingSerializer.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.serialization;
-
-
-import java.io.IOException;
-import java.io.Writer;
-import java.util.Arrays;
-
-import com.fasterxml.jackson.core.JsonFactory;
-import com.fasterxml.jackson.core.JsonGenerator;
-import com.fasterxml.jackson.core.StreamReadConstraints;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-
-
-public class JsonStreamingSerializer implements AutoCloseable {
-
- private final Writer writer;
- boolean hasStartedArray = false;
- private JsonGenerator jsonGenerator;
-
- public JsonStreamingSerializer(Writer writer) {
- this.writer = writer;
- }
-
- public void add(Metadata metadata) throws IOException {
- if (!hasStartedArray) {
- jsonGenerator = new JsonFactory()
- .setStreamReadConstraints(StreamReadConstraints
- .builder()
-
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength())
- .build())
- .createGenerator(writer);
- jsonGenerator.writeStartArray();
- hasStartedArray = true;
- }
- String[] names = metadata.names();
- Arrays.sort(names);
- JsonMetadata.writeMetadataObject(metadata, jsonGenerator, false);
- }
-
- @Override
- public void close() throws IOException {
- jsonGenerator.writeEndArray();
- jsonGenerator.flush();
- jsonGenerator.close();
- }
-}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/MetadataDeserializer.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/MetadataDeserializer.java
new file mode 100644
index 000000000..4dc7c3a7e
--- /dev/null
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/MetadataDeserializer.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.serialization;
+
+import java.io.IOException;
+
+import com.fasterxml.jackson.core.JacksonException;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+
+import org.apache.tika.metadata.Metadata;
+
+public class MetadataDeserializer extends JsonDeserializer<Metadata> {
+
+ @Override
+ public Metadata deserialize(JsonParser jsonParser, DeserializationContext
deserializationContext) throws IOException, JacksonException {
+ Metadata metadata = new Metadata();
+ boolean keepGoing = true;
+ while (keepGoing) {
+ keepGoing = addField(jsonParser, metadata);
+ }
+ return metadata;
+ }
+
+ private boolean addField(JsonParser jsonParser, Metadata metadata) throws
IOException {
+ String field = jsonParser.nextFieldName();
+ if (field == null) {
+ return false;
+ }
+ JsonToken token = jsonParser.nextValue();
+
+ if (token == null) {
+ return false;
+ }
+
+ if (token.isScalarValue()) {
+ metadata.set(field, jsonParser.getText());
+ } else if (jsonParser.isExpectedStartArrayToken()) {
+ token = jsonParser.nextToken();
+ while (token != null) {
+ if (token == JsonToken.END_ARRAY) {
+ return true;
+ } else if (token.isScalarValue()) {
+ metadata.add(field, jsonParser.getText());
+ } else {
+ break;
+ }
+ token = jsonParser.nextToken();
+ }
+ } else {
+ return false;
+ }
+ return true;
+ }
+}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/MetadataSerializer.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/MetadataSerializer.java
new file mode 100644
index 000000000..68e7d6593
--- /dev/null
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/MetadataSerializer.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.serialization;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+
+import com.fasterxml.jackson.core.JsonGenerator;
+import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.SerializerProvider;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+public class MetadataSerializer extends JsonSerializer<Metadata> {
+ private static final String TIKA_CONTENT_KEY =
TikaCoreProperties.TIKA_CONTENT.getName();
+
+ //always sort the content at the end
+ private static final Comparator<String> METADATA_KEY_COMPARATOR = new
Comparator<String>() {
+ @Override
+ public int compare(String o1, String o2) {
+ if (o1.equals(TIKA_CONTENT_KEY)) {
+ return 1;
+ }
+ if (o2.equals(TIKA_CONTENT_KEY)) {
+ return -1;
+ }
+ return o1.compareTo(o2);
+ }
+ };
+
+ private boolean prettyPrint = false;
+
+ public MetadataSerializer() {
+
+ }
+
+ public MetadataSerializer(boolean prettyPrint) {
+ this.prettyPrint = prettyPrint;
+ }
+ @Override
+ public void serialize(Metadata metadata, JsonGenerator jsonGenerator,
SerializerProvider serializerProvider) throws IOException {
+ jsonGenerator.writeStartObject();
+ String[] names = metadata.names();
+ if (prettyPrint) {
+ Arrays.sort(names, METADATA_KEY_COMPARATOR);
+ }
+ for (String n : names) {
+ String[] v = metadata.getValues(n);
+ if (v.length == 0) {
+ continue;
+ } else if (v.length == 1) {
+ jsonGenerator.writeStringField(n, v[0]);
+ } else {
+ jsonGenerator.writeFieldName(n);
+ jsonGenerator.writeArray(v, 0, v.length);
+ }
+ }
+ jsonGenerator.writeEndObject();
+ }
+}
diff --git
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java
index 584920e5c..adc0c4691 100644
---
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java
+++
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java
@@ -30,7 +30,6 @@ public class ParseContextSerializer extends
JsonSerializer<ParseContext> {
@Override
public void serialize(ParseContext parseContext, JsonGenerator
jsonGenerator, SerializerProvider serializerProvider) throws IOException {
- jsonGenerator.writeFieldName(PARSE_CONTEXT);
jsonGenerator.writeStartObject();
for (String className : parseContext.keySet()) {
try {
diff --git
a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java
b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java
index 79ecf8df4..8c1c45a27 100644
---
a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java
+++
b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataListTest.java
@@ -18,17 +18,20 @@ package org.apache.tika.serialization;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
-import java.io.Reader;
+import java.io.InputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
+import com.fasterxml.jackson.databind.JsonMappingException;
import org.junit.jupiter.api.Test;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -60,16 +63,6 @@ public class JsonMetadataListTest {
JsonMetadataList.toJson(metadataList, writer);
List<Metadata> deserialized = JsonMetadataList.fromJson(new
StringReader(writer.toString()));
assertEquals(metadataList, deserialized);
-
- //now test streaming serializer
- writer = new StringWriter();
- try (JsonStreamingSerializer streamingSerializer = new
JsonStreamingSerializer(writer)) {
- streamingSerializer.add(m1);
- streamingSerializer.add(m2);
- }
- deserialized = JsonMetadataList.fromJson(new
StringReader(writer.toString()));
- assertEquals(metadataList, deserialized);
-
}
@Test
@@ -87,8 +80,7 @@ public class JsonMetadataListTest {
@Test
public void testListCorrupted() throws Exception {
String json =
"[{\"k1\":[\"v1\",\"v2\",\"v3\",\"v4\",\"v4\"],\"k2\":\"v1\"}," +
"\"k3\":[\"v1\",\"v2\",\"v3\",\"v4\",\"v4\"],\"k4\":\"v1\"}]";
- List<Metadata> m = JsonMetadataList.fromJson(null);
- assertNull(m);
+ Exception ex = assertThrows(JsonMappingException.class, () ->
JsonMetadataList.fromJson(new StringReader(json)));
}
@Test
@@ -119,13 +111,17 @@ public class JsonMetadataListTest {
.toString()
.startsWith("["));
writer = new StringWriter();
+ JsonMetadata.setPrettyPrinting(true);
+
+
JsonMetadataList.setPrettyPrinting(true);
JsonMetadataList.toJson(metadataList, writer);
- assertTrue(writer
+ String expected = "[ {[NEWLINE] \"zk1\" : [ \"v1\", \"v2\", \"v3\",
\"v4\", \"v4\" ],[NEWLINE] \"zk2\" : \"v1\",[NEWLINE]" +
+ " \"X-TIKA:content\" : \"this is the content\"[NEWLINE]}, " +
+ "{[NEWLINE] \"k3\" : [ \"v1\", \"v2\", \"v3\", \"v4\", \"v4\"
],[NEWLINE] \"k4\" : \"v1\"[NEWLINE]} ]";
+ assertEquals(expected, writer
.toString()
- .replaceAll("\r\n", "\n")
- .startsWith("[ {\n" + " \"zk1\" : [ \"v1\", \"v2\", \"v3\",
\"v4\", \"v4\" ],\n" + " \"zk2\" : \"v1\",\n" + " \"X-TIKA:content\" : \"this
is the content\"\n" +
- "},"));
+ .replaceAll("[\r\n]+", "[NEWLINE]"));
//now set it back to false
@@ -138,35 +134,24 @@ public class JsonMetadataListTest {
}
@Test
- public void testSwitchingOrderOfMainDoc() throws Exception {
- Metadata m1 = new Metadata();
- m1.add("k1", "v1");
- m1.add("k1", "v2");
- m1.add("k1", "v3");
- m1.add("k1", "v4");
- m1.add("k1", "v4");
- m1.add("k2", "v1");
- m1.add(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, "/embedded-1");
-
- Metadata m2 = new Metadata();
- m2.add("k3", "v1");
- m2.add("k3", "v2");
- m2.add("k3", "v3");
- m2.add("k3", "v4");
- m2.add("k3", "v4");
- m2.add("k4", "v1");
-
- List<Metadata> truth = new ArrayList<>();
- truth.add(m2);
- truth.add(m1);
- StringWriter stringWriter = new StringWriter();
- try (JsonStreamingSerializer serializer = new
JsonStreamingSerializer(stringWriter)) {
- serializer.add(m1);
- serializer.add(m2);
+ public void testLargeValues() throws Exception {
+ //TIKA-4154
+ TikaConfig tikaConfig = null;
+ try (InputStream is =
JsonMetadata.class.getResourceAsStream("/config/tika-config-json.xml")) {
+ tikaConfig = new TikaConfig(is);
}
- Reader reader = new StringReader(stringWriter.toString());
- List<Metadata> deserialized = JsonMetadataList.fromJson(reader);
- assertEquals(truth, deserialized);
-
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < 30000000; i++) {
+ sb.append("v");
+ }
+ Metadata m = new Metadata();
+ m.add("large_value", sb.toString());
+ List<Metadata> list = new ArrayList<>();
+ list.add(m);
+ list.add(m);
+ StringWriter writer = new StringWriter();
+ JsonMetadataList.toJson(list, writer);
+ List<Metadata> deserialized = JsonMetadataList.fromJson(new
StringReader(writer.toString()));
+ assertEquals(list, deserialized);
}
}
diff --git
a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java
b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java
index aa9b8ccad..80d32bdc4 100644
---
a/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java
+++
b/tika-serialization/src/test/java/org/apache/tika/serialization/JsonMetadataTest.java
@@ -29,12 +29,14 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
public class JsonMetadataTest {
@Test
public void testBasicSerializationAndDeserialization() throws Exception {
Metadata metadata = new Metadata();
+ metadata.add(TikaCoreProperties.TIKA_CONTENT, "this is the content");
metadata.add("k1", "v1");
metadata.add("k1", "v2");
//test duplicate value
@@ -54,7 +56,7 @@ public class JsonMetadataTest {
StringWriter writer = new StringWriter();
JsonMetadata.toJson(metadata, writer);
Metadata deserialized = JsonMetadata.fromJson(new
StringReader(writer.toString()));
- assertEquals(7, deserialized.names().length);
+ assertEquals(8, deserialized.names().length);
assertEquals(metadata, deserialized);
//test that this really is 6 Chinese characters
@@ -66,11 +68,13 @@ public class JsonMetadataTest {
writer = new StringWriter();
JsonMetadata.setPrettyPrinting(true);
JsonMetadata.toJson(metadata, writer);
- assertTrue(writer
+ String expected = "{[NEWLINE] \"alma_mater\" : \"普林斯顿大学\",[NEWLINE]
\"html\" : \"<html><body>& </body></html>\"," +
+ "[NEWLINE] \"json_escapes\" : \"the: \\\"quick\\\" brown,
fox\"," +
+ "[NEWLINE] \"k1\" : [ \"v1\", \"v2\" ],[NEWLINE] \"k3\" : [
\"v3\", \"v3\" ],[NEWLINE] \"k4\" : \"500,000\"," +
+ "[NEWLINE] \"url\" :
\"/myApp/myAction.html?method=router&cmd=1\",[NEWLINE] \"X-TIKA:content\" :
\"this is the content\"[NEWLINE]}";
+ assertEquals(expected, writer
.toString()
- .replaceAll("\r\n", "\n")
- .contains("\"json_escapes\" : \"the: \\\"quick\\\" brown,
fox\",\n" + " \"k1\" : [ \"v1\", \"v2\" ],\n" + " \"k3\" : [ \"v3\", \"v3\"
],\n" +
- " \"k4\" : \"500,000\",\n" + " \"url\" :
\"/myApp/myAction.html?method=router&cmd=1\"\n" + "}"));
+ .replaceAll("[\r\n]+", "[NEWLINE]"));
}
@Test
diff --git
a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
index f6bf0a95f..89913d4b6 100644
---
a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
+++
b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java
@@ -45,10 +45,8 @@ public class TestParseContextSerialization {
String json;
try (Writer writer = new StringWriter()) {
try (JsonGenerator jsonGenerator = new
JsonFactory().createGenerator(writer)) {
- jsonGenerator.writeStartObject();
ParseContextSerializer serializer = new
ParseContextSerializer();
serializer.serialize(pc, jsonGenerator, null);
- jsonGenerator.writeEndObject();
}
json = writer.toString();
}