This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 41d3eb13d3e5f608bc9a21f1e1b946bf1c7bf46d Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Tue Jun 9 14:17:40 2020 +0200 NUTCH-2787 CrawlDb JSON dump does not export metadata primitive data types correctly - add JsonSerializer to write common Writable types (null, boolean, numbers) - remaining "unknown" Writables are written after calling toString() --- src/java/org/apache/nutch/crawl/CrawlDbReader.java | 36 ++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java index 603b2e3..1bb8160 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java @@ -79,8 +79,11 @@ import org.apache.commons.jexl2.Expression; import com.fasterxml.jackson.core.JsonGenerationException; import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.core.util.MinimalPrettyPrinter; +import com.fasterxml.jackson.databind.JsonSerializer; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectWriter; +import com.fasterxml.jackson.databind.SerializerProvider; +import com.fasterxml.jackson.databind.module.SimpleModule; /** * Read utility for the CrawlDB. @@ -243,6 +246,9 @@ public class CrawlDbReader extends AbstractChecker implements Closeable { this.out = out; jsonMapper.getFactory() .configure(JsonGenerator.Feature.ESCAPE_NON_ASCII, true); + SimpleModule module = new SimpleModule(); + module.addSerializer(Writable.class, new WritableSerializer()); + jsonMapper.registerModule(module); jsonWriter = jsonMapper.writer(new JsonIndenter()); } @@ -295,6 +301,36 @@ public class CrawlDbReader extends AbstractChecker implements Closeable { DataOutputStream fileOut = fs.create(new Path(dir, name), context); return new LineRecordWriter(fileOut); } + + public static class WritableSerializer extends JsonSerializer<Writable> { + @Override + public void serialize(Writable obj, JsonGenerator jgen, + SerializerProvider provider) throws IOException { + if (obj instanceof org.apache.hadoop.io.NullWritable) { + jgen.writeNull(); + } else if (obj instanceof org.apache.hadoop.io.BooleanWritable) { + jgen.writeBoolean(((org.apache.hadoop.io.BooleanWritable) obj).get()); + } else if (obj instanceof org.apache.hadoop.io.IntWritable) { + jgen.writeNumber(((org.apache.hadoop.io.IntWritable) obj).get()); + } else if (obj instanceof org.apache.hadoop.io.VIntWritable) { + jgen.writeNumber(((org.apache.hadoop.io.VIntWritable) obj).get()); + } else if (obj instanceof org.apache.hadoop.io.LongWritable) { + jgen.writeNumber(((org.apache.hadoop.io.LongWritable) obj).get()); + } else if (obj instanceof org.apache.hadoop.io.VLongWritable) { + jgen.writeNumber(((org.apache.hadoop.io.VLongWritable) obj).get()); + } else if (obj instanceof org.apache.hadoop.io.ByteWritable) { + jgen.writeNumber(((org.apache.hadoop.io.ByteWritable) obj).get()); + } else if (obj instanceof org.apache.hadoop.io.FloatWritable) { + jgen.writeNumber(((org.apache.hadoop.io.FloatWritable) obj).get()); + } else if (obj instanceof org.apache.hadoop.io.DoubleWritable) { + jgen.writeNumber(((org.apache.hadoop.io.DoubleWritable) obj).get()); + } else if (obj instanceof org.apache.hadoop.io.BytesWritable) { + jgen.writeBinary(((org.apache.hadoop.io.BytesWritable) obj).getBytes()); + } else { + jgen.writeString(obj.toString()); + } + } + } } public static class CrawlDbStatMapper