This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 41d3eb13d3e5f608bc9a21f1e1b946bf1c7bf46d
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Tue Jun 9 14:17:40 2020 +0200

    NUTCH-2787 CrawlDb JSON dump does not export metadata primitive data types 
correctly
    - add JsonSerializer to write common Writable types (null, boolean, numbers)
    - remaining "unknown" Writables are written after calling toString()
---
 src/java/org/apache/nutch/crawl/CrawlDbReader.java | 36 ++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java 
b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 603b2e3..1bb8160 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -79,8 +79,11 @@ import org.apache.commons.jexl2.Expression;
 import com.fasterxml.jackson.core.JsonGenerationException;
 import com.fasterxml.jackson.core.JsonGenerator;
 import com.fasterxml.jackson.core.util.MinimalPrettyPrinter;
+import com.fasterxml.jackson.databind.JsonSerializer;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.ObjectWriter;
+import com.fasterxml.jackson.databind.SerializerProvider;
+import com.fasterxml.jackson.databind.module.SimpleModule;
 
 /**
  * Read utility for the CrawlDB.
@@ -243,6 +246,9 @@ public class CrawlDbReader extends AbstractChecker 
implements Closeable {
         this.out = out;
         jsonMapper.getFactory()
             .configure(JsonGenerator.Feature.ESCAPE_NON_ASCII, true);
+        SimpleModule module = new SimpleModule();
+        module.addSerializer(Writable.class, new WritableSerializer());
+        jsonMapper.registerModule(module);
         jsonWriter = jsonMapper.writer(new JsonIndenter());
       }
 
@@ -295,6 +301,36 @@ public class CrawlDbReader extends AbstractChecker 
implements Closeable {
       DataOutputStream fileOut = fs.create(new Path(dir, name), context);
       return new LineRecordWriter(fileOut);
     }
+
+    public static class WritableSerializer extends JsonSerializer<Writable> {
+      @Override
+      public void serialize(Writable obj, JsonGenerator jgen,
+          SerializerProvider provider) throws IOException {
+        if (obj instanceof org.apache.hadoop.io.NullWritable) {
+          jgen.writeNull();
+        } else if (obj instanceof org.apache.hadoop.io.BooleanWritable) {
+          jgen.writeBoolean(((org.apache.hadoop.io.BooleanWritable) 
obj).get());
+        } else if (obj instanceof org.apache.hadoop.io.IntWritable) {
+          jgen.writeNumber(((org.apache.hadoop.io.IntWritable) obj).get());
+        } else if (obj instanceof org.apache.hadoop.io.VIntWritable) {
+          jgen.writeNumber(((org.apache.hadoop.io.VIntWritable) obj).get());
+        } else if (obj instanceof org.apache.hadoop.io.LongWritable) {
+          jgen.writeNumber(((org.apache.hadoop.io.LongWritable) obj).get());
+        } else if (obj instanceof org.apache.hadoop.io.VLongWritable) {
+          jgen.writeNumber(((org.apache.hadoop.io.VLongWritable) obj).get());
+        } else if (obj instanceof org.apache.hadoop.io.ByteWritable) {
+          jgen.writeNumber(((org.apache.hadoop.io.ByteWritable) obj).get());
+        } else if (obj instanceof org.apache.hadoop.io.FloatWritable) {
+          jgen.writeNumber(((org.apache.hadoop.io.FloatWritable) obj).get());
+        } else if (obj instanceof org.apache.hadoop.io.DoubleWritable) {
+          jgen.writeNumber(((org.apache.hadoop.io.DoubleWritable) obj).get());
+        } else if (obj instanceof org.apache.hadoop.io.BytesWritable) {
+          jgen.writeBinary(((org.apache.hadoop.io.BytesWritable) 
obj).getBytes());
+        } else {
+          jgen.writeString(obj.toString());
+        }
+      }
+    }
   }
 
   public static class CrawlDbStatMapper

Reply via email to