Author: mattmann
Date: Sat Apr 25 15:56:39 2015
New Revision: 1676029

URL: http://svn.apache.org/r1676029
Log:
NUTCH-1997: Fix for Add CBOR magic header to CommonCrawlDataDumper output 
contributed by Giuseppe Totaro, and Luke Sh.

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1676029&r1=1676028&r2=1676029&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr 25 15:56:39 2015
@@ -2,6 +2,9 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1997 Add CBOR "magic header" to CommonCrawlDataDumper 
+  output (Giuseppe Totaro, Luke Sh via mattmann)
+
 * NUTCH-1991 Tika mime detection not using Nutch supplied tika-mimetypes.xml 
for content based 
   detection (Iain Lopata, snagel via mattmann)
 

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1676029&r1=1676028&r2=1676029&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Sat 
Apr 25 15:56:39 2015
@@ -515,6 +515,31 @@ public class CommonCrawlDataDumper {
            tarOutput.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);
        }
        
+       /**
+        * Writes the CBOR "Self-Describe Tag" (value 55799, serialized as 
3-byte
+        * sequence of {@code 0xd9d9f7}) at the current position. This method 
must
+        * be used to write the CBOR magic number at the beginning of the 
document.
+        * Since version 2.5, <a
+        * href="https://github.com/FasterXML/jackson-dataformat-cbor";
+        * >jackson-dataformat-cbor</a> will support the {@code 
WRITE_TYPE_HEADER}
+        * feature to write that type tag at the beginning of the document.
+        * 
+        * @see <a href="https://tools.ietf.org/html/rfc7049#section-2.4.5";>RFC
+        *      7049</a>
+        * @param generator {@link CBORGenerator} object used to create a 
CBOR-encoded document.
+        * @throws IOException if any I/O error occurs.
+        */
+       private void writeMagicHeader(CBORGenerator generator) throws 
IOException {
+               // Writes self-describe CBOR
+               // https://tools.ietf.org/html/rfc7049#section-2.4.5
+               // It will be supported in jackson-cbor since 2.5
+               byte[] header = new byte[3];
+               header[0] = (byte) 0xd9;
+               header[1] = (byte) 0xd9;
+               header[2] = (byte) 0xf7;
+               generator.writeBytes(header, 0, header.length);
+       }
+       
        private byte[] serializeCBORData(String jsonData) {
                CBORFactory factory = new CBORFactory();
                
@@ -524,6 +549,8 @@ public class CommonCrawlDataDumper {
                try {
                        stream = new ByteArrayOutputStream();
                        generator = factory.createGenerator(stream);
+                       // Writes CBOR tag
+                       writeMagicHeader(generator);
                        generator.writeString(jsonData);
                        generator.flush();
                        stream.flush();


Reply via email to