Author: mattmann
Date: Sat Apr 25 15:56:39 2015
New Revision: 1676029
URL: http://svn.apache.org/r1676029
Log:
NUTCH-1997: Fix for Add CBOR magic header to CommonCrawlDataDumper output
contributed by Giuseppe Totaro, and Luke Sh.
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1676029&r1=1676028&r2=1676029&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr 25 15:56:39 2015
@@ -2,6 +2,9 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1997 Add CBOR "magic header" to CommonCrawlDataDumper
+ output (Giuseppe Totaro, Luke Sh via mattmann)
+
* NUTCH-1991 Tika mime detection not using Nutch supplied tika-mimetypes.xml
for content based
detection (Iain Lopata, snagel via mattmann)
Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1676029&r1=1676028&r2=1676029&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Sat
Apr 25 15:56:39 2015
@@ -515,6 +515,31 @@ public class CommonCrawlDataDumper {
tarOutput.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);
}
+ /**
+ * Writes the CBOR "Self-Describe Tag" (value 55799, serialized as
3-byte
+ * sequence of {@code 0xd9d9f7}) at the current position. This method
must
+ * be used to write the CBOR magic number at the beginning of the
document.
+ * Since version 2.5, <a
+ * href="https://github.com/FasterXML/jackson-dataformat-cbor"
+ * >jackson-dataformat-cbor</a> will support the {@code
WRITE_TYPE_HEADER}
+ * feature to write that type tag at the beginning of the document.
+ *
+ * @see <a href="https://tools.ietf.org/html/rfc7049#section-2.4.5">RFC
+ * 7049</a>
+ * @param generator {@link CBORGenerator} object used to create a
CBOR-encoded document.
+ * @throws IOException if any I/O error occurs.
+ */
+ private void writeMagicHeader(CBORGenerator generator) throws
IOException {
+ // Writes self-describe CBOR
+ // https://tools.ietf.org/html/rfc7049#section-2.4.5
+ // It will be supported in jackson-cbor since 2.5
+ byte[] header = new byte[3];
+ header[0] = (byte) 0xd9;
+ header[1] = (byte) 0xd9;
+ header[2] = (byte) 0xf7;
+ generator.writeBytes(header, 0, header.length);
+ }
+
private byte[] serializeCBORData(String jsonData) {
CBORFactory factory = new CBORFactory();
@@ -524,6 +549,8 @@ public class CommonCrawlDataDumper {
try {
stream = new ByteArrayOutputStream();
generator = factory.createGenerator(stream);
+ // Writes CBOR tag
+ writeMagicHeader(generator);
generator.writeString(jsonData);
generator.flush();
stream.flush();