svn commit: r1664109 - in /nutch/trunk: ./ ivy/ src/bin/ src/java/org/apache/nutch/tools/

lewismc Wed, 04 Mar 2015 10:50:55 -0800

Author: lewismc
Date: Wed Mar  4 18:48:32 2015
New Revision: 1664109

URL: http://svn.apache.org/r1664109
Log:
NUTCH-1949 Dump out the Nutch data into the Common Crawl format


Added:
    nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/bin/nutch
    nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java
    nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Mar  4 18:48:32 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1949 Dump out the Nutch data into the Common Crawl format (Giuseppe 
Totaro via lewismc)
+
 * NUTCH-1950 File name too long (Jiaheng Zhang, Chong Li via mattmann)
 
 * NUTCH-1921 Optionally disable HTTP if-modified-since header (markus)

Modified: nutch/trunk/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Wed Mar  4 18:48:32 2015
@@ -49,7 +49,8 @@
                        rev="3.1" conf="*->master" />
                <dependency org="commons-codec" name="commons-codec" rev="1.3"
                        conf="*->default" />
-               
+                <dependency org="org.apache.commons" name="commons-compress" 
rev="1.9" 
+                        conf="*->default" />   
                <dependency org="org.apache.hadoop" name="hadoop-core" 
rev="1.2.0"
                        conf="*->default">
                        <exclude org="hsqldb" name="hsqldb" />
@@ -70,6 +71,9 @@
                <dependency org="com.google.guava" name="guava" rev="11.0.2" />
                <dependency org="com.google.code.crawler-commons" 
name="crawler-commons"
                        rev="0.5" />
+               
+                <dependency org="com.fasterxml.jackson.core" 
name="jackson-databind" rev="2.5.1" /> 
+                <dependency org="com.fasterxml.jackson.dataformat" 
name="jackson-dataformat-cbor" rev="2.5.1" />
 
                <!--Configuration: test -->
 

Modified: nutch/trunk/src/bin/nutch
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Wed Mar  4 18:48:32 2015
@@ -71,7 +71,8 @@ if [ $# = 0 ]; then
   echo "  mergelinkdb       merge linkdb-s, with optional filtering"
   echo "  index             run the plugin-based indexer on parsed segments 
and linkdb"
   echo "  dedup             deduplicate entries in the crawldb and give them a 
special status"
-  echo "  dump              exports cralwed data from segments into files"
+  echo "  dump              exports crawled data from segments into files"
+  echo "  commoncrawldump   exports crawled data from segments into common 
crawl data format encoded as CBOR"
   echo "  solrindex         run the solr indexer on parsed segments and linkdb 
- DEPRECATED use the index command instead"
   echo "  solrdedup         remove duplicates from solr - DEPRECATED use the 
dedup command instead"
   echo "  solrclean         remove HTTP 301 and 404 documents from solr - 
DEPRECATED use the clean command instead"
@@ -233,6 +234,8 @@ elif [ "$COMMAND" = "mergelinkdb" ] ; th
   CLASS=org.apache.nutch.crawl.LinkDbMerger
 elif [ "$COMMAND" = "dump" ] ; then
   CLASS=org.apache.nutch.tools.FileDumper
+elif [ "$COMMAND" = "commoncrawldump" ] ; then
+  CLASS=org.apache.nutch.tools.CommonCrawlDataDumper
 elif [ "$COMMAND" = "solrindex" ] ; then
   CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
   shift

Added: 
nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java 
(added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java 
Wed Mar  4 18:48:32 2015
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * Abstract class that implements {@see CommonCrawlFormat} interface. 
+ *
+ */
+public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat {
+       protected String url;
+       
+       protected byte[] content;
+       
+       protected Metadata metadata;
+       
+       protected Configuration conf;
+       
+       public AbstractCommonCrawlFormat(String url, byte[] content, Metadata 
metadata, Configuration conf) {
+               this.url = url;
+               this.content = content;
+               this.metadata = metadata;
+               this.conf = conf;
+       }
+
+       @Override
+       public String getJsonData(boolean mapAll) throws IOException {
+               if (mapAll) {
+                       return getJsonDataAll();
+               }
+               else {
+                       return getJsonDataSet();
+               }
+       }
+       
+       protected abstract String getJsonDataSet() throws IOException;
+       
+       protected abstract String getJsonDataAll() throws IOException;
+       
+       protected String ifNullString(String value) {
+               return (value != null) ? value : "";
+       }
+       
+       protected static String getHostName() {
+               String hostName = "";
+               try {
+                       hostName = InetAddress.getLocalHost().getHostName();
+               } catch (UnknownHostException uhe) {
+                       
+               }
+               return hostName;
+       }
+       
+       protected static String getHostAddress() {
+               String hostAddress = "";
+               try {
+                       hostAddress = 
InetAddress.getLocalHost().getHostAddress();
+               } catch (UnknownHostException uhe) {
+                       
+               }
+               return hostAddress;
+       }
+}

Modified: nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java Wed Mar  4 
18:48:32 2015
@@ -53,6 +53,7 @@ public class Benchmark extends Configure
     System.exit(res);
   }
 
+  @SuppressWarnings("unused")
   private static String getDate() {
     return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System
         .currentTimeMillis()));

Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java 
(added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Wed 
Mar  4 18:48:32 2015
@@ -0,0 +1,470 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+//JDK imports
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
+//Commons imports
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.FilenameUtils;
+
+//Hadoop
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+
+//Tika imports
+import org.apache.tika.Tika;
+import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
+import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.ibm.icu.text.SimpleDateFormat;
+
+/**
+ * <p>
+ * The Common Crawl Data Dumper tool enables one to reverse generate the raw
+ * content from Nutch segment data directories into a common crawling data
+ * format, consumed by many applications. The data is then serialized as <a
+ * href="http://cbor.io";>CBOR</a>
+ * </p>
+ * <p>
+ * Text content will be stored in a structured document format. Below is a
+ * schema for storage of data and metadata related to a crawling request, with
+ * the response body truncated for readability. This document must be encoded
+ * using CBOR and should be compressed with gzip after encoding. The 
timestamped
+ * URL key for these records' keys follows the same layout as the media file
+ * directory structure, with underscores in place of directory separators. 
</li>
+ * </p>
+ * <p>
+ * Thus, the timestamped url key for the record is provided below followed by 
an
+ * example record:
+ * 
+ * <pre>
+ * {@code
+ * com_somepage_33a3e36bbef59c2a5242c2ccee59239ab30d51f3_1411623696000
+ *     
+ *     {
+ *         "url": "http:\/\/somepage.com\/22\/14560817",
+ *         "timestamp": "1411623696000",
+ *         "request": {
+ *             "method": "GET",
+ *             "client": {
+ *                 "hostname": "crawler01.local",
+ *                 "address": "74.347.129.200",
+ *                 "software": "Apache Nutch v1.10",
+ *                 "robots": "classic",
+ *                 "contact": {
+ *                     "name": "Nutch Admin",
+ *                     "email": "[email protected]"
+ *                 }
+ *             },
+ *             "headers": {
+ *                 "Accept": 
"text\/html,application\/xhtml+xml,application\/xml",
+ *                 "Accept-Encoding": "gzip,deflate,sdch",
+ *                 "Accept-Language": "en-US,en",
+ *                 "User-Agent": "Mozilla\/5.0",
+ *                 "...": "..."
+ *             },
+ *             "body": null
+ *         },
+ *         "response": {
+ *             "status": "200",
+ *             "server": {
+ *                 "hostname": "somepage.com",
+ *                 "address": "55.33.51.19",
+ *             },
+ *             "headers": {
+ *                 "Content-Encoding": "gzip",
+ *                 "Content-Type": "text\/html",
+ *                 "Date": "Thu, 25 Sep 2014 04:16:58 GMT",
+ *                 "Expires": "Thu, 25 Sep 2014 04:16:57 GMT",
+ *                 "Server": "nginx",
+ *                 "...": "..."
+ *             },
+ *             "body": "\r\n  <!DOCTYPE html PUBLIC ... \r\n\r\n  \r\n    
</body>\r\n    </html>\r\n  \r\n\r\n",    
+ *         },
+ *         "key": 
"com_somepage_33a3e36bbef59c2a5242c2ccee59239ab30d51f3_1411623696000",
+ *         "imported": "1411623698000"
+ *     }
+ *     }
+ * </pre>
+ * 
+ * <p>
+ * Upon successful completion the tool displays a very convenient JSON snippet
+ * detailing the mimetype classifications and the counts of documents which 
fall
+ * into those classifications. An example is as follows:
+ * </p>
+ * 
+ * <pre>
+ * {@code
+ * INFO: File Types: 
+ *   TOTAL Stats:    {
+ *     {"mimeType":"application/xml","count":19"}
+ *     {"mimeType":"image/png","count":47"}
+ *     {"mimeType":"image/jpeg","count":141"}
+ *     {"mimeType":"image/vnd.microsoft.icon","count":4"}
+ *     {"mimeType":"text/plain","count":89"}
+ *     {"mimeType":"video/quicktime","count":2"}
+ *     {"mimeType":"image/gif","count":63"}
+ *     {"mimeType":"application/xhtml+xml","count":1670"}
+ *     {"mimeType":"application/octet-stream","count":40"}
+ *     {"mimeType":"text/html","count":1863"}
+ *   }
+ * }
+ * </pre>
+ * 
+ */
+public class CommonCrawlDataDumper {
+
+       private static final Logger LOG = 
LoggerFactory.getLogger(CommonCrawlDataDumper.class.getName());
+
+       /**
+        * Main method for invoking this tool
+        * 
+        * @param args
+        *            1) output directory (which will be created if it does not
+        *            already exist) to host the CBOR data and 2) a directory
+        *            containing one or more segments from which we wish to 
generate
+        *            CBOR data from. Optionally, 3) a list of mimetypes and 
the 4) 
+        *            the gzip option may be provided.
+        * @throws Exception
+        */
+       @SuppressWarnings("static-access")
+       public static void main(String[] args) throws Exception {
+               Option helpOpt = new Option("h", "help", false,
+                               "show this help message");
+               // argument options
+               Option outputOpt = OptionBuilder
+                               .withArgName("outputDir")
+                               .hasArg()
+                               .withDescription(
+                                               "output directory (which will 
be created) to host the CBOR data")
+                               .create("outputDir");
+               Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
+                               .withDescription("the segment(s) to 
use").create("segment");
+               // GIUSEPPE: create mimetype and gzip options
+               Option mimeOpt = OptionBuilder
+                               .isRequired(false)
+                               .withArgName("mimetype")
+                               .hasArgs()
+                               .withDescription(
+                                               "an optional list of mimetypes 
to dump, excluding all others. Defaults to all.")
+                               .create("mimetype");
+               Option gzipOpt = OptionBuilder
+                               .isRequired(false)
+                               .hasArg(false)
+                               .withDescription(
+                                               "an optional flag indicating 
whether to additionally gzip the data")
+                               .create("gzip");
+
+               // create the options
+               Options options = new Options();
+               options.addOption(helpOpt);
+               options.addOption(outputOpt);
+               options.addOption(segOpt);
+               // create mimetypes and gzip options
+               options.addOption(mimeOpt);
+               options.addOption(gzipOpt);
+
+               CommandLineParser parser = new GnuParser();
+               try {
+                       CommandLine line = parser.parse(options, args);
+                       if (line.hasOption("help") || 
!line.hasOption("outputDir") || (!line.hasOption("segment"))) {
+                               HelpFormatter formatter = new HelpFormatter();
+                               
formatter.printHelp(CommonCrawlDataDumper.class.getName(), options, true);
+                               return;
+                       }
+
+                       File outputDir = new 
File(line.getOptionValue("outputDir"));
+                       File segmentRootDir = new 
File(line.getOptionValue("segment"));
+                       String[] mimeTypes = line.getOptionValues("mimetype");
+                       boolean gzip = line.hasOption("gzip");
+
+                       if (!outputDir.exists()) {
+                               LOG.warn("Output directory: [" + 
outputDir.getAbsolutePath() + "]: does not exist, creating it.");
+                               if (!outputDir.mkdirs())
+                                       throw new Exception("Unable to create: 
[" + outputDir.getAbsolutePath() + "]");
+                       }
+
+                       CommonCrawlDataDumper dumper = new 
CommonCrawlDataDumper();
+                       
+                       dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes);
+                       
+               } catch (Exception e) {
+                       LOG.error(CommonCrawlDataDumper.class.getName() + ": " 
+ StringUtils.stringifyException(e));
+                       e.printStackTrace();
+                       return;
+               }
+       }
+       
+       /**
+        * Dumps the reverse engineered CBOR content from the provided segment
+        * directories if a parent directory contains more than one segment,
+        * otherwise a single segment can be passed as an argument. If the 
boolean
+        * argument is provided then the CBOR is also zipped.
+        * 
+        * @param outputDir
+        *            the directory you wish to dump the raw content to. This
+        *            directory will be created.
+        * @param segmentRootDir
+        *            a directory containing one or more segments.
+        * @param gzip
+        *            a boolean flag indicating whether the CBOR content should 
also
+        *            be gzipped.
+        * @param mimetypes
+        *            an array of mime types we have to dump, all others will be
+     *            filtered out.
+        * @throws Exception
+        */
+       public void dump(File outputDir, File segmentRootDir, boolean gzip,     
String[] mimeTypes) throws Exception {
+               if (!gzip) {
+                       LOG.info("Gzipping CBOR data has been skipped");
+               }
+               // total file counts
+               Map<String, Integer> typeCounts = new HashMap<String, 
Integer>();
+               // filtered file counters
+               Map<String, Integer> filteredCounts = new HashMap<String, 
Integer>();
+               
+               Configuration conf = NutchConfiguration.create();
+               FileSystem fs = FileSystem.get(conf);
+               File[] segmentDirs = segmentRootDir.listFiles(new FileFilter() {
+                       @Override
+                       public boolean accept(File file) {
+                               return file.canRead() && file.isDirectory();
+                       }
+               });
+               
+               if (segmentDirs == null) {
+                       LOG.error("No segment directories found in [" + 
segmentRootDir.getAbsolutePath() + "]");
+                       System.exit(1);
+               }
+               
+               // Gzip initialization
+               FileOutputStream fileOutput = null;
+           BufferedOutputStream bufOutput = null;
+           GzipCompressorOutputStream gzipOutput = null;
+           TarArchiveOutputStream tarOutput = null;
+           
+           ArrayList<String> fileList = null;
+           
+               if (gzip) {
+                       String archiveName = new 
SimpleDateFormat("yyyyMMddhhmm'.tar.gz'").format(new Date());
+                   fileOutput = new FileOutputStream(new File(outputDir + 
File.separator + archiveName));
+                   bufOutput = new BufferedOutputStream(fileOutput);
+                   gzipOutput = new GzipCompressorOutputStream(bufOutput);
+                   tarOutput = new TarArchiveOutputStream(gzipOutput);
+                   
+                   fileList = new ArrayList<String>();
+               }
+
+               for (File segment : segmentDirs) {
+                       LOG.info("Processing segment: [" + 
segment.getAbsolutePath() + "]");
+                       // GIUSEPPE: Never used (also in FileDumper.java)!
+                       //DataOutputStream doutputStream = null;
+                       try {
+                               String segmentContentPath = 
segment.getAbsolutePath() + File.separator + Content.DIR_NAME + 
"/part-00000/data";
+                               Path file = new Path(segmentContentPath);
+
+                               if (!new File(file.toString()).exists()) {
+                                       LOG.warn("Skipping segment: [" + 
segmentContentPath     + "]: no data directory present");
+                                       continue;
+                               }
+                               SequenceFile.Reader reader = new 
SequenceFile.Reader(fs, file, conf);
+
+                               if (!new File(file.toString()).exists()) {
+                                       LOG.warn("Skipping segment: [" + 
segmentContentPath     + "]: no data directory present");
+                                       continue;
+                               }
+                               Writable key = (Writable) 
reader.getKeyClass().newInstance();
+                               
+                               Content content = null;
+
+                               while (reader.next(key)) {
+                                       content = new Content();
+                                       reader.getCurrentValue(content);
+                                       String url = key.toString();
+                                       String baseName = 
FilenameUtils.getBaseName(url);
+                                       String extension = 
FilenameUtils.getExtension(url);
+                                       if (extension == null || 
extension.equals("")) {
+                                               extension = "html";
+                                       }
+
+                                       String filename = baseName + "." + 
extension;
+                                       
+                                       // Encode all filetypes if no mimetypes 
have been given
+                                       Boolean filter = (mimeTypes == null);
+                                       
+                                       String jsonData = "";
+                                       try {
+                                               String mimeType = new 
Tika().detect(content.getContent());
+                                               // Maps file to JSON-based 
structure
+                                               CommonCrawlFormat format = 
CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url, 
content.getContent(), content.getMetadata(), conf);
+                                               jsonData = 
format.getJsonData(false);
+
+                                               collectStats(typeCounts, 
mimeType);
+                                               // collects statistics for the 
given mimetypes
+                                               if ((mimeType != null) && 
(mimeTypes != null) && Arrays.asList(mimeTypes).contains(mimeType)) {
+                                                       
collectStats(filteredCounts, mimeType);
+                                                       filter = true;
+                                               }
+                                       } catch (Exception e) {
+                                               e.printStackTrace();
+                                               LOG.warn("Tika is unable to 
detect type for: [" + url
+                                                               + "]");
+                                       }
+
+                                       if (filter) {
+                                               
+                                               byte[] byteData = 
serializeCBORData(jsonData);
+                                               
+                                               if (!gzip) {
+                                                       String outputFullPath = 
outputDir + File.separator + filename;
+                                                       File outputFile = new 
File(outputFullPath);
+                                                       if 
(outputFile.exists()) {
+                                                               
LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
+                                                       }
+                                                       else {
+                                                               
LOG.info("Writing: [" + outputFullPath + "]");
+                                                               
IOUtils.copy(new ByteArrayInputStream(byteData), new 
FileOutputStream(outputFile));
+                                                       }
+                                               }
+                                               else {
+                                                       if 
(fileList.contains(filename)) {
+                                                               
LOG.info("Skipping compressing: [" + filename   + "]: file already exists");
+                                                       }
+                                                       else {
+                                                               
fileList.add(filename);
+                                                               
LOG.info("Compressing: [" + filename + "]");
+                                                               TarArchiveEntry 
tarEntry = new TarArchiveEntry(filename);
+                                                               
tarEntry.setSize(byteData.length);
+                                                               
tarOutput.putArchiveEntry(tarEntry);
+                                                               
IOUtils.copy(new ByteArrayInputStream(byteData), tarOutput);
+                                                               
tarOutput.closeArchiveEntry();
+                                                       }
+                                               }
+                                       }
+                               }
+                               reader.close();
+                       } finally {
+                               fs.close();
+                       }
+               }
+               
+               if (gzip) {
+                       tarOutput.finish();
+                        
+               tarOutput.close();
+               gzipOutput.close();
+               bufOutput.close();
+               fileOutput.close();
+               }
+               
+               LOG.info("CommonsCrawlDataDumper File Stats: " + 
displayFileTypes(typeCounts, filteredCounts));
+       }
+       
+       private byte[] serializeCBORData(String jsonData) {
+               CBORFactory factory = new CBORFactory();
+               
+               CBORGenerator generator = null;
+               ByteArrayOutputStream stream = null;
+               
+               try {
+                       stream = new ByteArrayOutputStream();
+                       generator = factory.createGenerator(stream);
+                       generator.writeString(jsonData);
+                       generator.flush();
+                       stream.flush();
+                       
+                       return stream.toByteArray();
+                       
+               } catch (Exception e) {
+                       LOG.warn("CBOR encoding failed: " + e.getMessage());
+               } finally {
+                       try {
+                               generator.close();
+                               stream.close();
+                       } catch (IOException e) {
+                               // nothing to do
+                       }
+               }
+               
+               return null;
+       }
+
+       private void collectStats(Map<String, Integer> typeCounts, String 
mimeType) {
+               typeCounts.put(mimeType, typeCounts.containsKey(mimeType) ? 
typeCounts.get(mimeType) + 1 : 1);
+       }
+
+       private String displayFileTypes(Map<String, Integer> typeCounts, 
Map<String, Integer> filteredCounts) {
+               StringBuilder builder = new StringBuilder();
+               // print total stats
+               builder.append("\n  TOTAL Stats:\n");
+               builder.append("                {\n");
+               for (String mimeType : typeCounts.keySet()) {
+                       builder.append("    {\"mimeType\":\"");
+                       builder.append(mimeType);
+                       builder.append("\",\"count\":");
+                       builder.append(typeCounts.get(mimeType));
+                       builder.append("\"}\n");
+               }
+               builder.append("}\n");
+               // filtered types stats
+               if (!filteredCounts.isEmpty()) {
+                       builder.append("\n  FILTERED Stats:\n");
+                       builder.append("                {\n");
+                       for (String mimeType : filteredCounts.keySet()) {
+                               builder.append("    {\"mimeType\":\"");
+                               builder.append(mimeType);
+                               builder.append("\",\"count\":");
+                               builder.append(filteredCounts.get(mimeType));
+                               builder.append("\"}\n");
+                       }
+                       builder.append("}\n");
+               }
+               return builder.toString();
+       }
+}

Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java Wed Mar  
4 18:48:32 2015
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+
+/**
+ * Interface for all CommonCrawl formatter. It provides the signature for the
+ * method used to get JSON data.
+ * 
+ * @author gtotaro
+ *
+ */
+public interface CommonCrawlFormat {
+
+       /**
+        * 
+        * @param mapAll If {@code true} maps all metdata on the JSON structure.
+        * @return the JSON data
+        */
+       public String getJsonData(boolean mapAll) throws IOException;
+}

Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java 
(added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java 
Wed Mar  4 18:48:32 2015
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * Factory class that creates new {@see CommonCrawlFormat} objects (a.k.a. 
formatter) that map crawled files to CommonCrawl format.   
+ *
+ */
+public class CommonCrawlFormatFactory {
+       
+       /**
+        * Returns a new instance of a {@see CommonCrawlFormat} object 
specifying the type of formatter. 
+        * @param formatType the type of formatter to be created.
+        * @param url the url.
+        * @param content the content.
+        * @param metadata the metadata.
+        * @param conf the configuration.
+        * @return the new {@see CommonCrawlFormat} object.
+        */
+       public static CommonCrawlFormat getCommonCrawlFormat(String formatType, 
String url, byte[] content,
+                       Metadata metadata, Configuration conf) {
+               if (formatType == null) {
+                       return null;
+               }
+               
+               if (formatType.equalsIgnoreCase("jackson")) {
+                       return new CommonCrawlFormatJackson(url, content, 
metadata, conf);
+               }
+               else if (formatType.equalsIgnoreCase("jettinson")) {
+                       return new CommonCrawlFormatJettinson(url, content, 
metadata, conf);
+               }
+               else if (formatType.equalsIgnoreCase("simple")) {
+                       return new CommonCrawlFormatSimple(url, content, 
metadata, conf);
+               }
+               
+               return null;
+       }
+}

Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java 
(added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java 
Wed Mar  4 18:48:32 2015
@@ -0,0 +1,253 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonGenerator;
+
+/**
+ * This class provides methods to map crawled data on JSON using Jackson 
Streaming APIs. 
+ *
+ */
+public class CommonCrawlFormatJackson extends AbstractCommonCrawlFormat {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(CommonCrawlFormatJackson.class.getName());
+
+  public CommonCrawlFormatJackson(String url, byte[] content,
+      Metadata metadata, Configuration conf) {
+    super(url, content, metadata, conf);
+  }
+
+  @Override
+  protected String getJsonDataAll() throws IOException {
+    JsonFactory factory = new JsonFactory();
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    JsonGenerator generator = null;
+
+    try {
+      generator = factory.createGenerator(out);
+      generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
+
+      generator.writeStartObject();
+
+      // url
+      generator.writeFieldName("url");
+      generator.writeString(url);
+
+      // timestamp
+      generator.writeFieldName("timestamp");
+      generator.writeString(metadata.get(Metadata.LAST_MODIFIED));
+
+
+      //request
+      generator.writeFieldName("request");
+      generator.writeStartObject();
+      generator.writeFieldName("method");
+      generator.writeString("GET"); 
+      generator.writeFieldName("client");
+      generator.writeStartObject();
+      generator.writeFieldName("hostname");
+      generator.writeString(getHostName());
+      generator.writeFieldName("address");
+      generator.writeString(getHostAddress());
+      generator.writeFieldName("software");
+      generator.writeString(conf.get("http.agent.version", ""));
+      generator.writeFieldName("robots");
+      generator.writeString("classic");
+      generator.writeFieldName("contact");
+      generator.writeStartObject();
+      generator.writeFieldName("name");
+      generator.writeString(conf.get("http.agent.name", ""));
+      generator.writeFieldName("email");
+      generator.writeString(conf.get("http.agent.email", ""));
+      generator.writeEndObject();
+      generator.writeFieldName("headers");
+      generator.writeStartObject();
+      generator.writeFieldName("Accept");
+      generator.writeString(conf.get("accept", ""));
+      generator.writeFieldName("Accept-Encoding");
+      generator.writeString(""); // TODO
+      generator.writeFieldName("Accept-Language");
+      generator.writeString(conf.get("http.accept.language", ""));
+      generator.writeFieldName("User-Agent");
+      generator.writeString(conf.get("http.robots.agents", ""));
+      generator.writeEndObject();
+      generator.writeFieldName("body");
+      generator.writeNull();
+      generator.writeEndObject();
+
+      //response
+      generator.writeFieldName("response");
+      generator.writeStartObject();
+      generator.writeFieldName("status");
+      generator.writeString(ifNullString(metadata.get("status")));
+      generator.writeFieldName("server");
+
+      generator.writeStartObject();
+      generator.writeFieldName("hostname");
+      generator.writeString(URLUtil.getHost(url)); 
+      generator.writeFieldName("address");
+      generator.writeString(ifNullString(metadata.get("_ip_")));
+      generator.writeEndObject();
+
+      generator.writeFieldName("headers");
+      generator.writeStartObject();
+      for (String name : metadata.names()) {
+        generator.writeFieldName(name);
+        generator.writeString(ifNullString(metadata.get(name)));
+      }
+      generator.writeEndObject();
+
+      generator.writeFieldName("body");
+      generator.writeString(new String(content));
+      generator.writeEndObject();
+
+      generator.writeFieldName("key"); 
+      generator.writeString(url);
+
+      generator.writeFieldName("imported"); // TODO
+      generator.writeString("");
+
+      generator.writeEndObject();
+
+      generator.flush();
+
+      return out.toString();
+
+    } catch (IOException ioe) {
+      LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
+      throw new IOException("Error in generating JSON using Jackson:" + 
ioe.getMessage()); 
+    }
+  }
+
+  @Override
+  protected String getJsonDataSet() throws IOException {
+    JsonFactory factory = new JsonFactory();
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    JsonGenerator generator = null;
+
+    try {
+      generator = factory.createGenerator(out);
+      generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
+
+      generator.writeStartObject();
+
+      // url
+      generator.writeFieldName("url");
+      generator.writeString(url);
+
+      // timestamp
+      generator.writeFieldName("timestamp");
+      generator.writeString(metadata.get(Metadata.LAST_MODIFIED)); 
+
+      //request
+      generator.writeFieldName("request");
+      generator.writeStartObject();
+      generator.writeFieldName("method");
+      generator.writeString("GET");
+      generator.writeFieldName("client");
+      generator.writeStartObject();
+      generator.writeFieldName("hostname");
+      generator.writeString(getHostName());
+      generator.writeFieldName("address");
+      generator.writeString(getHostAddress());
+      generator.writeFieldName("software");
+      generator.writeString(conf.get("http.agent.version", ""));
+      generator.writeFieldName("robots");
+      generator.writeString("CLASSIC"); 
+      generator.writeFieldName("contact");
+      generator.writeStartObject();
+      generator.writeFieldName("name");
+      generator.writeString(conf.get("http.agent.name", ""));
+      generator.writeFieldName("email");
+      generator.writeString(conf.get("http.agent.email", ""));
+      generator.writeEndObject();
+      generator.writeFieldName("headers");
+      generator.writeStartObject();
+      generator.writeFieldName("Accept");
+      generator.writeString(conf.get("accept", ""));
+      generator.writeFieldName("Accept-Encoding");
+      generator.writeString(""); // TODO
+      generator.writeFieldName("Accept-Language");
+      generator.writeString(conf.get("http.accept.language", ""));
+      generator.writeFieldName("User-Agent");
+      generator.writeString(conf.get("http.robots.agents", ""));
+      generator.writeEndObject();
+      generator.writeFieldName("body");
+      generator.writeNull();
+      generator.writeEndObject();
+
+      //response
+      generator.writeFieldName("response");
+      generator.writeStartObject();
+      generator.writeFieldName("status");
+      generator.writeString(ifNullString(metadata.get("status")));
+      generator.writeFieldName("server");
+
+      generator.writeStartObject();
+      generator.writeFieldName("hostname");
+      generator.writeString(URLUtil.getHost(url)); 
+      generator.writeFieldName("address");
+      generator.writeString(ifNullString(metadata.get("_ip_")));
+      generator.writeEndObject();
+
+      generator.writeFieldName("headers");
+      generator.writeStartObject();
+      generator.writeFieldName("Content-Encoding");
+      generator.writeString(ifNullString(metadata.get("Content-Encoding")));
+      generator.writeFieldName("Content-Type");
+      generator.writeString(ifNullString(metadata.get("Content-Type")));
+      generator.writeFieldName("Date");
+      generator.writeString(ifNullString(metadata.get("Date")));
+      generator.writeFieldName("Server");
+      generator.writeString(ifNullString(metadata.get("Server")));
+      generator.writeEndObject();
+
+      generator.writeFieldName("body");
+      generator.writeString(new String(content));
+      generator.writeEndObject();
+
+      generator.writeFieldName("key");
+      generator.writeString(url);
+
+      generator.writeFieldName("imported"); // TODO
+      generator.writeString("");
+
+      generator.writeEndObject();
+
+      generator.flush();
+
+      return out.toString();
+
+    } catch (IOException ioe) {
+      LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
+      throw new IOException("Error in generating JSON using Jackson:" + 
ioe.getMessage()); 
+    }
+  }
+}

Added: 
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java 
(added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java 
Wed Mar  4 18:48:32 2015
@@ -0,0 +1,168 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONObject;
+import org.mortbay.log.Log;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class provides methods to map crawled data on JSON using Jettinson 
APIs. 
+ *
+ */
+public class CommonCrawlFormatJettinson extends AbstractCommonCrawlFormat {
+       
+       private static final Logger LOG = 
LoggerFactory.getLogger(CommonCrawlFormatJettinson.class.getName());
+
+       public CommonCrawlFormatJettinson(String url, byte[] content,
+                       Metadata metadata, Configuration conf) {
+               super(url, content, metadata, conf);
+       }
+       
+       @Override
+       protected String getJsonDataAll() throws IOException {
+               JSONObject object = new JSONObject();
+
+               try {
+                       // url
+                       object.put("url", url);
+
+                       // timestamp
+                       object.put("timestamp", 
metadata.get(Metadata.LAST_MODIFIED));
+
+                       // request
+                       JSONObject requestObject = new JSONObject();
+                       requestObject.put("method", "GET"); 
+                       JSONObject clientObject = new JSONObject();
+                       clientObject.put("hostname", getHostName());
+                       clientObject.put("address", getHostAddress());
+                       clientObject.put("software", 
conf.get("http.agent.version", ""));
+                       clientObject.put("robots", "CLASSIC");
+                       JSONObject contactObject = new JSONObject();
+                       contactObject.put("name", conf.get("http.agent.name", 
""));
+                       contactObject.put("email", conf.get("http.agent.email", 
""));
+                       clientObject.put("contact", contactObject);
+                       requestObject.put("client", clientObject);
+                       JSONObject reqHeadersObject = new JSONObject();
+                       reqHeadersObject.put("Accept", conf.get("http.accept", 
""));
+                       reqHeadersObject.put("Accept-Encoding", ""); // TODO
+                       reqHeadersObject.put("Accept-Language", 
conf.get("http.accept.language", ""));
+                       reqHeadersObject.put("User-Agent", 
conf.get("http.robots.agents", ""));
+                       requestObject.put("headers", reqHeadersObject);
+                       requestObject.put("body", JSONObject.NULL);
+                       object.put("request", requestObject);
+
+                       // response
+                       JSONObject responseObject = new JSONObject();
+                       responseObject.put("status", 
ifNullString(metadata.get("status")));
+                       JSONObject serverObject = new JSONObject();
+                       serverObject.put("hostname", URLUtil.getHost(url));
+                       serverObject.put("address", 
ifNullString(metadata.get("_ip_")));
+                       responseObject.put("client", serverObject);
+                       JSONObject respHeadersObject = new JSONObject();
+                       for (String name : metadata.names()) {
+                               respHeadersObject.put(name, 
ifNullString(metadata.get(name)));
+                       }
+                       responseObject.put("headers", respHeadersObject);
+                       responseObject.put("body", new String(content));
+                       object.put("response", responseObject);
+
+                       // key
+                       object.put("key", url); 
+
+                       // imported
+                       object.put("imported", ""); // TODO
+
+                       return object.toString(2); // INDENTED OUTPUT
+
+               } catch (JSONException jsone) {
+                       LOG.warn("Error in processing file " + url + ": " + 
jsone.getMessage());
+                       throw new IOException("Error in generating JSON using 
Jettinson:" + jsone.getMessage()); 
+               }
+       }
+
+       @Override
+       protected String getJsonDataSet() throws IOException {
+               JSONObject object = new JSONObject();
+
+               try {
+                       // url
+                       object.put("url", url);
+
+                       // timestamp
+                       object.put("timestamp", 
metadata.get(Metadata.LAST_MODIFIED));
+
+                       // request
+                       JSONObject requestObject = new JSONObject();
+                       requestObject.put("method", "GET"); 
+                       JSONObject clientObject = new JSONObject();
+                       clientObject.put("hostname", getHostName());
+                       clientObject.put("address", getHostAddress());
+                       clientObject.put("software", 
conf.get("http.agent.version", ""));
+                       clientObject.put("robots", "CLASSIC"); 
+                       JSONObject contactObject = new JSONObject();
+                       contactObject.put("name", conf.get("http.agent.name", 
""));
+                       contactObject.put("email", conf.get("http.agent.email", 
""));
+                       clientObject.put("contact", contactObject);
+                       requestObject.put("client", clientObject);
+                       JSONObject reqHeadersObject = new JSONObject();
+                       reqHeadersObject.put("Accept", conf.get("http.accept", 
""));
+                       reqHeadersObject.put("Accept-Encoding", ""); // TODO
+                       reqHeadersObject.put("Accept-Language", 
conf.get("http.accept.language", ""));
+                       reqHeadersObject.put("User-Agent", 
conf.get("http.robots.agents", "")); 
+                       requestObject.put("headers", reqHeadersObject);
+                       requestObject.put("body", JSONObject.NULL);
+                       object.put("request", requestObject);
+
+                       // response
+                       JSONObject responseObject = new JSONObject();
+                       responseObject.put("status", 
ifNullString(metadata.get("status")));
+                       JSONObject serverObject = new JSONObject();
+                       serverObject.put("hostname", URLUtil.getHost(url)); 
+                       serverObject.put("address", 
ifNullString(metadata.get("_ip_")));
+                       responseObject.put("client", serverObject);
+                       JSONObject respHeadersObject = new JSONObject();
+                       respHeadersObject.put("Content-Encoding", 
ifNullString(metadata.get("Content-Encoding")));
+                       respHeadersObject.put("Content-Type", 
ifNullString(metadata.get("Content-Type")));
+                       respHeadersObject.put("Date", 
ifNullString(metadata.get("Date")));
+                       respHeadersObject.put("Server", 
ifNullString(metadata.get("Server")));
+                       responseObject.put("headers", respHeadersObject);
+                       responseObject.put("body", new String(content)); 
+                       object.put("response", responseObject);
+
+                       // key
+                       object.put("key", url);
+
+                       // imported
+                       object.put("imported", ""); // TODO
+
+                       return object.toString(2); // INDENTED OUTPUT
+
+               } catch (JSONException jsone) {
+                       LOG.warn("Error in processing file " + url + ": " + 
jsone.getMessage());
+                       throw new IOException("Error in generating JSON using 
Jettinson:" + jsone.getMessage()); 
+               }
+       }
+}

Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java 
(added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java 
Wed Mar  4 18:48:32 2015
@@ -0,0 +1,152 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * This class provides methods to map crawled data on JSON using a {@see 
StringBuilder} object. 
+ *
+ */
+public class CommonCrawlFormatSimple extends AbstractCommonCrawlFormat {
+       
+       public CommonCrawlFormatSimple(String url, byte[] content, Metadata 
metadata,
+                       Configuration conf) {
+               super(url, content, metadata, conf);
+       }
+       
+       @Override
+       protected String getJsonDataAll() {
+               // TODO character escaping
+               StringBuilder sb = new StringBuilder();
+               sb.append("{\n");
+
+               // url
+               sb.append("\t\"url\": \"" + url + "\",\n");
+               
+               // timstamp
+               sb.append("\t\"timstamp\": \"" + 
metadata.get(Metadata.LAST_MODIFIED) + "\",\n");
+                               
+               // request
+               sb.append("\t\"request\": {\n");
+               sb.append("\t\t\"method\": \"GET\",\n");
+               sb.append("\t\t\"client\": {\n");
+               sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n");
+               sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n");
+               sb.append("\t\t\t\"software\": \"" + 
conf.get("http.agent.version", "") + "\",\n");
+               sb.append("\t\t\t\"robots\": \"CLASSIC\",\n");
+               sb.append("\t\t\t\"contact\": {\n");
+               sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name", 
"") + "\",\n");
+               sb.append("\t\t\t\t\"email\": \"" + 
conf.get("http.agent.email", "") + "\",\n");
+               sb.append("\t\t\t}\n");
+               sb.append("\t\t},\n");
+               sb.append("\t\t\"headers\": {\n");
+               sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "") 
+ "\",\n");
+               sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); //TODO
+               sb.append("\t\t\t\"Accept-Language\": \"" + 
conf.get("http.accept.language", "") + "\",\n");
+               sb.append("\t\t\t\"User-Agent\": \"" + 
conf.get("http.robots.agents", "") + "\",\n");  
+               sb.append("\t},\n");
+
+               // response
+               sb.append("\t\"response\": {\n");
+               sb.append("\t\t\"status\": \"" + 
ifNullString(metadata.get("status")) + "\",\n");
+               sb.append("\t\t\"server\": {\n");
+               sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + 
"\"\n"); 
+               sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") + 
"\"\n");
+               sb.append("\t\t},\n");
+               sb.append("\t\t\"headers\": {\n");      
+               for (String name : metadata.names()) {
+                       sb.append("\t\t\t\"" + name + "\": \"" + 
metadata.get(name)     + "\"\n");
+               }
+               sb.append("\t\t},\n");
+               sb.append("\t\t\"body\": " + new String(content) + "\",\n");
+               sb.append("\t},\n");
+               
+               // key
+               sb.append("\t\"key\": \"" + url + "\",\n");
+               
+               // imported
+               sb.append("\t\"imported\": \"\"\n"); //TODO
+               
+               sb.append("}");
+
+               return sb.toString();
+       }
+       
+       @Override
+       protected String getJsonDataSet() {
+               // TODO character escaping
+               StringBuilder sb = new StringBuilder();
+               sb.append("{\n");
+               
+               // url
+               sb.append("\t\"url\": \"" + url + "\",\n");
+               
+               // timstamp
+               sb.append("\t\"timestamp\": \"" + 
metadata.get(Metadata.LAST_MODIFIED) + "\",\n");
+               
+               // request
+               sb.append("\t\"request\": {\n");
+               sb.append("\t\t\"method\": \"GET\",\n");
+               sb.append("\t\t\"client\": {\n");
+               sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n");
+               sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n");
+               sb.append("\t\t\t\"software\": \"" + 
conf.get("http.agent.version", "") + "\",\n");
+               sb.append("\t\t\t\"robots\": \"CLASSIC\",\n");
+               sb.append("\t\t\t\"contact\": {\n");
+               sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name", 
"") + "\",\n");
+               sb.append("\t\t\t\t\"email\": \"" + 
conf.get("http.agent.email", "") + "\",\n");
+               sb.append("\t\t\t}\n");
+               sb.append("\t\t},\n");
+               sb.append("\t\t\"headers\": {\n");
+               sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "") 
+ "\",\n");
+               sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); // TODO
+               sb.append("\t\t\t\"Accept-Language\": \"" + 
conf.get("http.accept.language", "") + "\",\n");
+    sb.append("\t\t\t\"User-Agent\": \"" + conf.get("http.robots.agents", "") 
+ "\",\n");  
+               sb.append("\t},\n");
+               
+               // response
+               sb.append("\t\"response\": {\n");
+               sb.append("\t\t\"status\": \"" + 
ifNullString(metadata.get("status")) + "\",\n");
+               sb.append("\t\t\"server\": {\n");
+    sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + "\"\n"); 
+               sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") + 
"\"\n");
+               sb.append("\t\t},\n");
+               sb.append("\t\t\"headers\": {\n");
+               sb.append("\t\t\t\"Content-Encoding\": " + 
ifNullString(metadata.get("Content-Encoding")));
+               sb.append("\t\t\t\"Content-Type\": " + 
ifNullString(metadata.get("Content-Type")));
+               sb.append("\t\t\t\"Date\": " + 
ifNullString(metadata.get("Date")));
+               sb.append("\t\t\t\"Server\": " + 
ifNullString(metadata.get("Server")));
+               sb.append("\t\t},\n");
+               sb.append("\t\t\"body\": " + new String(content) + "\",\n");
+               sb.append("\t},\n");
+               
+               // key
+               sb.append("\t\"key\": \"" + url + "\",\n"); 
+               
+               // imported
+               sb.append("\t\"imported\": \"\"\n"); // TODO
+               
+               sb.append("}");
+
+               return sb.toString();
+       }
+
+}

Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Wed Mar  4 
18:48:32 2015
@@ -100,7 +100,7 @@ import org.slf4j.LoggerFactory;
  * }
  * </pre>
  * <p>
- * In the case above the tool would have been run with the <b>-mimeType
+ * In the case above, the tool would have been run with the <b>-mimeType
  * image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b>
  * flag and corresponding values activated.
  *

svn commit: r1664109 - in /nutch/trunk: ./ ivy/ src/bin/ src/java/org/apache/nutch/tools/

Reply via email to