Author: mattmann
Date: Thu Mar 26 02:56:20 2015
New Revision: 1669248

URL: http://svn.apache.org/r1669248
Log:
fix for NUTCH-1974 keyPrefix option for CommonCrawlDataDumper tool (Giuseppe 
Totaro via mattmann).

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
    nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Mar 26 02:56:20 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1974 keyPrefix option for CommonCrawlDataDumper tool (Giuseppe Totaro 
via mattmann)
+
 * NUTCH-1968 File Name too long issue of DumpFileUtil.java file (Xin Zhang, 
Renxia Wang via mattmann)
 
 * NUTCH-1966 Configuration endpoint for 1x REST API (Sujen Shah via mattmann)

Modified: 
nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java 
Thu Mar 26 02:56:20 2015
@@ -23,12 +23,17 @@ import java.net.UnknownHostException;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Abstract class that implements {@see CommonCrawlFormat} interface. 
  *
  */
 public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat {
+       private static final Logger LOG = 
LoggerFactory.getLogger(AbstractCommonCrawlFormat.class.getName());
+       
        protected String url;
        
        protected byte[] content;
@@ -37,32 +42,117 @@ public abstract class AbstractCommonCraw
        
        protected Configuration conf;
        
-       public AbstractCommonCrawlFormat(String url, byte[] content, Metadata 
metadata, Configuration conf) {
+       protected String keyPrefix;
+       
+       public AbstractCommonCrawlFormat(String url, byte[] content, Metadata 
metadata, Configuration conf, String keyPrefix) throws IOException {
                this.url = url;
                this.content = content;
                this.metadata = metadata;
                this.conf = conf;
+               this.keyPrefix = keyPrefix;
        }
-
+       
        @Override
-       public String getJsonData(boolean mapAll) throws IOException {
-               if (mapAll) {
-                       return getJsonDataAll();
-               }
-               else {
-                       return getJsonDataSet();
+       public String getJsonData() throws IOException {
+               try {
+                       startObject(null);
+                       
+                       // url
+                       writeKeyValue("url", getUrl());
+                       
+                       // timestamp
+                       writeKeyValue("timestamp", getTimestamp());
+                       
+                       // request
+                       startObject("request");
+                       writeKeyValue("method", getMethod());
+                       startObject("client");
+                       writeKeyValue("hostname", getRequestHostName());
+                       writeKeyValue("address", getRequestHostAddress());
+                       writeKeyValue("software", getRequestSoftware());
+                       writeKeyValue("robots", getRequestRobots());
+                       startObject("contact");
+                       writeKeyValue("name", getRequestContactName());
+                       writeKeyValue("email", getRequestContactEmail());
+                       closeObject("contact");
+                       closeObject("client");
+                       startObject("headers");
+                       writeKeyValue("Accept", getRequestAccept());
+                       writeKeyValue("Accept-Encoding", 
getRequestAcceptEncoding());
+                       writeKeyValue("Accept-Language", 
getRequestAcceptLanguage());
+                       writeKeyValue("User-Agent", getRequestUserAgent());
+                       closeObject("headers");
+                       writeKeyNull("body");
+                       closeObject("request");
+                       
+                       // response
+                       startObject("response");
+                       writeKeyValue("status", getResponseStatus());
+                       startObject("server");
+                       writeKeyValue("hostname", getResponseHostName());
+                       writeKeyValue("address", getResponseAddress());
+                       closeObject("server");
+                       startObject("headers");
+                       writeKeyValue("Content-Encoding", 
getResponseContentEncoding());
+                       writeKeyValue("Content-Type", getResponseContentType());
+                       writeKeyValue("Date", getResponseDate());
+                       writeKeyValue("Server", getResponseServer());
+                       for (String name : metadata.names()) {
+                               if (name.equalsIgnoreCase("Content-Encoding") 
|| name.equalsIgnoreCase("Content-Type") || name.equalsIgnoreCase("Date") || 
name.equalsIgnoreCase("Server")) {
+                                       continue;
+                               }
+                               writeKeyValue(name, metadata.get(name));
+                       }
+                       closeObject("headers");
+                       writeKeyValue("body", getResponseContent());
+                       closeObject("response");
+                       
+                       // key
+                       if (!this.keyPrefix.isEmpty()) {
+                               this.keyPrefix += "-";
+                       }
+                       writeKeyValue("key", this.keyPrefix + getKey());
+                       
+                       // imported
+                       writeKeyValue("imported", getImported());
+                       
+                       closeObject(null);
+                       
+                       return generateJson();
+               
+               } catch (IOException ioe) {
+                       LOG.warn("Error in processing file " + url + ": " + 
ioe.getMessage());
+                       throw new IOException("Error in generating JSON:" + 
ioe.getMessage()); 
                }
        }
        
-       protected abstract String getJsonDataSet() throws IOException;
+       // abstract methods
        
-       protected abstract String getJsonDataAll() throws IOException;
+       protected abstract void writeKeyValue(String key, String value) throws 
IOException;
        
-       protected String ifNullString(String value) {
-               return (value != null) ? value : "";
+       protected abstract void writeKeyNull(String key) throws IOException;
+       
+       protected abstract void startObject(String key) throws IOException;
+       
+       protected abstract void closeObject(String key) throws IOException;
+       
+       protected abstract String generateJson() throws IOException;
+       
+       // getters
+       
+       protected String getUrl() {
+               return url;
+       }
+       
+       protected String getTimestamp() {
+               return metadata.get(ifNullString(Metadata.LAST_MODIFIED));
+       }
+       
+       protected String getMethod() {
+               return new String("GET");
        }
        
-       protected static String getHostName() {
+       protected String getRequestHostName() {
                String hostName = "";
                try {
                        hostName = InetAddress.getLocalHost().getHostName();
@@ -72,7 +162,7 @@ public abstract class AbstractCommonCraw
                return hostName;
        }
        
-       protected static String getHostAddress() {
+       protected String getRequestHostAddress() {
                String hostAddress = "";
                try {
                        hostAddress = 
InetAddress.getLocalHost().getHostAddress();
@@ -81,4 +171,80 @@ public abstract class AbstractCommonCraw
                }
                return hostAddress;
        }
+       
+       protected String getRequestSoftware() {
+               return conf.get("http.agent.version", "");
+       }
+       
+       protected String getRequestRobots() {
+               return new String("CLASSIC");
+       }
+       
+       protected String getRequestContactName() {
+               return conf.get("http.agent.name", "");
+       }
+       
+       protected String getRequestContactEmail() {
+               return conf.get("http.agent.email", "");
+       }
+       
+       protected String getRequestAccept() {
+               return conf.get("http.accept", "");
+       }
+       
+       protected String getRequestAcceptEncoding() {
+               return new String(""); // TODO
+       }
+       
+       protected String getRequestAcceptLanguage() {
+               return conf.get("http.accept.language", "");
+       }
+       
+       protected String getRequestUserAgent() {
+               return conf.get("http.robots.agents", "");
+       }
+       
+       protected String getResponseStatus() {
+               return ifNullString(metadata.get("status"));
+       }
+       
+       protected String getResponseHostName() {
+               return URLUtil.getHost(url);
+       }
+       
+       protected String getResponseAddress() {
+               return ifNullString(metadata.get("_ip_"));
+       }
+       
+       protected String getResponseContentEncoding() {
+               return ifNullString(metadata.get("Content-Encoding"));
+       }
+       
+       protected String getResponseContentType() {
+               return ifNullString(metadata.get("Content-Type"));
+       }
+       
+       protected String getResponseDate() {
+               return ifNullString(metadata.get("Date"));
+       }
+       
+       protected String getResponseServer() {
+               return ifNullString(metadata.get("Server"));
+       }
+       
+       protected String getResponseContent() {
+               return new String(content);
+       }
+       
+       protected String getKey() {
+               return url;
+       }
+       
+       protected String getImported() {
+               return new String(""); // TODO
+       }
+       
+       private static String ifNullString(String value) {
+               return (value != null) ? value : "";
+       }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Thu 
Mar 26 02:56:20 2015
@@ -30,7 +30,6 @@ import java.util.Arrays;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Map;
-import java.security.MessageDigest;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
@@ -54,12 +53,15 @@ import org.apache.hadoop.io.SequenceFile
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.DumpFileUtil;
 import org.apache.nutch.util.NutchConfiguration;
 
 //Tika imports
 import org.apache.tika.Tika;
+
 import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
 import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -162,6 +164,13 @@ import com.ibm.icu.text.SimpleDateFormat
 public class CommonCrawlDataDumper {
 
        private static final Logger LOG = 
LoggerFactory.getLogger(CommonCrawlDataDumper.class.getName());
+       
+       // Gzip initialization
+       private FileOutputStream fileOutput = null;
+       private BufferedOutputStream bufOutput = null;
+       private GzipCompressorOutputStream gzipOutput = null;
+       private TarArchiveOutputStream tarOutput = null;
+       private ArrayList<String> fileList = null;
 
        /**
         * Main method for invoking this tool
@@ -177,17 +186,20 @@ public class CommonCrawlDataDumper {
        @SuppressWarnings("static-access")
        public static void main(String[] args) throws Exception {
                Option helpOpt = new Option("h", "help", false,
-                               "show this help message");
+                               "show this help message.");
                // argument options
                Option outputOpt = OptionBuilder
                                .withArgName("outputDir")
                                .hasArg()
                                .withDescription(
-                                               "output directory (which will 
be created) to host the CBOR data")
+                                               "output directory (which will 
be created) to host the CBOR data.")
                                .create("outputDir");
-               Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
-                               .withDescription("the segment(s) to 
use").create("segment");
-               // GIUSEPPE: create mimetype and gzip options
+               Option segOpt = OptionBuilder
+                               .withArgName("segment")
+                               .hasArgs()
+                               .withDescription("the segment(s) to use")
+                               .create("segment");
+               // create mimetype and gzip options
                Option mimeOpt = OptionBuilder
                                .isRequired(false)
                                .withArgName("mimetype")
@@ -196,11 +208,16 @@ public class CommonCrawlDataDumper {
                                                "an optional list of mimetypes 
to dump, excluding all others. Defaults to all.")
                                .create("mimetype");
                Option gzipOpt = OptionBuilder
-                               .isRequired(false)
+                               .withArgName("gzip")
                                .hasArg(false)
                                .withDescription(
-                                               "an optional flag indicating 
whether to additionally gzip the data")
+                                               "an optional flag indicating 
whether to additionally gzip the data.")
                                .create("gzip");
+               Option keyPrefixOpt = OptionBuilder
+                               .withArgName("keyPrefix")
+                               .hasArg(true)
+                               .withDescription("an optional prefix for key in 
the output format.")
+                               .create("keyPrefix");
 
                // create the options
                Options options = new Options();
@@ -210,6 +227,8 @@ public class CommonCrawlDataDumper {
                // create mimetypes and gzip options
                options.addOption(mimeOpt);
                options.addOption(gzipOpt);
+               // create keyPrefix option
+               options.addOption(keyPrefixOpt);
 
                CommandLineParser parser = new GnuParser();
                try {
@@ -224,6 +243,7 @@ public class CommonCrawlDataDumper {
                        File segmentRootDir = new 
File(line.getOptionValue("segment"));
                        String[] mimeTypes = line.getOptionValues("mimetype");
                        boolean gzip = line.hasOption("gzip");
+                       String keyPrefix = line.getOptionValue("keyPrefix", "");
 
                        if (!outputDir.exists()) {
                                LOG.warn("Output directory: [" + 
outputDir.getAbsolutePath() + "]: does not exist, creating it.");
@@ -233,7 +253,7 @@ public class CommonCrawlDataDumper {
 
                        CommonCrawlDataDumper dumper = new 
CommonCrawlDataDumper();
                        
-                       dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes);
+                       dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes, 
keyPrefix);
                        
                } catch (Exception e) {
                        LOG.error(CommonCrawlDataDumper.class.getName() + ": " 
+ StringUtils.stringifyException(e));
@@ -261,7 +281,7 @@ public class CommonCrawlDataDumper {
      *            filtered out.
         * @throws Exception
         */
-       public void dump(File outputDir, File segmentRootDir, boolean gzip,     
String[] mimeTypes) throws Exception {
+       public void dump(File outputDir, File segmentRootDir, boolean gzip,     
String[] mimeTypes, String keyPrefix) throws Exception {
                if (!gzip) {
                        LOG.info("Gzipping CBOR data has been skipped");
                }
@@ -284,22 +304,9 @@ public class CommonCrawlDataDumper {
                        System.exit(1);
                }
                
-               // Gzip initialization
-               FileOutputStream fileOutput = null;
-           BufferedOutputStream bufOutput = null;
-           GzipCompressorOutputStream gzipOutput = null;
-           TarArchiveOutputStream tarOutput = null;
-           
-           ArrayList<String> fileList = null;
-           
                if (gzip) {
-                       String archiveName = new 
SimpleDateFormat("yyyyMMddhhmm'.tar.gz'").format(new Date());
-                   fileOutput = new FileOutputStream(new File(outputDir + 
File.separator + archiveName));
-                   bufOutput = new BufferedOutputStream(fileOutput);
-                   gzipOutput = new GzipCompressorOutputStream(bufOutput);
-                   tarOutput = new TarArchiveOutputStream(gzipOutput);
-                   
-                   fileList = new ArrayList<String>();
+                       fileList = new ArrayList<String>();
+                   constructNewStream(outputDir);
                }
 
                for (File segment : segmentDirs) {
@@ -334,7 +341,14 @@ public class CommonCrawlDataDumper {
                                                extension = "html";
                                        }
 
-                                       String filename = baseName + "." + 
extension;
+                                       String md5Ofurl = 
DumpFileUtil.getUrlMD5(url);
+                                       String fullDir = 
DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl, 
!gzip);
+                                       String filename = 
DumpFileUtil.createFileName(md5Ofurl, baseName, extension);
+                                       String outputFullPath = 
String.format("%s/%s", fullDir, filename);
+
+                                       String [] fullPathLevels = 
fullDir.split(File.separator);
+                                       String firstLevelDirName = 
fullPathLevels[fullPathLevels.length-2]; 
+                                       String secondLevelDirName = 
fullPathLevels[fullPathLevels.length-1];
                                        
                                        // Encode all filetypes if no mimetypes 
have been given
                                        Boolean filter = (mimeTypes == null);
@@ -343,8 +357,8 @@ public class CommonCrawlDataDumper {
                                        try {
                                                String mimeType = new 
Tika().detect(content.getContent());
                                                // Maps file to JSON-based 
structure
-                                               CommonCrawlFormat format = 
CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url, 
content.getContent(), content.getMetadata(), conf);
-                                               jsonData = 
format.getJsonData(false);
+                                               CommonCrawlFormat format = 
CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url, 
content.getContent(), content.getMetadata(), conf, keyPrefix);
+                                               jsonData = format.getJsonData();
 
                                                collectStats(typeCounts, 
mimeType);
                                                // collects statistics for the 
given mimetypes
@@ -352,53 +366,36 @@ public class CommonCrawlDataDumper {
                                                        
collectStats(filteredCounts, mimeType);
                                                        filter = true;
                                                }
-                                       } catch (Exception e) {
-                                               e.printStackTrace();
-                                               LOG.warn("Tika is unable to 
detect type for: [" + url
-                                                               + "]");
+                                       } catch (IOException ioe) { 
+                                               LOG.error("Fatal error in 
creating JSON data: " + ioe.getMessage());
+                                               return;
                                        }
 
                                        if (filter) {
-                                               
                                                byte[] byteData = 
serializeCBORData(jsonData);
                                                
                                                if (!gzip) {
-                                                       String outputFullPath = 
outputDir + File.separator + filename;
+                                                       //String outputFullPath 
= outputDir + File.separator + filename;
                                                        File outputFile = new 
File(outputFullPath);
                                                        if 
(outputFile.exists()) {
                                                                
LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
                                                        }
                                                        else {
                                                                
LOG.info("Writing: [" + outputFullPath + "]");
-                                                               try{
-                                                                   
IOUtils.copy(new ByteArrayInputStream(byteData), new 
FileOutputStream(outputFile));
-                                                               }
-                                                               catch 
(Exception e){
-                                                                   
MessageDigest md = MessageDigest.getInstance("MD5");
-                                                                   
md.update(outputFullPath.getBytes());
-                                                                   byte[] 
digest = md.digest();
-                                                                   
StringBuffer sb = new StringBuffer();
-                                                                   for (byte b 
: digest) {
-                                                                          
sb.append(String.format("%02x", b & 0xff));
-                                                                   }
-                                                                   
outputFullPath = outputFullPath.substring(0, 32) + "_" + sb.toString();
-                                                                   File 
newOutPutFile = new File(outputFullPath);
-                                                                   
IOUtils.copy(new ByteArrayInputStream(byteData), new 
FileOutputStream(newOutPutFile));
-                                                                   
LOG.info("File name is too long. Truncated and MD5 appended.");
-                                                               }
+                                                               
IOUtils.copy(new ByteArrayInputStream(byteData), new 
FileOutputStream(outputFile));
                                                        }
                                                }
                                                else {
-                                                       if 
(fileList.contains(filename)) {
-                                                               
LOG.info("Skipping compressing: [" + filename   + "]: file already exists");
+                                                       if 
(fileList.contains(outputFullPath)) {
+                                                               
LOG.info("Skipping compressing: [" + outputFullPath + "]: file already exists");
                                                        }
                                                        else {
-                                                               
fileList.add(filename);
-                                                               
LOG.info("Compressing: [" + filename + "]");
-                                                               TarArchiveEntry 
tarEntry = new TarArchiveEntry(filename);
+                                                               
fileList.add(outputFullPath);
+                                                               
LOG.info("Compressing: [" + outputFullPath + "]");
+                                                               TarArchiveEntry 
tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + 
secondLevelDirName + File.separator + filename);
                                                                
tarEntry.setSize(byteData.length);
                                                                
tarOutput.putArchiveEntry(tarEntry);
-                                                               
IOUtils.copy(new ByteArrayInputStream(byteData), tarOutput);
+                                                               
tarOutput.write(byteData);
                                                                
tarOutput.closeArchiveEntry();
                                                        }
                                                }
@@ -411,15 +408,35 @@ public class CommonCrawlDataDumper {
                }
                
                if (gzip) {
+               closeStream();
+               }
+               
+               if (!typeCounts.isEmpty()) {
+                       LOG.info("CommonsCrawlDataDumper File Stats: " + 
displayFileTypes(typeCounts, filteredCounts));
+               }
+       }
+       
+       private void closeStream() {
+               try {
                        tarOutput.finish();
-                        
+                       
                tarOutput.close();
                gzipOutput.close();
                bufOutput.close();
                fileOutput.close();
+               } catch (IOException ioe) {
+                       LOG.warn("Error in closing stream: " + 
ioe.getMessage());
                }
-               
-               LOG.info("CommonsCrawlDataDumper File Stats: " + 
displayFileTypes(typeCounts, filteredCounts));
+       }
+       
+       private void constructNewStream(File outputDir) throws IOException {    
+               String archiveName = new 
SimpleDateFormat("yyyyMMddhhmm'.tar.gz'").format(new Date());
+               LOG.info("Creating a new gzip archive: " + archiveName);
+           fileOutput = new FileOutputStream(new File(outputDir + 
File.separator + archiveName));
+           bufOutput = new BufferedOutputStream(fileOutput);
+           gzipOutput = new GzipCompressorOutputStream(bufOutput);
+           tarOutput = new TarArchiveOutputStream(gzipOutput);
+           tarOutput.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);
        }
        
        private byte[] serializeCBORData(String jsonData) {
@@ -458,8 +475,8 @@ public class CommonCrawlDataDumper {
        private String displayFileTypes(Map<String, Integer> typeCounts, 
Map<String, Integer> filteredCounts) {
                StringBuilder builder = new StringBuilder();
                // print total stats
-               builder.append("\n  TOTAL Stats:\n");
-               builder.append("                {\n");
+               builder.append("\nTOTAL Stats:\n");
+               builder.append("{\n");
                for (String mimeType : typeCounts.keySet()) {
                        builder.append("    {\"mimeType\":\"");
                        builder.append(mimeType);
@@ -470,8 +487,8 @@ public class CommonCrawlDataDumper {
                builder.append("}\n");
                // filtered types stats
                if (!filteredCounts.isEmpty()) {
-                       builder.append("\n  FILTERED Stats:\n");
-                       builder.append("                {\n");
+                       builder.append("\nFILTERED Stats:\n");
+                       builder.append("{\n");
                        for (String mimeType : filteredCounts.keySet()) {
                                builder.append("    {\"mimeType\":\"");
                                builder.append(mimeType);

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java Thu Mar 
26 02:56:20 2015
@@ -33,5 +33,6 @@ public interface CommonCrawlFormat {
         * @param mapAll If {@code true} maps all metdata on the JSON structure.
         * @return the JSON data
         */
-       public String getJsonData(boolean mapAll) throws IOException;
+       //public String getJsonData(boolean mapAll) throws IOException;
+       public String getJsonData() throws IOException;
 }

Modified: 
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java 
Thu Mar 26 02:56:20 2015
@@ -17,6 +17,8 @@
 
 package org.apache.nutch.tools;
 
+import java.io.IOException;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
 
@@ -34,21 +36,22 @@ public class CommonCrawlFormatFactory {
         * @param metadata the metadata.
         * @param conf the configuration.
         * @return the new {@see CommonCrawlFormat} object.
+        * @throws IOException If any I/O error occurs.
         */
        public static CommonCrawlFormat getCommonCrawlFormat(String formatType, 
String url, byte[] content,
-                       Metadata metadata, Configuration conf) {
+                       Metadata metadata, Configuration conf, String 
keyPrefix) throws IOException {
                if (formatType == null) {
                        return null;
                }
                
                if (formatType.equalsIgnoreCase("jackson")) {
-                       return new CommonCrawlFormatJackson(url, content, 
metadata, conf);
+                       return new CommonCrawlFormatJackson(url, content, 
metadata, conf, keyPrefix);
                }
                else if (formatType.equalsIgnoreCase("jettinson")) {
-                       return new CommonCrawlFormatJettinson(url, content, 
metadata, conf);
+                       return new CommonCrawlFormatJettinson(url, content, 
metadata, conf, keyPrefix);
                }
                else if (formatType.equalsIgnoreCase("simple")) {
-                       return new CommonCrawlFormatSimple(url, content, 
metadata, conf);
+                       return new CommonCrawlFormatSimple(url, content, 
metadata, conf, keyPrefix);
                }
                
                return null;

Modified: 
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java 
Thu Mar 26 02:56:20 2015
@@ -22,9 +22,6 @@ import java.io.IOException;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.util.URLUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import com.fasterxml.jackson.core.JsonFactory;
 import com.fasterxml.jackson.core.JsonGenerator;
@@ -34,220 +31,52 @@ import com.fasterxml.jackson.core.JsonGe
  *
  */
 public class CommonCrawlFormatJackson extends AbstractCommonCrawlFormat {
-
-  private static final Logger LOG = 
LoggerFactory.getLogger(CommonCrawlFormatJackson.class.getName());
-
-  public CommonCrawlFormatJackson(String url, byte[] content,
-      Metadata metadata, Configuration conf) {
-    super(url, content, metadata, conf);
-  }
-
-  @Override
-  protected String getJsonDataAll() throws IOException {
-    JsonFactory factory = new JsonFactory();
-
-    ByteArrayOutputStream out = new ByteArrayOutputStream();
-    JsonGenerator generator = null;
-
-    try {
-      generator = factory.createGenerator(out);
-      generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
-
-      generator.writeStartObject();
-
-      // url
-      generator.writeFieldName("url");
-      generator.writeString(url);
-
-      // timestamp
-      generator.writeFieldName("timestamp");
-      generator.writeString(metadata.get(Metadata.LAST_MODIFIED));
-
-
-      //request
-      generator.writeFieldName("request");
-      generator.writeStartObject();
-      generator.writeFieldName("method");
-      generator.writeString("GET"); 
-      generator.writeFieldName("client");
-      generator.writeStartObject();
-      generator.writeFieldName("hostname");
-      generator.writeString(getHostName());
-      generator.writeFieldName("address");
-      generator.writeString(getHostAddress());
-      generator.writeFieldName("software");
-      generator.writeString(conf.get("http.agent.version", ""));
-      generator.writeFieldName("robots");
-      generator.writeString("classic");
-      generator.writeFieldName("contact");
-      generator.writeStartObject();
-      generator.writeFieldName("name");
-      generator.writeString(conf.get("http.agent.name", ""));
-      generator.writeFieldName("email");
-      generator.writeString(conf.get("http.agent.email", ""));
-      generator.writeEndObject();
-      generator.writeFieldName("headers");
-      generator.writeStartObject();
-      generator.writeFieldName("Accept");
-      generator.writeString(conf.get("accept", ""));
-      generator.writeFieldName("Accept-Encoding");
-      generator.writeString(""); // TODO
-      generator.writeFieldName("Accept-Language");
-      generator.writeString(conf.get("http.accept.language", ""));
-      generator.writeFieldName("User-Agent");
-      generator.writeString(conf.get("http.robots.agents", ""));
-      generator.writeEndObject();
-      generator.writeFieldName("body");
-      generator.writeNull();
-      generator.writeEndObject();
-
-      //response
-      generator.writeFieldName("response");
-      generator.writeStartObject();
-      generator.writeFieldName("status");
-      generator.writeString(ifNullString(metadata.get("status")));
-      generator.writeFieldName("server");
-
-      generator.writeStartObject();
-      generator.writeFieldName("hostname");
-      generator.writeString(URLUtil.getHost(url)); 
-      generator.writeFieldName("address");
-      generator.writeString(ifNullString(metadata.get("_ip_")));
-      generator.writeEndObject();
-
-      generator.writeFieldName("headers");
-      generator.writeStartObject();
-      for (String name : metadata.names()) {
-        generator.writeFieldName(name);
-        generator.writeString(ifNullString(metadata.get(name)));
-      }
-      generator.writeEndObject();
-
-      generator.writeFieldName("body");
-      generator.writeString(new String(content));
-      generator.writeEndObject();
-
-      generator.writeFieldName("key"); 
-      generator.writeString(url);
-
-      generator.writeFieldName("imported"); // TODO
-      generator.writeString("");
-
-      generator.writeEndObject();
-
-      generator.flush();
-
-      return out.toString();
-
-    } catch (IOException ioe) {
-      LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
-      throw new IOException("Error in generating JSON using Jackson:" + 
ioe.getMessage()); 
-    }
-  }
-
-  @Override
-  protected String getJsonDataSet() throws IOException {
-    JsonFactory factory = new JsonFactory();
-
-    ByteArrayOutputStream out = new ByteArrayOutputStream();
-    JsonGenerator generator = null;
-
-    try {
-      generator = factory.createGenerator(out);
-      generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
-
-      generator.writeStartObject();
-
-      // url
-      generator.writeFieldName("url");
-      generator.writeString(url);
-
-      // timestamp
-      generator.writeFieldName("timestamp");
-      generator.writeString(metadata.get(Metadata.LAST_MODIFIED)); 
-
-      //request
-      generator.writeFieldName("request");
-      generator.writeStartObject();
-      generator.writeFieldName("method");
-      generator.writeString("GET");
-      generator.writeFieldName("client");
-      generator.writeStartObject();
-      generator.writeFieldName("hostname");
-      generator.writeString(getHostName());
-      generator.writeFieldName("address");
-      generator.writeString(getHostAddress());
-      generator.writeFieldName("software");
-      generator.writeString(conf.get("http.agent.version", ""));
-      generator.writeFieldName("robots");
-      generator.writeString("CLASSIC"); 
-      generator.writeFieldName("contact");
-      generator.writeStartObject();
-      generator.writeFieldName("name");
-      generator.writeString(conf.get("http.agent.name", ""));
-      generator.writeFieldName("email");
-      generator.writeString(conf.get("http.agent.email", ""));
-      generator.writeEndObject();
-      generator.writeFieldName("headers");
-      generator.writeStartObject();
-      generator.writeFieldName("Accept");
-      generator.writeString(conf.get("accept", ""));
-      generator.writeFieldName("Accept-Encoding");
-      generator.writeString(""); // TODO
-      generator.writeFieldName("Accept-Language");
-      generator.writeString(conf.get("http.accept.language", ""));
-      generator.writeFieldName("User-Agent");
-      generator.writeString(conf.get("http.robots.agents", ""));
-      generator.writeEndObject();
-      generator.writeFieldName("body");
-      generator.writeNull();
-      generator.writeEndObject();
-
-      //response
-      generator.writeFieldName("response");
-      generator.writeStartObject();
-      generator.writeFieldName("status");
-      generator.writeString(ifNullString(metadata.get("status")));
-      generator.writeFieldName("server");
-
-      generator.writeStartObject();
-      generator.writeFieldName("hostname");
-      generator.writeString(URLUtil.getHost(url)); 
-      generator.writeFieldName("address");
-      generator.writeString(ifNullString(metadata.get("_ip_")));
-      generator.writeEndObject();
-
-      generator.writeFieldName("headers");
-      generator.writeStartObject();
-      generator.writeFieldName("Content-Encoding");
-      generator.writeString(ifNullString(metadata.get("Content-Encoding")));
-      generator.writeFieldName("Content-Type");
-      generator.writeString(ifNullString(metadata.get("Content-Type")));
-      generator.writeFieldName("Date");
-      generator.writeString(ifNullString(metadata.get("Date")));
-      generator.writeFieldName("Server");
-      generator.writeString(ifNullString(metadata.get("Server")));
-      generator.writeEndObject();
-
-      generator.writeFieldName("body");
-      generator.writeString(new String(content));
-      generator.writeEndObject();
-
-      generator.writeFieldName("key");
-      generator.writeString(url);
-
-      generator.writeFieldName("imported"); // TODO
-      generator.writeString("");
-
-      generator.writeEndObject();
-
-      generator.flush();
-
-      return out.toString();
-
-    } catch (IOException ioe) {
-      LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
-      throw new IOException("Error in generating JSON using Jackson:" + 
ioe.getMessage()); 
-    }
-  }
+       
+       //private static final Logger LOG = 
LoggerFactory.getLogger(CommonCrawlFormatJackson.class.getName());
+       
+       private ByteArrayOutputStream out;
+       
+       private JsonGenerator generator;
+
+       public CommonCrawlFormatJackson(String url, byte[] content,
+                       Metadata metadata, Configuration conf, String 
keyPrefix) throws IOException {
+               super(url, content, metadata, conf, keyPrefix);
+               
+               JsonFactory factory = new JsonFactory();
+               this.out = new ByteArrayOutputStream();
+               this.generator = factory.createGenerator(out);
+               
+               this.generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
+       }
+       
+       @Override
+       protected void writeKeyValue(String key, String value) throws 
IOException {
+               generator.writeFieldName(key);
+               generator.writeString(value);
+       }
+       
+       @Override
+       protected void writeKeyNull(String key) throws IOException {
+               generator.writeFieldName(key);
+               generator.writeNull();;
+       }
+       
+       @Override
+       protected void startObject(String key) throws IOException {
+               if (key != null) {
+                       generator.writeFieldName(key);
+               }
+               generator.writeStartObject();
+       }
+       
+       @Override
+       protected void closeObject(String key) throws IOException {
+               generator.writeEndObject();
+       }
+       
+       @Override
+       protected String generateJson() throws IOException {
+               this.generator.flush();
+               return this.out.toString();
+       }
 }

Modified: 
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java 
Thu Mar 26 02:56:20 2015
@@ -18,15 +18,13 @@
 package org.apache.nutch.tools;
 
 import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.util.URLUtil;
 import org.codehaus.jettison.json.JSONException;
 import org.codehaus.jettison.json.JSONObject;
-import org.mortbay.log.Log;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 /**
  * This class provides methods to map crawled data on JSON using Jettinson 
APIs. 
@@ -34,135 +32,57 @@ import org.slf4j.LoggerFactory;
  */
 public class CommonCrawlFormatJettinson extends AbstractCommonCrawlFormat {
        
-       private static final Logger LOG = 
LoggerFactory.getLogger(CommonCrawlFormatJettinson.class.getName());
+       private Deque<JSONObject> stack;
 
        public CommonCrawlFormatJettinson(String url, byte[] content,
-                       Metadata metadata, Configuration conf) {
-               super(url, content, metadata, conf);
+                       Metadata metadata, Configuration conf, String 
keyPrefix) throws IOException {
+               super(url, content, metadata, conf, keyPrefix);
+               
+               stack = new ArrayDeque<JSONObject>();
        }
        
        @Override
-       protected String getJsonDataAll() throws IOException {
+       protected void writeKeyValue(String key, String value) throws 
IOException {
+               try {
+                       stack.getFirst().put(key, value);
+               } catch (JSONException jsone) {
+                       throw new IOException(jsone.getMessage());
+               }
+       }
+       
+       @Override
+       protected void writeKeyNull(String key) throws IOException {
+               try {
+                       stack.getFirst().put(key, JSONObject.NULL);
+               } catch (JSONException jsone) {
+                       throw new IOException(jsone.getMessage());
+               }
+       }
+       
+       @Override
+       protected void startObject(String key) throws IOException {
                JSONObject object = new JSONObject();
-
+               stack.push(object);
+       }
+       
+       @Override
+       protected void closeObject(String key) throws IOException {
                try {
-                       // url
-                       object.put("url", url);
-
-                       // timestamp
-                       object.put("timestamp", 
metadata.get(Metadata.LAST_MODIFIED));
-
-                       // request
-                       JSONObject requestObject = new JSONObject();
-                       requestObject.put("method", "GET"); 
-                       JSONObject clientObject = new JSONObject();
-                       clientObject.put("hostname", getHostName());
-                       clientObject.put("address", getHostAddress());
-                       clientObject.put("software", 
conf.get("http.agent.version", ""));
-                       clientObject.put("robots", "CLASSIC");
-                       JSONObject contactObject = new JSONObject();
-                       contactObject.put("name", conf.get("http.agent.name", 
""));
-                       contactObject.put("email", conf.get("http.agent.email", 
""));
-                       clientObject.put("contact", contactObject);
-                       requestObject.put("client", clientObject);
-                       JSONObject reqHeadersObject = new JSONObject();
-                       reqHeadersObject.put("Accept", conf.get("http.accept", 
""));
-                       reqHeadersObject.put("Accept-Encoding", ""); // TODO
-                       reqHeadersObject.put("Accept-Language", 
conf.get("http.accept.language", ""));
-                       reqHeadersObject.put("User-Agent", 
conf.get("http.robots.agents", ""));
-                       requestObject.put("headers", reqHeadersObject);
-                       requestObject.put("body", JSONObject.NULL);
-                       object.put("request", requestObject);
-
-                       // response
-                       JSONObject responseObject = new JSONObject();
-                       responseObject.put("status", 
ifNullString(metadata.get("status")));
-                       JSONObject serverObject = new JSONObject();
-                       serverObject.put("hostname", URLUtil.getHost(url));
-                       serverObject.put("address", 
ifNullString(metadata.get("_ip_")));
-                       responseObject.put("client", serverObject);
-                       JSONObject respHeadersObject = new JSONObject();
-                       for (String name : metadata.names()) {
-                               respHeadersObject.put(name, 
ifNullString(metadata.get(name)));
+                       if (stack.size() > 1) {
+                               JSONObject object = stack.pop();
+                               stack.getFirst().put(key, object);
                        }
-                       responseObject.put("headers", respHeadersObject);
-                       responseObject.put("body", new String(content));
-                       object.put("response", responseObject);
-
-                       // key
-                       object.put("key", url); 
-
-                       // imported
-                       object.put("imported", ""); // TODO
-
-                       return object.toString(2); // INDENTED OUTPUT
-
                } catch (JSONException jsone) {
-                       LOG.warn("Error in processing file " + url + ": " + 
jsone.getMessage());
-                       throw new IOException("Error in generating JSON using 
Jettinson:" + jsone.getMessage()); 
+                       throw new IOException(jsone.getMessage());
                }
        }
-
+       
        @Override
-       protected String getJsonDataSet() throws IOException {
-               JSONObject object = new JSONObject();
-
+       protected String generateJson() throws IOException {
                try {
-                       // url
-                       object.put("url", url);
-
-                       // timestamp
-                       object.put("timestamp", 
metadata.get(Metadata.LAST_MODIFIED));
-
-                       // request
-                       JSONObject requestObject = new JSONObject();
-                       requestObject.put("method", "GET"); 
-                       JSONObject clientObject = new JSONObject();
-                       clientObject.put("hostname", getHostName());
-                       clientObject.put("address", getHostAddress());
-                       clientObject.put("software", 
conf.get("http.agent.version", ""));
-                       clientObject.put("robots", "CLASSIC"); 
-                       JSONObject contactObject = new JSONObject();
-                       contactObject.put("name", conf.get("http.agent.name", 
""));
-                       contactObject.put("email", conf.get("http.agent.email", 
""));
-                       clientObject.put("contact", contactObject);
-                       requestObject.put("client", clientObject);
-                       JSONObject reqHeadersObject = new JSONObject();
-                       reqHeadersObject.put("Accept", conf.get("http.accept", 
""));
-                       reqHeadersObject.put("Accept-Encoding", ""); // TODO
-                       reqHeadersObject.put("Accept-Language", 
conf.get("http.accept.language", ""));
-                       reqHeadersObject.put("User-Agent", 
conf.get("http.robots.agents", "")); 
-                       requestObject.put("headers", reqHeadersObject);
-                       requestObject.put("body", JSONObject.NULL);
-                       object.put("request", requestObject);
-
-                       // response
-                       JSONObject responseObject = new JSONObject();
-                       responseObject.put("status", 
ifNullString(metadata.get("status")));
-                       JSONObject serverObject = new JSONObject();
-                       serverObject.put("hostname", URLUtil.getHost(url)); 
-                       serverObject.put("address", 
ifNullString(metadata.get("_ip_")));
-                       responseObject.put("client", serverObject);
-                       JSONObject respHeadersObject = new JSONObject();
-                       respHeadersObject.put("Content-Encoding", 
ifNullString(metadata.get("Content-Encoding")));
-                       respHeadersObject.put("Content-Type", 
ifNullString(metadata.get("Content-Type")));
-                       respHeadersObject.put("Date", 
ifNullString(metadata.get("Date")));
-                       respHeadersObject.put("Server", 
ifNullString(metadata.get("Server")));
-                       responseObject.put("headers", respHeadersObject);
-                       responseObject.put("body", new String(content)); 
-                       object.put("response", responseObject);
-
-                       // key
-                       object.put("key", url);
-
-                       // imported
-                       object.put("imported", ""); // TODO
-
-                       return object.toString(2); // INDENTED OUTPUT
-
+                       return stack.getFirst().toString(2);
                } catch (JSONException jsone) {
-                       LOG.warn("Error in processing file " + url + ": " + 
jsone.getMessage());
-                       throw new IOException("Error in generating JSON using 
Jettinson:" + jsone.getMessage()); 
+                       throw new IOException(jsone.getMessage());
                }
        }
 }

Modified: 
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java 
Thu Mar 26 02:56:20 2015
@@ -17,9 +17,10 @@
 
 package org.apache.nutch.tools;
 
+import java.io.IOException;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.util.URLUtil;
 
 /**
  * This class provides methods to map crawled data on JSON using a {@see 
StringBuilder} object. 
@@ -27,126 +28,113 @@ import org.apache.nutch.util.URLUtil;
  */
 public class CommonCrawlFormatSimple extends AbstractCommonCrawlFormat {
        
+       private StringBuilder sb;
+       
+       private int tabCount;
+       
        public CommonCrawlFormatSimple(String url, byte[] content, Metadata 
metadata,
-                       Configuration conf) {
-               super(url, content, metadata, conf);
+                       Configuration conf, String keyPrefix) throws 
IOException {
+               super(url, content, metadata, conf, keyPrefix);
+               
+               this.sb = new StringBuilder();
+               this.tabCount = 0;
        }
        
-       @Override
-       protected String getJsonDataAll() {
-               // TODO character escaping
-               StringBuilder sb = new StringBuilder();
-               sb.append("{\n");
-
-               // url
-               sb.append("\t\"url\": \"" + url + "\",\n");
-               
-               // timstamp
-               sb.append("\t\"timstamp\": \"" + 
metadata.get(Metadata.LAST_MODIFIED) + "\",\n");
-                               
-               // request
-               sb.append("\t\"request\": {\n");
-               sb.append("\t\t\"method\": \"GET\",\n");
-               sb.append("\t\t\"client\": {\n");
-               sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n");
-               sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n");
-               sb.append("\t\t\t\"software\": \"" + 
conf.get("http.agent.version", "") + "\",\n");
-               sb.append("\t\t\t\"robots\": \"CLASSIC\",\n");
-               sb.append("\t\t\t\"contact\": {\n");
-               sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name", 
"") + "\",\n");
-               sb.append("\t\t\t\t\"email\": \"" + 
conf.get("http.agent.email", "") + "\",\n");
-               sb.append("\t\t\t}\n");
-               sb.append("\t\t},\n");
-               sb.append("\t\t\"headers\": {\n");
-               sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "") 
+ "\",\n");
-               sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); //TODO
-               sb.append("\t\t\t\"Accept-Language\": \"" + 
conf.get("http.accept.language", "") + "\",\n");
-               sb.append("\t\t\t\"User-Agent\": \"" + 
conf.get("http.robots.agents", "") + "\",\n");  
-               sb.append("\t},\n");
-
-               // response
-               sb.append("\t\"response\": {\n");
-               sb.append("\t\t\"status\": \"" + 
ifNullString(metadata.get("status")) + "\",\n");
-               sb.append("\t\t\"server\": {\n");
-               sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + 
"\"\n"); 
-               sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") + 
"\"\n");
-               sb.append("\t\t},\n");
-               sb.append("\t\t\"headers\": {\n");      
-               for (String name : metadata.names()) {
-                       sb.append("\t\t\t\"" + name + "\": \"" + 
metadata.get(name)     + "\"\n");
+       protected void writeKeyValue(String key, String value) throws 
IOException {
+               sb.append(printTabs() + "\"" + key + "\": " + quote(value) + 
",\n");
+       }
+       
+       protected void writeKeyNull(String key) throws IOException {
+               sb.append(printTabs() + "\"" + key + "\": null,\n");
+       }
+       
+       protected void startObject(String key) throws IOException {
+               String name = "";
+               if (key != null) {
+                       name = "\"" + key + "\": ";
                }
-               sb.append("\t\t},\n");
-               sb.append("\t\t\"body\": " + new String(content) + "\",\n");
-               sb.append("\t},\n");
-               
-               // key
-               sb.append("\t\"key\": \"" + url + "\",\n");
-               
-               // imported
-               sb.append("\t\"imported\": \"\"\n"); //TODO
-               
-               sb.append("}");
-
+               sb.append(printTabs() + name + "{\n");
+               this.tabCount++;
+       }
+       
+       protected void closeObject(String key) throws IOException {
+               sb.deleteCharAt(sb.length()-2); // delete comma
+               this.tabCount--;
+               sb.append(printTabs() + "},\n");
+       }
+       
+       protected String generateJson() throws IOException {
+               sb.deleteCharAt(sb.length()-1); // delete new line
+               sb.deleteCharAt(sb.length()-1); // delete comma
                return sb.toString();
        }
        
-       @Override
-       protected String getJsonDataSet() {
-               // TODO character escaping
+       private String printTabs() {
                StringBuilder sb = new StringBuilder();
-               sb.append("{\n");
-               
-               // url
-               sb.append("\t\"url\": \"" + url + "\",\n");
-               
-               // timstamp
-               sb.append("\t\"timestamp\": \"" + 
metadata.get(Metadata.LAST_MODIFIED) + "\",\n");
-               
-               // request
-               sb.append("\t\"request\": {\n");
-               sb.append("\t\t\"method\": \"GET\",\n");
-               sb.append("\t\t\"client\": {\n");
-               sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n");
-               sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n");
-               sb.append("\t\t\t\"software\": \"" + 
conf.get("http.agent.version", "") + "\",\n");
-               sb.append("\t\t\t\"robots\": \"CLASSIC\",\n");
-               sb.append("\t\t\t\"contact\": {\n");
-               sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name", 
"") + "\",\n");
-               sb.append("\t\t\t\t\"email\": \"" + 
conf.get("http.agent.email", "") + "\",\n");
-               sb.append("\t\t\t}\n");
-               sb.append("\t\t},\n");
-               sb.append("\t\t\"headers\": {\n");
-               sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "") 
+ "\",\n");
-               sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); // TODO
-               sb.append("\t\t\t\"Accept-Language\": \"" + 
conf.get("http.accept.language", "") + "\",\n");
-    sb.append("\t\t\t\"User-Agent\": \"" + conf.get("http.robots.agents", "") 
+ "\",\n");  
-               sb.append("\t},\n");
-               
-               // response
-               sb.append("\t\"response\": {\n");
-               sb.append("\t\t\"status\": \"" + 
ifNullString(metadata.get("status")) + "\",\n");
-               sb.append("\t\t\"server\": {\n");
-    sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + "\"\n"); 
-               sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") + 
"\"\n");
-               sb.append("\t\t},\n");
-               sb.append("\t\t\"headers\": {\n");
-               sb.append("\t\t\t\"Content-Encoding\": " + 
ifNullString(metadata.get("Content-Encoding")));
-               sb.append("\t\t\t\"Content-Type\": " + 
ifNullString(metadata.get("Content-Type")));
-               sb.append("\t\t\t\"Date\": " + 
ifNullString(metadata.get("Date")));
-               sb.append("\t\t\t\"Server\": " + 
ifNullString(metadata.get("Server")));
-               sb.append("\t\t},\n");
-               sb.append("\t\t\"body\": " + new String(content) + "\",\n");
-               sb.append("\t},\n");
-               
-               // key
-               sb.append("\t\"key\": \"" + url + "\",\n"); 
-               
-               // imported
-               sb.append("\t\"imported\": \"\"\n"); // TODO
-               
-               sb.append("}");
-
+               for (int i=0; i < this.tabCount ;i++) {
+                       sb.append("\t");
+               }
                return sb.toString();
        }
-
+       
+    private static String quote(String string) throws IOException {
+       StringBuilder sb = new StringBuilder();
+       
+        if (string == null || string.length() == 0) {
+            sb.append("\"\"");
+            return sb.toString();
+        }
+
+        char b;
+        char c = 0;
+        String hhhh;
+        int i;
+        int len = string.length();
+
+        sb.append('"');
+        for (i = 0; i < len; i += 1) {
+            b = c;
+            c = string.charAt(i);
+            switch (c) {
+            case '\\':
+            case '"':
+                sb.append('\\');
+                sb.append(c);
+                break;
+            case '/':
+                if (b == '<') {
+                       sb.append('\\');
+                }
+                sb.append(c);
+                break;
+            case '\b':
+               sb.append("\\b");
+                break;
+            case '\t':
+               sb.append("\\t");
+                break;
+            case '\n':
+               sb.append("\\n");
+                break;
+            case '\f':
+               sb.append("\\f");
+                break;
+            case '\r':
+               sb.append("\\r");
+                break;
+            default:
+                if (c < ' ' || (c >= '\u0080' && c < '\u00a0')
+                        || (c >= '\u2000' && c < '\u2100')) {
+                       sb.append("\\u");
+                    hhhh = Integer.toHexString(c);
+                    sb.append("0000", 0, 4 - hhhh.length());
+                    sb.append(hhhh);
+                } else {
+                       sb.append(c);
+                }
+            }
+        }
+        sb.append('"');
+        return sb.toString();
+    }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java Thu Mar 26 
02:56:20 2015
@@ -46,21 +46,27 @@ public class DumpFileUtil {
         return sb.toString();
     }
 
-    public static String createTwoLevelsDirectory(String basePath, String md5) 
{
+    public static String createTwoLevelsDirectory(String basePath, String md5, 
boolean makeDir) {
         String firstLevelDirName = new 
StringBuilder().append(md5.charAt(0)).append(md5.charAt(8)).toString();
         String secondLevelDirName = new 
StringBuilder().append(md5.charAt(16)).append(md5.charAt(24)).toString();
 
         String fullDirPath = String.format(DIR_PATTERN, basePath, 
firstLevelDirName, secondLevelDirName);
 
-        try {
-            FileUtils.forceMkdir(new File(fullDirPath));
-        } catch (IOException e) {
-            LOG.error("Failed to create dir: {}", fullDirPath);
-            fullDirPath = null;
+        if (makeDir) {
+               try {
+                   FileUtils.forceMkdir(new File(fullDirPath));
+               } catch (IOException e) {
+                   LOG.error("Failed to create dir: {}", fullDirPath);
+                   fullDirPath = null;
+               }
         }
 
         return fullDirPath;
     }
+    
+    public static String createTwoLevelsDirectory(String basePath, String md5) 
{
+        return createTwoLevelsDirectory(basePath, md5, true);
+    }
 
     public static String createFileName(String md5, String fileBaseName, 
String fileExtension) {
         if (fileBaseName.length() > MAX_LENGTH_OF_FILENAME) {


Reply via email to