Author: mattmann
Date: Fri Apr  3 14:36:05 2015
New Revision: 1671077

URL: http://svn.apache.org/r1671077
Log:
NUTCH-1975: New configuration for CommonCrawlDataDumper tool contributed by 
Giuseppe Totaro.

Added:
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlConfig.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
    nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Apr  3 14:36:05 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1975 New configuration for CommonCrawlDataDumper tool (Giuseppe Totaro 
via mattmann)
+
 * NUTCH-1979 CrawlDbReader to implement Tool (markus)
 
 * NUTCH-1970 Pretty print JSON output in config resource (Tyler Pasulich, 
mattmann)

Modified: 
nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java 
Fri Apr  3 14:36:05 2015
@@ -20,6 +20,7 @@ package org.apache.nutch.tools;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
+import java.text.ParseException;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
@@ -27,6 +28,8 @@ import org.apache.nutch.util.URLUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.ibm.icu.text.SimpleDateFormat;
+
 /**
  * Abstract class that implements {@see CommonCrawlFormat} interface. 
  *
@@ -44,14 +47,27 @@ public abstract class AbstractCommonCraw
        
        protected String keyPrefix;
        
-       public AbstractCommonCrawlFormat(String url, byte[] content, Metadata 
metadata, Configuration conf, String keyPrefix) throws IOException {
+       protected boolean simpleDateFormat;
+       
+       protected boolean jsonArray;
+       
+       protected boolean reverseKey;
+       
+       protected String reverseKeyValue;
+
+       public AbstractCommonCrawlFormat(String url, byte[] content, Metadata 
metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException 
{
                this.url = url;
                this.content = content;
                this.metadata = metadata;
-               this.conf = conf;
-               this.keyPrefix = keyPrefix;
+               this.conf = nutchConf;
+               
+               this.keyPrefix = config.getKeyPrefix();
+               this.simpleDateFormat = config.getSimpleDateFormat();
+               this.jsonArray = config.getJsonArray();
+               this.reverseKey = config.getReverseKey();
+               this.reverseKeyValue = config.getReverseKeyValue();
        }
-       
+
        @Override
        public String getJsonData() throws IOException {
                try {
@@ -76,12 +92,14 @@ public abstract class AbstractCommonCraw
                        writeKeyValue("email", getRequestContactEmail());
                        closeObject("contact");
                        closeObject("client");
-                       startObject("headers");
-                       writeKeyValue("Accept", getRequestAccept());
-                       writeKeyValue("Accept-Encoding", 
getRequestAcceptEncoding());
-                       writeKeyValue("Accept-Language", 
getRequestAcceptLanguage());
-                       writeKeyValue("User-Agent", getRequestUserAgent());
-                       closeObject("headers");
+                       // start request headers
+                       startHeaders("headers", false, true);
+                       writeKeyValueWrapper("Accept", getRequestAccept());
+                       writeKeyValueWrapper("Accept-Encoding", 
getRequestAcceptEncoding());
+                       writeKeyValueWrapper("Accept-Language", 
getRequestAcceptLanguage());
+                       writeKeyValueWrapper("User-Agent", 
getRequestUserAgent());
+                       //closeObject("headers");
+                       closeHeaders("headers", false, true);
                        writeKeyNull("body");
                        closeObject("request");
                        
@@ -92,18 +110,19 @@ public abstract class AbstractCommonCraw
                        writeKeyValue("hostname", getResponseHostName());
                        writeKeyValue("address", getResponseAddress());
                        closeObject("server");
-                       startObject("headers");
-                       writeKeyValue("Content-Encoding", 
getResponseContentEncoding());
-                       writeKeyValue("Content-Type", getResponseContentType());
-                       writeKeyValue("Date", getResponseDate());
-                       writeKeyValue("Server", getResponseServer());
+                       // start response headers
+                       startHeaders("headers", false, true);
+                       writeKeyValueWrapper("Content-Encoding", 
getResponseContentEncoding());
+                       writeKeyValueWrapper("Content-Type", 
getResponseContentType());
+                       writeKeyValueWrapper("Date", getResponseDate());
+                       writeKeyValueWrapper("Server", getResponseServer());
                        for (String name : metadata.names()) {
                                if (name.equalsIgnoreCase("Content-Encoding") 
|| name.equalsIgnoreCase("Content-Type") || name.equalsIgnoreCase("Date") || 
name.equalsIgnoreCase("Server")) {
                                        continue;
                                }
-                               writeKeyValue(name, metadata.get(name));
+                               writeKeyValueWrapper(name, metadata.get(name));
                        }
-                       closeObject("headers");
+                       closeHeaders("headers", false, true);
                        writeKeyValue("body", getResponseContent());
                        closeObject("response");
                        
@@ -132,6 +151,12 @@ public abstract class AbstractCommonCraw
        
        protected abstract void writeKeyNull(String key) throws IOException;
        
+       protected abstract void startArray(String key, boolean nested, boolean 
newline) throws IOException;
+       
+       protected abstract void closeArray(String key, boolean nested, boolean 
newline) throws IOException;
+       
+       protected abstract void writeArrayValue(String value) throws 
IOException;
+       
        protected abstract void startObject(String key) throws IOException;
        
        protected abstract void closeObject(String key) throws IOException;
@@ -145,7 +170,18 @@ public abstract class AbstractCommonCraw
        }
        
        protected String getTimestamp() {
-               return metadata.get(ifNullString(Metadata.LAST_MODIFIED));
+               if (this.simpleDateFormat) {
+                       String timestamp = null;
+                       try {
+                               long epoch = new SimpleDateFormat("EEE, d MMM 
yyyy HH:mm:ss 
z").parse(ifNullString(metadata.get(Metadata.LAST_MODIFIED))).getTime();
+                               timestamp = String.valueOf(epoch);
+                       } catch (ParseException pe) {
+                               LOG.warn(pe.getMessage());
+                       }
+                       return timestamp;
+               } else {
+                       return 
ifNullString(metadata.get(Metadata.LAST_MODIFIED));
+               }
        }
        
        protected String getMethod() {
@@ -225,7 +261,18 @@ public abstract class AbstractCommonCraw
        }
        
        protected String getResponseDate() {
-               return ifNullString(metadata.get("Date"));
+               if (this.simpleDateFormat) {
+                       String timestamp = null;
+                       try {
+                               long epoch = new SimpleDateFormat("EEE, d MMM 
yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime();
+                               timestamp = String.valueOf(epoch);
+                       } catch (ParseException pe) {
+                               LOG.warn(pe.getMessage());
+                       }
+                       return timestamp;
+               } else {
+                       return ifNullString(metadata.get("Date"));
+               }
        }
        
        protected String getResponseServer() {
@@ -237,14 +284,60 @@ public abstract class AbstractCommonCraw
        }
        
        protected String getKey() {
-               return url;
+               if (this.reverseKey) {
+                       return this.reverseKeyValue;
+               }
+               else {
+                       return url;
+               }
        }
        
        protected String getImported() {
-               return new String(""); // TODO
+               if (this.simpleDateFormat) {
+                       String timestamp = null;
+                       try {
+                               long epoch = new SimpleDateFormat("EEE, d MMM 
yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime();
+                               timestamp = String.valueOf(epoch);
+                       } catch (ParseException pe) {
+                               LOG.warn(pe.getMessage());
+                       }
+                       return timestamp;
+               } else {
+                       return ifNullString(metadata.get("Date"));
+               }
        }
        
        private static String ifNullString(String value) {
                return (value != null) ? value : "";
        }
+       
+       private void startHeaders(String key, boolean nested, boolean newline) 
throws IOException {
+               if (this.jsonArray) {
+                       startArray(key, nested, newline);
+               }
+               else {
+                       startObject(key);
+               }
+       }
+       
+       private void closeHeaders(String key, boolean nested, boolean newline) 
throws IOException {
+               if (this.jsonArray) {
+                       closeArray(key, nested, newline);
+               }
+               else {
+                       closeObject(key);
+               }
+       }
+       
+       private void writeKeyValueWrapper(String key, String value) throws 
IOException {
+               if (this.jsonArray) {
+                       startArray(null, true, false);
+                       writeArrayValue(key);
+                       writeArrayValue(value);
+                       closeArray(null, true, false);
+               }
+               else {
+                       writeKeyValue(key, value);
+               }
+       }
 }

Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlConfig.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlConfig.java?rev=1671077&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlConfig.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlConfig.java Fri Apr  
3 14:36:05 2015
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Properties;
+
+public class CommonCrawlConfig implements Serializable {
+
+       /**
+        * Serial version UID
+        */
+       private static final long serialVersionUID = 5235013733207799661L;
+       
+       // Prefix for key value in the output format
+       private String keyPrefix = "";
+       
+       private boolean simpleDateFormat = false;
+       
+       private boolean jsonArray = false;
+       
+       private boolean reverseKey = false;
+       
+       private String reverseKeyValue = "";
+       
+       /**
+        * Default constructor
+        */
+       public CommonCrawlConfig() {
+               // TODO 
init(this.getClass().getResourceAsStream("CommonCrawlConfig.properties"));
+       }
+       
+       public CommonCrawlConfig(InputStream stream) {
+               init(stream);
+       }
+       
+       private void init(InputStream stream) {
+               if (stream == null) {
+                       return;
+               }
+               Properties properties = new Properties();
+               
+               try {
+                       properties.load(stream);
+               } catch (IOException e) {
+                       // TODO
+               } finally {
+                       try {
+                               stream.close();
+                       } catch (IOException e) {
+                               // TODO
+                       }
+               }
+
+               setKeyPrefix(properties.getProperty("keyPrefix", ""));
+               
setSimpleDateFormat(Boolean.parseBoolean(properties.getProperty("simpleDateFormat",
 "False")));
+               
setJsonArray(Boolean.parseBoolean(properties.getProperty("jsonArray", 
"False")));
+               
setReverseKey(Boolean.parseBoolean(properties.getProperty("reverseKey", 
"False")));
+       }
+       
+       public void setKeyPrefix(String keyPrefix) {
+               this.keyPrefix = keyPrefix;
+       }
+       
+       public void setSimpleDateFormat(boolean simpleDateFormat) {
+               this.simpleDateFormat = simpleDateFormat;
+       }
+       
+       public void setJsonArray(boolean jsonArray) {
+               this.jsonArray = jsonArray;
+       }
+       
+       public void setReverseKey(boolean reverseKey) {
+               this.reverseKey = reverseKey;
+       }
+       
+       public void setReverseKeyValue(String reverseKeyValue) {
+               this.reverseKeyValue = reverseKeyValue;
+       }
+       
+       public String getKeyPrefix() {
+               return this.keyPrefix;
+       }
+       
+       public boolean getSimpleDateFormat() {
+               return this.simpleDateFormat;
+       }
+       
+       public boolean getJsonArray() {
+               return this.jsonArray;
+       }
+       
+       public boolean getReverseKey() {
+               return this.reverseKey;
+       }
+       
+       public String getReverseKeyValue() {
+               return this.reverseKeyValue;
+       }
+}

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Fri 
Apr  3 14:36:05 2015
@@ -25,6 +25,9 @@ import java.io.File;
 import java.io.FileFilter;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Date;
@@ -38,6 +41,7 @@ import org.apache.commons.cli.HelpFormat
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.Options;
+import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
 import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
 import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
@@ -52,10 +56,10 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.DumpFileUtil;
 import org.apache.nutch.util.NutchConfiguration;
-
 //Tika imports
 import org.apache.tika.Tika;
 
@@ -65,6 +69,7 @@ import com.fasterxml.jackson.dataformat.
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.ibm.icu.text.DateFormat;
 import com.ibm.icu.text.SimpleDateFormat;
 
 /**
@@ -165,6 +170,8 @@ public class CommonCrawlDataDumper {
 
        private static final Logger LOG = 
LoggerFactory.getLogger(CommonCrawlDataDumper.class.getName());
        
+       private CommonCrawlConfig config = null;
+       
        // Gzip initialization
        private FileOutputStream fileOutput = null;
        private BufferedOutputStream bufOutput = null;
@@ -218,6 +225,26 @@ public class CommonCrawlDataDumper {
                                .hasArg(true)
                                .withDescription("an optional prefix for key in 
the output format.")
                                .create("keyPrefix");
+               Option simpleDateFormatOpt = OptionBuilder
+                               .withArgName("SimpleDateFormat")
+                               .hasArg(false)
+                               .withDescription("an optional format for 
timestamp in GMT epoch milliseconds.")
+                               .create("SimpleDateFormat");
+               Option epochFilenameOpt = OptionBuilder
+                               .withArgName("epochFilename")
+                               .hasArg(false)
+                               .withDescription("an optional format for output 
filename.")
+                               .create("epochFilename");
+               Option jsonArrayOpt = OptionBuilder
+                               .withArgName("jsonArray")
+                               .hasArg(false)
+                               .withDescription("an optional format for JSON 
output.")
+                               .create("jsonArray");
+               Option reverseKeyOpt = OptionBuilder
+                               .withArgName("reverseKey")
+                               .hasArg(false)
+                               .withDescription("an optional format for key 
value in JSON output.")
+                               .create("reverseKey");
 
                // create the options
                Options options = new Options();
@@ -229,6 +256,11 @@ public class CommonCrawlDataDumper {
                options.addOption(gzipOpt);
                // create keyPrefix option
                options.addOption(keyPrefixOpt);
+               // create simpleDataFormat option
+               options.addOption(simpleDateFormatOpt);
+               options.addOption(epochFilenameOpt);
+               options.addOption(jsonArrayOpt);
+               options.addOption(reverseKeyOpt);
 
                CommandLineParser parser = new GnuParser();
                try {
@@ -243,7 +275,18 @@ public class CommonCrawlDataDumper {
                        File segmentRootDir = new 
File(line.getOptionValue("segment"));
                        String[] mimeTypes = line.getOptionValues("mimetype");
                        boolean gzip = line.hasOption("gzip");
+                       boolean epochFilename = line.hasOption("epochFilename");
+                       
                        String keyPrefix = line.getOptionValue("keyPrefix", "");
+                       boolean simpleDateFormat = 
line.hasOption("SimpleDateFormat");
+                       boolean jsonArray = line.hasOption("jsonArray");
+                       boolean reverseKey = line.hasOption("reverseKey");
+                       
+                       CommonCrawlConfig config = new CommonCrawlConfig();
+                       config.setKeyPrefix(keyPrefix);
+                       config.setSimpleDateFormat(simpleDateFormat);
+                       config.setJsonArray(jsonArray);
+                       config.setReverseKey(reverseKey);
 
                        if (!outputDir.exists()) {
                                LOG.warn("Output directory: [" + 
outputDir.getAbsolutePath() + "]: does not exist, creating it.");
@@ -251,9 +294,9 @@ public class CommonCrawlDataDumper {
                                        throw new Exception("Unable to create: 
[" + outputDir.getAbsolutePath() + "]");
                        }
 
-                       CommonCrawlDataDumper dumper = new 
CommonCrawlDataDumper();
+                       CommonCrawlDataDumper dumper = new 
CommonCrawlDataDumper(config);
                        
-                       dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes, 
keyPrefix);
+                       dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes, 
epochFilename);
                        
                } catch (Exception e) {
                        LOG.error(CommonCrawlDataDumper.class.getName() + ": " 
+ StringUtils.stringifyException(e));
@@ -263,6 +306,13 @@ public class CommonCrawlDataDumper {
        }
        
        /**
+        * Constructor
+        */
+       public CommonCrawlDataDumper(CommonCrawlConfig config) {
+               this.config = config;
+       }
+       
+       /**
         * Dumps the reverse engineered CBOR content from the provided segment
         * directories if a parent directory contains more than one segment,
         * otherwise a single segment can be passed as an argument. If the 
boolean
@@ -281,8 +331,8 @@ public class CommonCrawlDataDumper {
      *            filtered out.
         * @throws Exception
         */
-       public void dump(File outputDir, File segmentRootDir, boolean gzip,     
String[] mimeTypes, String keyPrefix) throws Exception {
-               if (!gzip) {
+       public void dump(File outputDir, File segmentRootDir, boolean gzip,     
String[] mimeTypes, boolean epochFilename) throws Exception {
+               if (gzip) {
                        LOG.info("Gzipping CBOR data has been skipped");
                }
                // total file counts
@@ -290,8 +340,8 @@ public class CommonCrawlDataDumper {
                // filtered file counters
                Map<String, Integer> filteredCounts = new HashMap<String, 
Integer>();
                
-               Configuration conf = NutchConfiguration.create();
-               FileSystem fs = FileSystem.get(conf);
+               Configuration nutchConfig = NutchConfiguration.create();
+               FileSystem fs = FileSystem.get(nutchConfig);
                File[] segmentDirs = segmentRootDir.listFiles(new FileFilter() {
                        @Override
                        public boolean accept(File file) {
@@ -311,8 +361,6 @@ public class CommonCrawlDataDumper {
 
                for (File segment : segmentDirs) {
                        LOG.info("Processing segment: [" + 
segment.getAbsolutePath() + "]");
-                       // GIUSEPPE: Never used (also in FileDumper.java)!
-                       //DataOutputStream doutputStream = null;
                        try {
                                String segmentContentPath = 
segment.getAbsolutePath() + File.separator + Content.DIR_NAME + 
"/part-00000/data";
                                Path file = new Path(segmentContentPath);
@@ -321,7 +369,7 @@ public class CommonCrawlDataDumper {
                                        LOG.warn("Skipping segment: [" + 
segmentContentPath     + "]: no data directory present");
                                        continue;
                                }
-                               SequenceFile.Reader reader = new 
SequenceFile.Reader(fs, file, conf);
+                               SequenceFile.Reader reader = new 
SequenceFile.Reader(fs, file, nutchConfig);
 
                                if (!new File(file.toString()).exists()) {
                                        LOG.warn("Skipping segment: [" + 
segmentContentPath     + "]: no data directory present");
@@ -334,21 +382,49 @@ public class CommonCrawlDataDumper {
                                while (reader.next(key)) {
                                        content = new Content();
                                        reader.getCurrentValue(content);
+                                       Metadata metadata = 
content.getMetadata();
                                        String url = key.toString();
                                        String baseName = 
FilenameUtils.getBaseName(url);
                                        String extension = 
FilenameUtils.getExtension(url);
-                                       if (extension == null || 
extension.equals("")) {
+                                       
+                                       if ((extension == null) || 
extension.isEmpty()) {
                                                extension = "html";
                                        }
-
-                                       String md5Ofurl = 
DumpFileUtil.getUrlMD5(url);
-                                       String fullDir = 
DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl, 
!gzip);
-                                       String filename = 
DumpFileUtil.createFileName(md5Ofurl, baseName, extension);
-                                       String outputFullPath = 
String.format("%s/%s", fullDir, filename);
-
-                                       String [] fullPathLevels = 
fullDir.split(File.separator);
-                                       String firstLevelDirName = 
fullPathLevels[fullPathLevels.length-2]; 
-                                       String secondLevelDirName = 
fullPathLevels[fullPathLevels.length-1];
+                                       
+                                       String outputFullPath = null;
+                                       String outputRelativePath = null;
+                                       String filename = null;
+                                       String timestamp = null;
+                                       String reverseKey = null;
+                                       
+                                       if (epochFilename || 
config.getReverseKey()) {  
+                                               try {
+                                                       long epoch = new 
SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss 
z").parse(getDate(metadata.get("Date"))).getTime();
+                                                       timestamp = 
String.valueOf(epoch);
+                                               } catch (ParseException pe) {
+                                                       
LOG.warn(pe.getMessage());
+                                               }
+                                               
+                                               reverseKey = reverseUrl(url);
+                                               
config.setReverseKeyValue(reverseKey.replace("/", "_") + "_" + 
DigestUtils.shaHex(url) + "_" + timestamp);
+                                       }       
+                                       
+                                       if (epochFilename) {
+                                               outputFullPath = 
DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(), reverseKey, 
url, timestamp, extension, !gzip);
+                                               outputRelativePath = 
outputFullPath.substring(0, outputFullPath.lastIndexOf(File.separator)-1);
+                                               filename = 
content.getMetadata().get(Metadata.DATE) + "." + extension;
+                                       }
+                                       else {
+                                               String md5Ofurl = 
DumpFileUtil.getUrlMD5(url);
+                                               String fullDir = 
DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl, 
!gzip);
+                                               filename = 
DumpFileUtil.createFileName(md5Ofurl, baseName, extension);
+                                               outputFullPath = 
String.format("%s/%s", fullDir, filename);
+       
+                                               String [] fullPathLevels = 
fullDir.split(File.separator);
+                                               String firstLevelDirName = 
fullPathLevels[fullPathLevels.length-2]; 
+                                               String secondLevelDirName = 
fullPathLevels[fullPathLevels.length-1];
+                                               outputRelativePath = 
firstLevelDirName + secondLevelDirName;
+                                       }
                                        
                                        // Encode all filetypes if no mimetypes 
have been given
                                        Boolean filter = (mimeTypes == null);
@@ -357,7 +433,7 @@ public class CommonCrawlDataDumper {
                                        try {
                                                String mimeType = new 
Tika().detect(content.getContent());
                                                // Maps file to JSON-based 
structure
-                                               CommonCrawlFormat format = 
CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url, 
content.getContent(), content.getMetadata(), conf, keyPrefix);
+                                               CommonCrawlFormat format = 
CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url, 
content.getContent(), metadata, nutchConfig, config);
                                                jsonData = format.getJsonData();
 
                                                collectStats(typeCounts, 
mimeType);
@@ -375,7 +451,6 @@ public class CommonCrawlDataDumper {
                                                byte[] byteData = 
serializeCBORData(jsonData);
                                                
                                                if (!gzip) {
-                                                       //String outputFullPath 
= outputDir + File.separator + filename;
                                                        File outputFile = new 
File(outputFullPath);
                                                        if 
(outputFile.exists()) {
                                                                
LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
@@ -392,7 +467,8 @@ public class CommonCrawlDataDumper {
                                                        else {
                                                                
fileList.add(outputFullPath);
                                                                
LOG.info("Compressing: [" + outputFullPath + "]");
-                                                               TarArchiveEntry 
tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + 
secondLevelDirName + File.separator + filename);
+                                                               
//TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + 
File.separator + secondLevelDirName + File.separator + filename);
+                                                               TarArchiveEntry 
tarEntry = new TarArchiveEntry(outputRelativePath + File.separator + filename);
                                                                
tarEntry.setSize(byteData.length);
                                                                
tarOutput.putArchiveEntry(tarEntry);
                                                                
tarOutput.write(byteData);
@@ -500,4 +576,41 @@ public class CommonCrawlDataDumper {
                }
                return builder.toString();
        }
+       
+       /**
+        * Gets the current date if the given timestamp is empty or null.
+        * @param timestamp the timestamp
+        * @return the current timestamp if the given one is null.
+        */
+       private String getDate(String timestamp) {
+               if (timestamp == null || timestamp.isEmpty()) {
+                       DateFormat dateFormat = new SimpleDateFormat("EEE, d 
MMM yyyy HH:mm:ss z");
+                       timestamp = dateFormat.format(new Date());
+               }
+               return timestamp;
+                       
+       }
+       
+       public static String reverseUrl(String urlString) {
+       URL url = null;
+               String reverseKey = null;
+               try {
+                       url = new URL(urlString);
+                       
+                       String[] hostPart = url.getHost().replace('.', 
'/').split("/");
+                       
+                       StringBuilder sb = new StringBuilder();
+                       sb.append(hostPart[hostPart.length-1]);
+                       for (int i = hostPart.length-2; i >= 0; i--) {
+                               sb.append("/" + hostPart[i]);
+                       }
+                       
+                       reverseKey = sb.toString();
+
+               } catch (MalformedURLException e) {
+                       LOG.error("Failed to parse URL: {}", urlString);
+               }
+               
+               return reverseKey;
+    }
 }

Modified: 
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java 
Fri Apr  3 14:36:05 2015
@@ -34,24 +34,24 @@ public class CommonCrawlFormatFactory {
         * @param url the url.
         * @param content the content.
         * @param metadata the metadata.
-        * @param conf the configuration.
+        * @param nutchConf the configuration.
+        * @param config the CommonCrawl output configuration.
         * @return the new {@see CommonCrawlFormat} object.
         * @throws IOException If any I/O error occurs.
         */
-       public static CommonCrawlFormat getCommonCrawlFormat(String formatType, 
String url, byte[] content,
-                       Metadata metadata, Configuration conf, String 
keyPrefix) throws IOException {
+       public static CommonCrawlFormat getCommonCrawlFormat(String formatType, 
String url, byte[] content,     Metadata metadata, Configuration nutchConf, 
CommonCrawlConfig config) throws IOException {
                if (formatType == null) {
                        return null;
                }
                
                if (formatType.equalsIgnoreCase("jackson")) {
-                       return new CommonCrawlFormatJackson(url, content, 
metadata, conf, keyPrefix);
+                       return new CommonCrawlFormatJackson(url, content, 
metadata, nutchConf, config);
                }
                else if (formatType.equalsIgnoreCase("jettinson")) {
-                       return new CommonCrawlFormatJettinson(url, content, 
metadata, conf, keyPrefix);
+                       return new CommonCrawlFormatJettinson(url, content, 
metadata, nutchConf, config);
                }
                else if (formatType.equalsIgnoreCase("simple")) {
-                       return new CommonCrawlFormatSimple(url, content, 
metadata, conf, keyPrefix);
+                       return new CommonCrawlFormatSimple(url, content, 
metadata, nutchConf, config);
                }
                
                return null;

Modified: 
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java 
Fri Apr  3 14:36:05 2015
@@ -32,15 +32,13 @@ import com.fasterxml.jackson.core.JsonGe
  */
 public class CommonCrawlFormatJackson extends AbstractCommonCrawlFormat {
        
-       //private static final Logger LOG = 
LoggerFactory.getLogger(CommonCrawlFormatJackson.class.getName());
-       
        private ByteArrayOutputStream out;
        
        private JsonGenerator generator;
 
-       public CommonCrawlFormatJackson(String url, byte[] content,
-                       Metadata metadata, Configuration conf, String 
keyPrefix) throws IOException {
-               super(url, content, metadata, conf, keyPrefix);
+       
+       public CommonCrawlFormatJackson(String url, byte[] content, Metadata 
metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException 
{
+               super(url, content, metadata, nutchConf, config);
                
                JsonFactory factory = new JsonFactory();
                this.out = new ByteArrayOutputStream();
@@ -58,7 +56,25 @@ public class CommonCrawlFormatJackson ex
        @Override
        protected void writeKeyNull(String key) throws IOException {
                generator.writeFieldName(key);
-               generator.writeNull();;
+               generator.writeNull();
+       }
+       
+       @Override
+       protected void startArray(String key, boolean nested, boolean newline) 
throws IOException {
+               if (key != null) {
+                       generator.writeFieldName(key);
+               }
+               generator.writeStartArray();
+       }
+       
+       @Override
+       protected void closeArray(String key, boolean nested, boolean newline) 
throws IOException {
+               generator.writeEndArray();
+       }
+       
+       @Override
+       protected void writeArrayValue(String value) throws IOException {
+               generator.writeString(value);
        }
        
        @Override

Modified: 
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java 
Fri Apr  3 14:36:05 2015
@@ -23,6 +23,7 @@ import java.util.Deque;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
+import org.codehaus.jettison.json.JSONArray;
 import org.codehaus.jettison.json.JSONException;
 import org.codehaus.jettison.json.JSONObject;
 
@@ -32,19 +33,21 @@ import org.codehaus.jettison.json.JSONOb
  */
 public class CommonCrawlFormatJettinson extends AbstractCommonCrawlFormat {
        
-       private Deque<JSONObject> stack;
+       private Deque<JSONObject> stackObjects;
+       
+       private Deque<JSONArray> stackArrays;
 
-       public CommonCrawlFormatJettinson(String url, byte[] content,
-                       Metadata metadata, Configuration conf, String 
keyPrefix) throws IOException {
-               super(url, content, metadata, conf, keyPrefix);
+       public CommonCrawlFormatJettinson(String url, byte[] content, Metadata 
metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException 
{
+               super(url, content, metadata, nutchConf, config);
                
-               stack = new ArrayDeque<JSONObject>();
+               stackObjects = new ArrayDeque<JSONObject>();
+               stackArrays = new ArrayDeque<JSONArray>();
        }
        
        @Override
        protected void writeKeyValue(String key, String value) throws 
IOException {
                try {
-                       stack.getFirst().put(key, value);
+                       stackObjects.getFirst().put(key, value);
                } catch (JSONException jsone) {
                        throw new IOException(jsone.getMessage());
                }
@@ -53,24 +56,54 @@ public class CommonCrawlFormatJettinson
        @Override
        protected void writeKeyNull(String key) throws IOException {
                try {
-                       stack.getFirst().put(key, JSONObject.NULL);
+                       stackObjects.getFirst().put(key, JSONObject.NULL);
+               } catch (JSONException jsone) {
+                       throw new IOException(jsone.getMessage());
+               }
+       }
+       
+       @Override
+       protected void startArray(String key, boolean nested, boolean newline) 
throws IOException {
+               JSONArray array = new JSONArray();
+               stackArrays.push(array);
+       }
+       
+       @Override
+       protected void closeArray(String key, boolean nested, boolean newline) 
throws IOException {
+               try {
+                       if (stackArrays.size() > 1) {
+                               JSONArray array = stackArrays.pop();
+                               if (nested) {
+                                       stackArrays.getFirst().put(array);
+                               }
+                               else {
+                                       stackObjects.getFirst().put(key, array);
+                               }
+                       }
                } catch (JSONException jsone) {
                        throw new IOException(jsone.getMessage());
                }
        }
        
        @Override
+       protected void writeArrayValue(String value) throws IOException {
+               if (stackArrays.size() > 1) {
+                       stackArrays.getFirst().put(value);
+               }
+       }
+       
+       @Override
        protected void startObject(String key) throws IOException {
                JSONObject object = new JSONObject();
-               stack.push(object);
+               stackObjects.push(object);
        }
        
        @Override
        protected void closeObject(String key) throws IOException {
                try {
-                       if (stack.size() > 1) {
-                               JSONObject object = stack.pop();
-                               stack.getFirst().put(key, object);
+                       if (stackObjects.size() > 1) {
+                               JSONObject object = stackObjects.pop();
+                               stackObjects.getFirst().put(key, object);
                        }
                } catch (JSONException jsone) {
                        throw new IOException(jsone.getMessage());
@@ -80,7 +113,7 @@ public class CommonCrawlFormatJettinson
        @Override
        protected String generateJson() throws IOException {
                try {
-                       return stack.getFirst().toString(2);
+                       return stackObjects.getFirst().toString(2);
                } catch (JSONException jsone) {
                        throw new IOException(jsone.getMessage());
                }

Modified: 
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java 
Fri Apr  3 14:36:05 2015
@@ -32,22 +32,53 @@ public class CommonCrawlFormatSimple ext
        
        private int tabCount;
        
-       public CommonCrawlFormatSimple(String url, byte[] content, Metadata 
metadata,
-                       Configuration conf, String keyPrefix) throws 
IOException {
-               super(url, content, metadata, conf, keyPrefix);
+       public CommonCrawlFormatSimple(String url, byte[] content, Metadata 
metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException 
{
+               super(url, content, metadata, nutchConf, config);
                
                this.sb = new StringBuilder();
                this.tabCount = 0;
        }
        
+       @Override
        protected void writeKeyValue(String key, String value) throws 
IOException {
                sb.append(printTabs() + "\"" + key + "\": " + quote(value) + 
",\n");
        }
        
+       @Override
        protected void writeKeyNull(String key) throws IOException {
                sb.append(printTabs() + "\"" + key + "\": null,\n");
        }
        
+       @Override
+       protected void startArray(String key, boolean nested, boolean newline) 
throws IOException {
+               String name = (key != null) ? "\"" + key + "\": " : "";
+               String nl = (newline) ? "\n" : "";
+               sb.append(printTabs() + name + "[" + nl);
+               if (newline) {
+                       this.tabCount++;
+               }
+       }
+       
+       @Override
+       protected void closeArray(String key, boolean nested, boolean newline) 
throws IOException {
+               if (sb.charAt(sb.length()-1) == ',') {
+                       sb.deleteCharAt(sb.length()-1); // delete comma
+               }
+               else if (sb.charAt(sb.length()-2) == ',') {
+                       sb.deleteCharAt(sb.length()-2); // delete comma
+               }
+               String nl = (newline) ? printTabs() : "";
+               if (newline) {
+                       this.tabCount++;
+               }
+               sb.append(nl + "],\n");
+       }
+       
+       @Override
+       protected void writeArrayValue(String value) {
+               sb.append("\"" + value + "\",");
+       }
+       
        protected void startObject(String key) throws IOException {
                String name = "";
                if (key != null) {
@@ -58,7 +89,9 @@ public class CommonCrawlFormatSimple ext
        }
        
        protected void closeObject(String key) throws IOException {
-               sb.deleteCharAt(sb.length()-2); // delete comma
+               if (sb.charAt(sb.length()-2) == ',') {
+                       sb.deleteCharAt(sb.length()-2); // delete comma
+               }
                this.tabCount--;
                sb.append(printTabs() + "},\n");
        }

Modified: nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java Fri Apr  3 
14:36:05 2015
@@ -17,6 +17,7 @@
 
 package org.apache.nutch.util;
 
+import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.io.MD5Hash;
@@ -81,4 +82,26 @@ public class DumpFileUtil {
 
         return String.format(FILENAME_PATTERN, md5, fileBaseName, 
fileExtension);
     }
+    
+    public static String createFileNameFromUrl(String basePath, String 
reverseKey, String urlString, String epochScrapeTime, String fileExtension, 
boolean makeDir) {
+               String fullDirPath = basePath + File.separator + reverseKey + 
File.separator + DigestUtils.shaHex(urlString);
+               
+               if (makeDir) {
+               try {
+                   FileUtils.forceMkdir(new File(fullDirPath));
+               } catch (IOException e) {
+                   LOG.error("Failed to create dir: {}", fullDirPath);
+                   fullDirPath = null;
+               }
+        }
+               
+               if (fileExtension.length() > MAX_LENGTH_OF_EXTENSION) {
+                       LOG.info("File extension is too long. Truncated to {} 
characters.", MAX_LENGTH_OF_EXTENSION);
+                       fileExtension = StringUtils.substring(fileExtension, 0, 
MAX_LENGTH_OF_EXTENSION);
+           }
+               
+               String outputFullPath = fullDirPath + File.separator + 
epochScrapeTime + "." + fileExtension;
+               
+               return outputFullPath;
+    }
 }


Reply via email to