Author: mattmann
Date: Fri Apr 3 14:36:05 2015
New Revision: 1671077
URL: http://svn.apache.org/r1671077
Log:
NUTCH-1975: New configuration for CommonCrawlDataDumper tool contributed by
Giuseppe Totaro.
Added:
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlConfig.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Apr 3 14:36:05 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1975 New configuration for CommonCrawlDataDumper tool (Giuseppe Totaro
via mattmann)
+
* NUTCH-1979 CrawlDbReader to implement Tool (markus)
* NUTCH-1970 Pretty print JSON output in config resource (Tyler Pasulich,
mattmann)
Modified:
nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
Fri Apr 3 14:36:05 2015
@@ -20,6 +20,7 @@ package org.apache.nutch.tools;
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
+import java.text.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
@@ -27,6 +28,8 @@ import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.ibm.icu.text.SimpleDateFormat;
+
/**
* Abstract class that implements {@see CommonCrawlFormat} interface.
*
@@ -44,14 +47,27 @@ public abstract class AbstractCommonCraw
protected String keyPrefix;
- public AbstractCommonCrawlFormat(String url, byte[] content, Metadata
metadata, Configuration conf, String keyPrefix) throws IOException {
+ protected boolean simpleDateFormat;
+
+ protected boolean jsonArray;
+
+ protected boolean reverseKey;
+
+ protected String reverseKeyValue;
+
+ public AbstractCommonCrawlFormat(String url, byte[] content, Metadata
metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException
{
this.url = url;
this.content = content;
this.metadata = metadata;
- this.conf = conf;
- this.keyPrefix = keyPrefix;
+ this.conf = nutchConf;
+
+ this.keyPrefix = config.getKeyPrefix();
+ this.simpleDateFormat = config.getSimpleDateFormat();
+ this.jsonArray = config.getJsonArray();
+ this.reverseKey = config.getReverseKey();
+ this.reverseKeyValue = config.getReverseKeyValue();
}
-
+
@Override
public String getJsonData() throws IOException {
try {
@@ -76,12 +92,14 @@ public abstract class AbstractCommonCraw
writeKeyValue("email", getRequestContactEmail());
closeObject("contact");
closeObject("client");
- startObject("headers");
- writeKeyValue("Accept", getRequestAccept());
- writeKeyValue("Accept-Encoding",
getRequestAcceptEncoding());
- writeKeyValue("Accept-Language",
getRequestAcceptLanguage());
- writeKeyValue("User-Agent", getRequestUserAgent());
- closeObject("headers");
+ // start request headers
+ startHeaders("headers", false, true);
+ writeKeyValueWrapper("Accept", getRequestAccept());
+ writeKeyValueWrapper("Accept-Encoding",
getRequestAcceptEncoding());
+ writeKeyValueWrapper("Accept-Language",
getRequestAcceptLanguage());
+ writeKeyValueWrapper("User-Agent",
getRequestUserAgent());
+ //closeObject("headers");
+ closeHeaders("headers", false, true);
writeKeyNull("body");
closeObject("request");
@@ -92,18 +110,19 @@ public abstract class AbstractCommonCraw
writeKeyValue("hostname", getResponseHostName());
writeKeyValue("address", getResponseAddress());
closeObject("server");
- startObject("headers");
- writeKeyValue("Content-Encoding",
getResponseContentEncoding());
- writeKeyValue("Content-Type", getResponseContentType());
- writeKeyValue("Date", getResponseDate());
- writeKeyValue("Server", getResponseServer());
+ // start response headers
+ startHeaders("headers", false, true);
+ writeKeyValueWrapper("Content-Encoding",
getResponseContentEncoding());
+ writeKeyValueWrapper("Content-Type",
getResponseContentType());
+ writeKeyValueWrapper("Date", getResponseDate());
+ writeKeyValueWrapper("Server", getResponseServer());
for (String name : metadata.names()) {
if (name.equalsIgnoreCase("Content-Encoding")
|| name.equalsIgnoreCase("Content-Type") || name.equalsIgnoreCase("Date") ||
name.equalsIgnoreCase("Server")) {
continue;
}
- writeKeyValue(name, metadata.get(name));
+ writeKeyValueWrapper(name, metadata.get(name));
}
- closeObject("headers");
+ closeHeaders("headers", false, true);
writeKeyValue("body", getResponseContent());
closeObject("response");
@@ -132,6 +151,12 @@ public abstract class AbstractCommonCraw
protected abstract void writeKeyNull(String key) throws IOException;
+ protected abstract void startArray(String key, boolean nested, boolean
newline) throws IOException;
+
+ protected abstract void closeArray(String key, boolean nested, boolean
newline) throws IOException;
+
+ protected abstract void writeArrayValue(String value) throws
IOException;
+
protected abstract void startObject(String key) throws IOException;
protected abstract void closeObject(String key) throws IOException;
@@ -145,7 +170,18 @@ public abstract class AbstractCommonCraw
}
protected String getTimestamp() {
- return metadata.get(ifNullString(Metadata.LAST_MODIFIED));
+ if (this.simpleDateFormat) {
+ String timestamp = null;
+ try {
+ long epoch = new SimpleDateFormat("EEE, d MMM
yyyy HH:mm:ss
z").parse(ifNullString(metadata.get(Metadata.LAST_MODIFIED))).getTime();
+ timestamp = String.valueOf(epoch);
+ } catch (ParseException pe) {
+ LOG.warn(pe.getMessage());
+ }
+ return timestamp;
+ } else {
+ return
ifNullString(metadata.get(Metadata.LAST_MODIFIED));
+ }
}
protected String getMethod() {
@@ -225,7 +261,18 @@ public abstract class AbstractCommonCraw
}
protected String getResponseDate() {
- return ifNullString(metadata.get("Date"));
+ if (this.simpleDateFormat) {
+ String timestamp = null;
+ try {
+ long epoch = new SimpleDateFormat("EEE, d MMM
yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime();
+ timestamp = String.valueOf(epoch);
+ } catch (ParseException pe) {
+ LOG.warn(pe.getMessage());
+ }
+ return timestamp;
+ } else {
+ return ifNullString(metadata.get("Date"));
+ }
}
protected String getResponseServer() {
@@ -237,14 +284,60 @@ public abstract class AbstractCommonCraw
}
protected String getKey() {
- return url;
+ if (this.reverseKey) {
+ return this.reverseKeyValue;
+ }
+ else {
+ return url;
+ }
}
protected String getImported() {
- return new String(""); // TODO
+ if (this.simpleDateFormat) {
+ String timestamp = null;
+ try {
+ long epoch = new SimpleDateFormat("EEE, d MMM
yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime();
+ timestamp = String.valueOf(epoch);
+ } catch (ParseException pe) {
+ LOG.warn(pe.getMessage());
+ }
+ return timestamp;
+ } else {
+ return ifNullString(metadata.get("Date"));
+ }
}
private static String ifNullString(String value) {
return (value != null) ? value : "";
}
+
+ private void startHeaders(String key, boolean nested, boolean newline)
throws IOException {
+ if (this.jsonArray) {
+ startArray(key, nested, newline);
+ }
+ else {
+ startObject(key);
+ }
+ }
+
+ private void closeHeaders(String key, boolean nested, boolean newline)
throws IOException {
+ if (this.jsonArray) {
+ closeArray(key, nested, newline);
+ }
+ else {
+ closeObject(key);
+ }
+ }
+
+ private void writeKeyValueWrapper(String key, String value) throws
IOException {
+ if (this.jsonArray) {
+ startArray(null, true, false);
+ writeArrayValue(key);
+ writeArrayValue(value);
+ closeArray(null, true, false);
+ }
+ else {
+ writeKeyValue(key, value);
+ }
+ }
}
Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlConfig.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlConfig.java?rev=1671077&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlConfig.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlConfig.java Fri Apr
3 14:36:05 2015
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Properties;
+
+public class CommonCrawlConfig implements Serializable {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 5235013733207799661L;
+
+ // Prefix for key value in the output format
+ private String keyPrefix = "";
+
+ private boolean simpleDateFormat = false;
+
+ private boolean jsonArray = false;
+
+ private boolean reverseKey = false;
+
+ private String reverseKeyValue = "";
+
+ /**
+ * Default constructor
+ */
+ public CommonCrawlConfig() {
+ // TODO
init(this.getClass().getResourceAsStream("CommonCrawlConfig.properties"));
+ }
+
+ public CommonCrawlConfig(InputStream stream) {
+ init(stream);
+ }
+
+ private void init(InputStream stream) {
+ if (stream == null) {
+ return;
+ }
+ Properties properties = new Properties();
+
+ try {
+ properties.load(stream);
+ } catch (IOException e) {
+ // TODO
+ } finally {
+ try {
+ stream.close();
+ } catch (IOException e) {
+ // TODO
+ }
+ }
+
+ setKeyPrefix(properties.getProperty("keyPrefix", ""));
+
setSimpleDateFormat(Boolean.parseBoolean(properties.getProperty("simpleDateFormat",
"False")));
+
setJsonArray(Boolean.parseBoolean(properties.getProperty("jsonArray",
"False")));
+
setReverseKey(Boolean.parseBoolean(properties.getProperty("reverseKey",
"False")));
+ }
+
+ public void setKeyPrefix(String keyPrefix) {
+ this.keyPrefix = keyPrefix;
+ }
+
+ public void setSimpleDateFormat(boolean simpleDateFormat) {
+ this.simpleDateFormat = simpleDateFormat;
+ }
+
+ public void setJsonArray(boolean jsonArray) {
+ this.jsonArray = jsonArray;
+ }
+
+ public void setReverseKey(boolean reverseKey) {
+ this.reverseKey = reverseKey;
+ }
+
+ public void setReverseKeyValue(String reverseKeyValue) {
+ this.reverseKeyValue = reverseKeyValue;
+ }
+
+ public String getKeyPrefix() {
+ return this.keyPrefix;
+ }
+
+ public boolean getSimpleDateFormat() {
+ return this.simpleDateFormat;
+ }
+
+ public boolean getJsonArray() {
+ return this.jsonArray;
+ }
+
+ public boolean getReverseKey() {
+ return this.reverseKey;
+ }
+
+ public String getReverseKeyValue() {
+ return this.reverseKeyValue;
+ }
+}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Fri
Apr 3 14:36:05 2015
@@ -25,6 +25,9 @@ import java.io.File;
import java.io.FileFilter;
import java.io.FileOutputStream;
import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
@@ -38,6 +41,7 @@ import org.apache.commons.cli.HelpFormat
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
+import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
@@ -52,10 +56,10 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.DumpFileUtil;
import org.apache.nutch.util.NutchConfiguration;
-
//Tika imports
import org.apache.tika.Tika;
@@ -65,6 +69,7 @@ import com.fasterxml.jackson.dataformat.
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.ibm.icu.text.DateFormat;
import com.ibm.icu.text.SimpleDateFormat;
/**
@@ -165,6 +170,8 @@ public class CommonCrawlDataDumper {
private static final Logger LOG =
LoggerFactory.getLogger(CommonCrawlDataDumper.class.getName());
+ private CommonCrawlConfig config = null;
+
// Gzip initialization
private FileOutputStream fileOutput = null;
private BufferedOutputStream bufOutput = null;
@@ -218,6 +225,26 @@ public class CommonCrawlDataDumper {
.hasArg(true)
.withDescription("an optional prefix for key in
the output format.")
.create("keyPrefix");
+ Option simpleDateFormatOpt = OptionBuilder
+ .withArgName("SimpleDateFormat")
+ .hasArg(false)
+ .withDescription("an optional format for
timestamp in GMT epoch milliseconds.")
+ .create("SimpleDateFormat");
+ Option epochFilenameOpt = OptionBuilder
+ .withArgName("epochFilename")
+ .hasArg(false)
+ .withDescription("an optional format for output
filename.")
+ .create("epochFilename");
+ Option jsonArrayOpt = OptionBuilder
+ .withArgName("jsonArray")
+ .hasArg(false)
+ .withDescription("an optional format for JSON
output.")
+ .create("jsonArray");
+ Option reverseKeyOpt = OptionBuilder
+ .withArgName("reverseKey")
+ .hasArg(false)
+ .withDescription("an optional format for key
value in JSON output.")
+ .create("reverseKey");
// create the options
Options options = new Options();
@@ -229,6 +256,11 @@ public class CommonCrawlDataDumper {
options.addOption(gzipOpt);
// create keyPrefix option
options.addOption(keyPrefixOpt);
+ // create simpleDataFormat option
+ options.addOption(simpleDateFormatOpt);
+ options.addOption(epochFilenameOpt);
+ options.addOption(jsonArrayOpt);
+ options.addOption(reverseKeyOpt);
CommandLineParser parser = new GnuParser();
try {
@@ -243,7 +275,18 @@ public class CommonCrawlDataDumper {
File segmentRootDir = new
File(line.getOptionValue("segment"));
String[] mimeTypes = line.getOptionValues("mimetype");
boolean gzip = line.hasOption("gzip");
+ boolean epochFilename = line.hasOption("epochFilename");
+
String keyPrefix = line.getOptionValue("keyPrefix", "");
+ boolean simpleDateFormat =
line.hasOption("SimpleDateFormat");
+ boolean jsonArray = line.hasOption("jsonArray");
+ boolean reverseKey = line.hasOption("reverseKey");
+
+ CommonCrawlConfig config = new CommonCrawlConfig();
+ config.setKeyPrefix(keyPrefix);
+ config.setSimpleDateFormat(simpleDateFormat);
+ config.setJsonArray(jsonArray);
+ config.setReverseKey(reverseKey);
if (!outputDir.exists()) {
LOG.warn("Output directory: [" +
outputDir.getAbsolutePath() + "]: does not exist, creating it.");
@@ -251,9 +294,9 @@ public class CommonCrawlDataDumper {
throw new Exception("Unable to create:
[" + outputDir.getAbsolutePath() + "]");
}
- CommonCrawlDataDumper dumper = new
CommonCrawlDataDumper();
+ CommonCrawlDataDumper dumper = new
CommonCrawlDataDumper(config);
- dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes,
keyPrefix);
+ dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes,
epochFilename);
} catch (Exception e) {
LOG.error(CommonCrawlDataDumper.class.getName() + ": "
+ StringUtils.stringifyException(e));
@@ -263,6 +306,13 @@ public class CommonCrawlDataDumper {
}
/**
+ * Constructor
+ */
+ public CommonCrawlDataDumper(CommonCrawlConfig config) {
+ this.config = config;
+ }
+
+ /**
* Dumps the reverse engineered CBOR content from the provided segment
* directories if a parent directory contains more than one segment,
* otherwise a single segment can be passed as an argument. If the
boolean
@@ -281,8 +331,8 @@ public class CommonCrawlDataDumper {
* filtered out.
* @throws Exception
*/
- public void dump(File outputDir, File segmentRootDir, boolean gzip,
String[] mimeTypes, String keyPrefix) throws Exception {
- if (!gzip) {
+ public void dump(File outputDir, File segmentRootDir, boolean gzip,
String[] mimeTypes, boolean epochFilename) throws Exception {
+ if (gzip) {
LOG.info("Gzipping CBOR data has been skipped");
}
// total file counts
@@ -290,8 +340,8 @@ public class CommonCrawlDataDumper {
// filtered file counters
Map<String, Integer> filteredCounts = new HashMap<String,
Integer>();
- Configuration conf = NutchConfiguration.create();
- FileSystem fs = FileSystem.get(conf);
+ Configuration nutchConfig = NutchConfiguration.create();
+ FileSystem fs = FileSystem.get(nutchConfig);
File[] segmentDirs = segmentRootDir.listFiles(new FileFilter() {
@Override
public boolean accept(File file) {
@@ -311,8 +361,6 @@ public class CommonCrawlDataDumper {
for (File segment : segmentDirs) {
LOG.info("Processing segment: [" +
segment.getAbsolutePath() + "]");
- // GIUSEPPE: Never used (also in FileDumper.java)!
- //DataOutputStream doutputStream = null;
try {
String segmentContentPath =
segment.getAbsolutePath() + File.separator + Content.DIR_NAME +
"/part-00000/data";
Path file = new Path(segmentContentPath);
@@ -321,7 +369,7 @@ public class CommonCrawlDataDumper {
LOG.warn("Skipping segment: [" +
segmentContentPath + "]: no data directory present");
continue;
}
- SequenceFile.Reader reader = new
SequenceFile.Reader(fs, file, conf);
+ SequenceFile.Reader reader = new
SequenceFile.Reader(fs, file, nutchConfig);
if (!new File(file.toString()).exists()) {
LOG.warn("Skipping segment: [" +
segmentContentPath + "]: no data directory present");
@@ -334,21 +382,49 @@ public class CommonCrawlDataDumper {
while (reader.next(key)) {
content = new Content();
reader.getCurrentValue(content);
+ Metadata metadata =
content.getMetadata();
String url = key.toString();
String baseName =
FilenameUtils.getBaseName(url);
String extension =
FilenameUtils.getExtension(url);
- if (extension == null ||
extension.equals("")) {
+
+ if ((extension == null) ||
extension.isEmpty()) {
extension = "html";
}
-
- String md5Ofurl =
DumpFileUtil.getUrlMD5(url);
- String fullDir =
DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl,
!gzip);
- String filename =
DumpFileUtil.createFileName(md5Ofurl, baseName, extension);
- String outputFullPath =
String.format("%s/%s", fullDir, filename);
-
- String [] fullPathLevels =
fullDir.split(File.separator);
- String firstLevelDirName =
fullPathLevels[fullPathLevels.length-2];
- String secondLevelDirName =
fullPathLevels[fullPathLevels.length-1];
+
+ String outputFullPath = null;
+ String outputRelativePath = null;
+ String filename = null;
+ String timestamp = null;
+ String reverseKey = null;
+
+ if (epochFilename ||
config.getReverseKey()) {
+ try {
+ long epoch = new
SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss
z").parse(getDate(metadata.get("Date"))).getTime();
+ timestamp =
String.valueOf(epoch);
+ } catch (ParseException pe) {
+
LOG.warn(pe.getMessage());
+ }
+
+ reverseKey = reverseUrl(url);
+
config.setReverseKeyValue(reverseKey.replace("/", "_") + "_" +
DigestUtils.shaHex(url) + "_" + timestamp);
+ }
+
+ if (epochFilename) {
+ outputFullPath =
DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(), reverseKey,
url, timestamp, extension, !gzip);
+ outputRelativePath =
outputFullPath.substring(0, outputFullPath.lastIndexOf(File.separator)-1);
+ filename =
content.getMetadata().get(Metadata.DATE) + "." + extension;
+ }
+ else {
+ String md5Ofurl =
DumpFileUtil.getUrlMD5(url);
+ String fullDir =
DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl,
!gzip);
+ filename =
DumpFileUtil.createFileName(md5Ofurl, baseName, extension);
+ outputFullPath =
String.format("%s/%s", fullDir, filename);
+
+ String [] fullPathLevels =
fullDir.split(File.separator);
+ String firstLevelDirName =
fullPathLevels[fullPathLevels.length-2];
+ String secondLevelDirName =
fullPathLevels[fullPathLevels.length-1];
+ outputRelativePath =
firstLevelDirName + secondLevelDirName;
+ }
// Encode all filetypes if no mimetypes
have been given
Boolean filter = (mimeTypes == null);
@@ -357,7 +433,7 @@ public class CommonCrawlDataDumper {
try {
String mimeType = new
Tika().detect(content.getContent());
// Maps file to JSON-based
structure
- CommonCrawlFormat format =
CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url,
content.getContent(), content.getMetadata(), conf, keyPrefix);
+ CommonCrawlFormat format =
CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url,
content.getContent(), metadata, nutchConfig, config);
jsonData = format.getJsonData();
collectStats(typeCounts,
mimeType);
@@ -375,7 +451,6 @@ public class CommonCrawlDataDumper {
byte[] byteData =
serializeCBORData(jsonData);
if (!gzip) {
- //String outputFullPath
= outputDir + File.separator + filename;
File outputFile = new
File(outputFullPath);
if
(outputFile.exists()) {
LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
@@ -392,7 +467,8 @@ public class CommonCrawlDataDumper {
else {
fileList.add(outputFullPath);
LOG.info("Compressing: [" + outputFullPath + "]");
- TarArchiveEntry
tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator +
secondLevelDirName + File.separator + filename);
+
//TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName +
File.separator + secondLevelDirName + File.separator + filename);
+ TarArchiveEntry
tarEntry = new TarArchiveEntry(outputRelativePath + File.separator + filename);
tarEntry.setSize(byteData.length);
tarOutput.putArchiveEntry(tarEntry);
tarOutput.write(byteData);
@@ -500,4 +576,41 @@ public class CommonCrawlDataDumper {
}
return builder.toString();
}
+
+ /**
+ * Gets the current date if the given timestamp is empty or null.
+ * @param timestamp the timestamp
+ * @return the current timestamp if the given one is null.
+ */
+ private String getDate(String timestamp) {
+ if (timestamp == null || timestamp.isEmpty()) {
+ DateFormat dateFormat = new SimpleDateFormat("EEE, d
MMM yyyy HH:mm:ss z");
+ timestamp = dateFormat.format(new Date());
+ }
+ return timestamp;
+
+ }
+
+ public static String reverseUrl(String urlString) {
+ URL url = null;
+ String reverseKey = null;
+ try {
+ url = new URL(urlString);
+
+ String[] hostPart = url.getHost().replace('.',
'/').split("/");
+
+ StringBuilder sb = new StringBuilder();
+ sb.append(hostPart[hostPart.length-1]);
+ for (int i = hostPart.length-2; i >= 0; i--) {
+ sb.append("/" + hostPart[i]);
+ }
+
+ reverseKey = sb.toString();
+
+ } catch (MalformedURLException e) {
+ LOG.error("Failed to parse URL: {}", urlString);
+ }
+
+ return reverseKey;
+ }
}
Modified:
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
Fri Apr 3 14:36:05 2015
@@ -34,24 +34,24 @@ public class CommonCrawlFormatFactory {
* @param url the url.
* @param content the content.
* @param metadata the metadata.
- * @param conf the configuration.
+ * @param nutchConf the configuration.
+ * @param config the CommonCrawl output configuration.
* @return the new {@see CommonCrawlFormat} object.
* @throws IOException If any I/O error occurs.
*/
- public static CommonCrawlFormat getCommonCrawlFormat(String formatType,
String url, byte[] content,
- Metadata metadata, Configuration conf, String
keyPrefix) throws IOException {
+ public static CommonCrawlFormat getCommonCrawlFormat(String formatType,
String url, byte[] content, Metadata metadata, Configuration nutchConf,
CommonCrawlConfig config) throws IOException {
if (formatType == null) {
return null;
}
if (formatType.equalsIgnoreCase("jackson")) {
- return new CommonCrawlFormatJackson(url, content,
metadata, conf, keyPrefix);
+ return new CommonCrawlFormatJackson(url, content,
metadata, nutchConf, config);
}
else if (formatType.equalsIgnoreCase("jettinson")) {
- return new CommonCrawlFormatJettinson(url, content,
metadata, conf, keyPrefix);
+ return new CommonCrawlFormatJettinson(url, content,
metadata, nutchConf, config);
}
else if (formatType.equalsIgnoreCase("simple")) {
- return new CommonCrawlFormatSimple(url, content,
metadata, conf, keyPrefix);
+ return new CommonCrawlFormatSimple(url, content,
metadata, nutchConf, config);
}
return null;
Modified:
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
Fri Apr 3 14:36:05 2015
@@ -32,15 +32,13 @@ import com.fasterxml.jackson.core.JsonGe
*/
public class CommonCrawlFormatJackson extends AbstractCommonCrawlFormat {
- //private static final Logger LOG =
LoggerFactory.getLogger(CommonCrawlFormatJackson.class.getName());
-
private ByteArrayOutputStream out;
private JsonGenerator generator;
- public CommonCrawlFormatJackson(String url, byte[] content,
- Metadata metadata, Configuration conf, String
keyPrefix) throws IOException {
- super(url, content, metadata, conf, keyPrefix);
+
+ public CommonCrawlFormatJackson(String url, byte[] content, Metadata
metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException
{
+ super(url, content, metadata, nutchConf, config);
JsonFactory factory = new JsonFactory();
this.out = new ByteArrayOutputStream();
@@ -58,7 +56,25 @@ public class CommonCrawlFormatJackson ex
@Override
protected void writeKeyNull(String key) throws IOException {
generator.writeFieldName(key);
- generator.writeNull();;
+ generator.writeNull();
+ }
+
+ @Override
+ protected void startArray(String key, boolean nested, boolean newline)
throws IOException {
+ if (key != null) {
+ generator.writeFieldName(key);
+ }
+ generator.writeStartArray();
+ }
+
+ @Override
+ protected void closeArray(String key, boolean nested, boolean newline)
throws IOException {
+ generator.writeEndArray();
+ }
+
+ @Override
+ protected void writeArrayValue(String value) throws IOException {
+ generator.writeString(value);
}
@Override
Modified:
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
Fri Apr 3 14:36:05 2015
@@ -23,6 +23,7 @@ import java.util.Deque;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
+import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
@@ -32,19 +33,21 @@ import org.codehaus.jettison.json.JSONOb
*/
public class CommonCrawlFormatJettinson extends AbstractCommonCrawlFormat {
- private Deque<JSONObject> stack;
+ private Deque<JSONObject> stackObjects;
+
+ private Deque<JSONArray> stackArrays;
- public CommonCrawlFormatJettinson(String url, byte[] content,
- Metadata metadata, Configuration conf, String
keyPrefix) throws IOException {
- super(url, content, metadata, conf, keyPrefix);
+ public CommonCrawlFormatJettinson(String url, byte[] content, Metadata
metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException
{
+ super(url, content, metadata, nutchConf, config);
- stack = new ArrayDeque<JSONObject>();
+ stackObjects = new ArrayDeque<JSONObject>();
+ stackArrays = new ArrayDeque<JSONArray>();
}
@Override
protected void writeKeyValue(String key, String value) throws
IOException {
try {
- stack.getFirst().put(key, value);
+ stackObjects.getFirst().put(key, value);
} catch (JSONException jsone) {
throw new IOException(jsone.getMessage());
}
@@ -53,24 +56,54 @@ public class CommonCrawlFormatJettinson
@Override
protected void writeKeyNull(String key) throws IOException {
try {
- stack.getFirst().put(key, JSONObject.NULL);
+ stackObjects.getFirst().put(key, JSONObject.NULL);
+ } catch (JSONException jsone) {
+ throw new IOException(jsone.getMessage());
+ }
+ }
+
+ @Override
+ protected void startArray(String key, boolean nested, boolean newline)
throws IOException {
+ JSONArray array = new JSONArray();
+ stackArrays.push(array);
+ }
+
+ @Override
+ protected void closeArray(String key, boolean nested, boolean newline)
throws IOException {
+ try {
+ if (stackArrays.size() > 1) {
+ JSONArray array = stackArrays.pop();
+ if (nested) {
+ stackArrays.getFirst().put(array);
+ }
+ else {
+ stackObjects.getFirst().put(key, array);
+ }
+ }
} catch (JSONException jsone) {
throw new IOException(jsone.getMessage());
}
}
@Override
+ protected void writeArrayValue(String value) throws IOException {
+ if (stackArrays.size() > 1) {
+ stackArrays.getFirst().put(value);
+ }
+ }
+
+ @Override
protected void startObject(String key) throws IOException {
JSONObject object = new JSONObject();
- stack.push(object);
+ stackObjects.push(object);
}
@Override
protected void closeObject(String key) throws IOException {
try {
- if (stack.size() > 1) {
- JSONObject object = stack.pop();
- stack.getFirst().put(key, object);
+ if (stackObjects.size() > 1) {
+ JSONObject object = stackObjects.pop();
+ stackObjects.getFirst().put(key, object);
}
} catch (JSONException jsone) {
throw new IOException(jsone.getMessage());
@@ -80,7 +113,7 @@ public class CommonCrawlFormatJettinson
@Override
protected String generateJson() throws IOException {
try {
- return stack.getFirst().toString(2);
+ return stackObjects.getFirst().toString(2);
} catch (JSONException jsone) {
throw new IOException(jsone.getMessage());
}
Modified:
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
Fri Apr 3 14:36:05 2015
@@ -32,22 +32,53 @@ public class CommonCrawlFormatSimple ext
private int tabCount;
- public CommonCrawlFormatSimple(String url, byte[] content, Metadata
metadata,
- Configuration conf, String keyPrefix) throws
IOException {
- super(url, content, metadata, conf, keyPrefix);
+ public CommonCrawlFormatSimple(String url, byte[] content, Metadata
metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException
{
+ super(url, content, metadata, nutchConf, config);
this.sb = new StringBuilder();
this.tabCount = 0;
}
+ @Override
protected void writeKeyValue(String key, String value) throws
IOException {
sb.append(printTabs() + "\"" + key + "\": " + quote(value) +
",\n");
}
+ @Override
protected void writeKeyNull(String key) throws IOException {
sb.append(printTabs() + "\"" + key + "\": null,\n");
}
+ @Override
+ protected void startArray(String key, boolean nested, boolean newline)
throws IOException {
+ String name = (key != null) ? "\"" + key + "\": " : "";
+ String nl = (newline) ? "\n" : "";
+ sb.append(printTabs() + name + "[" + nl);
+ if (newline) {
+ this.tabCount++;
+ }
+ }
+
+ @Override
+ protected void closeArray(String key, boolean nested, boolean newline)
throws IOException {
+ if (sb.charAt(sb.length()-1) == ',') {
+ sb.deleteCharAt(sb.length()-1); // delete comma
+ }
+ else if (sb.charAt(sb.length()-2) == ',') {
+ sb.deleteCharAt(sb.length()-2); // delete comma
+ }
+ String nl = (newline) ? printTabs() : "";
+ if (newline) {
+ this.tabCount++;
+ }
+ sb.append(nl + "],\n");
+ }
+
+ @Override
+ protected void writeArrayValue(String value) {
+ sb.append("\"" + value + "\",");
+ }
+
protected void startObject(String key) throws IOException {
String name = "";
if (key != null) {
@@ -58,7 +89,9 @@ public class CommonCrawlFormatSimple ext
}
protected void closeObject(String key) throws IOException {
- sb.deleteCharAt(sb.length()-2); // delete comma
+ if (sb.charAt(sb.length()-2) == ',') {
+ sb.deleteCharAt(sb.length()-2); // delete comma
+ }
this.tabCount--;
sb.append(printTabs() + "},\n");
}
Modified: nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java?rev=1671077&r1=1671076&r2=1671077&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java Fri Apr 3
14:36:05 2015
@@ -17,6 +17,7 @@
package org.apache.nutch.util;
+import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.MD5Hash;
@@ -81,4 +82,26 @@ public class DumpFileUtil {
return String.format(FILENAME_PATTERN, md5, fileBaseName,
fileExtension);
}
+
+ public static String createFileNameFromUrl(String basePath, String
reverseKey, String urlString, String epochScrapeTime, String fileExtension,
boolean makeDir) {
+ String fullDirPath = basePath + File.separator + reverseKey +
File.separator + DigestUtils.shaHex(urlString);
+
+ if (makeDir) {
+ try {
+ FileUtils.forceMkdir(new File(fullDirPath));
+ } catch (IOException e) {
+ LOG.error("Failed to create dir: {}", fullDirPath);
+ fullDirPath = null;
+ }
+ }
+
+ if (fileExtension.length() > MAX_LENGTH_OF_EXTENSION) {
+ LOG.info("File extension is too long. Truncated to {}
characters.", MAX_LENGTH_OF_EXTENSION);
+ fileExtension = StringUtils.substring(fileExtension, 0,
MAX_LENGTH_OF_EXTENSION);
+ }
+
+ String outputFullPath = fullDirPath + File.separator +
epochScrapeTime + "." + fileExtension;
+
+ return outputFullPath;
+ }
}