Author: lewismc
Date: Wed Mar 4 18:48:32 2015
New Revision: 1664109
URL: http://svn.apache.org/r1664109
Log:
NUTCH-1949 Dump out the Nutch data into the Common Crawl format
Added:
nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/bin/nutch
nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Mar 4 18:48:32 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1949 Dump out the Nutch data into the Common Crawl format (Giuseppe
Totaro via lewismc)
+
* NUTCH-1950 File name too long (Jiaheng Zhang, Chong Li via mattmann)
* NUTCH-1921 Optionally disable HTTP if-modified-since header (markus)
Modified: nutch/trunk/ivy/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Wed Mar 4 18:48:32 2015
@@ -49,7 +49,8 @@
rev="3.1" conf="*->master" />
<dependency org="commons-codec" name="commons-codec" rev="1.3"
conf="*->default" />
-
+ <dependency org="org.apache.commons" name="commons-compress"
rev="1.9"
+ conf="*->default" />
<dependency org="org.apache.hadoop" name="hadoop-core"
rev="1.2.0"
conf="*->default">
<exclude org="hsqldb" name="hsqldb" />
@@ -70,6 +71,9 @@
<dependency org="com.google.guava" name="guava" rev="11.0.2" />
<dependency org="com.google.code.crawler-commons"
name="crawler-commons"
rev="0.5" />
+
+ <dependency org="com.fasterxml.jackson.core"
name="jackson-databind" rev="2.5.1" />
+ <dependency org="com.fasterxml.jackson.dataformat"
name="jackson-dataformat-cbor" rev="2.5.1" />
<!--Configuration: test -->
Modified: nutch/trunk/src/bin/nutch
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Wed Mar 4 18:48:32 2015
@@ -71,7 +71,8 @@ if [ $# = 0 ]; then
echo " mergelinkdb merge linkdb-s, with optional filtering"
echo " index run the plugin-based indexer on parsed segments
and linkdb"
echo " dedup deduplicate entries in the crawldb and give them a
special status"
- echo " dump exports cralwed data from segments into files"
+ echo " dump exports crawled data from segments into files"
+ echo " commoncrawldump exports crawled data from segments into common
crawl data format encoded as CBOR"
echo " solrindex run the solr indexer on parsed segments and linkdb
- DEPRECATED use the index command instead"
echo " solrdedup remove duplicates from solr - DEPRECATED use the
dedup command instead"
echo " solrclean remove HTTP 301 and 404 documents from solr -
DEPRECATED use the clean command instead"
@@ -233,6 +234,8 @@ elif [ "$COMMAND" = "mergelinkdb" ] ; th
CLASS=org.apache.nutch.crawl.LinkDbMerger
elif [ "$COMMAND" = "dump" ] ; then
CLASS=org.apache.nutch.tools.FileDumper
+elif [ "$COMMAND" = "commoncrawldump" ] ; then
+ CLASS=org.apache.nutch.tools.CommonCrawlDataDumper
elif [ "$COMMAND" = "solrindex" ] ; then
CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
shift
Added:
nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
(added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
Wed Mar 4 18:48:32 2015
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * Abstract class that implements {@see CommonCrawlFormat} interface.
+ *
+ */
+public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat {
+ protected String url;
+
+ protected byte[] content;
+
+ protected Metadata metadata;
+
+ protected Configuration conf;
+
+ public AbstractCommonCrawlFormat(String url, byte[] content, Metadata
metadata, Configuration conf) {
+ this.url = url;
+ this.content = content;
+ this.metadata = metadata;
+ this.conf = conf;
+ }
+
+ @Override
+ public String getJsonData(boolean mapAll) throws IOException {
+ if (mapAll) {
+ return getJsonDataAll();
+ }
+ else {
+ return getJsonDataSet();
+ }
+ }
+
+ protected abstract String getJsonDataSet() throws IOException;
+
+ protected abstract String getJsonDataAll() throws IOException;
+
+ protected String ifNullString(String value) {
+ return (value != null) ? value : "";
+ }
+
+ protected static String getHostName() {
+ String hostName = "";
+ try {
+ hostName = InetAddress.getLocalHost().getHostName();
+ } catch (UnknownHostException uhe) {
+
+ }
+ return hostName;
+ }
+
+ protected static String getHostAddress() {
+ String hostAddress = "";
+ try {
+ hostAddress =
InetAddress.getLocalHost().getHostAddress();
+ } catch (UnknownHostException uhe) {
+
+ }
+ return hostAddress;
+ }
+}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java Wed Mar 4
18:48:32 2015
@@ -53,6 +53,7 @@ public class Benchmark extends Configure
System.exit(res);
}
+ @SuppressWarnings("unused")
private static String getDate() {
return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System
.currentTimeMillis()));
Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
(added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Wed
Mar 4 18:48:32 2015
@@ -0,0 +1,470 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+//JDK imports
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
+//Commons imports
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.FilenameUtils;
+
+//Hadoop
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+
+//Tika imports
+import org.apache.tika.Tika;
+import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
+import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.ibm.icu.text.SimpleDateFormat;
+
+/**
+ * <p>
+ * The Common Crawl Data Dumper tool enables one to reverse generate the raw
+ * content from Nutch segment data directories into a common crawling data
+ * format, consumed by many applications. The data is then serialized as <a
+ * href="http://cbor.io">CBOR</a>
+ * </p>
+ * <p>
+ * Text content will be stored in a structured document format. Below is a
+ * schema for storage of data and metadata related to a crawling request, with
+ * the response body truncated for readability. This document must be encoded
+ * using CBOR and should be compressed with gzip after encoding. The
timestamped
+ * URL key for these records' keys follows the same layout as the media file
+ * directory structure, with underscores in place of directory separators.
</li>
+ * </p>
+ * <p>
+ * Thus, the timestamped url key for the record is provided below followed by
an
+ * example record:
+ *
+ * <pre>
+ * {@code
+ * com_somepage_33a3e36bbef59c2a5242c2ccee59239ab30d51f3_1411623696000
+ *
+ * {
+ * "url": "http:\/\/somepage.com\/22\/14560817",
+ * "timestamp": "1411623696000",
+ * "request": {
+ * "method": "GET",
+ * "client": {
+ * "hostname": "crawler01.local",
+ * "address": "74.347.129.200",
+ * "software": "Apache Nutch v1.10",
+ * "robots": "classic",
+ * "contact": {
+ * "name": "Nutch Admin",
+ * "email": "[email protected]"
+ * }
+ * },
+ * "headers": {
+ * "Accept":
"text\/html,application\/xhtml+xml,application\/xml",
+ * "Accept-Encoding": "gzip,deflate,sdch",
+ * "Accept-Language": "en-US,en",
+ * "User-Agent": "Mozilla\/5.0",
+ * "...": "..."
+ * },
+ * "body": null
+ * },
+ * "response": {
+ * "status": "200",
+ * "server": {
+ * "hostname": "somepage.com",
+ * "address": "55.33.51.19",
+ * },
+ * "headers": {
+ * "Content-Encoding": "gzip",
+ * "Content-Type": "text\/html",
+ * "Date": "Thu, 25 Sep 2014 04:16:58 GMT",
+ * "Expires": "Thu, 25 Sep 2014 04:16:57 GMT",
+ * "Server": "nginx",
+ * "...": "..."
+ * },
+ * "body": "\r\n <!DOCTYPE html PUBLIC ... \r\n\r\n \r\n
</body>\r\n </html>\r\n \r\n\r\n",
+ * },
+ * "key":
"com_somepage_33a3e36bbef59c2a5242c2ccee59239ab30d51f3_1411623696000",
+ * "imported": "1411623698000"
+ * }
+ * }
+ * </pre>
+ *
+ * <p>
+ * Upon successful completion the tool displays a very convenient JSON snippet
+ * detailing the mimetype classifications and the counts of documents which
fall
+ * into those classifications. An example is as follows:
+ * </p>
+ *
+ * <pre>
+ * {@code
+ * INFO: File Types:
+ * TOTAL Stats: {
+ * {"mimeType":"application/xml","count":19"}
+ * {"mimeType":"image/png","count":47"}
+ * {"mimeType":"image/jpeg","count":141"}
+ * {"mimeType":"image/vnd.microsoft.icon","count":4"}
+ * {"mimeType":"text/plain","count":89"}
+ * {"mimeType":"video/quicktime","count":2"}
+ * {"mimeType":"image/gif","count":63"}
+ * {"mimeType":"application/xhtml+xml","count":1670"}
+ * {"mimeType":"application/octet-stream","count":40"}
+ * {"mimeType":"text/html","count":1863"}
+ * }
+ * }
+ * </pre>
+ *
+ */
+public class CommonCrawlDataDumper {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(CommonCrawlDataDumper.class.getName());
+
+ /**
+ * Main method for invoking this tool
+ *
+ * @param args
+ * 1) output directory (which will be created if it does not
+ * already exist) to host the CBOR data and 2) a directory
+ * containing one or more segments from which we wish to
generate
+ * CBOR data from. Optionally, 3) a list of mimetypes and
the 4)
+ * the gzip option may be provided.
+ * @throws Exception
+ */
+ @SuppressWarnings("static-access")
+ public static void main(String[] args) throws Exception {
+ Option helpOpt = new Option("h", "help", false,
+ "show this help message");
+ // argument options
+ Option outputOpt = OptionBuilder
+ .withArgName("outputDir")
+ .hasArg()
+ .withDescription(
+ "output directory (which will
be created) to host the CBOR data")
+ .create("outputDir");
+ Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
+ .withDescription("the segment(s) to
use").create("segment");
+ // GIUSEPPE: create mimetype and gzip options
+ Option mimeOpt = OptionBuilder
+ .isRequired(false)
+ .withArgName("mimetype")
+ .hasArgs()
+ .withDescription(
+ "an optional list of mimetypes
to dump, excluding all others. Defaults to all.")
+ .create("mimetype");
+ Option gzipOpt = OptionBuilder
+ .isRequired(false)
+ .hasArg(false)
+ .withDescription(
+ "an optional flag indicating
whether to additionally gzip the data")
+ .create("gzip");
+
+ // create the options
+ Options options = new Options();
+ options.addOption(helpOpt);
+ options.addOption(outputOpt);
+ options.addOption(segOpt);
+ // create mimetypes and gzip options
+ options.addOption(mimeOpt);
+ options.addOption(gzipOpt);
+
+ CommandLineParser parser = new GnuParser();
+ try {
+ CommandLine line = parser.parse(options, args);
+ if (line.hasOption("help") ||
!line.hasOption("outputDir") || (!line.hasOption("segment"))) {
+ HelpFormatter formatter = new HelpFormatter();
+
formatter.printHelp(CommonCrawlDataDumper.class.getName(), options, true);
+ return;
+ }
+
+ File outputDir = new
File(line.getOptionValue("outputDir"));
+ File segmentRootDir = new
File(line.getOptionValue("segment"));
+ String[] mimeTypes = line.getOptionValues("mimetype");
+ boolean gzip = line.hasOption("gzip");
+
+ if (!outputDir.exists()) {
+ LOG.warn("Output directory: [" +
outputDir.getAbsolutePath() + "]: does not exist, creating it.");
+ if (!outputDir.mkdirs())
+ throw new Exception("Unable to create:
[" + outputDir.getAbsolutePath() + "]");
+ }
+
+ CommonCrawlDataDumper dumper = new
CommonCrawlDataDumper();
+
+ dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes);
+
+ } catch (Exception e) {
+ LOG.error(CommonCrawlDataDumper.class.getName() + ": "
+ StringUtils.stringifyException(e));
+ e.printStackTrace();
+ return;
+ }
+ }
+
+ /**
+ * Dumps the reverse engineered CBOR content from the provided segment
+ * directories if a parent directory contains more than one segment,
+ * otherwise a single segment can be passed as an argument. If the
boolean
+ * argument is provided then the CBOR is also zipped.
+ *
+ * @param outputDir
+ * the directory you wish to dump the raw content to. This
+ * directory will be created.
+ * @param segmentRootDir
+ * a directory containing one or more segments.
+ * @param gzip
+ * a boolean flag indicating whether the CBOR content should
also
+ * be gzipped.
+ * @param mimetypes
+ * an array of mime types we have to dump, all others will be
+ * filtered out.
+ * @throws Exception
+ */
+ public void dump(File outputDir, File segmentRootDir, boolean gzip,
String[] mimeTypes) throws Exception {
+ if (!gzip) {
+ LOG.info("Gzipping CBOR data has been skipped");
+ }
+ // total file counts
+ Map<String, Integer> typeCounts = new HashMap<String,
Integer>();
+ // filtered file counters
+ Map<String, Integer> filteredCounts = new HashMap<String,
Integer>();
+
+ Configuration conf = NutchConfiguration.create();
+ FileSystem fs = FileSystem.get(conf);
+ File[] segmentDirs = segmentRootDir.listFiles(new FileFilter() {
+ @Override
+ public boolean accept(File file) {
+ return file.canRead() && file.isDirectory();
+ }
+ });
+
+ if (segmentDirs == null) {
+ LOG.error("No segment directories found in [" +
segmentRootDir.getAbsolutePath() + "]");
+ System.exit(1);
+ }
+
+ // Gzip initialization
+ FileOutputStream fileOutput = null;
+ BufferedOutputStream bufOutput = null;
+ GzipCompressorOutputStream gzipOutput = null;
+ TarArchiveOutputStream tarOutput = null;
+
+ ArrayList<String> fileList = null;
+
+ if (gzip) {
+ String archiveName = new
SimpleDateFormat("yyyyMMddhhmm'.tar.gz'").format(new Date());
+ fileOutput = new FileOutputStream(new File(outputDir +
File.separator + archiveName));
+ bufOutput = new BufferedOutputStream(fileOutput);
+ gzipOutput = new GzipCompressorOutputStream(bufOutput);
+ tarOutput = new TarArchiveOutputStream(gzipOutput);
+
+ fileList = new ArrayList<String>();
+ }
+
+ for (File segment : segmentDirs) {
+ LOG.info("Processing segment: [" +
segment.getAbsolutePath() + "]");
+ // GIUSEPPE: Never used (also in FileDumper.java)!
+ //DataOutputStream doutputStream = null;
+ try {
+ String segmentContentPath =
segment.getAbsolutePath() + File.separator + Content.DIR_NAME +
"/part-00000/data";
+ Path file = new Path(segmentContentPath);
+
+ if (!new File(file.toString()).exists()) {
+ LOG.warn("Skipping segment: [" +
segmentContentPath + "]: no data directory present");
+ continue;
+ }
+ SequenceFile.Reader reader = new
SequenceFile.Reader(fs, file, conf);
+
+ if (!new File(file.toString()).exists()) {
+ LOG.warn("Skipping segment: [" +
segmentContentPath + "]: no data directory present");
+ continue;
+ }
+ Writable key = (Writable)
reader.getKeyClass().newInstance();
+
+ Content content = null;
+
+ while (reader.next(key)) {
+ content = new Content();
+ reader.getCurrentValue(content);
+ String url = key.toString();
+ String baseName =
FilenameUtils.getBaseName(url);
+ String extension =
FilenameUtils.getExtension(url);
+ if (extension == null ||
extension.equals("")) {
+ extension = "html";
+ }
+
+ String filename = baseName + "." +
extension;
+
+ // Encode all filetypes if no mimetypes
have been given
+ Boolean filter = (mimeTypes == null);
+
+ String jsonData = "";
+ try {
+ String mimeType = new
Tika().detect(content.getContent());
+ // Maps file to JSON-based
structure
+ CommonCrawlFormat format =
CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url,
content.getContent(), content.getMetadata(), conf);
+ jsonData =
format.getJsonData(false);
+
+ collectStats(typeCounts,
mimeType);
+ // collects statistics for the
given mimetypes
+ if ((mimeType != null) &&
(mimeTypes != null) && Arrays.asList(mimeTypes).contains(mimeType)) {
+
collectStats(filteredCounts, mimeType);
+ filter = true;
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ LOG.warn("Tika is unable to
detect type for: [" + url
+ + "]");
+ }
+
+ if (filter) {
+
+ byte[] byteData =
serializeCBORData(jsonData);
+
+ if (!gzip) {
+ String outputFullPath =
outputDir + File.separator + filename;
+ File outputFile = new
File(outputFullPath);
+ if
(outputFile.exists()) {
+
LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
+ }
+ else {
+
LOG.info("Writing: [" + outputFullPath + "]");
+
IOUtils.copy(new ByteArrayInputStream(byteData), new
FileOutputStream(outputFile));
+ }
+ }
+ else {
+ if
(fileList.contains(filename)) {
+
LOG.info("Skipping compressing: [" + filename + "]: file already exists");
+ }
+ else {
+
fileList.add(filename);
+
LOG.info("Compressing: [" + filename + "]");
+ TarArchiveEntry
tarEntry = new TarArchiveEntry(filename);
+
tarEntry.setSize(byteData.length);
+
tarOutput.putArchiveEntry(tarEntry);
+
IOUtils.copy(new ByteArrayInputStream(byteData), tarOutput);
+
tarOutput.closeArchiveEntry();
+ }
+ }
+ }
+ }
+ reader.close();
+ } finally {
+ fs.close();
+ }
+ }
+
+ if (gzip) {
+ tarOutput.finish();
+
+ tarOutput.close();
+ gzipOutput.close();
+ bufOutput.close();
+ fileOutput.close();
+ }
+
+ LOG.info("CommonsCrawlDataDumper File Stats: " +
displayFileTypes(typeCounts, filteredCounts));
+ }
+
+ private byte[] serializeCBORData(String jsonData) {
+ CBORFactory factory = new CBORFactory();
+
+ CBORGenerator generator = null;
+ ByteArrayOutputStream stream = null;
+
+ try {
+ stream = new ByteArrayOutputStream();
+ generator = factory.createGenerator(stream);
+ generator.writeString(jsonData);
+ generator.flush();
+ stream.flush();
+
+ return stream.toByteArray();
+
+ } catch (Exception e) {
+ LOG.warn("CBOR encoding failed: " + e.getMessage());
+ } finally {
+ try {
+ generator.close();
+ stream.close();
+ } catch (IOException e) {
+ // nothing to do
+ }
+ }
+
+ return null;
+ }
+
+ private void collectStats(Map<String, Integer> typeCounts, String
mimeType) {
+ typeCounts.put(mimeType, typeCounts.containsKey(mimeType) ?
typeCounts.get(mimeType) + 1 : 1);
+ }
+
+ private String displayFileTypes(Map<String, Integer> typeCounts,
Map<String, Integer> filteredCounts) {
+ StringBuilder builder = new StringBuilder();
+ // print total stats
+ builder.append("\n TOTAL Stats:\n");
+ builder.append(" {\n");
+ for (String mimeType : typeCounts.keySet()) {
+ builder.append(" {\"mimeType\":\"");
+ builder.append(mimeType);
+ builder.append("\",\"count\":");
+ builder.append(typeCounts.get(mimeType));
+ builder.append("\"}\n");
+ }
+ builder.append("}\n");
+ // filtered types stats
+ if (!filteredCounts.isEmpty()) {
+ builder.append("\n FILTERED Stats:\n");
+ builder.append(" {\n");
+ for (String mimeType : filteredCounts.keySet()) {
+ builder.append(" {\"mimeType\":\"");
+ builder.append(mimeType);
+ builder.append("\",\"count\":");
+ builder.append(filteredCounts.get(mimeType));
+ builder.append("\"}\n");
+ }
+ builder.append("}\n");
+ }
+ return builder.toString();
+ }
+}
Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java Wed Mar
4 18:48:32 2015
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+
+/**
+ * Interface for all CommonCrawl formatter. It provides the signature for the
+ * method used to get JSON data.
+ *
+ * @author gtotaro
+ *
+ */
+public interface CommonCrawlFormat {
+
+ /**
+ *
+ * @param mapAll If {@code true} maps all metdata on the JSON structure.
+ * @return the JSON data
+ */
+ public String getJsonData(boolean mapAll) throws IOException;
+}
Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
(added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
Wed Mar 4 18:48:32 2015
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * Factory class that creates new {@see CommonCrawlFormat} objects (a.k.a.
formatter) that map crawled files to CommonCrawl format.
+ *
+ */
+public class CommonCrawlFormatFactory {
+
+ /**
+ * Returns a new instance of a {@see CommonCrawlFormat} object
specifying the type of formatter.
+ * @param formatType the type of formatter to be created.
+ * @param url the url.
+ * @param content the content.
+ * @param metadata the metadata.
+ * @param conf the configuration.
+ * @return the new {@see CommonCrawlFormat} object.
+ */
+ public static CommonCrawlFormat getCommonCrawlFormat(String formatType,
String url, byte[] content,
+ Metadata metadata, Configuration conf) {
+ if (formatType == null) {
+ return null;
+ }
+
+ if (formatType.equalsIgnoreCase("jackson")) {
+ return new CommonCrawlFormatJackson(url, content,
metadata, conf);
+ }
+ else if (formatType.equalsIgnoreCase("jettinson")) {
+ return new CommonCrawlFormatJettinson(url, content,
metadata, conf);
+ }
+ else if (formatType.equalsIgnoreCase("simple")) {
+ return new CommonCrawlFormatSimple(url, content,
metadata, conf);
+ }
+
+ return null;
+ }
+}
Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
(added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
Wed Mar 4 18:48:32 2015
@@ -0,0 +1,253 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonGenerator;
+
+/**
+ * This class provides methods to map crawled data on JSON using Jackson
Streaming APIs.
+ *
+ */
+public class CommonCrawlFormatJackson extends AbstractCommonCrawlFormat {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(CommonCrawlFormatJackson.class.getName());
+
+ public CommonCrawlFormatJackson(String url, byte[] content,
+ Metadata metadata, Configuration conf) {
+ super(url, content, metadata, conf);
+ }
+
+ @Override
+ protected String getJsonDataAll() throws IOException {
+ JsonFactory factory = new JsonFactory();
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ JsonGenerator generator = null;
+
+ try {
+ generator = factory.createGenerator(out);
+ generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
+
+ generator.writeStartObject();
+
+ // url
+ generator.writeFieldName("url");
+ generator.writeString(url);
+
+ // timestamp
+ generator.writeFieldName("timestamp");
+ generator.writeString(metadata.get(Metadata.LAST_MODIFIED));
+
+
+ //request
+ generator.writeFieldName("request");
+ generator.writeStartObject();
+ generator.writeFieldName("method");
+ generator.writeString("GET");
+ generator.writeFieldName("client");
+ generator.writeStartObject();
+ generator.writeFieldName("hostname");
+ generator.writeString(getHostName());
+ generator.writeFieldName("address");
+ generator.writeString(getHostAddress());
+ generator.writeFieldName("software");
+ generator.writeString(conf.get("http.agent.version", ""));
+ generator.writeFieldName("robots");
+ generator.writeString("classic");
+ generator.writeFieldName("contact");
+ generator.writeStartObject();
+ generator.writeFieldName("name");
+ generator.writeString(conf.get("http.agent.name", ""));
+ generator.writeFieldName("email");
+ generator.writeString(conf.get("http.agent.email", ""));
+ generator.writeEndObject();
+ generator.writeFieldName("headers");
+ generator.writeStartObject();
+ generator.writeFieldName("Accept");
+ generator.writeString(conf.get("accept", ""));
+ generator.writeFieldName("Accept-Encoding");
+ generator.writeString(""); // TODO
+ generator.writeFieldName("Accept-Language");
+ generator.writeString(conf.get("http.accept.language", ""));
+ generator.writeFieldName("User-Agent");
+ generator.writeString(conf.get("http.robots.agents", ""));
+ generator.writeEndObject();
+ generator.writeFieldName("body");
+ generator.writeNull();
+ generator.writeEndObject();
+
+ //response
+ generator.writeFieldName("response");
+ generator.writeStartObject();
+ generator.writeFieldName("status");
+ generator.writeString(ifNullString(metadata.get("status")));
+ generator.writeFieldName("server");
+
+ generator.writeStartObject();
+ generator.writeFieldName("hostname");
+ generator.writeString(URLUtil.getHost(url));
+ generator.writeFieldName("address");
+ generator.writeString(ifNullString(metadata.get("_ip_")));
+ generator.writeEndObject();
+
+ generator.writeFieldName("headers");
+ generator.writeStartObject();
+ for (String name : metadata.names()) {
+ generator.writeFieldName(name);
+ generator.writeString(ifNullString(metadata.get(name)));
+ }
+ generator.writeEndObject();
+
+ generator.writeFieldName("body");
+ generator.writeString(new String(content));
+ generator.writeEndObject();
+
+ generator.writeFieldName("key");
+ generator.writeString(url);
+
+ generator.writeFieldName("imported"); // TODO
+ generator.writeString("");
+
+ generator.writeEndObject();
+
+ generator.flush();
+
+ return out.toString();
+
+ } catch (IOException ioe) {
+ LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
+ throw new IOException("Error in generating JSON using Jackson:" +
ioe.getMessage());
+ }
+ }
+
+ @Override
+ protected String getJsonDataSet() throws IOException {
+ JsonFactory factory = new JsonFactory();
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ JsonGenerator generator = null;
+
+ try {
+ generator = factory.createGenerator(out);
+ generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
+
+ generator.writeStartObject();
+
+ // url
+ generator.writeFieldName("url");
+ generator.writeString(url);
+
+ // timestamp
+ generator.writeFieldName("timestamp");
+ generator.writeString(metadata.get(Metadata.LAST_MODIFIED));
+
+ //request
+ generator.writeFieldName("request");
+ generator.writeStartObject();
+ generator.writeFieldName("method");
+ generator.writeString("GET");
+ generator.writeFieldName("client");
+ generator.writeStartObject();
+ generator.writeFieldName("hostname");
+ generator.writeString(getHostName());
+ generator.writeFieldName("address");
+ generator.writeString(getHostAddress());
+ generator.writeFieldName("software");
+ generator.writeString(conf.get("http.agent.version", ""));
+ generator.writeFieldName("robots");
+ generator.writeString("CLASSIC");
+ generator.writeFieldName("contact");
+ generator.writeStartObject();
+ generator.writeFieldName("name");
+ generator.writeString(conf.get("http.agent.name", ""));
+ generator.writeFieldName("email");
+ generator.writeString(conf.get("http.agent.email", ""));
+ generator.writeEndObject();
+ generator.writeFieldName("headers");
+ generator.writeStartObject();
+ generator.writeFieldName("Accept");
+ generator.writeString(conf.get("accept", ""));
+ generator.writeFieldName("Accept-Encoding");
+ generator.writeString(""); // TODO
+ generator.writeFieldName("Accept-Language");
+ generator.writeString(conf.get("http.accept.language", ""));
+ generator.writeFieldName("User-Agent");
+ generator.writeString(conf.get("http.robots.agents", ""));
+ generator.writeEndObject();
+ generator.writeFieldName("body");
+ generator.writeNull();
+ generator.writeEndObject();
+
+ //response
+ generator.writeFieldName("response");
+ generator.writeStartObject();
+ generator.writeFieldName("status");
+ generator.writeString(ifNullString(metadata.get("status")));
+ generator.writeFieldName("server");
+
+ generator.writeStartObject();
+ generator.writeFieldName("hostname");
+ generator.writeString(URLUtil.getHost(url));
+ generator.writeFieldName("address");
+ generator.writeString(ifNullString(metadata.get("_ip_")));
+ generator.writeEndObject();
+
+ generator.writeFieldName("headers");
+ generator.writeStartObject();
+ generator.writeFieldName("Content-Encoding");
+ generator.writeString(ifNullString(metadata.get("Content-Encoding")));
+ generator.writeFieldName("Content-Type");
+ generator.writeString(ifNullString(metadata.get("Content-Type")));
+ generator.writeFieldName("Date");
+ generator.writeString(ifNullString(metadata.get("Date")));
+ generator.writeFieldName("Server");
+ generator.writeString(ifNullString(metadata.get("Server")));
+ generator.writeEndObject();
+
+ generator.writeFieldName("body");
+ generator.writeString(new String(content));
+ generator.writeEndObject();
+
+ generator.writeFieldName("key");
+ generator.writeString(url);
+
+ generator.writeFieldName("imported"); // TODO
+ generator.writeString("");
+
+ generator.writeEndObject();
+
+ generator.flush();
+
+ return out.toString();
+
+ } catch (IOException ioe) {
+ LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
+ throw new IOException("Error in generating JSON using Jackson:" +
ioe.getMessage());
+ }
+ }
+}
Added:
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
(added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
Wed Mar 4 18:48:32 2015
@@ -0,0 +1,168 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONObject;
+import org.mortbay.log.Log;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class provides methods to map crawled data on JSON using Jettinson
APIs.
+ *
+ */
+public class CommonCrawlFormatJettinson extends AbstractCommonCrawlFormat {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(CommonCrawlFormatJettinson.class.getName());
+
+ public CommonCrawlFormatJettinson(String url, byte[] content,
+ Metadata metadata, Configuration conf) {
+ super(url, content, metadata, conf);
+ }
+
+ @Override
+ protected String getJsonDataAll() throws IOException {
+ JSONObject object = new JSONObject();
+
+ try {
+ // url
+ object.put("url", url);
+
+ // timestamp
+ object.put("timestamp",
metadata.get(Metadata.LAST_MODIFIED));
+
+ // request
+ JSONObject requestObject = new JSONObject();
+ requestObject.put("method", "GET");
+ JSONObject clientObject = new JSONObject();
+ clientObject.put("hostname", getHostName());
+ clientObject.put("address", getHostAddress());
+ clientObject.put("software",
conf.get("http.agent.version", ""));
+ clientObject.put("robots", "CLASSIC");
+ JSONObject contactObject = new JSONObject();
+ contactObject.put("name", conf.get("http.agent.name",
""));
+ contactObject.put("email", conf.get("http.agent.email",
""));
+ clientObject.put("contact", contactObject);
+ requestObject.put("client", clientObject);
+ JSONObject reqHeadersObject = new JSONObject();
+ reqHeadersObject.put("Accept", conf.get("http.accept",
""));
+ reqHeadersObject.put("Accept-Encoding", ""); // TODO
+ reqHeadersObject.put("Accept-Language",
conf.get("http.accept.language", ""));
+ reqHeadersObject.put("User-Agent",
conf.get("http.robots.agents", ""));
+ requestObject.put("headers", reqHeadersObject);
+ requestObject.put("body", JSONObject.NULL);
+ object.put("request", requestObject);
+
+ // response
+ JSONObject responseObject = new JSONObject();
+ responseObject.put("status",
ifNullString(metadata.get("status")));
+ JSONObject serverObject = new JSONObject();
+ serverObject.put("hostname", URLUtil.getHost(url));
+ serverObject.put("address",
ifNullString(metadata.get("_ip_")));
+ responseObject.put("client", serverObject);
+ JSONObject respHeadersObject = new JSONObject();
+ for (String name : metadata.names()) {
+ respHeadersObject.put(name,
ifNullString(metadata.get(name)));
+ }
+ responseObject.put("headers", respHeadersObject);
+ responseObject.put("body", new String(content));
+ object.put("response", responseObject);
+
+ // key
+ object.put("key", url);
+
+ // imported
+ object.put("imported", ""); // TODO
+
+ return object.toString(2); // INDENTED OUTPUT
+
+ } catch (JSONException jsone) {
+ LOG.warn("Error in processing file " + url + ": " +
jsone.getMessage());
+ throw new IOException("Error in generating JSON using
Jettinson:" + jsone.getMessage());
+ }
+ }
+
+ @Override
+ protected String getJsonDataSet() throws IOException {
+ JSONObject object = new JSONObject();
+
+ try {
+ // url
+ object.put("url", url);
+
+ // timestamp
+ object.put("timestamp",
metadata.get(Metadata.LAST_MODIFIED));
+
+ // request
+ JSONObject requestObject = new JSONObject();
+ requestObject.put("method", "GET");
+ JSONObject clientObject = new JSONObject();
+ clientObject.put("hostname", getHostName());
+ clientObject.put("address", getHostAddress());
+ clientObject.put("software",
conf.get("http.agent.version", ""));
+ clientObject.put("robots", "CLASSIC");
+ JSONObject contactObject = new JSONObject();
+ contactObject.put("name", conf.get("http.agent.name",
""));
+ contactObject.put("email", conf.get("http.agent.email",
""));
+ clientObject.put("contact", contactObject);
+ requestObject.put("client", clientObject);
+ JSONObject reqHeadersObject = new JSONObject();
+ reqHeadersObject.put("Accept", conf.get("http.accept",
""));
+ reqHeadersObject.put("Accept-Encoding", ""); // TODO
+ reqHeadersObject.put("Accept-Language",
conf.get("http.accept.language", ""));
+ reqHeadersObject.put("User-Agent",
conf.get("http.robots.agents", ""));
+ requestObject.put("headers", reqHeadersObject);
+ requestObject.put("body", JSONObject.NULL);
+ object.put("request", requestObject);
+
+ // response
+ JSONObject responseObject = new JSONObject();
+ responseObject.put("status",
ifNullString(metadata.get("status")));
+ JSONObject serverObject = new JSONObject();
+ serverObject.put("hostname", URLUtil.getHost(url));
+ serverObject.put("address",
ifNullString(metadata.get("_ip_")));
+ responseObject.put("client", serverObject);
+ JSONObject respHeadersObject = new JSONObject();
+ respHeadersObject.put("Content-Encoding",
ifNullString(metadata.get("Content-Encoding")));
+ respHeadersObject.put("Content-Type",
ifNullString(metadata.get("Content-Type")));
+ respHeadersObject.put("Date",
ifNullString(metadata.get("Date")));
+ respHeadersObject.put("Server",
ifNullString(metadata.get("Server")));
+ responseObject.put("headers", respHeadersObject);
+ responseObject.put("body", new String(content));
+ object.put("response", responseObject);
+
+ // key
+ object.put("key", url);
+
+ // imported
+ object.put("imported", ""); // TODO
+
+ return object.toString(2); // INDENTED OUTPUT
+
+ } catch (JSONException jsone) {
+ LOG.warn("Error in processing file " + url + ": " +
jsone.getMessage());
+ throw new IOException("Error in generating JSON using
Jettinson:" + jsone.getMessage());
+ }
+ }
+}
Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
(added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
Wed Mar 4 18:48:32 2015
@@ -0,0 +1,152 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * This class provides methods to map crawled data on JSON using a {@see
StringBuilder} object.
+ *
+ */
+public class CommonCrawlFormatSimple extends AbstractCommonCrawlFormat {
+
+ public CommonCrawlFormatSimple(String url, byte[] content, Metadata
metadata,
+ Configuration conf) {
+ super(url, content, metadata, conf);
+ }
+
+ @Override
+ protected String getJsonDataAll() {
+ // TODO character escaping
+ StringBuilder sb = new StringBuilder();
+ sb.append("{\n");
+
+ // url
+ sb.append("\t\"url\": \"" + url + "\",\n");
+
+ // timstamp
+ sb.append("\t\"timstamp\": \"" +
metadata.get(Metadata.LAST_MODIFIED) + "\",\n");
+
+ // request
+ sb.append("\t\"request\": {\n");
+ sb.append("\t\t\"method\": \"GET\",\n");
+ sb.append("\t\t\"client\": {\n");
+ sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n");
+ sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n");
+ sb.append("\t\t\t\"software\": \"" +
conf.get("http.agent.version", "") + "\",\n");
+ sb.append("\t\t\t\"robots\": \"CLASSIC\",\n");
+ sb.append("\t\t\t\"contact\": {\n");
+ sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name",
"") + "\",\n");
+ sb.append("\t\t\t\t\"email\": \"" +
conf.get("http.agent.email", "") + "\",\n");
+ sb.append("\t\t\t}\n");
+ sb.append("\t\t},\n");
+ sb.append("\t\t\"headers\": {\n");
+ sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "")
+ "\",\n");
+ sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); //TODO
+ sb.append("\t\t\t\"Accept-Language\": \"" +
conf.get("http.accept.language", "") + "\",\n");
+ sb.append("\t\t\t\"User-Agent\": \"" +
conf.get("http.robots.agents", "") + "\",\n");
+ sb.append("\t},\n");
+
+ // response
+ sb.append("\t\"response\": {\n");
+ sb.append("\t\t\"status\": \"" +
ifNullString(metadata.get("status")) + "\",\n");
+ sb.append("\t\t\"server\": {\n");
+ sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) +
"\"\n");
+ sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") +
"\"\n");
+ sb.append("\t\t},\n");
+ sb.append("\t\t\"headers\": {\n");
+ for (String name : metadata.names()) {
+ sb.append("\t\t\t\"" + name + "\": \"" +
metadata.get(name) + "\"\n");
+ }
+ sb.append("\t\t},\n");
+ sb.append("\t\t\"body\": " + new String(content) + "\",\n");
+ sb.append("\t},\n");
+
+ // key
+ sb.append("\t\"key\": \"" + url + "\",\n");
+
+ // imported
+ sb.append("\t\"imported\": \"\"\n"); //TODO
+
+ sb.append("}");
+
+ return sb.toString();
+ }
+
+ @Override
+ protected String getJsonDataSet() {
+ // TODO character escaping
+ StringBuilder sb = new StringBuilder();
+ sb.append("{\n");
+
+ // url
+ sb.append("\t\"url\": \"" + url + "\",\n");
+
+ // timstamp
+ sb.append("\t\"timestamp\": \"" +
metadata.get(Metadata.LAST_MODIFIED) + "\",\n");
+
+ // request
+ sb.append("\t\"request\": {\n");
+ sb.append("\t\t\"method\": \"GET\",\n");
+ sb.append("\t\t\"client\": {\n");
+ sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n");
+ sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n");
+ sb.append("\t\t\t\"software\": \"" +
conf.get("http.agent.version", "") + "\",\n");
+ sb.append("\t\t\t\"robots\": \"CLASSIC\",\n");
+ sb.append("\t\t\t\"contact\": {\n");
+ sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name",
"") + "\",\n");
+ sb.append("\t\t\t\t\"email\": \"" +
conf.get("http.agent.email", "") + "\",\n");
+ sb.append("\t\t\t}\n");
+ sb.append("\t\t},\n");
+ sb.append("\t\t\"headers\": {\n");
+ sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "")
+ "\",\n");
+ sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); // TODO
+ sb.append("\t\t\t\"Accept-Language\": \"" +
conf.get("http.accept.language", "") + "\",\n");
+ sb.append("\t\t\t\"User-Agent\": \"" + conf.get("http.robots.agents", "")
+ "\",\n");
+ sb.append("\t},\n");
+
+ // response
+ sb.append("\t\"response\": {\n");
+ sb.append("\t\t\"status\": \"" +
ifNullString(metadata.get("status")) + "\",\n");
+ sb.append("\t\t\"server\": {\n");
+ sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + "\"\n");
+ sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") +
"\"\n");
+ sb.append("\t\t},\n");
+ sb.append("\t\t\"headers\": {\n");
+ sb.append("\t\t\t\"Content-Encoding\": " +
ifNullString(metadata.get("Content-Encoding")));
+ sb.append("\t\t\t\"Content-Type\": " +
ifNullString(metadata.get("Content-Type")));
+ sb.append("\t\t\t\"Date\": " +
ifNullString(metadata.get("Date")));
+ sb.append("\t\t\t\"Server\": " +
ifNullString(metadata.get("Server")));
+ sb.append("\t\t},\n");
+ sb.append("\t\t\"body\": " + new String(content) + "\",\n");
+ sb.append("\t},\n");
+
+ // key
+ sb.append("\t\"key\": \"" + url + "\",\n");
+
+ // imported
+ sb.append("\t\"imported\": \"\"\n"); // TODO
+
+ sb.append("}");
+
+ return sb.toString();
+ }
+
+}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Wed Mar 4
18:48:32 2015
@@ -100,7 +100,7 @@ import org.slf4j.LoggerFactory;
* }
* </pre>
* <p>
- * In the case above the tool would have been run with the <b>-mimeType
+ * In the case above, the tool would have been run with the <b>-mimeType
* image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b>
* flag and corresponding values activated.
*