Repository: nutch Updated Branches: refs/heads/master 0e03daf11 -> 37458a93e
option to inlcude inlinks in commons crawl dump Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/6ff1ecbb Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/6ff1ecbb Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/6ff1ecbb Branch: refs/heads/master Commit: 6ff1ecbbb8527f760a81a316ffa6ae5b823f980a Parents: 8572fd9 Author: Thamme Gowda <[email protected]> Authored: Mon Apr 18 17:57:11 2016 -0700 Committer: Thamme Gowda <[email protected]> Committed: Mon Apr 18 17:57:11 2016 -0700 ---------------------------------------------------------------------- .../nutch/tools/AbstractCommonCrawlFormat.java | 18 ++++++++ .../nutch/tools/CommonCrawlDataDumper.java | 44 ++++++++++++++++---- .../apache/nutch/tools/CommonCrawlFormat.java | 15 +++++++ 3 files changed, 70 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/6ff1ecbb/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java index 3dc62c0..386ec4a 100644 --- a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java +++ b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java @@ -23,6 +23,7 @@ import java.net.InetAddress; import java.net.URLEncoder; import java.net.UnknownHostException; import java.text.ParseException; +import java.util.List; import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.util.URIUtil; @@ -62,6 +63,8 @@ public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat { protected String reverseKeyValue; + protected List<String> inLinks; + public AbstractCommonCrawlFormat(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException { this.url = url; this.content = content; @@ -158,6 +161,13 @@ public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat { // imported writeKeyValue("imported", getImported()); + if (getInLinks() != null){ + startArray("inlinks", false, true); + for (String link : getInLinks()) { + writeArrayValue(link); + } + closeArray("inlinks", false, true); + } closeObject(null); return generateJson(); @@ -289,6 +299,14 @@ public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat { return ifNullString(metadata.get("Content-Type")); } + public List<String> getInLinks() { + return inLinks; + } + + public void setInLinks(List<String> inLinks) { + this.inLinks = inLinks; + } + protected String getResponseDate() { if (this.simpleDateFormat) { String timestamp = null; http://git-wip-us.apache.org/repos/asf/nutch/blob/6ff1ecbb/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java index d00df0a..e26e088 100644 --- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java +++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java @@ -32,6 +32,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Map; @@ -58,13 +59,16 @@ import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.crawl.Inlink; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.crawl.LinkDbReader; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.protocol.Content; -import org.apache.nutch.segment.SegmentReader; import org.apache.nutch.util.DumpFileUtil; import org.apache.nutch.util.NutchConfiguration; //Tika imports @@ -222,13 +226,14 @@ public class CommonCrawlDataDumper extends Configured implements Tool { * @param outputDir the directory you wish to dump the raw content to. This * directory will be created. * @param segmentRootDir a directory containing one or more segments. + * @param linkdb Path to linkdb. * @param gzip a boolean flag indicating whether the CBOR content should also * be gzipped. * @param epochFilename if {@code true}, output files will be names using the epoch time (in milliseconds). * @param extension a file extension to use with output documents. * @throws Exception if any exception occurs. */ - public void dump(File outputDir, File segmentRootDir, boolean gzip, + public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, String[] mimeTypes, boolean epochFilename, String extension, boolean warc) throws Exception { if (gzip) { @@ -257,6 +262,10 @@ public class CommonCrawlDataDumper extends Configured implements Tool { } } + LinkDbReader linkDbReader = null; + if (linkdb != null) { + linkDbReader = new LinkDbReader(fs.getConf(), new Path(linkdb.toString())); + } if (parts == null || parts.size() == 0) { LOG.error( "No segment directories found in [ {}] ", segmentRootDir.getAbsolutePath()); System.exit(1); @@ -346,11 +355,25 @@ public class CommonCrawlDataDumper extends Configured implements Tool { String mimeType = new Tika().detect(content.getContent()); // Maps file to JSON-based structure - //TODO: Make this Jackson Format implementation reusable - try (CommonCrawlFormat format = CommonCrawlFormatFactory - .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) { - jsonData = format.getJsonData(url, content, metadata); + List<String> inUrls = null; + if (linkDbReader != null) { + int max = 5000; //just in case there are too many urls! + Inlinks inlinks = linkDbReader.getInlinks((Text) key); + if (inlinks != null) { + Iterator<Inlink> iterator = inlinks.iterator(); + inUrls = new ArrayList<>(); + while (max >= 0 && iterator.hasNext()){ + inUrls.add(iterator.next().getFromUrl()); + max--; + } } + } + //TODO: Make this Jackson Format implementation reusable + try (CommonCrawlFormat format = CommonCrawlFormatFactory + .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) { + format.setInLinks(inUrls); + jsonData = format.getJsonData(url, content, metadata); + } collectStats(typeCounts, mimeType); // collects statistics for the given mimetypes @@ -587,6 +610,10 @@ public class CommonCrawlDataDumper extends Configured implements Tool { .withType(Number.class) .withDescription("an optional file size in bytes for the WARC file(s)") .create("warcSize"); + Option linkDbOpt = OptionBuilder.withArgName("linkdb").hasArg(true) + .withDescription("an optional linkdb parameter to include inlinks in dump files") + .isRequired(false) + .create("linkdb"); // create the options Options options = new Options(); @@ -606,6 +633,7 @@ public class CommonCrawlDataDumper extends Configured implements Tool { options.addOption(reverseKeyOpt); options.addOption(extensionOpt); options.addOption(sizeOpt); + options.addOption(linkDbOpt); CommandLineParser parser = new GnuParser(); try { @@ -635,6 +663,8 @@ public class CommonCrawlDataDumper extends Configured implements Tool { if (line.getParsedOptionValue("warcSize") != null) { warcSize = (Long) line.getParsedOptionValue("warcSize"); } + String linkdbPath = line.getOptionValue("linkdb"); + File linkdb = linkdbPath == null ? null : new File(linkdbPath); CommonCrawlConfig config = new CommonCrawlConfig(); config.setKeyPrefix(keyPrefix); @@ -655,7 +685,7 @@ public class CommonCrawlDataDumper extends Configured implements Tool { CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(config); - dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes, epochFilename, + dumper.dump(outputDir, segmentRootDir, linkdb, gzip, mimeTypes, epochFilename, extension, warc); } catch (Exception e) { http://git-wip-us.apache.org/repos/asf/nutch/blob/6ff1ecbb/src/java/org/apache/nutch/tools/CommonCrawlFormat.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormat.java b/src/java/org/apache/nutch/tools/CommonCrawlFormat.java index 87baeb5..ea10e7d 100644 --- a/src/java/org/apache/nutch/tools/CommonCrawlFormat.java +++ b/src/java/org/apache/nutch/tools/CommonCrawlFormat.java @@ -23,6 +23,7 @@ import org.apache.nutch.protocol.Content; import java.io.Closeable; import java.io.IOException; +import java.util.List; /** * Interface for all CommonCrawl formatter. It provides the signature for the @@ -64,6 +65,20 @@ public interface CommonCrawlFormat extends Closeable { public String getJsonData(String url, Content content, Metadata metadata, ParseData parseData) throws IOException; + + /** + * sets inlinks of this document + * @param inlinks list of inlinks + */ + void setInLinks(List<String> inLinks); + + + /** + * gets set of inlinks + * @return gets inlinks of this document + */ + List<String> getInLinks(); + /** * Optional method that could be implemented if the actual format needs some * close procedure.
