[1/6] nutch git commit: option to inlcude inlinks in commons crawl dump

mattmann Sat, 07 May 2016 13:40:16 -0700

Repository: nutch
Updated Branches:
  refs/heads/master 0e03daf11 -> 37458a93e



option to inlcude inlinks in commons crawl dump

Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/6ff1ecbb
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/6ff1ecbb
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/6ff1ecbb

Branch: refs/heads/master
Commit: 6ff1ecbbb8527f760a81a316ffa6ae5b823f980a
Parents: 8572fd9
Author: Thamme Gowda <[email protected]>
Authored: Mon Apr 18 17:57:11 2016 -0700
Committer: Thamme Gowda <[email protected]>
Committed: Mon Apr 18 17:57:11 2016 -0700

----------------------------------------------------------------------
 .../nutch/tools/AbstractCommonCrawlFormat.java  | 18 ++++++++
 .../nutch/tools/CommonCrawlDataDumper.java      | 44 ++++++++++++++++----
 .../apache/nutch/tools/CommonCrawlFormat.java   | 15 +++++++
 3 files changed, 70 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/6ff1ecbb/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java 
b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
index 3dc62c0..386ec4a 100644
--- a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
+++ b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
@@ -23,6 +23,7 @@ import java.net.InetAddress;
 import java.net.URLEncoder;
 import java.net.UnknownHostException;
 import java.text.ParseException;
+import java.util.List;
 
 import org.apache.commons.httpclient.URIException;
 import org.apache.commons.httpclient.util.URIUtil;
@@ -62,6 +63,8 @@ public abstract class AbstractCommonCrawlFormat implements 
CommonCrawlFormat {
 
        protected String reverseKeyValue;
 
+       protected List<String> inLinks;
+
        public AbstractCommonCrawlFormat(String url, Content content, Metadata 
metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException 
{
                this.url = url;
                this.content = content;
@@ -158,6 +161,13 @@ public abstract class AbstractCommonCrawlFormat implements 
CommonCrawlFormat {
                        // imported
                        writeKeyValue("imported", getImported());
 
+                       if (getInLinks() != null){
+                               startArray("inlinks", false, true);
+                               for (String link : getInLinks()) {
+                                       writeArrayValue(link);
+                               }
+                               closeArray("inlinks", false, true);
+                       }
                        closeObject(null);
 
                        return generateJson();
@@ -289,6 +299,14 @@ public abstract class AbstractCommonCrawlFormat implements 
CommonCrawlFormat {
                return ifNullString(metadata.get("Content-Type"));
        }
 
+       public List<String> getInLinks() {
+               return inLinks;
+       }
+
+       public void setInLinks(List<String> inLinks) {
+               this.inLinks = inLinks;
+       }
+
        protected String getResponseDate() {
                if (this.simpleDateFormat) {
                        String timestamp = null;

http://git-wip-us.apache.org/repos/asf/nutch/blob/6ff1ecbb/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java 
b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
index d00df0a..e26e088 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
@@ -32,6 +32,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 
@@ -58,13 +59,16 @@ import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.crawl.LinkDbReader;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.segment.SegmentReader;
 import org.apache.nutch.util.DumpFileUtil;
 import org.apache.nutch.util.NutchConfiguration;
 //Tika imports
@@ -222,13 +226,14 @@ public class CommonCrawlDataDumper extends Configured 
implements Tool {
    * @param outputDir      the directory you wish to dump the raw content to. 
This
    *                       directory will be created.
    * @param segmentRootDir a directory containing one or more segments.
+   * @param linkdb         Path to linkdb.
    * @param gzip           a boolean flag indicating whether the CBOR content 
should also
    *                       be gzipped.
    * @param epochFilename  if {@code true}, output files will be names using 
the epoch time (in milliseconds).
    * @param extension      a file extension to use with output documents.
    * @throws Exception if any exception occurs.
    */
-  public void dump(File outputDir, File segmentRootDir, boolean gzip,
+  public void dump(File outputDir, File segmentRootDir, File linkdb, boolean 
gzip,
       String[] mimeTypes, boolean epochFilename, String extension, boolean 
warc)
       throws Exception {
     if (gzip) {
@@ -257,6 +262,10 @@ public class CommonCrawlDataDumper extends Configured 
implements Tool {
       }
     }
 
+    LinkDbReader linkDbReader = null;
+    if (linkdb != null) {
+      linkDbReader = new LinkDbReader(fs.getConf(), new 
Path(linkdb.toString()));
+    }
     if (parts == null || parts.size() == 0) {
       LOG.error( "No segment directories found in [ {}] ", 
segmentRootDir.getAbsolutePath());
       System.exit(1);
@@ -346,11 +355,25 @@ public class CommonCrawlDataDumper extends Configured 
implements Tool {
             String mimeType = new Tika().detect(content.getContent());
             // Maps file to JSON-based structure
 
-            //TODO: Make this Jackson Format implementation reusable
-            try (CommonCrawlFormat format = CommonCrawlFormatFactory
-                .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, 
config)) {
-              jsonData = format.getJsonData(url, content, metadata);
+          List<String> inUrls = null;
+          if (linkDbReader != null) {
+            int max = 5000;     //just in case there are too many urls!
+            Inlinks inlinks = linkDbReader.getInlinks((Text) key);
+            if (inlinks != null) {
+              Iterator<Inlink> iterator = inlinks.iterator();
+              inUrls = new ArrayList<>();
+              while (max >= 0 && iterator.hasNext()){
+                inUrls.add(iterator.next().getFromUrl());
+                max--;
+              }
             }
+          }
+          //TODO: Make this Jackson Format implementation reusable
+          try (CommonCrawlFormat format = CommonCrawlFormatFactory
+                  .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", 
nutchConfig, config)) {
+            format.setInLinks(inUrls);
+            jsonData = format.getJsonData(url, content, metadata);
+          }
 
             collectStats(typeCounts, mimeType);
             // collects statistics for the given mimetypes
@@ -587,6 +610,10 @@ public class CommonCrawlDataDumper extends Configured 
implements Tool {
         .withType(Number.class)
         .withDescription("an optional file size in bytes for the WARC file(s)")
         .create("warcSize");
+    Option linkDbOpt = OptionBuilder.withArgName("linkdb").hasArg(true)
+        .withDescription("an optional linkdb parameter to include inlinks in 
dump files")
+        .isRequired(false)
+        .create("linkdb");
 
     // create the options
     Options options = new Options();
@@ -606,6 +633,7 @@ public class CommonCrawlDataDumper extends Configured 
implements Tool {
     options.addOption(reverseKeyOpt);
     options.addOption(extensionOpt);
     options.addOption(sizeOpt);
+    options.addOption(linkDbOpt);
 
     CommandLineParser parser = new GnuParser();
     try {
@@ -635,6 +663,8 @@ public class CommonCrawlDataDumper extends Configured 
implements Tool {
       if (line.getParsedOptionValue("warcSize") != null) {
         warcSize = (Long) line.getParsedOptionValue("warcSize");
       }
+      String linkdbPath = line.getOptionValue("linkdb");
+      File linkdb = linkdbPath == null ? null : new File(linkdbPath);
 
       CommonCrawlConfig config = new CommonCrawlConfig();
       config.setKeyPrefix(keyPrefix);
@@ -655,7 +685,7 @@ public class CommonCrawlDataDumper extends Configured 
implements Tool {
 
       CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(config);
 
-      dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes, epochFilename,
+      dumper.dump(outputDir, segmentRootDir, linkdb, gzip, mimeTypes, 
epochFilename,
           extension, warc);
 
     } catch (Exception e) {

http://git-wip-us.apache.org/repos/asf/nutch/blob/6ff1ecbb/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormat.java 
b/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
index 87baeb5..ea10e7d 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
@@ -23,6 +23,7 @@ import org.apache.nutch.protocol.Content;
 
 import java.io.Closeable;
 import java.io.IOException;
+import java.util.List;
 
 /**
  * Interface for all CommonCrawl formatter. It provides the signature for the
@@ -64,6 +65,20 @@ public interface CommonCrawlFormat extends Closeable {
   public String getJsonData(String url, Content content, Metadata metadata,
       ParseData parseData) throws IOException;
 
+
+  /**
+   * sets inlinks of this document
+   * @param inlinks list of inlinks
+     */
+  void setInLinks(List<String> inLinks);
+
+
+    /**
+     * gets set of inlinks
+     * @return gets inlinks of this document
+     */
+  List<String> getInLinks();
+
   /**
    * Optional method that could be implemented if the actual format needs some
    * close procedure.

[1/6] nutch git commit: option to inlcude inlinks in commons crawl dump

Reply via email to