DeduplicationJob.java

markus Mon, 22 Feb 2016 06:46:52 -0800

Author: markus
Date: Mon Feb 22 14:41:37 2016
New Revision: 1731651

URL: http://svn.apache.org/viewvc?rev=1731651&view=rev
Log:
NUTCH-2219 Criteria order to be configurable in DeduplicationJob


Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731651&r1=1731650&r2=1731651&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Feb 22 14:41:37 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2219 Criteria order to be configurable in DeduplicationJob (Ron van 
der Vegt via markus)
+
 * NUTCH-2218 Update CrawlComplete util to use Commons CLI (Joyce)
 
 * NUTCH-2223 Upgrade xercesImpl to 2.11.0 to fix hang on issue in tika 
mimetype detection (Tien Nguyen Manh via markus)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1731651&r1=1731650&r2=1731651&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Mon Feb 
22 14:41:37 2016
@@ -22,6 +22,7 @@ import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Random;
+import java.util.Arrays;
 
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -69,6 +70,7 @@ public class DeduplicationJob extends Nu
 
   private final static Text urlKey = new Text("_URLTEMPKEY_");
   private final static String DEDUPLICATION_GROUP_MODE = 
"deduplication.group.mode";
+  private final static String DEDUPLICATION_COMPARE_ORDER = 
"deduplication.compare.order";
 
   public static class DBFilter implements
       Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> {
@@ -128,6 +130,13 @@ public class DeduplicationJob extends Nu
   public static class DedupReducer implements
       Reducer<BytesWritable, CrawlDatum, Text, CrawlDatum> {
 
+    private String[] compareOrder;
+    
+    @Override
+    public void configure(JobConf arg0) {
+      compareOrder = arg0.get(DEDUPLICATION_COMPARE_ORDER).split(",");
+    }
+
     private void writeOutAsDuplicate(CrawlDatum datum,
         OutputCollector<Text, CrawlDatum> output, Reporter reporter)
         throws IOException {
@@ -144,6 +153,7 @@ public class DeduplicationJob extends Nu
         throws IOException {
       CrawlDatum existingDoc = null;
 
+      outerloop:
       while (values.hasNext()) {
         if (existingDoc == null) {
           existingDoc = new CrawlDatum();
@@ -151,48 +161,56 @@ public class DeduplicationJob extends Nu
           continue;
         }
         CrawlDatum newDoc = values.next();
-        // compare based on score
-        if (existingDoc.getScore() < newDoc.getScore()) {
-          writeOutAsDuplicate(existingDoc, output, reporter);
-          existingDoc = new CrawlDatum();
-          existingDoc.set(newDoc);
-          continue;
-        } else if (existingDoc.getScore() > newDoc.getScore()) {
-          // mark new one as duplicate
-          writeOutAsDuplicate(newDoc, output, reporter);
-          continue;
-        }
-        // same score? delete the one which is oldest
-        if (existingDoc.getFetchTime() > newDoc.getFetchTime()) {
-          // mark new one as duplicate
-          writeOutAsDuplicate(newDoc, output, reporter);
-          continue;
-        } else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) {
-          // mark existing one as duplicate
-          writeOutAsDuplicate(existingDoc, output, reporter);
-          existingDoc = new CrawlDatum();
-          existingDoc.set(newDoc);
-          continue;
-        }
-        // same time? keep the one which has the shortest URL
-        String urlExisting = existingDoc.getMetaData().get(urlKey).toString();
-        String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
-        if (urlExisting.length() < urlnewDoc.length()) {
-          // mark new one as duplicate
-          writeOutAsDuplicate(newDoc, output, reporter);
-          continue;
-        } else if (urlExisting.length() > urlnewDoc.length()) {
-          // mark existing one as duplicate
-          writeOutAsDuplicate(existingDoc, output, reporter);
-          existingDoc = new CrawlDatum();
-          existingDoc.set(newDoc);
-          continue;
+
+        for (int i = 0; i < compareOrder.length; i++) {
+          switch (compareOrder[i]) {
+            case "score":
+              // compare based on score
+              if (existingDoc.getScore() < newDoc.getScore()) {
+                writeOutAsDuplicate(existingDoc, output, reporter);
+                existingDoc = new CrawlDatum();
+                existingDoc.set(newDoc);
+                continue outerloop;
+              } else if (existingDoc.getScore() > newDoc.getScore()) {
+                // mark new one as duplicate
+                writeOutAsDuplicate(newDoc, output, reporter);
+                continue outerloop;
+              }
+              break;
+            case "fetchTime":
+              // same score? delete the one which is oldest
+              if (existingDoc.getFetchTime() > newDoc.getFetchTime()) {
+                // mark new one as duplicate
+                writeOutAsDuplicate(newDoc, output, reporter);
+                continue outerloop;
+              } else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) {
+                // mark existing one as duplicate
+                writeOutAsDuplicate(existingDoc, output, reporter);
+                existingDoc = new CrawlDatum();
+                existingDoc.set(newDoc);
+                continue outerloop;
+              }
+              break;
+            case "urlLength":
+              // same time? keep the one which has the shortest URL
+              String urlExisting = 
existingDoc.getMetaData().get(urlKey).toString();
+              String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
+              if (urlExisting.length() < urlnewDoc.length()) {
+                // mark new one as duplicate
+                writeOutAsDuplicate(newDoc, output, reporter);
+                continue outerloop;
+              } else if (urlExisting.length() > urlnewDoc.length()) {
+                // mark existing one as duplicate
+                writeOutAsDuplicate(existingDoc, output, reporter);
+                existingDoc = new CrawlDatum();
+                existingDoc.set(newDoc);
+                continue outerloop;
+              }
+              break;
+          }
         }
-      }
-    }
 
-    @Override
-    public void configure(JobConf arg0) {
+      }
     }
 
     @Override
@@ -242,16 +260,27 @@ public class DeduplicationJob extends Nu
 
   public int run(String[] args) throws IOException {
     if (args.length < 1) {
-      System.err.println("Usage: DeduplicationJob <crawldb> [-group 
<none|host|domain>]");
+      System.err.println("Usage: DeduplicationJob <crawldb> [-group 
<none|host|domain>] [-compareOrder <score>,<fetchTime>,<urlLength>]");
       return 1;
     }
 
     String group = "none";
     String crawldb = args[0];
-    
+    String compareOrder = "score,fetchTime,urlLength";
+
     for (int i = 1; i < args.length; i++) {
-      if (args[i].equals("-group"))
+      if (args[i].equals("-group")) 
         group = args[++i];
+      if (args[i].equals("-compareOrder")) {
+        compareOrder = args[++i];
+
+        if (compareOrder.indexOf("score") == -1 ||
+            compareOrder.indexOf("fetchTime") == -1 ||
+            compareOrder.indexOf("urlLength") == -1) {
+          System.err.println("DeduplicationJob: compareOrder must contain 
score, fetchTime and urlLength.");
+          return 1;
+        }
+      }
     }
 
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
@@ -266,6 +295,7 @@ public class DeduplicationJob extends Nu
 
     job.setJobName("Deduplication on " + crawldb);
     job.set(DEDUPLICATION_GROUP_MODE, group);
+    job.set(DEDUPLICATION_COMPARE_ORDER, compareOrder);
 
     FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
     job.setInputFormat(SequenceFileInputFormat.class);

svn commit: r1731651 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/DeduplicationJob.java

Reply via email to