Author: markus Date: Mon Feb 22 14:41:37 2016 New Revision: 1731651 URL: http://svn.apache.org/viewvc?rev=1731651&view=rev Log: NUTCH-2219 Criteria order to be configurable in DeduplicationJob
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731651&r1=1731650&r2=1731651&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon Feb 22 14:41:37 2016 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-2219 Criteria order to be configurable in DeduplicationJob (Ron van der Vegt via markus) + * NUTCH-2218 Update CrawlComplete util to use Commons CLI (Joyce) * NUTCH-2223 Upgrade xercesImpl to 2.11.0 to fix hang on issue in tika mimetype detection (Tien Nguyen Manh via markus) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1731651&r1=1731650&r2=1731651&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Mon Feb 22 14:41:37 2016 @@ -22,6 +22,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Random; +import java.util.Arrays; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -69,6 +70,7 @@ public class DeduplicationJob extends Nu private final static Text urlKey = new Text("_URLTEMPKEY_"); private final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode"; + private final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order"; public static class DBFilter implements Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> { @@ -128,6 +130,13 @@ public class DeduplicationJob extends Nu public static class DedupReducer implements Reducer<BytesWritable, CrawlDatum, Text, CrawlDatum> { + private String[] compareOrder; + + @Override + public void configure(JobConf arg0) { + compareOrder = arg0.get(DEDUPLICATION_COMPARE_ORDER).split(","); + } + private void writeOutAsDuplicate(CrawlDatum datum, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { @@ -144,6 +153,7 @@ public class DeduplicationJob extends Nu throws IOException { CrawlDatum existingDoc = null; + outerloop: while (values.hasNext()) { if (existingDoc == null) { existingDoc = new CrawlDatum(); @@ -151,48 +161,56 @@ public class DeduplicationJob extends Nu continue; } CrawlDatum newDoc = values.next(); - // compare based on score - if (existingDoc.getScore() < newDoc.getScore()) { - writeOutAsDuplicate(existingDoc, output, reporter); - existingDoc = new CrawlDatum(); - existingDoc.set(newDoc); - continue; - } else if (existingDoc.getScore() > newDoc.getScore()) { - // mark new one as duplicate - writeOutAsDuplicate(newDoc, output, reporter); - continue; - } - // same score? delete the one which is oldest - if (existingDoc.getFetchTime() > newDoc.getFetchTime()) { - // mark new one as duplicate - writeOutAsDuplicate(newDoc, output, reporter); - continue; - } else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) { - // mark existing one as duplicate - writeOutAsDuplicate(existingDoc, output, reporter); - existingDoc = new CrawlDatum(); - existingDoc.set(newDoc); - continue; - } - // same time? keep the one which has the shortest URL - String urlExisting = existingDoc.getMetaData().get(urlKey).toString(); - String urlnewDoc = newDoc.getMetaData().get(urlKey).toString(); - if (urlExisting.length() < urlnewDoc.length()) { - // mark new one as duplicate - writeOutAsDuplicate(newDoc, output, reporter); - continue; - } else if (urlExisting.length() > urlnewDoc.length()) { - // mark existing one as duplicate - writeOutAsDuplicate(existingDoc, output, reporter); - existingDoc = new CrawlDatum(); - existingDoc.set(newDoc); - continue; + + for (int i = 0; i < compareOrder.length; i++) { + switch (compareOrder[i]) { + case "score": + // compare based on score + if (existingDoc.getScore() < newDoc.getScore()) { + writeOutAsDuplicate(existingDoc, output, reporter); + existingDoc = new CrawlDatum(); + existingDoc.set(newDoc); + continue outerloop; + } else if (existingDoc.getScore() > newDoc.getScore()) { + // mark new one as duplicate + writeOutAsDuplicate(newDoc, output, reporter); + continue outerloop; + } + break; + case "fetchTime": + // same score? delete the one which is oldest + if (existingDoc.getFetchTime() > newDoc.getFetchTime()) { + // mark new one as duplicate + writeOutAsDuplicate(newDoc, output, reporter); + continue outerloop; + } else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) { + // mark existing one as duplicate + writeOutAsDuplicate(existingDoc, output, reporter); + existingDoc = new CrawlDatum(); + existingDoc.set(newDoc); + continue outerloop; + } + break; + case "urlLength": + // same time? keep the one which has the shortest URL + String urlExisting = existingDoc.getMetaData().get(urlKey).toString(); + String urlnewDoc = newDoc.getMetaData().get(urlKey).toString(); + if (urlExisting.length() < urlnewDoc.length()) { + // mark new one as duplicate + writeOutAsDuplicate(newDoc, output, reporter); + continue outerloop; + } else if (urlExisting.length() > urlnewDoc.length()) { + // mark existing one as duplicate + writeOutAsDuplicate(existingDoc, output, reporter); + existingDoc = new CrawlDatum(); + existingDoc.set(newDoc); + continue outerloop; + } + break; + } } - } - } - @Override - public void configure(JobConf arg0) { + } } @Override @@ -242,16 +260,27 @@ public class DeduplicationJob extends Nu public int run(String[] args) throws IOException { if (args.length < 1) { - System.err.println("Usage: DeduplicationJob <crawldb> [-group <none|host|domain>]"); + System.err.println("Usage: DeduplicationJob <crawldb> [-group <none|host|domain>] [-compareOrder <score>,<fetchTime>,<urlLength>]"); return 1; } String group = "none"; String crawldb = args[0]; - + String compareOrder = "score,fetchTime,urlLength"; + for (int i = 1; i < args.length; i++) { - if (args[i].equals("-group")) + if (args[i].equals("-group")) group = args[++i]; + if (args[i].equals("-compareOrder")) { + compareOrder = args[++i]; + + if (compareOrder.indexOf("score") == -1 || + compareOrder.indexOf("fetchTime") == -1 || + compareOrder.indexOf("urlLength") == -1) { + System.err.println("DeduplicationJob: compareOrder must contain score, fetchTime and urlLength."); + return 1; + } + } } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); @@ -266,6 +295,7 @@ public class DeduplicationJob extends Nu job.setJobName("Deduplication on " + crawldb); job.set(DEDUPLICATION_GROUP_MODE, group); + job.set(DEDUPLICATION_COMPARE_ORDER, compareOrder); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class);