This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 3958d0c NUTCH-2683 DeduplicationJob: add option to prefer https://
over http:// - add optional value "httpsOverHttp" to -compareOrder argument
to prefer https:// over http:// if it comes before the "urlLength" and
neither "score" nor "fetchTime" take precedence - code improvements: remove
nested loop, sort imports, add @Override statements where applicable
new cda251a Merge pull request #425 from
sebastian-nagel/NUTCH-2683-dedup-prefer-https
3958d0c is described below
commit 3958d0c23e32855225fd52403da7c7234eef5ea2
Author: Sebastian Nagel <[email protected]>
AuthorDate: Mon Jan 7 12:00:34 2019 +0100
NUTCH-2683 DeduplicationJob: add option to prefer https:// over http://
- add optional value "httpsOverHttp" to -compareOrder argument
to prefer https:// over http:// if it comes before the "urlLength"
and neither "score" nor "fetchTime" take precedence
- code improvements: remove nested loop, sort imports, add @Override
statements where applicable
---
.../org/apache/nutch/crawl/DeduplicationJob.java | 162 +++++++++++----------
1 file changed, 85 insertions(+), 77 deletions(-)
diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
index 8887b4f..9b01411 100644
--- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
+++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -24,26 +24,24 @@ import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
-
+import org.apache.hadoop.mapreduce.Counter;
+import org.apache.hadoop.mapreduce.CounterGroup;
import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.mapreduce.CounterGroup;
-import org.apache.hadoop.mapreduce.Counter;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -76,14 +74,13 @@ public class DeduplicationJob extends NutchTool implements
Tool {
private String groupMode;
+ @Override
public void setup(Mapper<Text, CrawlDatum, BytesWritable,
CrawlDatum>.Context context) {
Configuration arg0 = context.getConfiguration();
groupMode = arg0.get(DEDUPLICATION_GROUP_MODE);
}
- public void close() throws IOException {
- }
-
+ @Override
public void map(Text key, CrawlDatum value,
Context context)
throws IOException, InterruptedException {
@@ -118,7 +115,7 @@ public class DeduplicationJob extends NutchTool implements
Tool {
}
// add the URL as a temporary MD
value.getMetaData().put(urlKey, key);
- // reduce on the signature optionall grouped on host or domain or not
at all
+ // reduce on the signature optionally grouped on host or domain or not
at all
context.write(sig, value);
}
}
@@ -129,9 +126,10 @@ public class DeduplicationJob extends NutchTool implements
Tool {
private String[] compareOrder;
+ @Override
public void setup(Reducer<BytesWritable, CrawlDatum, Text,
CrawlDatum>.Context context) {
- Configuration arg0 = context.getConfiguration();
- compareOrder = arg0.get(DEDUPLICATION_COMPARE_ORDER).split(",");
+ Configuration conf = context.getConfiguration();
+ compareOrder = conf.get(DEDUPLICATION_COMPARE_ORDER).split(",");
}
private void writeOutAsDuplicate(CrawlDatum datum,
@@ -144,79 +142,90 @@ public class DeduplicationJob extends NutchTool
implements Tool {
context.write(key, datum);
}
+ @Override
public void reduce(BytesWritable key, Iterable<CrawlDatum> values,
- Context context)
- throws IOException, InterruptedException {
+ Context context) throws IOException, InterruptedException {
CrawlDatum existingDoc = null;
- outerloop:
for (CrawlDatum newDoc : values) {
if (existingDoc == null) {
existingDoc = new CrawlDatum();
existingDoc.set(newDoc);
continue;
}
-
- for (int i = 0; i < compareOrder.length; i++) {
- switch (compareOrder[i]) {
- case "score":
- // compare based on score
- if (existingDoc.getScore() < newDoc.getScore()) {
- writeOutAsDuplicate(existingDoc, context);
- existingDoc = new CrawlDatum();
- existingDoc.set(newDoc);
- continue outerloop;
- } else if (existingDoc.getScore() > newDoc.getScore()) {
- // mark new one as duplicate
- writeOutAsDuplicate(newDoc, context);
- continue outerloop;
- }
- break;
- case "fetchTime":
- // same score? delete the one which is oldest
- if (existingDoc.getFetchTime() > newDoc.getFetchTime()) {
- // mark new one as duplicate
- writeOutAsDuplicate(newDoc, context);
- continue outerloop;
- } else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) {
- // mark existing one as duplicate
- writeOutAsDuplicate(existingDoc, context);
- existingDoc = new CrawlDatum();
- existingDoc.set(newDoc);
- continue outerloop;
- }
- break;
- case "urlLength":
- // same time? keep the one which has the shortest URL
- String urlExisting;
- String urlnewDoc;
- try {
- urlExisting =
URLDecoder.decode(existingDoc.getMetaData().get(urlKey).toString(), "UTF8");
- urlnewDoc =
URLDecoder.decode(newDoc.getMetaData().get(urlKey).toString(), "UTF8");
- } catch (UnsupportedEncodingException e) {
- LOG.error("Error decoding: " + urlKey);
- throw new IOException("UnsupportedEncodingException for " +
urlKey);
- }
- if (urlExisting.length() < urlnewDoc.length()) {
- // mark new one as duplicate
- writeOutAsDuplicate(newDoc, context);
- continue outerloop;
- } else if (urlExisting.length() > urlnewDoc.length()) {
- // mark existing one as duplicate
- writeOutAsDuplicate(existingDoc, context);
- existingDoc = new CrawlDatum();
- existingDoc.set(newDoc);
- continue outerloop;
- }
- break;
+ CrawlDatum duplicate = getDuplicate(existingDoc, newDoc);
+ if (duplicate != null) {
+ writeOutAsDuplicate(duplicate, context);
+ if (duplicate == existingDoc) {
+ // keep new
+ existingDoc.set(newDoc);
}
}
-
}
}
- public void close() throws IOException {
-
+ private CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc)
+ throws IOException {
+ for (int i = 0; i < compareOrder.length; i++) {
+ switch (compareOrder[i]) {
+ case "score":
+ // compare based on score
+ if (existingDoc.getScore() < newDoc.getScore()) {
+ return existingDoc;
+ } else if (existingDoc.getScore() > newDoc.getScore()) {
+ // mark new one as duplicate
+ return newDoc;
+ }
+ break;
+ case "fetchTime":
+ // same score? delete the one which is oldest
+ if (existingDoc.getFetchTime() > newDoc.getFetchTime()) {
+ // mark new one as duplicate
+ return newDoc;
+ } else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) {
+ // mark existing one as duplicate
+ return existingDoc;
+ }
+ break;
+ case "httpsOverHttp":
+ // prefer https:// over http:// if URLs are identical except for the
+ // protocol
+ String url1 = existingDoc.getMetaData().get(urlKey).toString();
+ String url2 = newDoc.getMetaData().get(urlKey).toString();
+ if (url1.startsWith("https://") && url2.startsWith("http://")
+ && url1.substring(8).equals(url2.substring(7))) {
+ // existingDoc with https://, mark newDoc as duplicate
+ return newDoc;
+ } else if (url2.startsWith("https://") && url1.startsWith("http://")
+ && url2.substring(8).equals(url1.substring(7))) {
+ // newDoc with https://, mark existingDoc as duplicate
+ return existingDoc;
+ }
+ break;
+ case "urlLength":
+ // same time? keep the one which has the shortest URL
+ String urlExisting;
+ String urlnewDoc;
+ try {
+ urlExisting = URLDecoder.decode(
+ existingDoc.getMetaData().get(urlKey).toString(), "UTF8");
+ urlnewDoc = URLDecoder
+ .decode(newDoc.getMetaData().get(urlKey).toString(), "UTF8");
+ } catch (UnsupportedEncodingException e) {
+ LOG.error("Error decoding: " + urlKey);
+ throw new IOException("UnsupportedEncodingException for " +
urlKey);
+ }
+ if (urlExisting.length() < urlnewDoc.length()) {
+ // mark new one as duplicate
+ return newDoc;
+ } else if (urlExisting.length() > urlnewDoc.length()) {
+ // mark existing one as duplicate
+ return existingDoc;
+ }
+ break;
+ }
+ }
+ return null; // no decision possible
}
}
@@ -224,15 +233,14 @@ public class DeduplicationJob extends NutchTool
implements Tool {
public static class StatusUpdateReducer extends
Reducer<Text, CrawlDatum, Text, CrawlDatum> {
+ @Override
public void setup(Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context
context) {
}
- public void close() {
- }
-
private CrawlDatum old = new CrawlDatum();
private CrawlDatum duplicate = new CrawlDatum();
+ @Override
public void reduce(Text key, Iterable<CrawlDatum> values,
Context context)
throws IOException, InterruptedException {
@@ -260,7 +268,7 @@ public class DeduplicationJob extends NutchTool implements
Tool {
public int run(String[] args) throws IOException {
if (args.length < 1) {
- System.err.println("Usage: DeduplicationJob <crawldb> [-group
<none|host|domain>] [-compareOrder <score>,<fetchTime>,<urlLength>]");
+ System.err.println("Usage: DeduplicationJob <crawldb> [-group
<none|host|domain>] [-compareOrder
<score>,<fetchTime>,<httpsOverHttp>,<urlLength>]");
return 1;
}