Handling duplicate inlinks Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/a0880491 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/a0880491 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/a0880491
Branch: refs/heads/master Commit: a08804913c500b331c9296996ce1ba3d4929d3a8 Parents: 6ff1ecb Author: Thamme Gowda <[email protected]> Authored: Mon Apr 18 19:20:38 2016 -0700 Committer: Thamme Gowda <[email protected]> Committed: Mon Apr 18 19:20:38 2016 -0700 ---------------------------------------------------------------------- .../org/apache/nutch/tools/CommonCrawlDataDumper.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/a0880491/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java index e26e088..83da679 100644 --- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java +++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java @@ -33,8 +33,10 @@ import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.Iterator; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Set; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -355,13 +357,13 @@ public class CommonCrawlDataDumper extends Configured implements Tool { String mimeType = new Tika().detect(content.getContent()); // Maps file to JSON-based structure - List<String> inUrls = null; + Set<String> inUrls = null; ///may be there are duplicates, so using set if (linkDbReader != null) { int max = 5000; //just in case there are too many urls! Inlinks inlinks = linkDbReader.getInlinks((Text) key); if (inlinks != null) { Iterator<Inlink> iterator = inlinks.iterator(); - inUrls = new ArrayList<>(); + inUrls = new LinkedHashSet<>(); while (max >= 0 && iterator.hasNext()){ inUrls.add(iterator.next().getFromUrl()); max--; @@ -371,7 +373,9 @@ public class CommonCrawlDataDumper extends Configured implements Tool { //TODO: Make this Jackson Format implementation reusable try (CommonCrawlFormat format = CommonCrawlFormatFactory .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) { - format.setInLinks(inUrls); + if (inUrls != null) { + format.setInLinks(new ArrayList<>(inUrls)); + } jsonData = format.getJsonData(url, content, metadata); }
