Fix code format issues Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/298cffc4 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/298cffc4 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/298cffc4
Branch: refs/heads/master Commit: 298cffc4927eabf310ff65dd57847010ebb8f7f1 Parents: a088049 Author: Thamme Gowda <[email protected]> Authored: Sat Apr 30 16:48:15 2016 -0700 Committer: Thamme Gowda <[email protected]> Committed: Sat Apr 30 16:48:15 2016 -0700 ---------------------------------------------------------------------- .../nutch/tools/AbstractCommonCrawlFormat.java | 4 +- .../nutch/tools/CommonCrawlDataDumper.java | 43 ++++++++++---------- .../apache/nutch/tools/CommonCrawlFormat.java | 12 +++--- 3 files changed, 30 insertions(+), 29 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/298cffc4/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java index 386ec4a..d5a0154 100644 --- a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java +++ b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java @@ -79,7 +79,7 @@ public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat { } public String getJsonData(String url, Content content, Metadata metadata) - throws IOException { + throws IOException { this.url = url; this.content = content; this.metadata = metadata; @@ -90,7 +90,7 @@ public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat { public String getJsonData(String url, Content content, Metadata metadata, ParseData parseData) throws IOException { - // override of this is required in the actual formats + // override of this is required in the actual formats throw new NotImplementedException(); } http://git-wip-us.apache.org/repos/asf/nutch/blob/298cffc4/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java index 83da679..5abd393 100644 --- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java +++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java @@ -182,7 +182,8 @@ public class CommonCrawlDataDumper extends Configured implements Tool { private static final Logger LOG = LoggerFactory .getLogger(CommonCrawlDataDumper.class.getName()); - + private static final int MAX_INLINKS = 5000; + private CommonCrawlConfig config = null; // Gzip initialization @@ -253,7 +254,8 @@ public class CommonCrawlDataDumper extends Configured implements Tool { //get all paths List<Path> parts = new ArrayList<>(); RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true); - String partPattern = ".*" + File.separator + Content.DIR_NAME + File.separator + "part-[0-9]{5}" + File.separator + "data"; + String partPattern = ".*" + File.separator + Content.DIR_NAME + + File.separator + "part-[0-9]{5}" + File.separator + "data"; while (files.hasNext()) { LocatedFileStatus next = files.next(); if (next.isFile()) { @@ -269,7 +271,8 @@ public class CommonCrawlDataDumper extends Configured implements Tool { linkDbReader = new LinkDbReader(fs.getConf(), new Path(linkdb.toString())); } if (parts == null || parts.size() == 0) { - LOG.error( "No segment directories found in [ {}] ", segmentRootDir.getAbsolutePath()); + LOG.error( "No segment directories found in {} ", + segmentRootDir.getAbsolutePath()); System.exit(1); } LOG.info("Found {} segment parts", parts.size()); @@ -357,27 +360,25 @@ public class CommonCrawlDataDumper extends Configured implements Tool { String mimeType = new Tika().detect(content.getContent()); // Maps file to JSON-based structure - Set<String> inUrls = null; ///may be there are duplicates, so using set - if (linkDbReader != null) { - int max = 5000; //just in case there are too many urls! - Inlinks inlinks = linkDbReader.getInlinks((Text) key); - if (inlinks != null) { - Iterator<Inlink> iterator = inlinks.iterator(); - inUrls = new LinkedHashSet<>(); - while (max >= 0 && iterator.hasNext()){ - inUrls.add(iterator.next().getFromUrl()); - max--; + Set<String> inUrls = null; //there may be duplicates, so using set + if (linkDbReader != null) { + Inlinks inlinks = linkDbReader.getInlinks((Text) key); + if (inlinks != null) { + Iterator<Inlink> iterator = inlinks.iterator(); + inUrls = new LinkedHashSet<>(); + while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()){ + inUrls.add(iterator.next().getFromUrl()); + } } } - } - //TODO: Make this Jackson Format implementation reusable - try (CommonCrawlFormat format = CommonCrawlFormatFactory - .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) { - if (inUrls != null) { - format.setInLinks(new ArrayList<>(inUrls)); + //TODO: Make this Jackson Format implementation reusable + try (CommonCrawlFormat format = CommonCrawlFormatFactory + .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) { + if (inUrls != null) { + format.setInLinks(new ArrayList<>(inUrls)); + } + jsonData = format.getJsonData(url, content, metadata); } - jsonData = format.getJsonData(url, content, metadata); - } collectStats(typeCounts, mimeType); // collects statistics for the given mimetypes http://git-wip-us.apache.org/repos/asf/nutch/blob/298cffc4/src/java/org/apache/nutch/tools/CommonCrawlFormat.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormat.java b/src/java/org/apache/nutch/tools/CommonCrawlFormat.java index ea10e7d..0834d95 100644 --- a/src/java/org/apache/nutch/tools/CommonCrawlFormat.java +++ b/src/java/org/apache/nutch/tools/CommonCrawlFormat.java @@ -68,15 +68,15 @@ public interface CommonCrawlFormat extends Closeable { /** * sets inlinks of this document - * @param inlinks list of inlinks - */ + * @param inLinks list of inlinks + */ void setInLinks(List<String> inLinks); - /** - * gets set of inlinks - * @return gets inlinks of this document - */ + /** + * gets set of inlinks + * @return gets inlinks of this document + */ List<String> getInLinks(); /**
