Fix code format issues

Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/298cffc4
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/298cffc4
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/298cffc4

Branch: refs/heads/master
Commit: 298cffc4927eabf310ff65dd57847010ebb8f7f1
Parents: a088049
Author: Thamme Gowda <[email protected]>
Authored: Sat Apr 30 16:48:15 2016 -0700
Committer: Thamme Gowda <[email protected]>
Committed: Sat Apr 30 16:48:15 2016 -0700

----------------------------------------------------------------------
 .../nutch/tools/AbstractCommonCrawlFormat.java  |  4 +-
 .../nutch/tools/CommonCrawlDataDumper.java      | 43 ++++++++++----------
 .../apache/nutch/tools/CommonCrawlFormat.java   | 12 +++---
 3 files changed, 30 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/298cffc4/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java 
b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
index 386ec4a..d5a0154 100644
--- a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
+++ b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
@@ -79,7 +79,7 @@ public abstract class AbstractCommonCrawlFormat implements 
CommonCrawlFormat {
        }
 
        public String getJsonData(String url, Content content, Metadata 
metadata)
-      throws IOException {
+                       throws IOException {
                this.url = url;
                this.content = content;
                this.metadata = metadata;
@@ -90,7 +90,7 @@ public abstract class AbstractCommonCrawlFormat implements 
CommonCrawlFormat {
        public String getJsonData(String url, Content content, Metadata 
metadata,
                        ParseData parseData) throws IOException {
 
-    // override of this is required in the actual formats
+               // override of this is required in the actual formats
                throw new NotImplementedException();
        }
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/298cffc4/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java 
b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
index 83da679..5abd393 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
@@ -182,7 +182,8 @@ public class CommonCrawlDataDumper extends Configured 
implements Tool {
 
   private static final Logger LOG = LoggerFactory
       .getLogger(CommonCrawlDataDumper.class.getName());
-
+  private static final int MAX_INLINKS = 5000;
+  
   private CommonCrawlConfig config = null;
 
   // Gzip initialization
@@ -253,7 +254,8 @@ public class CommonCrawlDataDumper extends Configured 
implements Tool {
     //get all paths
     List<Path> parts = new ArrayList<>();
     RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, 
true);
-    String partPattern = ".*" + File.separator + Content.DIR_NAME + 
File.separator + "part-[0-9]{5}" + File.separator + "data";
+    String partPattern = ".*" + File.separator + Content.DIR_NAME
+        + File.separator + "part-[0-9]{5}" + File.separator + "data";
     while (files.hasNext()) {
       LocatedFileStatus next = files.next();
       if (next.isFile()) {
@@ -269,7 +271,8 @@ public class CommonCrawlDataDumper extends Configured 
implements Tool {
       linkDbReader = new LinkDbReader(fs.getConf(), new 
Path(linkdb.toString()));
     }
     if (parts == null || parts.size() == 0) {
-      LOG.error( "No segment directories found in [ {}] ", 
segmentRootDir.getAbsolutePath());
+      LOG.error( "No segment directories found in {} ",
+          segmentRootDir.getAbsolutePath());
       System.exit(1);
     }
     LOG.info("Found {} segment parts", parts.size());
@@ -357,27 +360,25 @@ public class CommonCrawlDataDumper extends Configured 
implements Tool {
             String mimeType = new Tika().detect(content.getContent());
             // Maps file to JSON-based structure
 
-          Set<String> inUrls = null; ///may be there are duplicates, so using 
set
-          if (linkDbReader != null) {
-            int max = 5000;     //just in case there are too many urls!
-            Inlinks inlinks = linkDbReader.getInlinks((Text) key);
-            if (inlinks != null) {
-              Iterator<Inlink> iterator = inlinks.iterator();
-              inUrls = new LinkedHashSet<>();
-              while (max >= 0 && iterator.hasNext()){
-                inUrls.add(iterator.next().getFromUrl());
-                max--;
+            Set<String> inUrls = null; //there may be duplicates, so using set
+            if (linkDbReader != null) {
+              Inlinks inlinks = linkDbReader.getInlinks((Text) key);
+              if (inlinks != null) {
+                Iterator<Inlink> iterator = inlinks.iterator();
+                inUrls = new LinkedHashSet<>();
+                while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()){
+                  inUrls.add(iterator.next().getFromUrl());
+                }
               }
             }
-          }
-          //TODO: Make this Jackson Format implementation reusable
-          try (CommonCrawlFormat format = CommonCrawlFormatFactory
-                  .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", 
nutchConfig, config)) {
-            if (inUrls != null) {
-              format.setInLinks(new ArrayList<>(inUrls));
+            //TODO: Make this Jackson Format implementation reusable
+            try (CommonCrawlFormat format = CommonCrawlFormatFactory
+                .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, 
config)) {
+              if (inUrls != null) {
+                format.setInLinks(new ArrayList<>(inUrls));
+              }
+              jsonData = format.getJsonData(url, content, metadata);
             }
-            jsonData = format.getJsonData(url, content, metadata);
-          }
 
             collectStats(typeCounts, mimeType);
             // collects statistics for the given mimetypes

http://git-wip-us.apache.org/repos/asf/nutch/blob/298cffc4/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormat.java 
b/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
index ea10e7d..0834d95 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
@@ -68,15 +68,15 @@ public interface CommonCrawlFormat extends Closeable {
 
   /**
    * sets inlinks of this document
-   * @param inlinks list of inlinks
-     */
+   * @param inLinks list of inlinks
+   */
   void setInLinks(List<String> inLinks);
 
 
-    /**
-     * gets set of inlinks
-     * @return gets inlinks of this document
-     */
+  /**
+   * gets set of inlinks
+   * @return gets inlinks of this document
+   */
   List<String> getInLinks();
 
   /**

Reply via email to