Author: mattmann
Date: Fri Apr 10 23:30:09 2015
New Revision: 1672796
URL: http://svn.apache.org/r1672796
Log:
- fix for NUTCH-1983 CommonCrawlDumper and FileDumper don't dump correct JSON
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1672796&r1=1672795&r2=1672796&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Apr 10 23:30:09 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1983 CommonCrawlDumper and FileDumper don't dump correct JSON
(mattmann)
+
* NUTCH-1972 Dockerfile for Nutch 1.x (Michael Joyce via mattmann)
* NUTCH-1771 Indexer fails if a segment is corrupted or incomplete (Diaa,
Chong Li via snagel)
Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1672796&r1=1672795&r2=1672796&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Fri
Apr 10 23:30:09 2015
@@ -488,7 +488,7 @@ public class CommonCrawlDataDumper {
}
if (!typeCounts.isEmpty()) {
- LOG.info("CommonsCrawlDataDumper File Stats: " +
displayFileTypes(typeCounts, filteredCounts));
+ LOG.info("CommonsCrawlDataDumper File Stats: " +
DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
}
}
@@ -547,35 +547,6 @@ public class CommonCrawlDataDumper {
private void collectStats(Map<String, Integer> typeCounts, String
mimeType) {
typeCounts.put(mimeType, typeCounts.containsKey(mimeType) ?
typeCounts.get(mimeType) + 1 : 1);
}
-
- private String displayFileTypes(Map<String, Integer> typeCounts,
Map<String, Integer> filteredCounts) {
- StringBuilder builder = new StringBuilder();
- // print total stats
- builder.append("\nTOTAL Stats:\n");
- builder.append("{\n");
- for (String mimeType : typeCounts.keySet()) {
- builder.append(" {\"mimeType\":\"");
- builder.append(mimeType);
- builder.append("\",\"count\":");
- builder.append(typeCounts.get(mimeType));
- builder.append("\"}\n");
- }
- builder.append("}\n");
- // filtered types stats
- if (!filteredCounts.isEmpty()) {
- builder.append("\nFILTERED Stats:\n");
- builder.append("{\n");
- for (String mimeType : filteredCounts.keySet()) {
- builder.append(" {\"mimeType\":\"");
- builder.append(mimeType);
- builder.append("\",\"count\":");
- builder.append(filteredCounts.get(mimeType));
- builder.append("\"}\n");
- }
- builder.append("}\n");
- }
- return builder.toString();
- }
/**
* Gets the current date if the given timestamp is empty or null.
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1672796&r1=1672795&r2=1672796&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Fri Apr 10
23:30:09 2015
@@ -80,26 +80,29 @@ import org.slf4j.LoggerFactory;
* <pre>
* {@code
* INFO: File Types:
- * TOTAL Stats: {
- * {"mimeType":"application/xml","count":19"}
- * {"mimeType":"image/png","count":47"}
- * {"mimeType":"image/jpeg","count":141"}
- * {"mimeType":"image/vnd.microsoft.icon","count":4"}
- * {"mimeType":"text/plain","count":89"}
- * {"mimeType":"video/quicktime","count":2"}
- * {"mimeType":"image/gif","count":63"}
- * {"mimeType":"application/xhtml+xml","count":1670"}
- * {"mimeType":"application/octet-stream","count":40"}
- * {"mimeType":"text/html","count":1863"}
- * }
- * FILTER Stats: {
- * {"mimeType":"image/png","count":47"}
- * {"mimeType":"image/jpeg","count":141"}
- * {"mimeType":"image/vnd.microsoft.icon","count":4"}
- * {"mimeType":"video/quicktime","count":2"}
- * {"mimeType":"image/gif","count":63"}
- * }
- * }
+ * TOTAL Stats:
+ * [
+ * {"mimeType":"application/xml","count":"19"}
+ * {"mimeType":"image/png","count":"47"}
+ * {"mimeType":"image/jpeg","count":"141"}
+ * {"mimeType":"image/vnd.microsoft.icon","count":"4"}
+ * {"mimeType":"text/plain","count":"89"}
+ * {"mimeType":"video/quicktime","count":"2"}
+ * {"mimeType":"image/gif","count":"63"}
+ * {"mimeType":"application/xhtml+xml","count":"1670"}
+ * {"mimeType":"application/octet-stream","count":"40"}
+ * {"mimeType":"text/html","count":"1863"}
+ * ]
+ *
+ * FILTER Stats:
+ * [
+ * {"mimeType":"image/png","count":"47"}
+ * {"mimeType":"image/jpeg","count":"141"}
+ * {"mimeType":"image/vnd.microsoft.icon","count":"4"}
+ * {"mimeType":"video/quicktime","count":"2"}
+ * {"mimeType":"image/gif","count":"63"}
+ * ]
+ *
* </pre>
* <p>
* In the case above, the tool would have been run with the <b>-mimeType
@@ -237,7 +240,7 @@ public class FileDumper {
}
}
LOG.info("Dumper File Stats: "
- + displayFileTypes(typeCounts, filteredCounts));
+ + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
}
@@ -314,34 +317,4 @@ public class FileDumper {
typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 : 1);
}
- private String displayFileTypes(Map<String, Integer> typeCounts,
- Map<String, Integer> filteredCounts) {
- StringBuilder builder = new StringBuilder();
- // print total stats
- builder.append("\n TOTAL Stats:\n");
- builder.append(" {\n");
- for (String mimeType : typeCounts.keySet()) {
- builder.append(" {\"mimeType\":\"");
- builder.append(mimeType);
- builder.append("\",\"count\":");
- builder.append(typeCounts.get(mimeType));
- builder.append("\"}\n");
- }
- builder.append("}\n");
- if (!filteredCounts.isEmpty()) {
- // print dumper stats
- builder.append("\n FILTERED Stats:\n");
- builder.append(" {\n");
- for (String mimeType : filteredCounts.keySet()) {
- builder.append(" {\"mimeType\":\"");
- builder.append(mimeType);
- builder.append("\",\"count\":");
- builder.append(filteredCounts.get(mimeType));
- builder.append("\"}\n");
- }
- builder.append("}\n");
- }
- return builder.toString();
- }
-
}
Modified: nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java?rev=1672796&r1=1672795&r2=1672796&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java Fri Apr 10
23:30:09 2015
@@ -26,6 +26,7 @@ import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
+import java.util.Map;
public class DumpFileUtil {
private static final Logger LOG =
LoggerFactory.getLogger(DumpFileUtil.class
@@ -104,4 +105,33 @@ public class DumpFileUtil {
return outputFullPath;
}
+
+ public static String displayFileTypes(Map<String, Integer> typeCounts,
Map<String, Integer> filteredCounts) {
+ StringBuilder builder = new StringBuilder();
+ // print total stats
+ builder.append("\nTOTAL Stats:\n");
+ builder.append("[\n");
+ for (String mimeType : typeCounts.keySet()) {
+ builder.append(" {\"mimeType\":\"");
+ builder.append(mimeType);
+ builder.append("\",\"count\":\"");
+ builder.append(typeCounts.get(mimeType));
+ builder.append("\"}\n");
+ }
+ builder.append("]\n");
+ // filtered types stats
+ if (!filteredCounts.isEmpty()) {
+ builder.append("\nFILTERED Stats:\n");
+ builder.append("[\n");
+ for (String mimeType : filteredCounts.keySet()) {
+ builder.append(" {\"mimeType\":\"");
+ builder.append(mimeType);
+ builder.append("\",\"count\":\"");
+ builder.append(filteredCounts.get(mimeType));
+ builder.append("\"}\n");
+ }
+ builder.append("]\n");
+ }
+ return builder.toString();
+ }
}