Author: lewismc
Date: Wed Jul 22 12:51:05 2015
New Revision: 1692268
URL: http://svn.apache.org/r1692268
Log:
NUTCH-2063 Add -mimeStats flag to FileDumper tool
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1692268&r1=1692267&r2=1692268&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul 22 12:51:05 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2063 Add -mimeStats flag to FileDumper tool (Mike Joyce via lewismc)
+
* NUTCH-2021 Use protocol-selenium to Capture Screenshots of the Page as it is
Fetched (lewismc)
* NUTCH-2058 Indexer plugin that allows RegEx replacements on the
NutchDocument
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1692268&r1=1692267&r2=1692268&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Wed Jul 22
12:51:05 2015
@@ -126,9 +126,12 @@ public class FileDumper {
* @param mimeTypes
* an array of mime types we have to dump, all others will be
* filtered out.
+ * @param mimeTypeStats
+ * a flag indicating whether mimetype stats should be displayed
+ * instead of dumping files.
* @throws Exception
*/
- public void dump(File outputDir, File segmentRootDir, String[] mimeTypes)
+ public void dump(File outputDir, File segmentRootDir, String[] mimeTypes,
boolean mimeTypeStats)
throws Exception {
if (mimeTypes == null)
LOG.info("Accepting all mimetypes.");
@@ -206,24 +209,25 @@ public class FileDumper {
}
if (filter) {
- String md5Ofurl = DumpFileUtil.getUrlMD5(url);
- String fullDir =
DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl);
-
- if (!Strings.isNullOrEmpty(fullDir)) {
- String outputFullPath = String.format("%s/%s", fullDir,
DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
- File outputFile = new File(outputFullPath);
-
- if (!outputFile.exists()) {
- LOG.info("Writing: [" + outputFullPath + "]");
- FileOutputStream output = new FileOutputStream(outputFile);
- IOUtils.write(content.getContent(), output);
- fileCount++;
- } else {
- LOG.info("Skipping writing: [" + outputFullPath
- + "]: file already exists");
+ if (!mimeTypeStats) {
+ String md5Ofurl = DumpFileUtil.getUrlMD5(url);
+ String fullDir =
DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl);
+
+ if (!Strings.isNullOrEmpty(fullDir)) {
+ String outputFullPath = String.format("%s/%s", fullDir,
DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
+ File outputFile = new File(outputFullPath);
+
+ if (!outputFile.exists()) {
+ LOG.info("Writing: [" + outputFullPath + "]");
+ FileOutputStream output = new FileOutputStream(outputFile);
+ IOUtils.write(content.getContent(), output);
+ fileCount++;
+ } else {
+ LOG.info("Skipping writing: [" + outputFullPath
+ + "]: file already exists");
+ }
}
}
-
}
}
reader.close();
@@ -240,6 +244,10 @@ public class FileDumper {
LOG.info("Dumper File Stats: "
+ DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
+ if (mimeTypeStats) {
+ System.out.println("Dumper File Stats: "
+ + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
+ }
}
/**
@@ -271,6 +279,12 @@ public class FileDumper {
.withDescription(
"an optional list of mimetypes to dump, excluding all others.
Defaults to all.")
.create("mimetype");
+ @SuppressWarnings("static-access")
+ Option mimeStat = OptionBuilder
+ .withArgName("mimeStats")
+ .withDescription(
+ "only display mimetype stats for the segment(s) instead of dumping
file.")
+ .create("mimeStats");
// create the options
Options options = new Options();
@@ -278,6 +292,7 @@ public class FileDumper {
options.addOption(outputOpt);
options.addOption(segOpt);
options.addOption(mimeOpt);
+ options.addOption(mimeStat);
CommandLineParser parser = new GnuParser();
try {
@@ -292,17 +307,22 @@ public class FileDumper {
File outputDir = new File(line.getOptionValue("outputDir"));
File segmentRootDir = new File(line.getOptionValue("segment"));
String[] mimeTypes = line.getOptionValues("mimetype");
+ boolean shouldDisplayStats = false;
+ if (line.hasOption("mimeStats"))
+ shouldDisplayStats = true;
if (!outputDir.exists()) {
LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
+ "]: does not exist, creating it.");
- if (!outputDir.mkdirs())
- throw new Exception("Unable to create: ["
+ if (!shouldDisplayStats) {
+ if (!outputDir.mkdirs())
+ throw new Exception("Unable to create: ["
+ outputDir.getAbsolutePath() + "]");
+ }
}
FileDumper dumper = new FileDumper();
- dumper.dump(outputDir, segmentRootDir, mimeTypes);
+ dumper.dump(outputDir, segmentRootDir, mimeTypes, shouldDisplayStats);
} catch (Exception e) {
LOG.error("FileDumper: " + StringUtils.stringifyException(e));
e.printStackTrace();