Author: mattmann
Date: Fri Apr 17 20:58:08 2015
New Revision: 1674401
URL: http://svn.apache.org/r1674401
Log:
Fix for NUTCH-1988 Make nested output directory dump optional contributed by
Michael Joyce <[email protected]> this closes #19.
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1674401&r1=1674400&r2=1674401&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Apr 17 20:58:08 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1988 Make nested output directory dump optional (Michael Joyce via
mattmann)
+
* NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of
RobotRules parsing (mattmann, snagel)
* NUTCH-1986 Clarify Elastic Search Indexer Plugin Settings (Michael Joyce via
mattmann)
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1674401&r1=1674400&r2=1674401&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Fri Apr 17
20:58:08 2015
@@ -128,9 +128,13 @@ public class FileDumper {
* @param mimeTypes
* an array of mime types we have to dump, all others will be
* filtered out.
+ * @param flatDir
+ * a boolean flag specifying whether the output directory should
contain
+ * only files instead of using nested directories to prevent naming
+ * conflicts.
* @throws Exception
*/
- public void dump(File outputDir, File segmentRootDir, String[] mimeTypes)
+ public void dump(File outputDir, File segmentRootDir, String[] mimeTypes,
boolean flatDir)
throws Exception {
if (mimeTypes == null)
LOG.info("Accepting all mimetypes.");
@@ -209,7 +213,11 @@ public class FileDumper {
if (filter) {
String md5Ofurl = DumpFileUtil.getUrlMD5(url);
- String fullDir =
DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl);
+
+ String fullDir = outputDir.getAbsolutePath();
+ if (!flatDir) {
+ fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir,
md5Ofurl);
+ }
if (!Strings.isNullOrEmpty(fullDir)) {
String outputFullPath = String.format("%s/%s", fullDir,
DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
@@ -273,6 +281,12 @@ public class FileDumper {
.withDescription(
"an optional list of mimetypes to dump, excluding all others.
Defaults to all.")
.create("mimetype");
+ @SuppressWarnings("static-access")
+ Option dirStructureOpt = OptionBuilder
+ .withArgName("flatdir")
+ .withDescription(
+ "optionally specify that the output directory should only contain
files.")
+ .create("flatdir");
// create the options
Options options = new Options();
@@ -280,6 +294,7 @@ public class FileDumper {
options.addOption(outputOpt);
options.addOption(segOpt);
options.addOption(mimeOpt);
+ options.addOption(dirStructureOpt);
CommandLineParser parser = new GnuParser();
try {
@@ -294,6 +309,7 @@ public class FileDumper {
File outputDir = new File(line.getOptionValue("outputDir"));
File segmentRootDir = new File(line.getOptionValue("segment"));
String[] mimeTypes = line.getOptionValues("mimetype");
+ boolean flatDir = line.hasOption("flatdir");
if (!outputDir.exists()) {
LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
@@ -304,7 +320,7 @@ public class FileDumper {
}
FileDumper dumper = new FileDumper();
- dumper.dump(outputDir, segmentRootDir, mimeTypes);
+ dumper.dump(outputDir, segmentRootDir, mimeTypes, flatDir);
} catch (Exception e) {
LOG.error("FileDumper: " + StringUtils.stringifyException(e));
e.printStackTrace();