Author: joyce
Date: Tue Nov 17 23:43:10 2015
New Revision: 1714908
URL: http://svn.apache.org/viewvc?rev=1714908&view=rev
Log:
NUTCH-2166 - Add reverse URL format to dump tool
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1714908&r1=1714907&r2=1714908&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Nov 17 23:43:10 2015
@@ -3,6 +3,8 @@ Nutch Change Log
Nutch 1.11 Release 25/10/2015 (dd/mm/yyyy)
Release Report: http://s.apache.org/nutch11
+* NUTCH-2166 Add reverse URL format to dump tool (joyce)
+
* NUTCH-2157 Addressing Miredot REST API Warnings (Sujen Shah)
* NUTCH-2165 FileDumper Util hard codes part-# folder name (joyce)
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1714908&r1=1714907&r2=1714908&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Tue Nov 17
23:43:10 2015
@@ -48,6 +48,7 @@ import org.apache.hadoop.util.StringUtil
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.DumpFileUtil;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TableUtil;
//Tika imports
import org.apache.tika.Tika;
@@ -135,7 +136,7 @@ public class FileDumper {
* instead of dumping files.
* @throws Exception
*/
- public void dump(File outputDir, File segmentRootDir, String[] mimeTypes,
boolean flatDir, boolean mimeTypeStats)
+ public void dump(File outputDir, File segmentRootDir, String[] mimeTypes,
boolean flatDir, boolean mimeTypeStats, boolean reverseURLDump)
throws Exception {
if (mimeTypes == null)
LOG.info("Accepting all mimetypes.");
@@ -227,14 +228,50 @@ public class FileDumper {
String md5Ofurl = DumpFileUtil.getUrlMD5(url);
String fullDir = outputDir.getAbsolutePath();
- if (!flatDir) {
+ if (!flatDir && !reverseURLDump) {
fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir,
md5Ofurl);
}
if (!Strings.isNullOrEmpty(fullDir)) {
- String outputFullPath = String.format("%s/%s", fullDir,
DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
- File outputFile = new File(outputFullPath);
+ String outputFullPath;
+
+ if (reverseURLDump) {
+ String[] reversedURL =
TableUtil.reverseUrl(url).split(":");
+ reversedURL[0] = reversedURL[0].replace('.', '/');
+
+ // URLs with content at a folder level and nested below
that
+ // run into problems when dumping. For example:
+ //
+ // www.foo.com/bar/
+ // www.foo.com/bar/about.html
+ //
+ // One of these will fail to dump depending on processing
order.
+ // To address this, we will use a placeholder when dumping
a URL
+ // such as the one ending in '/bar/'
+ String lastDir = reversedURL[reversedURL.length - 1];
+ if (! lastDir.contains(".")) {
+ if (lastDir.charAt(lastDir.length() - 1) != '/') {
+ reversedURL[reversedURL.length - 1] += '/';
+ }
+ reversedURL[reversedURL.length - 1] += "_file";
+ }
+
+ String reversedURLPath = String.join("/", reversedURL);
+ outputFullPath = String.format("%s/%s", fullDir,
reversedURLPath);
+
+ // We'll drop the trailing file name and create the nested
structure if it doesn't already exist.
+ String[] splitPath = outputFullPath.split("/");
+ File fullOutputDir = new File(String.join("/",
Arrays.asList(splitPath).subList(0, splitPath.length - 1)));
+
+ if (!fullOutputDir.exists()) {
+ fullOutputDir.mkdirs();
+ }
+ } else {
+ outputFullPath = String.format("%s/%s", fullDir,
DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
+ }
+ File outputFile = new File(outputFullPath);
+
if (!outputFile.exists()) {
LOG.info("Writing: [" + outputFullPath + "]");
@@ -328,6 +365,12 @@ public class FileDumper {
.withDescription(
"optionally specify that the output directory should only contain
files.")
.create("flatdir");
+ @SuppressWarnings("static-access")
+ Option reverseURLOutput = OptionBuilder
+ .withArgName("reverseUrlDirs")
+ .withDescription(
+ "optionally specify to use reverse URL folders for output structure.")
+ .create("reverseUrlDirs");
// create the options
Options options = new Options();
@@ -337,6 +380,7 @@ public class FileDumper {
options.addOption(mimeOpt);
options.addOption(mimeStat);
options.addOption(dirStructureOpt);
+ options.addOption(reverseURLOutput);
CommandLineParser parser = new GnuParser();
try {
@@ -355,6 +399,9 @@ public class FileDumper {
boolean shouldDisplayStats = false;
if (line.hasOption("mimeStats"))
shouldDisplayStats = true;
+ boolean reverseURLDump = false;
+ if (line.hasOption("reverseUrlDirs"))
+ reverseURLDump = true;
if (!outputDir.exists()) {
LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
@@ -367,7 +414,7 @@ public class FileDumper {
}
FileDumper dumper = new FileDumper();
- dumper.dump(outputDir, segmentRootDir, mimeTypes, flatDir,
shouldDisplayStats);
+ dumper.dump(outputDir, segmentRootDir, mimeTypes, flatDir,
shouldDisplayStats, reverseURLDump);
} catch (Exception e) {
LOG.error("FileDumper: " + StringUtils.stringifyException(e));
e.printStackTrace();