Author: joyce Date: Tue Nov 17 23:43:10 2015 New Revision: 1714908 URL: http://svn.apache.org/viewvc?rev=1714908&view=rev Log: NUTCH-2166 - Add reverse URL format to dump tool
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1714908&r1=1714907&r2=1714908&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Nov 17 23:43:10 2015 @@ -3,6 +3,8 @@ Nutch Change Log Nutch 1.11 Release 25/10/2015 (dd/mm/yyyy) Release Report: http://s.apache.org/nutch11 +* NUTCH-2166 Add reverse URL format to dump tool (joyce) + * NUTCH-2157 Addressing Miredot REST API Warnings (Sujen Shah) * NUTCH-2165 FileDumper Util hard codes part-# folder name (joyce) Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1714908&r1=1714907&r2=1714908&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Tue Nov 17 23:43:10 2015 @@ -48,6 +48,7 @@ import org.apache.hadoop.util.StringUtil import org.apache.nutch.protocol.Content; import org.apache.nutch.util.DumpFileUtil; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.TableUtil; //Tika imports import org.apache.tika.Tika; @@ -135,7 +136,7 @@ public class FileDumper { * instead of dumping files. * @throws Exception */ - public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean flatDir, boolean mimeTypeStats) + public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean flatDir, boolean mimeTypeStats, boolean reverseURLDump) throws Exception { if (mimeTypes == null) LOG.info("Accepting all mimetypes."); @@ -227,14 +228,50 @@ public class FileDumper { String md5Ofurl = DumpFileUtil.getUrlMD5(url); String fullDir = outputDir.getAbsolutePath(); - if (!flatDir) { + if (!flatDir && !reverseURLDump) { fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir, md5Ofurl); } if (!Strings.isNullOrEmpty(fullDir)) { - String outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension)); - File outputFile = new File(outputFullPath); + String outputFullPath; + + if (reverseURLDump) { + String[] reversedURL = TableUtil.reverseUrl(url).split(":"); + reversedURL[0] = reversedURL[0].replace('.', '/'); + + // URLs with content at a folder level and nested below that + // run into problems when dumping. For example: + // + // www.foo.com/bar/ + // www.foo.com/bar/about.html + // + // One of these will fail to dump depending on processing order. + // To address this, we will use a placeholder when dumping a URL + // such as the one ending in '/bar/' + String lastDir = reversedURL[reversedURL.length - 1]; + if (! lastDir.contains(".")) { + if (lastDir.charAt(lastDir.length() - 1) != '/') { + reversedURL[reversedURL.length - 1] += '/'; + } + reversedURL[reversedURL.length - 1] += "_file"; + } + + String reversedURLPath = String.join("/", reversedURL); + outputFullPath = String.format("%s/%s", fullDir, reversedURLPath); + + // We'll drop the trailing file name and create the nested structure if it doesn't already exist. + String[] splitPath = outputFullPath.split("/"); + File fullOutputDir = new File(String.join("/", Arrays.asList(splitPath).subList(0, splitPath.length - 1))); + + if (!fullOutputDir.exists()) { + fullOutputDir.mkdirs(); + } + } else { + outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension)); + } + File outputFile = new File(outputFullPath); + if (!outputFile.exists()) { LOG.info("Writing: [" + outputFullPath + "]"); @@ -328,6 +365,12 @@ public class FileDumper { .withDescription( "optionally specify that the output directory should only contain files.") .create("flatdir"); + @SuppressWarnings("static-access") + Option reverseURLOutput = OptionBuilder + .withArgName("reverseUrlDirs") + .withDescription( + "optionally specify to use reverse URL folders for output structure.") + .create("reverseUrlDirs"); // create the options Options options = new Options(); @@ -337,6 +380,7 @@ public class FileDumper { options.addOption(mimeOpt); options.addOption(mimeStat); options.addOption(dirStructureOpt); + options.addOption(reverseURLOutput); CommandLineParser parser = new GnuParser(); try { @@ -355,6 +399,9 @@ public class FileDumper { boolean shouldDisplayStats = false; if (line.hasOption("mimeStats")) shouldDisplayStats = true; + boolean reverseURLDump = false; + if (line.hasOption("reverseUrlDirs")) + reverseURLDump = true; if (!outputDir.exists()) { LOG.warn("Output directory: [" + outputDir.getAbsolutePath() @@ -367,7 +414,7 @@ public class FileDumper { } FileDumper dumper = new FileDumper(); - dumper.dump(outputDir, segmentRootDir, mimeTypes, flatDir, shouldDisplayStats); + dumper.dump(outputDir, segmentRootDir, mimeTypes, flatDir, shouldDisplayStats, reverseURLDump); } catch (Exception e) { LOG.error("FileDumper: " + StringUtils.stringifyException(e)); e.printStackTrace();