Author: mattmann
Date: Sat May  9 19:52:33 2015
New Revision: 1678520

URL: http://svn.apache.org/r1678520
Log:
Commit patch for NUTCH-1988 Add support for user-defined file extension to 
CommonCrawlDataDumper contributed by Giuseppe Totaro.

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1678520&r1=1678519&r2=1678520&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat May  9 19:52:33 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-1988 Add support for user-defined file extension to 
CommonCrawlDataDumper (totaro via mattmann)
+
 * NUTCH-1873 Solr IndexWriter/Job to report number of docs indexed. (snagel 
via lewismc)
  
 * NUTCH-1934 Refactor Fetcher in trunk (lewismc)

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1678520&r1=1678519&r2=1678520&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Sat 
May  9 19:52:33 2015
@@ -245,6 +245,11 @@ public class CommonCrawlDataDumper {
                                .hasArg(false)
                                .withDescription("an optional format for key 
value in JSON output.")
                                .create("reverseKey");
+               Option extensionOpt = OptionBuilder
+                               .withArgName("extension")
+                               .hasArg(true)
+                               .withDescription("an optional file extension 
for output documents.")
+                               .create("extension");
 
                // create the options
                Options options = new Options();
@@ -261,6 +266,7 @@ public class CommonCrawlDataDumper {
                options.addOption(epochFilenameOpt);
                options.addOption(jsonArrayOpt);
                options.addOption(reverseKeyOpt);
+               options.addOption(extensionOpt);
 
                CommandLineParser parser = new GnuParser();
                try {
@@ -281,6 +287,7 @@ public class CommonCrawlDataDumper {
                        boolean simpleDateFormat = 
line.hasOption("SimpleDateFormat");
                        boolean jsonArray = line.hasOption("jsonArray");
                        boolean reverseKey = line.hasOption("reverseKey");
+                       String extension = line.getOptionValue("extension", "");
                        
                        CommonCrawlConfig config = new CommonCrawlConfig();
                        config.setKeyPrefix(keyPrefix);
@@ -296,7 +303,7 @@ public class CommonCrawlDataDumper {
 
                        CommonCrawlDataDumper dumper = new 
CommonCrawlDataDumper(config);
                        
-                       dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes, 
epochFilename);
+                       dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes, 
epochFilename, extension);
                        
                } catch (Exception e) {
                        LOG.error(CommonCrawlDataDumper.class.getName() + ": " 
+ StringUtils.stringifyException(e));
@@ -329,9 +336,13 @@ public class CommonCrawlDataDumper {
         * @param mimetypes
         *            an array of mime types we have to dump, all others will be
      *            filtered out.
-        * @throws Exception
+     * @param epochFilename
+     *            if {@code true}, output files will be names using the epoch 
time (in milliseconds).
+     * @param extension
+     *            a file extension to use with output documents.
+        * @throws Exception if any exception occurs.
         */
-       public void dump(File outputDir, File segmentRootDir, boolean gzip,     
String[] mimeTypes, boolean epochFilename) throws Exception {
+       public void dump(File outputDir, File segmentRootDir, boolean gzip,     
String[] mimeTypes, boolean epochFilename, String extension) throws Exception {
                if (gzip) {
                        LOG.info("Gzipping CBOR data has been skipped");
                }
@@ -385,10 +396,13 @@ public class CommonCrawlDataDumper {
                                        Metadata metadata = 
content.getMetadata();
                                        String url = key.toString();
                                        String baseName = 
FilenameUtils.getBaseName(url);
-                                       String extension = 
FilenameUtils.getExtension(url);
+                                       String extensionName = 
FilenameUtils.getExtension(url);
                                        
-                                       if ((extension == null) || 
extension.isEmpty()) {
-                                               extension = "html";
+                                       if (!extension.isEmpty()) {
+                                               extensionName = extension;
+                                       }
+                                       else if ((extensionName == null) || 
extensionName.isEmpty()) {
+                                               extensionName = "html";
                                        }
                                        
                                        String outputFullPath = null;
@@ -410,14 +424,14 @@ public class CommonCrawlDataDumper {
                                        }       
                                        
                                        if (epochFilename) {
-                                               outputFullPath = 
DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(), reverseKey, 
url, timestamp, extension, !gzip);
+                                               outputFullPath = 
DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(), reverseKey, 
url, timestamp, extensionName, !gzip);
                                                outputRelativePath = 
outputFullPath.substring(0, outputFullPath.lastIndexOf(File.separator)-1);
-                                               filename = 
content.getMetadata().get(Metadata.DATE) + "." + extension;
+                                               filename = 
content.getMetadata().get(Metadata.DATE) + "." + extensionName;
                                        }
                                        else {
                                                String md5Ofurl = 
DumpFileUtil.getUrlMD5(url);
                                                String fullDir = 
DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl, 
!gzip);
-                                               filename = 
DumpFileUtil.createFileName(md5Ofurl, baseName, extension);
+                                               filename = 
DumpFileUtil.createFileName(md5Ofurl, baseName, extensionName);
                                                outputFullPath = 
String.format("%s/%s", fullDir, filename);
        
                                                String [] fullPathLevels = 
fullDir.split(File.separator);


Reply via email to