FileDumper.java

lewismc Tue, 07 Oct 2014 11:22:35 -0700

Author: lewismc
Date: Tue Oct  7 18:21:31 2014
New Revision: 1629941

URL: http://svn.apache.org/r1629941
Log:
Joint commit for NUTCH-1868 Document and improve CLI for FileDumper tool and 
NUTCH-1869 Add a flag to -mimeType fiag to FileDumper


Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1629941&r1=1629940&r2=1629941&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Oct  7 18:21:31 2014
@@ -2,6 +2,10 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1868 Document and improve CLI for FileDumper tool (lewismc)
+
+* NUTCH-1869 Add a flag to -mimeType fiag to FileDumper (lewismc)
+
 * NUTCH-1867 CrawlDbReader: use setFloat to pass min score (lewismc, snagel)
 
 * NUTCH-1826, NUTCH-1864 indexchecker fails if solr.server.url not configured 
(lewismc, snagel)

Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1629941&r1=1629940&r2=1629941&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Tue Oct  7 
18:21:31 2014
@@ -23,11 +23,17 @@ import java.io.File;
 import java.io.FileFilter;
 import java.io.FileOutputStream;
 import java.io.ByteArrayInputStream;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
-import java.util.logging.Level;
-import java.util.logging.Logger;
 
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
 //Commons imports
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.FilenameUtils;
@@ -37,160 +43,272 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
-
-//Nutch imports
-import org.apache.nutch.metadata.Metadata;
+import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.NutchConfiguration;
 
 //Tika imports
 import org.apache.tika.Tika;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
+/**
+ * <p>The file dumper tool enables one to reverse generate the raw content
+ * from Nutch segment data directories. </p>
+ * <p>
+ * The tool has a number of immediate uses:
+ * <ol>
+ * <li>one can see what a page looked like at the time it was crawled</li>
+ * <li>one can see different media types aquired as part of the crawl</li>
+ * <li>it enables us to see webpages before we augment them with additional 
metadata,
+ * this can be handy for providing a provenance trail for your crawl data.</li>
+ * </ol>
+ * </p>
+ * <p>Upon successful completion the tool displays a very convenient JSON 
snippet 
+ * detailing the mimetype classifications and the counts of doucments which 
+ * fall into those classifications. An example is as follows:</p>
+ * <pre>
+ * {@code
+ * INFO: File Types: 
+ *   TOTAL Stats:    {
+ *     {"mimeType":"application/xml","count":19"}
+ *     {"mimeType":"image/png","count":47"}
+ *     {"mimeType":"image/jpeg","count":141"}
+ *     {"mimeType":"image/vnd.microsoft.icon","count":4"}
+ *     {"mimeType":"text/plain","count":89"}
+ *     {"mimeType":"video/quicktime","count":2"}
+ *     {"mimeType":"image/gif","count":63"}
+ *     {"mimeType":"application/xhtml+xml","count":1670"}
+ *     {"mimeType":"application/octet-stream","count":40"}
+ *     {"mimeType":"text/html","count":1863"}
+ *   }
+ *   FILTER Stats:    {
+ *     {"mimeType":"image/png","count":47"}
+ *     {"mimeType":"image/jpeg","count":141"}
+ *     {"mimeType":"image/vnd.microsoft.icon","count":4"}
+ *     {"mimeType":"video/quicktime","count":2"}
+ *     {"mimeType":"image/gif","count":63"}
+ *   }
+ * }
+ * </pre>
+ * <p>In the case above the tool would have been run with the <b>-mimeType 
+ * image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b> 
+ * flag and corresponding values activated.
+ *
+ */
 public class FileDumper {
 
-    private static final Logger LOG = Logger.getLogger(FileDumper.class
-                                                      .getName());
+  private static final Logger LOG = LoggerFactory.getLogger(FileDumper.class
+      .getName());
 
 
-    public void dump(File outputDir, File segmentRootDir) throws Exception {
-        Map<String, Integer> typeCounts = new HashMap<String, Integer>();
-        Configuration conf = NutchConfiguration.create();
-        FileSystem fs = FileSystem.get(conf);
-        int fileCount = 0;
-        File[] segmentDirs = segmentRootDir
-            .listFiles(new FileFilter() {
-
-                    @Override
-                        public boolean accept(File file) {
-                        return file.canRead() && file.isDirectory();
-                    }
-                });
-
-        for (File segment : segmentDirs) {
-            LOG.log(Level.INFO,
-                    "Processing segment: [" + segment.getAbsolutePath() + "]");
-            DataOutputStream doutputStream = null;
-            try {
-                String segmentPath = segment.getAbsolutePath()
-                    + "/" + Content.DIR_NAME + "/part-00000/data";
-                Path file = new Path(segmentPath);
-                if (!new File(file.toString()).exists()) {
-                    LOG.log(Level.WARNING, "Skipping segment: [" + segmentPath
-                            + "]: no data directory present");
-                    continue;
-                }
-                SequenceFile.Reader reader = new SequenceFile.Reader(fs, file,
-                                                                     conf);
-
-                Writable key = (Writable)reader.getKeyClass().newInstance();
-                Content content = null;
-
-                while (reader.next(key)) {
-                   content = new Content();
-                   reader.getCurrentValue(content);
-                    String url = key.toString();
-                   String baseName = FilenameUtils.getBaseName(url);
-                   String extension = FilenameUtils.getExtension(url);
-                   if (extension == null || (extension != null && 
-                                             extension.equals(""))){
-                           extension = "html";
-                   }
-
-                   String filename = baseName + "." + extension;
-
-                   ByteArrayInputStream bas = null;
-                   try{
-                       bas = new ByteArrayInputStream(content.getContent());
-                       String mimeType = new 
Tika().detect(content.getContent());
-                        collectStats(typeCounts, mimeType);
-                   }
-                   catch(Exception e){
-                       e.printStackTrace();
-                       LOG.log(Level.WARNING, "Unable to detect type for: 
["+url+"]: Message: "+e.getMessage());
-                   }
-                   finally{
-                       if(bas != null){
-                           try{
-                               bas.close();
-                           }
-                           catch(Exception ignore){}
-                           bas = null;
-                       }
-                   }
-
-                    String outputFullPath = outputDir + "/" + filename;
-                    File outputFile = new File(outputFullPath);
-                    if (!outputFile.exists()) {
-                        LOG.log(Level.INFO, "Writing: [" + outputFullPath + 
"]");
-                       FileOutputStream output = new 
FileOutputStream(outputFile);
-                       IOUtils.write(content.getContent(), output);
-                        fileCount++;
-                    } else {
-                        LOG.log(Level.INFO, "Skipping writing: ["
-                                + outputFullPath + "]: file already exists");
-                    }
-                    content = null;
-                }
-                reader.close();
-            }
-             finally {
-                fs.close();
-                if (doutputStream != null){
-                   try{
-                       doutputStream.close();
-                   }
-                   catch (Exception ignore){}
-               }
-            }
+  /**
+   * Dumps the reverse engineered raw content from the provided segment 
directories
+   * if a parent directory contains more than one segment, otherwise a single 
segment
+   * can be passed as an argument. 
+   * @param outputDir the directory you wish to dump the raw content to. This 
directory will be created.
+   * @param segmentRootDir a directory containing one or more segments.
+   * @param mimeTypes an array of mime types we have to dump, all others will 
be filtered out.
+   * @throws Exception
+   */
+  public void dump(File outputDir, File segmentRootDir, String[] mimeTypes) 
throws Exception {
+    //total file counts
+    Map<String, Integer> typeCounts = new HashMap<String, Integer>();
+    //filtered file counts
+    Map<String, Integer> filteredCounts = new HashMap<String, Integer>();
+    Configuration conf = NutchConfiguration.create();
+    FileSystem fs = FileSystem.get(conf);
+    int fileCount = 0;
+    File[] segmentDirs = segmentRootDir
+        .listFiles(new FileFilter() {
+
+          @Override
+          public boolean accept(File file) {
+            return file.canRead() && file.isDirectory();
+          }
+        });
+
+    for (File segment : segmentDirs) {
+      LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
+      DataOutputStream doutputStream = null;
+      try {
+        String segmentPath = segment.getAbsolutePath()
+            + "/" + Content.DIR_NAME + "/part-00000/data";
+        Path file = new Path(segmentPath);
+        if (!new File(file.toString()).exists()) {
+          LOG.warn("Skipping segment: [" + segmentPath
+              + "]: no data directory present");
+          continue;
         }
+        SequenceFile.Reader reader = new SequenceFile.Reader(fs, file,
+            conf);
 
-        LOG.log(Level.INFO, "Processed: [" + fileCount + "] files.");
-        LOG.log(Level.INFO, "File Types: " + displayFileTypes(typeCounts));
+        Writable key = (Writable)reader.getKeyClass().newInstance();
+        Content content = null;
+
+        while (reader.next(key)) {
+          content = new Content();
+          reader.getCurrentValue(content);
+          String url = key.toString();
+          String baseName = FilenameUtils.getBaseName(url);
+          String extension = FilenameUtils.getExtension(url);
+          if (extension == null || (extension != null && 
+              extension.equals(""))){
+            extension = "html";
+          }
+
+          String filename = baseName + "." + extension;
+          ByteArrayInputStream bas = null;
+          Boolean filter = false;
+          try{
+            bas = new ByteArrayInputStream(content.getContent());
+            String mimeType = new Tika().detect(content.getContent());
+            collectStats(typeCounts, mimeType);
+            if (mimeType != null) {
+              if (Arrays.asList(mimeTypes).contains(mimeType)) {
+                collectStats(filteredCounts, mimeType);
+                filter = true;
+              }
+            }
+          }
+          catch(Exception e){
+            e.printStackTrace();
+            LOG.warn("Tika is unable to detect type for: ["+url+"]");
+          }
+          finally{
+            if(bas != null){
+              try{
+                bas.close();
+              }
+              catch(Exception ignore){}
+              bas = null;
+            }
+          }
 
+          if (filter) {
+            String outputFullPath = outputDir + "/" + filename;
+            File outputFile = new File(outputFullPath);
+            if (!outputFile.exists()) {
+              LOG.info("Writing: [" + outputFullPath + "]");
+              FileOutputStream output = new FileOutputStream(outputFile);
+              IOUtils.write(content.getContent(), output);
+              fileCount++;
+            } else {
+              LOG.info("Skipping writing: ["
+                  + outputFullPath + "]: file already exists");
+            }
+            content = null;
+          }
+        }
+        reader.close();
+      }
+      finally {
+        fs.close();
+        if (doutputStream != null){
+          try{
+            doutputStream.close();
+          }
+          catch (Exception ignore){}
+        }
+      }
     }
+    LOG.info("Dumper File Stats: " + displayFileTypes(typeCounts, 
filteredCounts));
 
-    public static void main(String[] args) throws Exception {
-       String usage = "Usage: FileDumper <output directory> <segments dir>\n";
-       if (args.length != 2) {
-           System.err.println(usage);
-           System.exit(1);
-       }
-
-        String outputDir = args[0];
-        String segmentRootDir = args[1];
-       File outputDirFile = new File(outputDir);
-       File segmentRootDirFile = new File(segmentRootDir);
-
-       if (!outputDirFile.exists()) {
-           LOG.log(Level.WARNING, "Output directory: [" + outputDir
-                   + "]: does not exist, creating it.");
-           if(!outputDirFile.mkdirs()) throw new Exception("Unable to create: 
["+outputDir+"]");
-       }
+  }
 
-       FileDumper dumper = new FileDumper();
-       dumper.dump(outputDirFile, segmentRootDirFile);
-    }
+  /**
+   * Main method for invoking this tool
+   * @param args 1) output directory (which will be created) to host the
+   * raw data and 2) a directory containing one or more segments.
+   * @throws Exception
+   */
+  public static void main(String[] args) throws Exception {
+    //boolean options
+    Option helpOpt = new Option("h", "help", false, "show this help message");
+    //argument options
+    @SuppressWarnings("static-access")
+    Option outputOpt = OptionBuilder.withArgName("outputDir")
+    .hasArg().withDescription("output directory (which will be created) to 
host the raw data")
+    .create("outputDir");
+    @SuppressWarnings("static-access")
+    Option segOpt = OptionBuilder.withArgName("segment")
+    .hasArgs().withDescription("the segment(s) to use")
+    .create("segment");
+    @SuppressWarnings("static-access")
+    Option mimeOpt = OptionBuilder.withArgName("mimetype")
+    .hasArgs().withDescription("an optional list of mimetypes to dump, 
excluding all others")
+    .create("mimetype");
+
+    //create the options
+    Options options = new Options();
+    options.addOption(helpOpt);
+    options.addOption(outputOpt);
+    options.addOption(segOpt);
+    options.addOption(mimeOpt);
+
+    CommandLineParser parser = new GnuParser();
+    try {
+      CommandLine line = parser.parse(options, args);
+      if (line.hasOption("help") || !line.hasOption("outputDir")
+          || (!line.hasOption("segment"))) {
+        HelpFormatter formatter = new HelpFormatter();
+        formatter.printHelp("FileDumper", options, true);
+        return;
+      }
+
+      File outputDir = new File(line.getOptionValue("outputDir"));
+      File segmentRootDir = new File(line.getOptionValue("segment"));
+      String[] mimeTypes = line.getOptionValues("mimetype");
+
+      if (!outputDir.exists()) {
+        LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
+            + "]: does not exist, creating it.");
+        if(!outputDir.mkdirs()) throw new Exception("Unable to create: 
["+outputDir.getAbsolutePath()+"]");
+      }
 
-    private void collectStats(Map<String, Integer> typeCounts, String 
mimeType) {
-       typeCounts.put(mimeType,
-                      typeCounts.containsKey(mimeType) ? 
typeCounts.get(mimeType) + 1
-                      : 1);
+      FileDumper dumper = new FileDumper();
+      dumper.dump(outputDir, segmentRootDir, mimeTypes);
+    }
+    catch(Exception e) {
+      LOG.error("FileDumper: " + StringUtils.stringifyException(e));
+      return;
     }
+  }
 
-    private String displayFileTypes(Map<String, Integer> typeCounts) {
-       StringBuilder  builder = new StringBuilder();
-       builder.append("{\n");
-       for (String mimeType : typeCounts.keySet()) {
-           builder.append("{\"mimeType\":\"");
-           builder.append(mimeType);
-           builder.append("\",\"count\":");
-           builder.append(typeCounts.get(mimeType));
-           builder.append("\"}\n");
-       }
-       builder.append("}\n");
-       return builder.toString();
+  private void collectStats(Map<String, Integer> typeCounts, String mimeType) {
+    typeCounts.put(mimeType,
+        typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1
+            : 1);
+  }
+
+  private String displayFileTypes(Map<String, Integer> typeCounts, Map<String, 
Integer> filteredCounts) {
+    StringBuilder  builder = new StringBuilder();
+    //print total stats
+    builder.append("\n  TOTAL Stats:\n");
+    builder.append("                {\n");
+    for (String mimeType : typeCounts.keySet()) {
+      builder.append("    {\"mimeType\":\"");
+      builder.append(mimeType);
+      builder.append("\",\"count\":");
+      builder.append(typeCounts.get(mimeType));
+      builder.append("\"}\n");
+    }
+    builder.append("}\n");
+    if (!filteredCounts.isEmpty()) {
+      // print dumper stats
+      builder.append("\n  FILTERED Stats:\n");
+      builder.append("                {\n");
+      for (String mimeType : filteredCounts.keySet()) {
+        builder.append("    {\"mimeType\":\"");
+        builder.append(mimeType);
+        builder.append("\",\"count\":");
+        builder.append(filteredCounts.get(mimeType));
+        builder.append("\"}\n");
+      }
+      builder.append("}\n");
     }
+    return builder.toString();
+  }
 
 }

svn commit: r1629941 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/tools/FileDumper.java

Reply via email to