Author: lewismc
Date: Tue Oct 7 18:21:31 2014
New Revision: 1629941
URL: http://svn.apache.org/r1629941
Log:
Joint commit for NUTCH-1868 Document and improve CLI for FileDumper tool and
NUTCH-1869 Add a flag to -mimeType fiag to FileDumper
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1629941&r1=1629940&r2=1629941&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Oct 7 18:21:31 2014
@@ -2,6 +2,10 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1868 Document and improve CLI for FileDumper tool (lewismc)
+
+* NUTCH-1869 Add a flag to -mimeType fiag to FileDumper (lewismc)
+
* NUTCH-1867 CrawlDbReader: use setFloat to pass min score (lewismc, snagel)
* NUTCH-1826, NUTCH-1864 indexchecker fails if solr.server.url not configured
(lewismc, snagel)
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1629941&r1=1629940&r2=1629941&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Tue Oct 7
18:21:31 2014
@@ -23,11 +23,17 @@ import java.io.File;
import java.io.FileFilter;
import java.io.FileOutputStream;
import java.io.ByteArrayInputStream;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
-import java.util.logging.Level;
-import java.util.logging.Logger;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
//Commons imports
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.FilenameUtils;
@@ -37,160 +43,272 @@ import org.apache.hadoop.conf.Configurat
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
-
-//Nutch imports
-import org.apache.nutch.metadata.Metadata;
+import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
//Tika imports
import org.apache.tika.Tika;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+/**
+ * <p>The file dumper tool enables one to reverse generate the raw content
+ * from Nutch segment data directories. </p>
+ * <p>
+ * The tool has a number of immediate uses:
+ * <ol>
+ * <li>one can see what a page looked like at the time it was crawled</li>
+ * <li>one can see different media types aquired as part of the crawl</li>
+ * <li>it enables us to see webpages before we augment them with additional
metadata,
+ * this can be handy for providing a provenance trail for your crawl data.</li>
+ * </ol>
+ * </p>
+ * <p>Upon successful completion the tool displays a very convenient JSON
snippet
+ * detailing the mimetype classifications and the counts of doucments which
+ * fall into those classifications. An example is as follows:</p>
+ * <pre>
+ * {@code
+ * INFO: File Types:
+ * TOTAL Stats: {
+ * {"mimeType":"application/xml","count":19"}
+ * {"mimeType":"image/png","count":47"}
+ * {"mimeType":"image/jpeg","count":141"}
+ * {"mimeType":"image/vnd.microsoft.icon","count":4"}
+ * {"mimeType":"text/plain","count":89"}
+ * {"mimeType":"video/quicktime","count":2"}
+ * {"mimeType":"image/gif","count":63"}
+ * {"mimeType":"application/xhtml+xml","count":1670"}
+ * {"mimeType":"application/octet-stream","count":40"}
+ * {"mimeType":"text/html","count":1863"}
+ * }
+ * FILTER Stats: {
+ * {"mimeType":"image/png","count":47"}
+ * {"mimeType":"image/jpeg","count":141"}
+ * {"mimeType":"image/vnd.microsoft.icon","count":4"}
+ * {"mimeType":"video/quicktime","count":2"}
+ * {"mimeType":"image/gif","count":63"}
+ * }
+ * }
+ * </pre>
+ * <p>In the case above the tool would have been run with the <b>-mimeType
+ * image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b>
+ * flag and corresponding values activated.
+ *
+ */
public class FileDumper {
- private static final Logger LOG = Logger.getLogger(FileDumper.class
- .getName());
+ private static final Logger LOG = LoggerFactory.getLogger(FileDumper.class
+ .getName());
- public void dump(File outputDir, File segmentRootDir) throws Exception {
- Map<String, Integer> typeCounts = new HashMap<String, Integer>();
- Configuration conf = NutchConfiguration.create();
- FileSystem fs = FileSystem.get(conf);
- int fileCount = 0;
- File[] segmentDirs = segmentRootDir
- .listFiles(new FileFilter() {
-
- @Override
- public boolean accept(File file) {
- return file.canRead() && file.isDirectory();
- }
- });
-
- for (File segment : segmentDirs) {
- LOG.log(Level.INFO,
- "Processing segment: [" + segment.getAbsolutePath() + "]");
- DataOutputStream doutputStream = null;
- try {
- String segmentPath = segment.getAbsolutePath()
- + "/" + Content.DIR_NAME + "/part-00000/data";
- Path file = new Path(segmentPath);
- if (!new File(file.toString()).exists()) {
- LOG.log(Level.WARNING, "Skipping segment: [" + segmentPath
- + "]: no data directory present");
- continue;
- }
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, file,
- conf);
-
- Writable key = (Writable)reader.getKeyClass().newInstance();
- Content content = null;
-
- while (reader.next(key)) {
- content = new Content();
- reader.getCurrentValue(content);
- String url = key.toString();
- String baseName = FilenameUtils.getBaseName(url);
- String extension = FilenameUtils.getExtension(url);
- if (extension == null || (extension != null &&
- extension.equals(""))){
- extension = "html";
- }
-
- String filename = baseName + "." + extension;
-
- ByteArrayInputStream bas = null;
- try{
- bas = new ByteArrayInputStream(content.getContent());
- String mimeType = new
Tika().detect(content.getContent());
- collectStats(typeCounts, mimeType);
- }
- catch(Exception e){
- e.printStackTrace();
- LOG.log(Level.WARNING, "Unable to detect type for:
["+url+"]: Message: "+e.getMessage());
- }
- finally{
- if(bas != null){
- try{
- bas.close();
- }
- catch(Exception ignore){}
- bas = null;
- }
- }
-
- String outputFullPath = outputDir + "/" + filename;
- File outputFile = new File(outputFullPath);
- if (!outputFile.exists()) {
- LOG.log(Level.INFO, "Writing: [" + outputFullPath +
"]");
- FileOutputStream output = new
FileOutputStream(outputFile);
- IOUtils.write(content.getContent(), output);
- fileCount++;
- } else {
- LOG.log(Level.INFO, "Skipping writing: ["
- + outputFullPath + "]: file already exists");
- }
- content = null;
- }
- reader.close();
- }
- finally {
- fs.close();
- if (doutputStream != null){
- try{
- doutputStream.close();
- }
- catch (Exception ignore){}
- }
- }
+ /**
+ * Dumps the reverse engineered raw content from the provided segment
directories
+ * if a parent directory contains more than one segment, otherwise a single
segment
+ * can be passed as an argument.
+ * @param outputDir the directory you wish to dump the raw content to. This
directory will be created.
+ * @param segmentRootDir a directory containing one or more segments.
+ * @param mimeTypes an array of mime types we have to dump, all others will
be filtered out.
+ * @throws Exception
+ */
+ public void dump(File outputDir, File segmentRootDir, String[] mimeTypes)
throws Exception {
+ //total file counts
+ Map<String, Integer> typeCounts = new HashMap<String, Integer>();
+ //filtered file counts
+ Map<String, Integer> filteredCounts = new HashMap<String, Integer>();
+ Configuration conf = NutchConfiguration.create();
+ FileSystem fs = FileSystem.get(conf);
+ int fileCount = 0;
+ File[] segmentDirs = segmentRootDir
+ .listFiles(new FileFilter() {
+
+ @Override
+ public boolean accept(File file) {
+ return file.canRead() && file.isDirectory();
+ }
+ });
+
+ for (File segment : segmentDirs) {
+ LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
+ DataOutputStream doutputStream = null;
+ try {
+ String segmentPath = segment.getAbsolutePath()
+ + "/" + Content.DIR_NAME + "/part-00000/data";
+ Path file = new Path(segmentPath);
+ if (!new File(file.toString()).exists()) {
+ LOG.warn("Skipping segment: [" + segmentPath
+ + "]: no data directory present");
+ continue;
}
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, file,
+ conf);
- LOG.log(Level.INFO, "Processed: [" + fileCount + "] files.");
- LOG.log(Level.INFO, "File Types: " + displayFileTypes(typeCounts));
+ Writable key = (Writable)reader.getKeyClass().newInstance();
+ Content content = null;
+
+ while (reader.next(key)) {
+ content = new Content();
+ reader.getCurrentValue(content);
+ String url = key.toString();
+ String baseName = FilenameUtils.getBaseName(url);
+ String extension = FilenameUtils.getExtension(url);
+ if (extension == null || (extension != null &&
+ extension.equals(""))){
+ extension = "html";
+ }
+
+ String filename = baseName + "." + extension;
+ ByteArrayInputStream bas = null;
+ Boolean filter = false;
+ try{
+ bas = new ByteArrayInputStream(content.getContent());
+ String mimeType = new Tika().detect(content.getContent());
+ collectStats(typeCounts, mimeType);
+ if (mimeType != null) {
+ if (Arrays.asList(mimeTypes).contains(mimeType)) {
+ collectStats(filteredCounts, mimeType);
+ filter = true;
+ }
+ }
+ }
+ catch(Exception e){
+ e.printStackTrace();
+ LOG.warn("Tika is unable to detect type for: ["+url+"]");
+ }
+ finally{
+ if(bas != null){
+ try{
+ bas.close();
+ }
+ catch(Exception ignore){}
+ bas = null;
+ }
+ }
+ if (filter) {
+ String outputFullPath = outputDir + "/" + filename;
+ File outputFile = new File(outputFullPath);
+ if (!outputFile.exists()) {
+ LOG.info("Writing: [" + outputFullPath + "]");
+ FileOutputStream output = new FileOutputStream(outputFile);
+ IOUtils.write(content.getContent(), output);
+ fileCount++;
+ } else {
+ LOG.info("Skipping writing: ["
+ + outputFullPath + "]: file already exists");
+ }
+ content = null;
+ }
+ }
+ reader.close();
+ }
+ finally {
+ fs.close();
+ if (doutputStream != null){
+ try{
+ doutputStream.close();
+ }
+ catch (Exception ignore){}
+ }
+ }
}
+ LOG.info("Dumper File Stats: " + displayFileTypes(typeCounts,
filteredCounts));
- public static void main(String[] args) throws Exception {
- String usage = "Usage: FileDumper <output directory> <segments dir>\n";
- if (args.length != 2) {
- System.err.println(usage);
- System.exit(1);
- }
-
- String outputDir = args[0];
- String segmentRootDir = args[1];
- File outputDirFile = new File(outputDir);
- File segmentRootDirFile = new File(segmentRootDir);
-
- if (!outputDirFile.exists()) {
- LOG.log(Level.WARNING, "Output directory: [" + outputDir
- + "]: does not exist, creating it.");
- if(!outputDirFile.mkdirs()) throw new Exception("Unable to create:
["+outputDir+"]");
- }
+ }
- FileDumper dumper = new FileDumper();
- dumper.dump(outputDirFile, segmentRootDirFile);
- }
+ /**
+ * Main method for invoking this tool
+ * @param args 1) output directory (which will be created) to host the
+ * raw data and 2) a directory containing one or more segments.
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception {
+ //boolean options
+ Option helpOpt = new Option("h", "help", false, "show this help message");
+ //argument options
+ @SuppressWarnings("static-access")
+ Option outputOpt = OptionBuilder.withArgName("outputDir")
+ .hasArg().withDescription("output directory (which will be created) to
host the raw data")
+ .create("outputDir");
+ @SuppressWarnings("static-access")
+ Option segOpt = OptionBuilder.withArgName("segment")
+ .hasArgs().withDescription("the segment(s) to use")
+ .create("segment");
+ @SuppressWarnings("static-access")
+ Option mimeOpt = OptionBuilder.withArgName("mimetype")
+ .hasArgs().withDescription("an optional list of mimetypes to dump,
excluding all others")
+ .create("mimetype");
+
+ //create the options
+ Options options = new Options();
+ options.addOption(helpOpt);
+ options.addOption(outputOpt);
+ options.addOption(segOpt);
+ options.addOption(mimeOpt);
+
+ CommandLineParser parser = new GnuParser();
+ try {
+ CommandLine line = parser.parse(options, args);
+ if (line.hasOption("help") || !line.hasOption("outputDir")
+ || (!line.hasOption("segment"))) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp("FileDumper", options, true);
+ return;
+ }
+
+ File outputDir = new File(line.getOptionValue("outputDir"));
+ File segmentRootDir = new File(line.getOptionValue("segment"));
+ String[] mimeTypes = line.getOptionValues("mimetype");
+
+ if (!outputDir.exists()) {
+ LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
+ + "]: does not exist, creating it.");
+ if(!outputDir.mkdirs()) throw new Exception("Unable to create:
["+outputDir.getAbsolutePath()+"]");
+ }
- private void collectStats(Map<String, Integer> typeCounts, String
mimeType) {
- typeCounts.put(mimeType,
- typeCounts.containsKey(mimeType) ?
typeCounts.get(mimeType) + 1
- : 1);
+ FileDumper dumper = new FileDumper();
+ dumper.dump(outputDir, segmentRootDir, mimeTypes);
+ }
+ catch(Exception e) {
+ LOG.error("FileDumper: " + StringUtils.stringifyException(e));
+ return;
}
+ }
- private String displayFileTypes(Map<String, Integer> typeCounts) {
- StringBuilder builder = new StringBuilder();
- builder.append("{\n");
- for (String mimeType : typeCounts.keySet()) {
- builder.append("{\"mimeType\":\"");
- builder.append(mimeType);
- builder.append("\",\"count\":");
- builder.append(typeCounts.get(mimeType));
- builder.append("\"}\n");
- }
- builder.append("}\n");
- return builder.toString();
+ private void collectStats(Map<String, Integer> typeCounts, String mimeType) {
+ typeCounts.put(mimeType,
+ typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1
+ : 1);
+ }
+
+ private String displayFileTypes(Map<String, Integer> typeCounts, Map<String,
Integer> filteredCounts) {
+ StringBuilder builder = new StringBuilder();
+ //print total stats
+ builder.append("\n TOTAL Stats:\n");
+ builder.append(" {\n");
+ for (String mimeType : typeCounts.keySet()) {
+ builder.append(" {\"mimeType\":\"");
+ builder.append(mimeType);
+ builder.append("\",\"count\":");
+ builder.append(typeCounts.get(mimeType));
+ builder.append("\"}\n");
+ }
+ builder.append("}\n");
+ if (!filteredCounts.isEmpty()) {
+ // print dumper stats
+ builder.append("\n FILTERED Stats:\n");
+ builder.append(" {\n");
+ for (String mimeType : filteredCounts.keySet()) {
+ builder.append(" {\"mimeType\":\"");
+ builder.append(mimeType);
+ builder.append("\",\"count\":");
+ builder.append(filteredCounts.get(mimeType));
+ builder.append("\"}\n");
+ }
+ builder.append("}\n");
}
+ return builder.toString();
+ }
}