Author: jnioche Date: Mon Mar 29 12:12:09 2010 New Revision: 928746 URL: http://svn.apache.org/viewvc?rev=928746&view=rev Log: NUTCH-784 : CrawlDBScanner
Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java Modified: lucene/nutch/trunk/CHANGES.txt Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=928746&r1=928745&r2=928746&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Mar 29 12:12:09 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-784 CrawlDBScanner (jnioche) + * NUTCH-762 Generator can generate several segments in one parse of the crawlDB (jnioche) * NUTCH-740 Configuration option to override default language for fetched pages (Marcin Okraszewski via jnioche) Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java?rev=928746&view=auto ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java Mon Mar 29 12:12:09 2010 @@ -0,0 +1,165 @@ +package org.apache.nutch.tools; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapFileOutputFormat; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.SequenceFileInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.CrawlDb; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; + +/** + * Dumps all the entries matching a regular expression on their URL. Generates a + * text representation of the CrawlDatum-s or binary objects which can then be + * used as a new CrawlDB. The dump mechanism of the crawldb reader is not very + * useful on large crawldbs as the ouput can be extremely large and the -url + * function can't help if we don't know what url we want to have a look at. + * + * @author : Julien Nioche + */ + +public class CrawlDBScanner extends Configured implements Tool, + Mapper<Text,CrawlDatum,Text,CrawlDatum>, Reducer<Text,CrawlDatum,Text,CrawlDatum> { + + public static final Log LOG = LogFactory.getLog(CrawlDBScanner.class); + + public CrawlDBScanner() {} + + public CrawlDBScanner(Configuration conf) { + setConf(conf); + } + + public void close() {} + + private String regex = null; + private String status = null; + + public void configure(JobConf job) { + regex = job.get("CrawlDBScanner.regex"); + status = job.get("CrawlDBScanner.status"); + } + + public void map(Text url, CrawlDatum crawlDatum, + OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws IOException { + + // check status + if (status != null + && !status.equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus()))) return; + + // if URL matched regexp dump it + if (url.toString().matches(regex)) { + output.collect(url, crawlDatum); + } + } + + public void reduce(Text key, Iterator<CrawlDatum> values, + OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws IOException { + while (values.hasNext()) { + CrawlDatum val = values.next(); + output.collect(key, val); + } + } + + private void scan(Path crawlDb, Path outputPath, String regex, String status, + boolean text) throws IOException { + + JobConf job = new NutchJob(getConf()); + + job.setJobName("Scan : " + crawlDb + " for URLS matching : " + regex); + + job.set("CrawlDBScanner.regex", regex); + if (status != null) job.set("CrawlDBScanner.status", status); + + FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); + job.setInputFormat(SequenceFileInputFormat.class); + + job.setMapperClass(CrawlDBScanner.class); + job.setReducerClass(CrawlDBScanner.class); + + FileOutputFormat.setOutputPath(job, outputPath); + + // if we want a text dump of the entries + // in order to check something - better to use the text format and avoid + // compression + if (text) { + job.set("mapred.output.compress", "false"); + job.setOutputFormat(TextOutputFormat.class); + } + // otherwise what we will actually create is a mini-crawlDB which can be + // then used + // for debugging + else { + job.setOutputFormat(MapFileOutputFormat.class); + } + + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(CrawlDatum.class); + + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(CrawlDatum.class); + + try { + JobClient.runJob(job); + } catch (IOException e) { + throw e; + } + } + + public static void main(String args[]) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDBScanner(), args); + System.exit(res); + } + + public int run(String[] args) throws Exception { + if (args.length < 3) { + System.err + .println("Usage: CrawlDBScanner <crawldb> <output> <regex> [-s <status>] <-text>"); + return -1; + } + + boolean text = false; + + Path dbDir = new Path(args[0]); + Path output = new Path(args[1]); + + String status = null; + + for (int i = 2; i < args.length; i++) { + if (args[i].equals("-text")) { + text = true; + } else if (args[i].equals("-s")) { + i++; + status = args[i]; + } + } + + try { + scan(dbDir, output, args[2], status, text); + return 0; + } catch (Exception e) { + LOG.fatal("CrawlDBScanner: " + StringUtils.stringifyException(e)); + return -1; + } + } + +}