Author: jnioche
Date: Mon Mar 29 12:12:09 2010
New Revision: 928746
URL: http://svn.apache.org/viewvc?rev=928746&view=rev
Log:
NUTCH-784 : CrawlDBScanner
Added:
lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java
Modified:
lucene/nutch/trunk/CHANGES.txt
Modified: lucene/nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=928746&r1=928745&r2=928746&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar 29 12:12:09 2010
@@ -2,6 +2,8 @@ Nutch Change Log
Unreleased Changes
+* NUTCH-784 CrawlDBScanner (jnioche)
+
* NUTCH-762 Generator can generate several segments in one parse of the
crawlDB (jnioche)
* NUTCH-740 Configuration option to override default language for fetched
pages (Marcin Okraszewski via jnioche)
Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java?rev=928746&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/CrawlDBScanner.java Mon
Mar 29 12:12:09 2010
@@ -0,0 +1,165 @@
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+
+/**
+ * Dumps all the entries matching a regular expression on their URL. Generates
a
+ * text representation of the CrawlDatum-s or binary objects which can then be
+ * used as a new CrawlDB. The dump mechanism of the crawldb reader is not very
+ * useful on large crawldbs as the ouput can be extremely large and the -url
+ * function can't help if we don't know what url we want to have a look at.
+ *
+ * @author : Julien Nioche
+ */
+
+public class CrawlDBScanner extends Configured implements Tool,
+ Mapper<Text,CrawlDatum,Text,CrawlDatum>,
Reducer<Text,CrawlDatum,Text,CrawlDatum> {
+
+ public static final Log LOG = LogFactory.getLog(CrawlDBScanner.class);
+
+ public CrawlDBScanner() {}
+
+ public CrawlDBScanner(Configuration conf) {
+ setConf(conf);
+ }
+
+ public void close() {}
+
+ private String regex = null;
+ private String status = null;
+
+ public void configure(JobConf job) {
+ regex = job.get("CrawlDBScanner.regex");
+ status = job.get("CrawlDBScanner.status");
+ }
+
+ public void map(Text url, CrawlDatum crawlDatum,
+ OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws
IOException {
+
+ // check status
+ if (status != null
+ &&
!status.equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus())))
return;
+
+ // if URL matched regexp dump it
+ if (url.toString().matches(regex)) {
+ output.collect(url, crawlDatum);
+ }
+ }
+
+ public void reduce(Text key, Iterator<CrawlDatum> values,
+ OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws
IOException {
+ while (values.hasNext()) {
+ CrawlDatum val = values.next();
+ output.collect(key, val);
+ }
+ }
+
+ private void scan(Path crawlDb, Path outputPath, String regex, String status,
+ boolean text) throws IOException {
+
+ JobConf job = new NutchJob(getConf());
+
+ job.setJobName("Scan : " + crawlDb + " for URLS matching : " + regex);
+
+ job.set("CrawlDBScanner.regex", regex);
+ if (status != null) job.set("CrawlDBScanner.status", status);
+
+ FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
+ job.setInputFormat(SequenceFileInputFormat.class);
+
+ job.setMapperClass(CrawlDBScanner.class);
+ job.setReducerClass(CrawlDBScanner.class);
+
+ FileOutputFormat.setOutputPath(job, outputPath);
+
+ // if we want a text dump of the entries
+ // in order to check something - better to use the text format and avoid
+ // compression
+ if (text) {
+ job.set("mapred.output.compress", "false");
+ job.setOutputFormat(TextOutputFormat.class);
+ }
+ // otherwise what we will actually create is a mini-crawlDB which can be
+ // then used
+ // for debugging
+ else {
+ job.setOutputFormat(MapFileOutputFormat.class);
+ }
+
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(CrawlDatum.class);
+
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(CrawlDatum.class);
+
+ try {
+ JobClient.runJob(job);
+ } catch (IOException e) {
+ throw e;
+ }
+ }
+
+ public static void main(String args[]) throws Exception {
+ int res = ToolRunner.run(NutchConfiguration.create(), new
CrawlDBScanner(), args);
+ System.exit(res);
+ }
+
+ public int run(String[] args) throws Exception {
+ if (args.length < 3) {
+ System.err
+ .println("Usage: CrawlDBScanner <crawldb> <output> <regex> [-s
<status>] <-text>");
+ return -1;
+ }
+
+ boolean text = false;
+
+ Path dbDir = new Path(args[0]);
+ Path output = new Path(args[1]);
+
+ String status = null;
+
+ for (int i = 2; i < args.length; i++) {
+ if (args[i].equals("-text")) {
+ text = true;
+ } else if (args[i].equals("-s")) {
+ i++;
+ status = args[i];
+ }
+ }
+
+ try {
+ scan(dbDir, output, args[2], status, text);
+ return 0;
+ } catch (Exception e) {
+ LOG.fatal("CrawlDBScanner: " + StringUtils.stringifyException(e));
+ return -1;
+ }
+ }
+
+}