Author: ab
Date: Mon May 8 14:52:09 2006
New Revision: 405181
URL: http://svn.apache.org/viewcvs?rev=405181&view=rev
Log:
Refactor to make it easier to use these classes programmatically.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=405181&r1=405180&r2=405181&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Mon
May 8 14:52:09 2006
@@ -25,6 +25,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.Closeable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapFile;
@@ -55,9 +56,28 @@
* @author Andrzej Bialecki
*
*/
-public class CrawlDbReader {
+public class CrawlDbReader implements Closeable {
public static final Logger LOG =
LogFormatter.getLogger(CrawlDbReader.class.getName());
+
+ private MapFile.Reader[] readers = null;
+
+ private void openReaders(String crawlDb, Configuration config) throws
IOException {
+ if (readers != null) return;
+ FileSystem fs = FileSystem.get(config);
+ readers = MapFileOutputFormat.getReaders(fs, new File(crawlDb,
CrawlDatum.DB_DIR_NAME), config);
+ }
+
+ private void closeReaders() {
+ if (readers == null) return;
+ for (int i = 0; i < readers.length; i++) {
+ try {
+ readers[i].close();
+ } catch (Exception e) {
+
+ }
+ }
+ }
public static class CrawlDbStatMapper implements Mapper {
public void configure(JobConf job) {}
@@ -177,6 +197,10 @@
public void close() {}
}
+
+ public void close() {
+ closeReaders();
+ }
public void processStatJob(String crawlDb, Configuration config) throws
IOException {
LOG.info("CrawlDb statistics start: " + crawlDb);
@@ -249,16 +273,20 @@
LOG.info("CrawlDb statistics: done");
}
-
- public void readUrl(String crawlDb, String url, Configuration config) throws
IOException {
- FileSystem fs = FileSystem.get(config);
+
+ public CrawlDatum get(String crawlDb, String url, Configuration config)
throws IOException {
UTF8 key = new UTF8(url);
CrawlDatum val = new CrawlDatum();
- MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new
File(crawlDb, CrawlDatum.DB_DIR_NAME), config);
- Writable res = MapFileOutputFormat.getEntry(readers, new
HashPartitioner(), key, val);
+ openReaders(crawlDb, config);
+ CrawlDatum res = (CrawlDatum)MapFileOutputFormat.getEntry(readers, new
HashPartitioner(), key, val);
+ return res;
+ }
+
+ public void readUrl(String crawlDb, String url, Configuration config) throws
IOException {
+ CrawlDatum res = get(crawlDb, url, config);
System.out.println("URL: " + url);
if (res != null) {
- System.out.println(val);
+ System.out.println(res);
} else {
System.out.println("not found");
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=405181&r1=405180&r2=405181&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Mon May 8
14:52:09 2006
@@ -28,6 +28,7 @@
import org.apache.hadoop.util.LogFormatter;
import org.apache.hadoop.mapred.*;
+import org.apache.nutch.net.URLFilters;
import org.apache.nutch.parse.*;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -44,15 +45,27 @@
private int maxInlinks;
private boolean ignoreInternalLinks;
- public static class LinkDbMerger extends MapReduceBase implements Reducer {
+ public static class Merger extends MapReduceBase implements Reducer {
private int _maxInlinks;
+ private URLFilters filters = null;
public void configure(JobConf job) {
super.configure(job);
_maxInlinks = job.getInt("db.max.inlinks", 10000);
+ if (job.getBoolean("linkdb.merger.urlfilters", false)) {
+ filters = new URLFilters(job);
+ }
}
public void reduce(WritableComparable key, Iterator values,
OutputCollector output, Reporter reporter) throws IOException {
+ if (filters != null) {
+ try {
+ if (filters.filter(((UTF8)key).toString()) == null)
+ return;
+ } catch (Exception e) {
+ LOG.fine("Can't filter " + key + ": " + e);
+ }
+ }
Inlinks inlinks = null;
while (values.hasNext()) {
if (inlinks == null) {
@@ -65,9 +78,19 @@
output.collect(key, inlinks);
return;
}
- inlinks.add((Inlink)it.next());
+ Inlink in = (Inlink)it.next();
+ if (filters != null) {
+ try {
+ if (filters.filter(in.getFromUrl()) == null)
+ continue;
+ } catch (Exception e) {
+ LOG.fine("Can't filter " + key + ": " + e);
+ }
+ }
+ inlinks.add(in);
}
}
+ if (inlinks.size() == 0) return;
output.collect(key, inlinks);
}
}
@@ -205,7 +228,6 @@
job.setInputValueClass(ParseData.class);
job.setMapperClass(LinkDb.class);
- //job.setCombinerClass(LinkDb.class);
job.setReducerClass(LinkDb.class);
job.setOutputDir(newLinkDb);
@@ -217,7 +239,7 @@
return job;
}
- private static JobConf createMergeJob(Configuration config, File linkDb) {
+ public static JobConf createMergeJob(Configuration config, File linkDb) {
File newLinkDb =
new File("linkdb-merge-" +
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
@@ -229,7 +251,7 @@
job.setInputKeyClass(UTF8.class);
job.setInputValueClass(Inlinks.class);
- job.setReducerClass(LinkDbMerger.class);
+ job.setReducerClass(Merger.class);
job.setOutputDir(newLinkDb);
job.setOutputFormat(MapFileOutputFormat.class);