Author: ab
Date: Wed Mar 8 06:10:12 2006
New Revision: 384219
URL: http://svn.apache.org/viewcvs?rev=384219&view=rev
Log:
Don't generate URLs that don't pass URLFilters.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=384219&r1=384218&r2=384219&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Mar
8 06:10:12 2006
@@ -28,6 +28,8 @@
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.*;
+import org.apache.nutch.net.URLFilterException;
+import org.apache.nutch.net.URLFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -45,11 +47,13 @@
private HashMap hostCounts = new HashMap();
private int maxPerHost;
private Partitioner hostPartitioner = new PartitionUrlByHost();
+ private URLFilters filters;
public void configure(JobConf job) {
curTime = job.getLong("crawl.gen.curTime", System.currentTimeMillis());
limit = job.getLong("crawl.topN",Long.MAX_VALUE)/job.getNumReduceTasks();
maxPerHost = job.getInt("generate.max.per.host", -1);
+ filters = new URLFilters(job);
}
public void close() {}
@@ -58,6 +62,14 @@
public void map(WritableComparable key, Writable value,
OutputCollector output, Reporter reporter)
throws IOException {
+ UTF8 url = (UTF8)key;
+ // don't generate URLs that don't pass URLFilters
+ try {
+ if (filters.filter(url.toString()) == null)
+ return;
+ } catch (URLFilterException e) {
+ LOG.warning("Couldn't filter url: " + url + " (" + e.getMessage() +
")");
+ }
CrawlDatum crawlDatum = (CrawlDatum)value;
if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_GONE)