Author: ab Date: Wed Mar 8 06:10:12 2006 New Revision: 384219 URL: http://svn.apache.org/viewcvs?rev=384219&view=rev Log: Don't generate URLs that don't pass URLFilters.
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=384219&r1=384218&r2=384219&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Mar 8 06:10:12 2006 @@ -28,6 +28,8 @@ import org.apache.hadoop.mapred.*; import org.apache.hadoop.mapred.lib.*; +import org.apache.nutch.net.URLFilterException; +import org.apache.nutch.net.URLFilters; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -45,11 +47,13 @@ private HashMap hostCounts = new HashMap(); private int maxPerHost; private Partitioner hostPartitioner = new PartitionUrlByHost(); + private URLFilters filters; public void configure(JobConf job) { curTime = job.getLong("crawl.gen.curTime", System.currentTimeMillis()); limit = job.getLong("crawl.topN",Long.MAX_VALUE)/job.getNumReduceTasks(); maxPerHost = job.getInt("generate.max.per.host", -1); + filters = new URLFilters(job); } public void close() {} @@ -58,6 +62,14 @@ public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { + UTF8 url = (UTF8)key; + // don't generate URLs that don't pass URLFilters + try { + if (filters.filter(url.toString()) == null) + return; + } catch (URLFilterException e) { + LOG.warning("Couldn't filter url: " + url + " (" + e.getMessage() + ")"); + } CrawlDatum crawlDatum = (CrawlDatum)value; if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_GONE)