Author: ab Date: Fri Mar 14 07:33:53 2008 New Revision: 637114 URL: http://svn.apache.org/viewvc?rev=637114&view=rev Log: NUTCH-612 URL filtering was disabled when invoking Generator from Crawl.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=637114&r1=637113&r2=637114&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Fri Mar 14 07:33:53 2008 @@ -220,6 +220,9 @@ 79. NUTCH-613 - Empty summaries and cached pages (kubes via ab) +80. NUTCH-612 - URL filtering was disabled in Generator when invoked + from Crawl (Susam Pal via ab) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=637114&r1=637113&r2=637114&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Fri Mar 14 07:33:53 2008 @@ -117,7 +117,7 @@ int i; for (i = 0; i < depth; i++) { // generate new segment Path segment = generator.generate(crawlDb, segments, -1, topN, System - .currentTimeMillis(), false, false); + .currentTimeMillis()); if (segment == null) { LOG.info("Stopping at depth=" + i + " - no more URLs to fetch."); break; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=637114&r1=637113&r2=637114&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Mar 14 07:33:53 2008 @@ -371,11 +371,28 @@ setConf(conf); } - /** Generate fetchlists in a segment. */ - public Path generate(Path dbDir, Path segments) - throws IOException { - return generate(dbDir, segments, -1, Long.MAX_VALUE, System - .currentTimeMillis(), true, false); + /** + * Generate fetchlists in a segment. Whether to filter URLs or not is + * read from the crawl.generate.filter property in the configuration + * files. If the property is not found, the URLs are filtered. + * + * @param dbDir Crawl database directory + * @param segments Segments directory + * @param numLists Number of reduce tasks + * @param topN Number of top URLs to be selected + * @param curTime Current time in milliseconds + * + * @return Path to generated segment or null if no entries were + * selected + * + * @throws IOException When an I/O error occurs + */ + public Path generate(Path dbDir, Path segments, int numLists, + long topN, long curTime) throws IOException { + + JobConf job = new NutchJob(getConf()); + boolean filter = job.getBoolean(CRAWL_GENERATE_FILTER, true); + return generate(dbDir, segments, numLists, topN, curTime, filter, false); } /**