Author: ab
Date: Fri Mar 14 07:33:53 2008
New Revision: 637114

URL: http://svn.apache.org/viewvc?rev=637114&view=rev
Log:
NUTCH-612 URL filtering was disabled when invoking Generator from Crawl.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=637114&r1=637113&r2=637114&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Mar 14 07:33:53 2008
@@ -220,6 +220,9 @@
 
 79. NUTCH-613 - Empty summaries and cached pages (kubes via ab)
 
+80. NUTCH-612 - URL filtering was disabled in Generator when invoked
+    from Crawl (Susam Pal via ab)
+
 
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=637114&r1=637113&r2=637114&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Fri Mar 14 
07:33:53 2008
@@ -117,7 +117,7 @@
     int i;
     for (i = 0; i < depth; i++) {             // generate new segment
       Path segment = generator.generate(crawlDb, segments, -1, topN, System
-          .currentTimeMillis(), false, false);
+          .currentTimeMillis());
       if (segment == null) {
         LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
         break;

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=637114&r1=637113&r2=637114&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Mar 
14 07:33:53 2008
@@ -371,11 +371,28 @@
     setConf(conf);
   }
   
-  /** Generate fetchlists in a segment. */
-  public Path generate(Path dbDir, Path segments)
-    throws IOException {
-    return generate(dbDir, segments, -1, Long.MAX_VALUE, System
-        .currentTimeMillis(), true, false);
+  /**
+   * Generate fetchlists in a segment. Whether to filter URLs or not is
+   * read from the crawl.generate.filter property in the configuration
+   * files. If the property is not found, the URLs are filtered.
+   *
+   * @param dbDir     Crawl database directory
+   * @param segments  Segments directory
+   * @param numLists  Number of reduce tasks
+   * @param topN      Number of top URLs to be selected
+   * @param curTime   Current time in milliseconds
+   *
+   * @return Path to generated segment or null if no entries were
+   *         selected
+   *
+   * @throws IOException When an I/O error occurs
+   */
+  public Path generate(Path dbDir, Path segments, int numLists,
+                       long topN, long curTime) throws IOException {
+
+    JobConf job = new NutchJob(getConf());
+    boolean filter = job.getBoolean(CRAWL_GENERATE_FILTER, true);
+    return generate(dbDir, segments, numLists, topN, curTime, filter, false);
   }
 
   /**


Reply via email to