The ability to figure out what new URLs are being added to the database
is more important than the ones being fetched.

When changing the regex-urlfilter or regex-normalize files this gives
instantaneous feedback where the urls being retrieved may still be
crawling old junk for some time after the edits took place.

-- 
Rod Taylor <[EMAIL PROTECTED]>
*** ./src/java/org/apache/nutch/crawl/CrawlDbReducer.java.orig	2005-11-16 13:53:07.000000000 -0500
--- ./src/java/org/apache/nutch/crawl/CrawlDbReducer.java	2005-11-16 15:16:58.000000000 -0500
***************
*** 18,23 ****
--- 18,24 ----
  
  import java.net.URL;
  import java.util.Iterator;
+ import java.util.logging.*;
  import java.io.IOException;
  
  import org.apache.nutch.io.*;
***************
*** 28,33 ****
--- 29,37 ----
  public class CrawlDbReducer implements Reducer {
    private int retryMax;
  
+   public static final Logger LOG =
+     LogFormatter.getLogger("org.apache.nutch.crawl.CrawlDb");
+ 
    public void configure(JobConf job) {
      retryMax = job.getInt("db.fetch.retry.max", 3);
    }
***************
*** 72,77 ****
--- 76,82 ----
        if (old != null) {                          // if old exists
          result = old;                             // use it
        } else {
+ 	LOG.info("New Entry: "+ key);
          result = highest;                         // use new entry
          result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
          result.setScore(1.0f);                    // initial score is 1.0f

Reply via email to