The ability to figure out what new URLs are being added to the database
is more important than the ones being fetched.
When changing the regex-urlfilter or regex-normalize files this gives
instantaneous feedback where the urls being retrieved may still be
crawling old junk for some time after the edits took place.
--
Rod Taylor <[EMAIL PROTECTED]>
*** ./src/java/org/apache/nutch/crawl/CrawlDbReducer.java.orig 2005-11-16 13:53:07.000000000 -0500
--- ./src/java/org/apache/nutch/crawl/CrawlDbReducer.java 2005-11-16 15:16:58.000000000 -0500
***************
*** 18,23 ****
--- 18,24 ----
import java.net.URL;
import java.util.Iterator;
+ import java.util.logging.*;
import java.io.IOException;
import org.apache.nutch.io.*;
***************
*** 28,33 ****
--- 29,37 ----
public class CrawlDbReducer implements Reducer {
private int retryMax;
+ public static final Logger LOG =
+ LogFormatter.getLogger("org.apache.nutch.crawl.CrawlDb");
+
public void configure(JobConf job) {
retryMax = job.getInt("db.fetch.retry.max", 3);
}
***************
*** 72,77 ****
--- 76,82 ----
if (old != null) { // if old exists
result = old; // use it
} else {
+ LOG.info("New Entry: "+ key);
result = highest; // use new entry
result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
result.setScore(1.0f); // initial score is 1.0f