Author: ab
Date: Mon Apr 3 06:35:34 2006
New Revision: 391044
URL: http://svn.apache.org/viewcvs?rev=391044view=rev
Log:
Make sure we use new values for score, metadata, fetch interval
and fetch time.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=391044r1=391043r2=391044view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Mon
Apr 3 06:35:34 2006
@@ -25,6 +25,7 @@
/** Merge new page entries with existing entries. */
public class CrawlDbReducer implements Reducer {
private int retryMax;
+ private CrawlDatum result = new CrawlDatum();
public void configure(JobConf job) {
retryMax = job.getInt(db.fetch.retry.max, 3);
@@ -61,36 +62,45 @@
}
}
-CrawlDatum result = null;
+// initialize with the latest version
+result.set(highest);
+if (old != null) {
+ // copy metadata from old, if exists
+ if (old.getMetaData() != null) {
+result.getMetaData().putAll(old.getMetaData());
+// overlay with new, if any
+if (highest.getMetaData() != null)
+ result.getMetaData().putAll(highest.getMetaData());
+ }
+ // set the most recent valid value of modifiedTime
+ if (old.getModifiedTime() 0 highest.getModifiedTime() == 0) {
+result.setModifiedTime(old.getModifiedTime());
+ }
+}
switch (highest.getStatus()) {// determine new status
case CrawlDatum.STATUS_DB_UNFETCHED: // no new entry
case CrawlDatum.STATUS_DB_FETCHED:
case CrawlDatum.STATUS_DB_GONE:
- result = old; // use old
+ result.set(old);// use old
break;
case CrawlDatum.STATUS_LINKED:// highest was link
if (old != null) { // if old exists
-result = old; // use it
+result.set(old); // use it
} else {
-result = highest; // use new entry
result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
-result.setScore(1.0f);// initial score is 1.0f
}
- result.setSignature(null); // reset the signature
break;
case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch
- result = highest; // use new entry
- if (highest.getSignature() == null) highest.setSignature(signature);
+ if (highest.getSignature() == null) result.setSignature(signature);
result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
result.setNextFetchTime();
break;
case CrawlDatum.STATUS_FETCH_RETRY: // temporary failure
- result = highest; // use new entry
if (old != null)
result.setSignature(old.getSignature()); // use old signature
if (highest.getRetriesSinceFetch() retryMax) {
@@ -101,7 +111,6 @@
break;
case CrawlDatum.STATUS_FETCH_GONE:// permanent failure
- result = highest; // use new entry
if (old != null)
result.setSignature(old.getSignature()); // use old signature
result.setStatus(CrawlDatum.STATUS_DB_GONE);
@@ -111,10 +120,8 @@
throw new RuntimeException(Unknown status: +highest.getStatus());
}
-if (result != null) {
- result.setScore(result.getScore() + scoreIncrement);
- output.collect(key, result);
-}
+result.setScore(result.getScore() + scoreIncrement);
+output.collect(key, result);
}
}