svn commit: r391044 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

2006-04-03 Thread ab
Author: ab
Date: Mon Apr  3 06:35:34 2006
New Revision: 391044

URL: http://svn.apache.org/viewcvs?rev=391044view=rev
Log:
Make sure we use new values for score, metadata, fetch interval
and fetch time.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=391044r1=391043r2=391044view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Mon 
Apr  3 06:35:34 2006
@@ -25,6 +25,7 @@
 /** Merge new page entries with existing entries. */
 public class CrawlDbReducer implements Reducer {
   private int retryMax;
+  private CrawlDatum result = new CrawlDatum();
 
   public void configure(JobConf job) {
 retryMax = job.getInt(db.fetch.retry.max, 3);
@@ -61,36 +62,45 @@
   }
 }
 
-CrawlDatum result = null;
+// initialize with the latest version
+result.set(highest);
+if (old != null) {
+  // copy metadata from old, if exists
+  if (old.getMetaData() != null) {
+result.getMetaData().putAll(old.getMetaData());
+// overlay with new, if any
+if (highest.getMetaData() != null)
+  result.getMetaData().putAll(highest.getMetaData());
+  }
+  // set the most recent valid value of modifiedTime
+  if (old.getModifiedTime()  0  highest.getModifiedTime() == 0) {
+result.setModifiedTime(old.getModifiedTime());
+  }
+}
 
 switch (highest.getStatus()) {// determine new status
 
 case CrawlDatum.STATUS_DB_UNFETCHED:  // no new entry
 case CrawlDatum.STATUS_DB_FETCHED:
 case CrawlDatum.STATUS_DB_GONE:
-  result = old;   // use old
+  result.set(old);// use old
   break;
 
 case CrawlDatum.STATUS_LINKED:// highest was link
   if (old != null) {  // if old exists
-result = old; // use it
+result.set(old);  // use it
   } else {
-result = highest; // use new entry
 result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
-result.setScore(1.0f);// initial score is 1.0f
   }
-  result.setSignature(null);  // reset the signature
   break;
   
 case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch
-  result = highest;   // use new entry
-  if (highest.getSignature() == null) highest.setSignature(signature);
+  if (highest.getSignature() == null) result.setSignature(signature);
   result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
   result.setNextFetchTime();
   break;
 
 case CrawlDatum.STATUS_FETCH_RETRY:   // temporary failure
-  result = highest;   // use new entry
   if (old != null)
 result.setSignature(old.getSignature());  // use old signature
   if (highest.getRetriesSinceFetch()  retryMax) {
@@ -101,7 +111,6 @@
   break;
 
 case CrawlDatum.STATUS_FETCH_GONE:// permanent failure
-  result = highest;   // use new entry
   if (old != null)
 result.setSignature(old.getSignature());  // use old signature
   result.setStatus(CrawlDatum.STATUS_DB_GONE);
@@ -111,10 +120,8 @@
   throw new RuntimeException(Unknown status: +highest.getStatus());
 }
 
-if (result != null) {
-  result.setScore(result.getScore() + scoreIncrement);
-  output.collect(key, result);
-}
+result.setScore(result.getScore() + scoreIncrement);
+output.collect(key, result);
   }
 
 }




svn commit: r391055 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

2006-04-03 Thread ab
Author: ab
Date: Mon Apr  3 07:36:19 2006
New Revision: 391055

URL: http://svn.apache.org/viewcvs?rev=391055view=rev
Log:
Forgot to properly initialize the score.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=391055r1=391054r2=391055view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Mon 
Apr  3 07:36:19 2006
@@ -91,6 +91,7 @@
 result.set(old);  // use it
   } else {
 result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+result.setScore(1.0f);
   }
   break;
   




svn commit: r391150 - /lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java

2006-04-03 Thread jerome
Author: jerome
Date: Mon Apr  3 13:57:46 2006
New Revision: 391150

URL: http://svn.apache.org/viewcvs?rev=391150view=rev
Log:
no more dump parse-mspowerpoint unit test result to a file for visual checks

Modified:

lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java

Modified: 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?rev=391150r1=391149r2=391150view=diff
==
--- 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
 Mon Apr  3 13:57:46 2006
@@ -70,7 +70,7 @@
   /**
* Wether dumping the extracted data to file for visual checks.
*/
-  private final static boolean DUMP_TO_FILE = true;
+  private final static boolean DUMP_TO_FILE = false;
 
   private final File testFile;