Author: lewismc
Date: Mon Nov  4 19:11:16 2013
New Revision: 1538723

URL: http://svn.apache.org/r1538723
Log:
NUTCH-1651 modifiedTime and prevmodifiedTime never set

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1538723&r1=1538722&r2=1538723&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Nov  4 19:11:16 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1651 modifiedTime and prevmodifiedTime never set (Talat UYARER via 
lewismc)
+
 * NUTCH-1360 Suport the storing of IP address connected to when web crawling 
(ferdy, lewismc, Yasin Kılınç)
 
 * NUTCH-1588 Port NUTCH-1245 URL gone with 404 after db.fetch.interval.max 
stays db_unfetched in CrawlDb and is generated over and over again to 2.x 
(Talat UYARER via lewismc)

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java?rev=1538723&r1=1538722&r2=1538723&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java Mon 
Nov  4 19:11:16 2013
@@ -27,6 +27,7 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.fetcher.FetcherJob;
+import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.scoring.ScoreDatum;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
@@ -128,7 +129,14 @@ extends GoraReducer<UrlWithScore, NutchW
         long prevFetchTime = page.getPrevFetchTime();
         long modifiedTime = page.getModifiedTime();
         long prevModifiedTime = page.getPrevModifiedTime();
-
+        Utf8 lastModified = page.getFromHeaders(new Utf8("Last-Modified"));
+        if ( lastModified != null ){
+          try {
+            modifiedTime = HttpDateFormat.toLong(lastModified.toString());
+            prevModifiedTime = page.getModifiedTime();
+          } catch (Exception e) {
+          }
+        }
         schedule.setFetchSchedule(url, page, prevFetchTime, prevModifiedTime,
             fetchTime, modifiedTime, modified);
         if (maxInterval < page.getFetchInterval())

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java?rev=1538723&r1=1538722&r2=1538723&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java Mon 
Nov  4 19:11:16 2013
@@ -61,6 +61,8 @@ public class DbUpdaterJob extends NutchT
     FIELDS.add(WebPage.Field.MODIFIED_TIME);
     FIELDS.add(WebPage.Field.FETCH_INTERVAL);
     FIELDS.add(WebPage.Field.PREV_FETCH_TIME);
+    FIELDS.add(WebPage.Field.PREV_MODIFIED_TIME);
+    FIELDS.add(WebPage.Field.HEADERS);
   }
 
   public static final Utf8 DISTANCE = new Utf8("dist");


Reply via email to