Author: lewismc
Date: Mon Nov 4 19:11:16 2013
New Revision: 1538723
URL: http://svn.apache.org/r1538723
Log:
NUTCH-1651 modifiedTime and prevmodifiedTime never set
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1538723&r1=1538722&r2=1538723&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Nov 4 19:11:16 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1651 modifiedTime and prevmodifiedTime never set (Talat UYARER via
lewismc)
+
* NUTCH-1360 Suport the storing of IP address connected to when web crawling
(ferdy, lewismc, Yasin Kılınç)
* NUTCH-1588 Port NUTCH-1245 URL gone with 404 after db.fetch.interval.max
stays db_unfetched in CrawlDb and is generated over and over again to 2.x
(Talat UYARER via lewismc)
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java?rev=1538723&r1=1538722&r2=1538723&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java Mon
Nov 4 19:11:16 2013
@@ -27,6 +27,7 @@ import org.apache.hadoop.conf.Configurat
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.fetcher.FetcherJob;
+import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.scoring.ScoreDatum;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
@@ -128,7 +129,14 @@ extends GoraReducer<UrlWithScore, NutchW
long prevFetchTime = page.getPrevFetchTime();
long modifiedTime = page.getModifiedTime();
long prevModifiedTime = page.getPrevModifiedTime();
-
+ Utf8 lastModified = page.getFromHeaders(new Utf8("Last-Modified"));
+ if ( lastModified != null ){
+ try {
+ modifiedTime = HttpDateFormat.toLong(lastModified.toString());
+ prevModifiedTime = page.getModifiedTime();
+ } catch (Exception e) {
+ }
+ }
schedule.setFetchSchedule(url, page, prevFetchTime, prevModifiedTime,
fetchTime, modifiedTime, modified);
if (maxInterval < page.getFetchInterval())
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java?rev=1538723&r1=1538722&r2=1538723&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java Mon
Nov 4 19:11:16 2013
@@ -61,6 +61,8 @@ public class DbUpdaterJob extends NutchT
FIELDS.add(WebPage.Field.MODIFIED_TIME);
FIELDS.add(WebPage.Field.FETCH_INTERVAL);
FIELDS.add(WebPage.Field.PREV_FETCH_TIME);
+ FIELDS.add(WebPage.Field.PREV_MODIFIED_TIME);
+ FIELDS.add(WebPage.Field.HEADERS);
}
public static final Utf8 DISTANCE = new Utf8("dist");