Repository: nutch
Updated Branches:
  refs/heads/master 78e99092c -> 70622c3e1


NUTCH-2164 NUTCH-2242 Inconsistent 'Modified Time' in crawl db / lastModified 
not always set
 - set modified time (time of last successful fetch) by DefaultFetchSchedule 
and AdaptiveFetchSchedule
   but only if the document is actually modified
 - update unit tests to check whether modification time is properly set
 - set modified time (sent by responding server in HTTP header) in 
ProtocolOutput:
   FetchSchedule implementations can access the HTTP modified time from 
CrawlDatum's
   metadata (PROTO_STATUS_KEY = "_pst_")


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/70622c3e
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/70622c3e
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/70622c3e

Branch: refs/heads/master
Commit: 70622c3e18cee879f5a38d895f68dd0be69461e1
Parents: 78e9909
Author: Sebastian Nagel <[email protected]>
Authored: Fri Mar 11 22:55:24 2016 +0100
Committer: Sebastian Nagel <[email protected]>
Committed: Tue Aug 23 09:29:41 2016 +0200

----------------------------------------------------------------------
 .../apache/nutch/crawl/AdaptiveFetchSchedule.java    |  1 +
 .../org/apache/nutch/crawl/DefaultFetchSchedule.java |  4 ++++
 .../org/apache/nutch/protocol/ProtocolOutput.java    | 14 ++++++++++++++
 .../org/apache/nutch/crawl/TestCrawlDbStates.java    | 15 +++++++--------
 4 files changed, 26 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/70622c3e/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java 
b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
index 08cad34..a4119ff 100755
--- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -116,6 +116,7 @@ public class AdaptiveFetchSchedule extends 
AbstractFetchSchedule {
       switch (state) {
       case FetchSchedule.STATUS_MODIFIED:
         interval *= (1.0f - DEC_RATE);
+        modifiedTime = fetchTime;
         break;
       case FetchSchedule.STATUS_NOTMODIFIED:
         interval *= (1.0f + INC_RATE);

http://git-wip-us.apache.org/repos/asf/nutch/blob/70622c3e/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java 
b/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
index 4a60a1c..d979e84 100755
--- a/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
@@ -39,6 +39,10 @@ public class DefaultFetchSchedule extends 
AbstractFetchSchedule {
       datum.setFetchInterval(defaultInterval);
     }
     datum.setFetchTime(fetchTime + (long) datum.getFetchInterval() * 1000);
+    if (modifiedTime <= 0 || state == FetchSchedule.STATUS_MODIFIED) {
+      // Set modifiedTime to fetchTime on first successful fetch
+      modifiedTime = fetchTime;
+    }
     datum.setModifiedTime(modifiedTime);
     return datum;
   }

http://git-wip-us.apache.org/repos/asf/nutch/blob/70622c3e/src/java/org/apache/nutch/protocol/ProtocolOutput.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/protocol/ProtocolOutput.java 
b/src/java/org/apache/nutch/protocol/ProtocolOutput.java
index c7f0c2c..f743b3f 100644
--- a/src/java/org/apache/nutch/protocol/ProtocolOutput.java
+++ b/src/java/org/apache/nutch/protocol/ProtocolOutput.java
@@ -17,6 +17,11 @@
 
 package org.apache.nutch.protocol;
 
+import java.text.ParseException;
+
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+
 /**
  * Simple aggregate to pass from protocol plugins both content and protocol
  * status.
@@ -35,6 +40,15 @@ public class ProtocolOutput {
   public ProtocolOutput(Content content) {
     this.content = content;
     this.status = ProtocolStatus.STATUS_SUCCESS;
+    String lastModifiedDate = 
content.getMetadata().get(Response.LAST_MODIFIED);
+    if (lastModifiedDate != null) {
+      try {
+        long lastModified = HttpDateFormat.toLong(lastModifiedDate);
+        status.setLastModified(lastModified);
+      } catch (ParseException e) {
+        // last-modified still unset
+      }
+    }
   }
 
   public Content getContent() {

http://git-wip-us.apache.org/repos/asf/nutch/blob/70622c3e/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
----------------------------------------------------------------------
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java 
b/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
index c54559b..7f1c9cf 100644
--- a/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
@@ -300,6 +300,8 @@ public class TestCrawlDbStates {
      * time the document was fetched first (at all or after it has been 
changed)
      */
     protected long firstFetchTime;
+    /** elapsed duration */
+    protected long elapsedDuration = 0;
     /** state in CrawlDb before the last fetch */
     protected byte previousDbState;
     /** signature in CrawlDb of previous fetch */
@@ -386,12 +388,7 @@ public class TestCrawlDbStates {
 
     // test modified time
     private boolean checkModifiedTime(CrawlDatum result, long modifiedTime) {
-      if (result.getModifiedTime() == 0) {
-        LOG.error("modified time not set (TODO: not set by 
DefaultFetchSchedule)");
-        // TODO: return false (but DefaultFetchSchedule does not set modified
-        // time, see NUTCH-933)
-        return true;
-      } else if (modifiedTime == result.getModifiedTime()) {
+      if (modifiedTime == result.getModifiedTime()) {
         return true;
       }
       LOG.error("wrong modified time: " + new Date(result.getModifiedTime())
@@ -403,13 +400,15 @@ public class TestCrawlDbStates {
     protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
       lastFetchTime = currFetchTime;
       currFetchTime = currentTime;
+      if (lastFetchTime > 0)
+        elapsedDuration += (currFetchTime - lastFetchTime);
       previousDbState = datum.getStatus();
       lastSignature = datum.getSignature();
       datum = super.fetch(datum, currentTime);
       if (firstFetchTime == 0) {
         firstFetchTime = currFetchTime;
-      } else if ((currFetchTime - firstFetchTime) > (duration / 2)) {
-        // simulate a modification after "one year"
+      } else if (elapsedDuration < (duration / 2)) {
+        // simulate frequent modifications in the first "year"
         changeContent();
         firstFetchTime = currFetchTime;
       }

Reply via email to