Repository: nutch Updated Branches: refs/heads/master 78e99092c -> 70622c3e1
NUTCH-2164 NUTCH-2242 Inconsistent 'Modified Time' in crawl db / lastModified not always set - set modified time (time of last successful fetch) by DefaultFetchSchedule and AdaptiveFetchSchedule but only if the document is actually modified - update unit tests to check whether modification time is properly set - set modified time (sent by responding server in HTTP header) in ProtocolOutput: FetchSchedule implementations can access the HTTP modified time from CrawlDatum's metadata (PROTO_STATUS_KEY = "_pst_") Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/70622c3e Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/70622c3e Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/70622c3e Branch: refs/heads/master Commit: 70622c3e18cee879f5a38d895f68dd0be69461e1 Parents: 78e9909 Author: Sebastian Nagel <[email protected]> Authored: Fri Mar 11 22:55:24 2016 +0100 Committer: Sebastian Nagel <[email protected]> Committed: Tue Aug 23 09:29:41 2016 +0200 ---------------------------------------------------------------------- .../apache/nutch/crawl/AdaptiveFetchSchedule.java | 1 + .../org/apache/nutch/crawl/DefaultFetchSchedule.java | 4 ++++ .../org/apache/nutch/protocol/ProtocolOutput.java | 14 ++++++++++++++ .../org/apache/nutch/crawl/TestCrawlDbStates.java | 15 +++++++-------- 4 files changed, 26 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/70622c3e/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index 08cad34..a4119ff 100755 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -116,6 +116,7 @@ public class AdaptiveFetchSchedule extends AbstractFetchSchedule { switch (state) { case FetchSchedule.STATUS_MODIFIED: interval *= (1.0f - DEC_RATE); + modifiedTime = fetchTime; break; case FetchSchedule.STATUS_NOTMODIFIED: interval *= (1.0f + INC_RATE); http://git-wip-us.apache.org/repos/asf/nutch/blob/70622c3e/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java b/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java index 4a60a1c..d979e84 100755 --- a/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java @@ -39,6 +39,10 @@ public class DefaultFetchSchedule extends AbstractFetchSchedule { datum.setFetchInterval(defaultInterval); } datum.setFetchTime(fetchTime + (long) datum.getFetchInterval() * 1000); + if (modifiedTime <= 0 || state == FetchSchedule.STATUS_MODIFIED) { + // Set modifiedTime to fetchTime on first successful fetch + modifiedTime = fetchTime; + } datum.setModifiedTime(modifiedTime); return datum; } http://git-wip-us.apache.org/repos/asf/nutch/blob/70622c3e/src/java/org/apache/nutch/protocol/ProtocolOutput.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/protocol/ProtocolOutput.java b/src/java/org/apache/nutch/protocol/ProtocolOutput.java index c7f0c2c..f743b3f 100644 --- a/src/java/org/apache/nutch/protocol/ProtocolOutput.java +++ b/src/java/org/apache/nutch/protocol/ProtocolOutput.java @@ -17,6 +17,11 @@ package org.apache.nutch.protocol; +import java.text.ParseException; + +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.net.protocols.Response; + /** * Simple aggregate to pass from protocol plugins both content and protocol * status. @@ -35,6 +40,15 @@ public class ProtocolOutput { public ProtocolOutput(Content content) { this.content = content; this.status = ProtocolStatus.STATUS_SUCCESS; + String lastModifiedDate = content.getMetadata().get(Response.LAST_MODIFIED); + if (lastModifiedDate != null) { + try { + long lastModified = HttpDateFormat.toLong(lastModifiedDate); + status.setLastModified(lastModified); + } catch (ParseException e) { + // last-modified still unset + } + } } public Content getContent() { http://git-wip-us.apache.org/repos/asf/nutch/blob/70622c3e/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java ---------------------------------------------------------------------- diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java b/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java index c54559b..7f1c9cf 100644 --- a/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java +++ b/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java @@ -300,6 +300,8 @@ public class TestCrawlDbStates { * time the document was fetched first (at all or after it has been changed) */ protected long firstFetchTime; + /** elapsed duration */ + protected long elapsedDuration = 0; /** state in CrawlDb before the last fetch */ protected byte previousDbState; /** signature in CrawlDb of previous fetch */ @@ -386,12 +388,7 @@ public class TestCrawlDbStates { // test modified time private boolean checkModifiedTime(CrawlDatum result, long modifiedTime) { - if (result.getModifiedTime() == 0) { - LOG.error("modified time not set (TODO: not set by DefaultFetchSchedule)"); - // TODO: return false (but DefaultFetchSchedule does not set modified - // time, see NUTCH-933) - return true; - } else if (modifiedTime == result.getModifiedTime()) { + if (modifiedTime == result.getModifiedTime()) { return true; } LOG.error("wrong modified time: " + new Date(result.getModifiedTime()) @@ -403,13 +400,15 @@ public class TestCrawlDbStates { protected CrawlDatum fetch(CrawlDatum datum, long currentTime) { lastFetchTime = currFetchTime; currFetchTime = currentTime; + if (lastFetchTime > 0) + elapsedDuration += (currFetchTime - lastFetchTime); previousDbState = datum.getStatus(); lastSignature = datum.getSignature(); datum = super.fetch(datum, currentTime); if (firstFetchTime == 0) { firstFetchTime = currFetchTime; - } else if ((currFetchTime - firstFetchTime) > (duration / 2)) { - // simulate a modification after "one year" + } else if (elapsedDuration < (duration / 2)) { + // simulate frequent modifications in the first "year" changeContent(); firstFetchTime = currFetchTime; }
