nutch doesn't do a good job on storing or testing the Last-Modified
time of pages it's crawled. I made the following changes which seem
to help a lot:
snowbird:~/src/nutch/trunk> svn diff
Index: src/java/org/apache/nutch/fetcher/Fetcher.java
===
--- src/java/org/apache/nutch/fetcher/Fetcher.java (revision 817382)
+++ src/java/org/apache/nutch/fetcher/Fetcher.java (working copy)
@@ -21,6 +21,7 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
+import java.text.ParseException;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger;
@@ -42,6 +43,7 @@
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.*;
+import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.protocol.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.scoring.ScoringFilters;
@@ -742,6 +744,23 @@
datum.setStatus(status);
datum.setFetchTime(System.currentTimeMillis());
+ LOG.debug("metadata = " + (content != null ?
content.getMetadata() : "content-null"));
+ LOG.debug("modified? = " + ((content != null &&
content.getMetadata() != null) ?
content.getMetadata().get("Last-Modified") : "content-null"));
+ if (content != null && content.getMetadata() != null &&
content.getMetadata().get("Last-Modified") != null)
+ {
+ String lastModifiedStr = content.getMetadata().get("Last-Modified");
+
+ try
+ {
+ long lastModifiedDate = HttpDateFormat.toLong(lastModifiedStr);
+ LOG.debug("last modified = " + lastModifiedStr + " = "
+ lastModifiedDate);
+ datum.setModifiedTime(lastModifiedDate);
+ }
+ catch (ParseException e)
+ {
+ LOG.error("unable to parse " + lastModifiedStr, e);
+ }
+ }
if (pstatus != null)
datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
ParseResult parseResult = null;
Index: src/java/org/apache/nutch/indexer/IndexerMapReduce.java
===
--- src/java/org/apache/nutch/indexer/IndexerMapReduce.java (revision
817382)
+++ src/java/org/apache/nutch/indexer/IndexerMapReduce.java (working copy)
@@ -84,8 +84,10 @@
if (CrawlDatum.hasDbStatus(datum))
dbDatum = datum;
else if (CrawlDatum.hasFetchStatus(datum)) {
- // don't index unmodified (empty) pages
- if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED)
+ /*
+ * Where did this person get the idea that unmodified pages
are empty?
+ // don't index unmodified (empty) pages
+ if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) */
fetchDatum = datum;
} else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
CrawlDatum.STATUS_SIGNATURE == datum.getStatus()) {
@@ -108,7 +110,7 @@
}
if (!parseData.getStatus().isSuccess() ||
-fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
+(fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS &&
fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED)) {
return;
}
Index:
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
===
---
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(revision
817382)
+++
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
(working
copy)
@@ -124,11 +124,14 @@
reqStr.append("\r\n");
}
- reqStr.append("\r\n");
if (datum.getModifiedTime() > 0) {
-reqStr.append("If-Modified-Since: " +
HttpDateFormat.toString(datum.getModifiedTime()));
+ String httpDate =
+ HttpDateFormat.toString(datum.getModifiedTime());
+ Http.LOG.debug("modified time: " + httpDate);
+reqStr.append("If-Modified-Since: " + httpDate);
reqStr.append("\r\n");
}
+ reqStr.append("\r\n");
byte[] reqBytes= reqStr.toString().getBytes();
On Wed, Oct 14, 2009 at 9:40 AM, sprabhu_PN
wrote:
>
> "We are looking at picking up updates in a recrawl - How do I get the the
> fetcher to read the recently built segment, get to the url and decide
> whether to get the content based on whether the url has been updated since?
> "
>
> Shreekanth Prabhu
> --
> View this message in context:
> http://www.nabble.com/Recrawling--Nutch-tp25891294p25891294.html
> Sent from the Nutch - User mailing list archive at Nabble.com.
>
>
--
http://www.linkedin.com/in/paultomblin