Author: mattmann
Date: Sun Oct 18 19:32:22 2015
New Revision: 1709306
URL: http://svn.apache.org/viewvc?rev=1709306&view=rev
Log:
Fix for NUTCH-2129 - Add protocol status tracking to crawl datum contributed by
Michael Joyce <[email protected]> this closes #68.
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1709306&r1=1709305&r2=1709306&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sun Oct 18 19:32:22 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2129 Add protocol status tracking to crawl datum (Michael Joyce via
mattmann)
+
* NUTCH-2142 Nutch File Dump - FileNotFoundException (Invalid Argument) Error
(Karanjeet Singh via mattmann)
* NUTCH-2136 Implement a different version of Naive Bayes Parse Filter
(Asitang Mishra)
Modified: nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?rev=1709306&r1=1709305&r2=1709306&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Sun Oct 18
19:32:22 2015
@@ -41,6 +41,8 @@ public interface Nutch {
public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
GENERATE_TIME_KEY);
+ public static final Text PROTOCOL_STATUS_CODE_KEY = new
Text("nutch.protocol.code");
+
public static final String PROTO_STATUS_KEY = "_pst_";
public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
Modified:
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1709306&r1=1709305&r2=1709306&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
(original)
+++
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Sun Oct 18 19:32:22 2015
@@ -33,6 +33,7 @@ import org.slf4j.LoggerFactory;
// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
@@ -265,6 +266,9 @@ public abstract class HttpBase implement
}
int code = response.getCode();
+ datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
+ new Text(Integer.toString(code)));
+
byte[] content = response.getContent();
Content c = new Content(u.toString(), u.toString(),
(content == null ? EMPTY_CONTENT : content),
Modified:
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=1709306&r1=1709305&r2=1709306&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
(original)
+++
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
Sun Oct 18 19:32:22 2015
@@ -29,6 +29,7 @@ import org.apache.nutch.net.protocols.Re
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
@@ -129,6 +130,9 @@ public class Ftp implements Protocol {
response = new FtpResponse(u, datum, this, getConf()); // make a
request
int code = response.getCode();
+ datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
+ new Text(Integer.toString(code)));
+
if (code == 200) { // got a good response
return new ProtocolOutput(response.toContent()); // return it