Author: lewismc
Date: Tue Mar 26 19:22:45 2013
New Revision: 1461276
URL: http://svn.apache.org/r1461276
Log:
NUTCH-1419 parsechecker and indexchecker to report protocol status
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1461276&r1=1461275&r2=1461276&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Mar 26 19:22:45 2013
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk): Current Development
+* NUTCH-1419 parsechecker and indexchecker to report protocol status (snagel +
lewismc)
+
* NUTCH-1047 Pluggable indexing backends (jnioche)
* NUTCH-1536 Ant build file has hardcoded conf dir location (zm via lewismc)
Modified:
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1461276&r1=1461275&r2=1461276&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Tue Mar 26 19:22:45 2013
@@ -39,6 +39,7 @@ import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.URLUtil;
@@ -81,8 +82,14 @@ public class IndexingFiltersChecker exte
Protocol protocol = factory.getProtocol(url);
CrawlDatum datum = new CrawlDatum();
- Content content = protocol.getProtocolOutput(new Text(url), datum)
- .getContent();
+ ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
+
+ if (!output.getStatus().isSuccess()) {
+ System.out.println("Fetch failed with protocol status: " +
output.getStatus());
+ return 0;
+ }
+
+ Content content = output.getContent();
if (content == null) {
System.out.println("No content for " + url);
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1461276&r1=1461275&r2=1461276&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Tue Mar 26
19:22:45 2013
@@ -28,6 +28,7 @@ import org.apache.nutch.crawl.SignatureF
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.URLUtil;
import org.apache.nutch.util.StringUtil;
@@ -80,11 +81,17 @@ public class ParserChecker implements To
ProtocolFactory factory = new ProtocolFactory(conf);
Protocol protocol = factory.getProtocol(url);
- Content content = protocol.getProtocolOutput(new Text(url),
- new CrawlDatum()).getContent();
+ ProtocolOutput output = protocol.getProtocolOutput(new Text(url), new
CrawlDatum());
+
+ if (!output.getStatus().isSuccess()) {
+ System.err.println("Fetch failed with protocol status: " +
output.getStatus());
+ return (-1);
+ }
+
+ Content content = output.getContent();
if (content == null) {
- System.err.println("Can't fetch URL successfully");
+ LOG.error("No content for " + url);
return (-1);
}