Author: lewismc
Date: Tue Mar 26 19:19:36 2013
New Revision: 1461274
URL: http://svn.apache.org/r1461274
Log:
NUTCH-1419 parsechecker and indexchecker to report protocol status
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1461274&r1=1461273&r2=1461274&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Mar 26 19:19:36 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.2 - Current Development
+* NUTCH-1419 parsechecker and indexchecker to report protocol status (snagel
via lewismc)
+
* NUTCH-1038 Port IndexingFiltersChecker to 2.0 (snagel via lewismc)
* NUTCH-1532 Replace 'segment' mapping field with batchId (Feng +via lewismc)
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1461274&r1=1461273&r2=1461274&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Tue Mar 26 19:19:36 2013
@@ -85,7 +85,7 @@ public class IndexingFiltersChecker exte
page.setStatus(CrawlStatus.STATUS_FETCHED);
page.setFetchTime(System.currentTimeMillis());
} else {
- System.out.println("Fetch failed with protocol status: "
+ LOG.error("Fetch failed with protocol status: "
+ ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
+ ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
return -1;
@@ -93,7 +93,7 @@ public class IndexingFiltersChecker exte
Content content = protocolOutput.getContent();
if (content == null) {
- System.out.println("No content for " + url);
+ LOG.warn("No content for " + url);
return 0;
}
@@ -111,7 +111,7 @@ public class IndexingFiltersChecker exte
(new ParseUtil(conf)).process(url, page);
if (!ParseStatusUtils.isSuccess(page.getParseStatus())) {
- System.err.println("Problem with parse - check log");
+ LOG.warn("Problem with parse - check log");
return (-1);
}
@@ -124,7 +124,7 @@ public class IndexingFiltersChecker exte
}
if (doc == null) {
- System.out.println("Document discarded by indexing filter");
+ LOG.info("Document discarded by indexing filter");
return 0;
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1461274&r1=1461273&r2=1461274&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Tue
Mar 26 19:19:36 2013
@@ -31,6 +31,8 @@ import org.apache.hadoop.util.ToolRunner
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatusUtils;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.NutchConfiguration;
@@ -83,7 +85,21 @@ public class ParserChecker implements To
ProtocolFactory factory = new ProtocolFactory(conf);
Protocol protocol = factory.getProtocol(url);
WebPage page = new WebPage();
- Content content = protocol.getProtocolOutput(url, page).getContent();
+
+ ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
+
+ if(!protocolOutput.getStatus().isSuccess()) {
+ LOG.error("Fetch failed with protocol status: "
+ + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
+ + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
+ return (-1);
+ }
+ Content content = protocolOutput.getContent();
+
+ if (content == null) {
+ LOG.error("No content for " + url);
+ return (-1);
+ }
page.setBaseUrl(new org.apache.avro.util.Utf8(url));
page.setContent(ByteBuffer.wrap(content.getContent()));
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java?rev=1461274&r1=1461273&r2=1461274&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java
Tue Mar 26 19:19:36 2013
@@ -37,6 +37,7 @@ import org.apache.gora.persistency.impl.
import org.apache.gora.persistency.impl.StateManagerImpl;
import org.apache.gora.persistency.StatefulHashMap;
import org.apache.gora.persistency.ListGenericArray;
+import org.apache.nutch.protocol.ProtocolStatusUtils;
@SuppressWarnings("all")
public class ProtocolStatus extends PersistentBase {
@@ -109,4 +110,12 @@ public class ProtocolStatus extends Pers
public void setLastModified(long value) {
put(2, value);
}
+
+ /**
+ * A convenience method which returns a successful {@link ProtocolStatus}.
+ * @return the {@link ProtocolStatus} value for 200 (success).
+ */
+ public boolean isSuccess() {
+ return code == ProtocolStatusUtils.SUCCESS;
+ }
}