Author: lewismc
Date: Tue Mar 26 19:19:36 2013
New Revision: 1461274

URL: http://svn.apache.org/r1461274
Log:
NUTCH-1419 parsechecker and indexchecker to report protocol status

Modified:
    nutch/branches/2.x/CHANGES.txt
    
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
    nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1461274&r1=1461273&r2=1461274&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue Mar 26 19:19:36 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
+* NUTCH-1419 parsechecker and indexchecker to report protocol status (snagel 
via lewismc)
+
 * NUTCH-1038 Port IndexingFiltersChecker to 2.0 (snagel via lewismc)
 
 * NUTCH-1532 Replace 'segment' mapping field with batchId (Feng +via lewismc)

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1461274&r1=1461273&r2=1461274&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
 (original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
 Tue Mar 26 19:19:36 2013
@@ -85,7 +85,7 @@ public class IndexingFiltersChecker exte
       page.setStatus(CrawlStatus.STATUS_FETCHED);
       page.setFetchTime(System.currentTimeMillis());
     } else {
-      System.out.println("Fetch failed with protocol status: "
+      LOG.error("Fetch failed with protocol status: "
           + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
           + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
       return -1;
@@ -93,7 +93,7 @@ public class IndexingFiltersChecker exte
     
     Content content = protocolOutput.getContent();
     if (content == null) {
-      System.out.println("No content for " + url);
+      LOG.warn("No content for " + url);
       return 0;
     }
 
@@ -111,7 +111,7 @@ public class IndexingFiltersChecker exte
 
     (new ParseUtil(conf)).process(url, page);
     if (!ParseStatusUtils.isSuccess(page.getParseStatus())) {
-      System.err.println("Problem with parse - check log");
+      LOG.warn("Problem with parse - check log");
       return (-1);
     }
 
@@ -124,7 +124,7 @@ public class IndexingFiltersChecker exte
     }
 
     if (doc == null) {
-      System.out.println("Document discarded by indexing filter");
+      LOG.info("Document discarded by indexing filter");
       return 0;
     }
     

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1461274&r1=1461273&r2=1461274&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Tue 
Mar 26 19:19:36 2013
@@ -31,6 +31,8 @@ import org.apache.hadoop.util.ToolRunner
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatusUtils;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.NutchConfiguration;
@@ -83,7 +85,21 @@ public class ParserChecker implements To
     ProtocolFactory factory = new ProtocolFactory(conf);
     Protocol protocol = factory.getProtocol(url);
     WebPage page = new WebPage();
-    Content content = protocol.getProtocolOutput(url, page).getContent();
+    
+    ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
+    
+    if(!protocolOutput.getStatus().isSuccess()) {
+      LOG.error("Fetch failed with protocol status: "
+          + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode())
+          + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus()));
+      return (-1);
+    }
+    Content content = protocolOutput.getContent();
+    
+    if (content == null) {
+      LOG.error("No content for " + url);
+      return (-1);
+    }
     page.setBaseUrl(new org.apache.avro.util.Utf8(url));
     page.setContent(ByteBuffer.wrap(content.getContent()));
 

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java?rev=1461274&r1=1461273&r2=1461274&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java 
Tue Mar 26 19:19:36 2013
@@ -37,6 +37,7 @@ import org.apache.gora.persistency.impl.
 import org.apache.gora.persistency.impl.StateManagerImpl;
 import org.apache.gora.persistency.StatefulHashMap;
 import org.apache.gora.persistency.ListGenericArray;
+import org.apache.nutch.protocol.ProtocolStatusUtils;
 
 @SuppressWarnings("all")
 public class ProtocolStatus extends PersistentBase {
@@ -109,4 +110,12 @@ public class ProtocolStatus extends Pers
   public void setLastModified(long value) {
     put(2, value);
   }
+  
+  /**
+   * A convenience method which returns a successful {@link ProtocolStatus}.
+   * @return the {@link ProtocolStatus} value for 200 (success).
+   */
+  public boolean isSuccess() {
+    return code == ProtocolStatusUtils.SUCCESS; 
+  }
 }


Reply via email to