Author: snagel
Date: Wed Mar 27 21:31:42 2013
New Revision: 1461854

URL: http://svn.apache.org/r1461854
Log:
parsechecker and indexchecker to report truncated content

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
    nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1461854&r1=1461853&r2=1461854&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Mar 27 21:31:42 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel)
+
 * NUTCH-1419 parsechecker and indexchecker to report protocol status (snagel + 
lewismc)
 
 * NUTCH-1047 Pluggable indexing backends (jnioche)

Modified: 
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1461854&r1=1461853&r2=1461854&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
Wed Mar 27 21:31:42 2013
@@ -35,6 +35,7 @@ import org.apache.nutch.indexer.NutchDoc
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseSegment;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
@@ -105,6 +106,10 @@ public class IndexingFiltersChecker exte
     // store the guessed content type in the crawldatum
     datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new 
Text(contentType));
 
+    if (ParseSegment.isTruncated(content)) {
+      LOG.warn("Content is truncated, parse may fail!");
+    }
+
     if (LOG.isInfoEnabled()) {
       LOG.info("parsing: " + url);
       LOG.info("contentType: " + contentType);

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1461854&r1=1461853&r2=1461854&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed Mar 27 
21:31:42 2013
@@ -106,6 +106,10 @@ public class ParserChecker implements To
       return (-1);
     }
 
+    if (ParseSegment.isTruncated(content)) {
+      LOG.warn("Content is truncated, parse may fail!");
+    }
+
     ParseResult parseResult = new ParseUtil(conf).parse(content);
 
     // Calculate the signature


Reply via email to