Author: snagel Date: Wed Mar 27 21:31:42 2013 New Revision: 1461854 URL: http://svn.apache.org/r1461854 Log: parsechecker and indexchecker to report truncated content
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1461854&r1=1461853&r2=1461854&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Mar 27 21:31:42 2013 @@ -2,6 +2,8 @@ Nutch Change Log (trunk): Current Development +* NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel) + * NUTCH-1419 parsechecker and indexchecker to report protocol status (snagel + lewismc) * NUTCH-1047 Pluggable indexing backends (jnioche) Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1461854&r1=1461853&r2=1461854&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Wed Mar 27 21:31:42 2013 @@ -35,6 +35,7 @@ import org.apache.nutch.indexer.NutchDoc import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseSegment; import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; @@ -105,6 +106,10 @@ public class IndexingFiltersChecker exte // store the guessed content type in the crawldatum datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType)); + if (ParseSegment.isTruncated(content)) { + LOG.warn("Content is truncated, parse may fail!"); + } + if (LOG.isInfoEnabled()) { LOG.info("parsing: " + url); LOG.info("contentType: " + contentType); Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1461854&r1=1461853&r2=1461854&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed Mar 27 21:31:42 2013 @@ -106,6 +106,10 @@ public class ParserChecker implements To return (-1); } + if (ParseSegment.isTruncated(content)) { + LOG.warn("Content is truncated, parse may fail!"); + } + ParseResult parseResult = new ParseUtil(conf).parse(content); // Calculate the signature