Author: snagel
Date: Wed Mar 27 21:31:42 2013
New Revision: 1461854
URL: http://svn.apache.org/r1461854
Log:
parsechecker and indexchecker to report truncated content
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1461854&r1=1461853&r2=1461854&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Mar 27 21:31:42 2013
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk): Current Development
+* NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel)
+
* NUTCH-1419 parsechecker and indexchecker to report protocol status (snagel +
lewismc)
* NUTCH-1047 Pluggable indexing backends (jnioche)
Modified:
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1461854&r1=1461853&r2=1461854&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Wed Mar 27 21:31:42 2013
@@ -35,6 +35,7 @@ import org.apache.nutch.indexer.NutchDoc
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
@@ -105,6 +106,10 @@ public class IndexingFiltersChecker exte
// store the guessed content type in the crawldatum
datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new
Text(contentType));
+ if (ParseSegment.isTruncated(content)) {
+ LOG.warn("Content is truncated, parse may fail!");
+ }
+
if (LOG.isInfoEnabled()) {
LOG.info("parsing: " + url);
LOG.info("contentType: " + contentType);
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1461854&r1=1461853&r2=1461854&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed Mar 27
21:31:42 2013
@@ -106,6 +106,10 @@ public class ParserChecker implements To
return (-1);
}
+ if (ParseSegment.isTruncated(content)) {
+ LOG.warn("Content is truncated, parse may fail!");
+ }
+
ParseResult parseResult = new ParseUtil(conf).parse(content);
// Calculate the signature