This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new d2c3e96d8 NUTCH-3012 SegmentReader when dumping with option -recode: NPE on unparsed documents - fall back to UTF-8 when stringifying the content of unparsed documents d2c3e96d8 is described below commit d2c3e96d88818d8107f320c49e007329b020e090 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Mon Oct 9 10:21:01 2023 +0200 NUTCH-3012 SegmentReader when dumping with option -recode: NPE on unparsed documents - fall back to UTF-8 when stringifying the content of unparsed documents --- src/java/org/apache/nutch/segment/SegmentReader.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/nutch/segment/SegmentReader.java b/src/java/org/apache/nutch/segment/SegmentReader.java index 14546af54..ee5c266fd 100644 --- a/src/java/org/apache/nutch/segment/SegmentReader.java +++ b/src/java/org/apache/nutch/segment/SegmentReader.java @@ -163,13 +163,16 @@ public class SegmentReader extends Configured implements Tool { dump.append("\nRecno:: ").append(recNo++).append("\n"); dump.append("URL:: " + key.toString() + "\n"); Content content = null; - Charset charset = null; + // fall-back encoding for content of unparsed documents + Charset charset = StandardCharsets.UTF_8; for (NutchWritable val : values) { Writable value = val.get(); // unwrap if (value instanceof CrawlDatum) { dump.append("\nCrawlDatum::\n").append(((CrawlDatum) value).toString()); } else if (value instanceof Content) { if (recodeContent) { + // output recoded content later when charset is extracted from HTML + // metadata hold in ParseData content = (Content) value; } else { dump.append("\nContent::\n").append(((Content) value).toString());