This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new d2c3e96d8 NUTCH-3012 SegmentReader when dumping with option -recode:
NPE on unparsed documents - fall back to UTF-8 when stringifying the content of
unparsed documents
d2c3e96d8 is described below
commit d2c3e96d88818d8107f320c49e007329b020e090
Author: Sebastian Nagel <[email protected]>
AuthorDate: Mon Oct 9 10:21:01 2023 +0200
NUTCH-3012 SegmentReader when dumping with option -recode: NPE on unparsed
documents
- fall back to UTF-8 when stringifying the content of unparsed documents
---
src/java/org/apache/nutch/segment/SegmentReader.java | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/java/org/apache/nutch/segment/SegmentReader.java
b/src/java/org/apache/nutch/segment/SegmentReader.java
index 14546af54..ee5c266fd 100644
--- a/src/java/org/apache/nutch/segment/SegmentReader.java
+++ b/src/java/org/apache/nutch/segment/SegmentReader.java
@@ -163,13 +163,16 @@ public class SegmentReader extends Configured implements
Tool {
dump.append("\nRecno:: ").append(recNo++).append("\n");
dump.append("URL:: " + key.toString() + "\n");
Content content = null;
- Charset charset = null;
+ // fall-back encoding for content of unparsed documents
+ Charset charset = StandardCharsets.UTF_8;
for (NutchWritable val : values) {
Writable value = val.get(); // unwrap
if (value instanceof CrawlDatum) {
dump.append("\nCrawlDatum::\n").append(((CrawlDatum)
value).toString());
} else if (value instanceof Content) {
if (recodeContent) {
+ // output recoded content later when charset is extracted from HTML
+ // metadata hold in ParseData
content = (Content) value;
} else {
dump.append("\nContent::\n").append(((Content) value).toString());