This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new d2c3e96d8 NUTCH-3012 SegmentReader when dumping with option -recode: 
NPE on unparsed documents - fall back to UTF-8 when stringifying the content of 
unparsed documents
d2c3e96d8 is described below

commit d2c3e96d88818d8107f320c49e007329b020e090
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Mon Oct 9 10:21:01 2023 +0200

    NUTCH-3012 SegmentReader when dumping with option -recode: NPE on unparsed 
documents
    - fall back to UTF-8 when stringifying the content of unparsed documents
---
 src/java/org/apache/nutch/segment/SegmentReader.java | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/java/org/apache/nutch/segment/SegmentReader.java 
b/src/java/org/apache/nutch/segment/SegmentReader.java
index 14546af54..ee5c266fd 100644
--- a/src/java/org/apache/nutch/segment/SegmentReader.java
+++ b/src/java/org/apache/nutch/segment/SegmentReader.java
@@ -163,13 +163,16 @@ public class SegmentReader extends Configured implements 
Tool {
       dump.append("\nRecno:: ").append(recNo++).append("\n");
       dump.append("URL:: " + key.toString() + "\n");
       Content content = null;
-      Charset charset = null;
+      // fall-back encoding for content of unparsed documents
+      Charset charset = StandardCharsets.UTF_8;
       for (NutchWritable val : values) {
         Writable value = val.get(); // unwrap
         if (value instanceof CrawlDatum) {
           dump.append("\nCrawlDatum::\n").append(((CrawlDatum) 
value).toString());
         } else if (value instanceof Content) {
           if (recodeContent) {
+            // output recoded content later when charset is extracted from HTML
+            // metadata hold in ParseData
             content = (Content) value;
           } else {
             dump.append("\nContent::\n").append(((Content) value).toString());

Reply via email to