Author: talat
Date: Sat May  3 13:47:44 2014
New Revision: 1592207

URL: http://svn.apache.org/r1592207
Log:
NUTCH-1657 ORIGINAL_CHAR_ENCODING and CHAR_ENCODING_FOR_CONVERSION never set in 
HTMLParser (talat)


Modified:
    nutch/branches/2.x/CHANGES.txt
    
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1592207&r1=1592206&r2=1592207&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat May  3 13:47:44 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1657 ORIGINAL_CHAR_ENCODING and CHAR_ENCODING_FOR_CONVERSION never set 
in HTMLParser (talat)
+
 * NUTCH-1725 CleaningJob's reducer does not commit deleted docs. (ilhamikalkan 
via talat)
 
 * NUTCH-1728 indexer-solr plugin is not delete docs from Solr (ilhamikalkan 
via talat)

Modified: 
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1592207&r1=1592206&r2=1592207&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
 Sat May  3 13:47:44 2014
@@ -179,7 +179,6 @@ public class HtmlParser implements Parse
     String text = "";
     String title = "";
     Outlink[] outlinks = new Outlink[0];
-    Metadata metadata = new Metadata();
 
     // parse the content
     DocumentFragment root;
@@ -193,8 +192,8 @@ public class HtmlParser implements Parse
       detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
       String encoding = detector.guessEncoding(page, defaultCharEncoding);
 
-      metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
-      metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
+      page.putToMetadata(new Utf8(Metadata.ORIGINAL_CHAR_ENCODING), 
ByteBuffer.wrap(Bytes.toBytes(encoding)));
+      page.putToMetadata(new Utf8(Metadata.CHAR_ENCODING_FOR_CONVERSION), 
ByteBuffer.wrap(Bytes.toBytes(encoding)));
 
       input.setEncoding(encoding);
       if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); }


Reply via email to