Author: talat
Date: Sat May 3 13:47:44 2014
New Revision: 1592207
URL: http://svn.apache.org/r1592207
Log:
NUTCH-1657 ORIGINAL_CHAR_ENCODING and CHAR_ENCODING_FOR_CONVERSION never set in
HTMLParser (talat)
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1592207&r1=1592206&r2=1592207&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat May 3 13:47:44 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1657 ORIGINAL_CHAR_ENCODING and CHAR_ENCODING_FOR_CONVERSION never set
in HTMLParser (talat)
+
* NUTCH-1725 CleaningJob's reducer does not commit deleted docs. (ilhamikalkan
via talat)
* NUTCH-1728 indexer-solr plugin is not delete docs from Solr (ilhamikalkan
via talat)
Modified:
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1592207&r1=1592206&r2=1592207&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
(original)
+++
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
Sat May 3 13:47:44 2014
@@ -179,7 +179,6 @@ public class HtmlParser implements Parse
String text = "";
String title = "";
Outlink[] outlinks = new Outlink[0];
- Metadata metadata = new Metadata();
// parse the content
DocumentFragment root;
@@ -193,8 +192,8 @@ public class HtmlParser implements Parse
detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
String encoding = detector.guessEncoding(page, defaultCharEncoding);
- metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
- metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
+ page.putToMetadata(new Utf8(Metadata.ORIGINAL_CHAR_ENCODING),
ByteBuffer.wrap(Bytes.toBytes(encoding)));
+ page.putToMetadata(new Utf8(Metadata.CHAR_ENCODING_FOR_CONVERSION),
ByteBuffer.wrap(Bytes.toBytes(encoding)));
input.setEncoding(encoding);
if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); }