Author: pkosiorowski
Date: Thu Mar 9 13:17:03 2006
New Revision: 384618
URL: http://svn.apache.org/viewcvs?rev=384618&view=rev
Log:
NUTCH-91 - empty encoding causes exception. (Michael Nebel)
Modified:
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
Modified:
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL:
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=384618&r1=384617&r2=384618&view=diff
==============================================================================
---
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
(original)
+++
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
Thu Mar 9 13:17:03 2006
@@ -120,7 +120,7 @@
InputSource input = new InputSource(new
ByteArrayInputStream(contentInOctets));
String contentType = content.getMetadata().get(Response.CONTENT_TYPE);
String encoding = StringUtil.parseCharacterEncoding(contentType);
- if (encoding!=null) {
+ if ((encoding != null) && !("".equals(encoding))) {
metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
@@ -129,7 +129,7 @@
}
// sniff out 'charset' value from the beginning of a document
- if (encoding == null) {
+ if ((encoding == null) || ("".equals(encoding))) {
encoding = sniffCharacterEncoding(contentInOctets);
if (encoding!=null) {
metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);