Author: jukka
Date: Mon Nov 30 01:40:14 2009
New Revision: 885308
URL: http://svn.apache.org/viewvc?rev=885308&view=rev
Log:
TIKA-334: HtmlParser should use CharsetDetector whenever no charset is
specified via meta http-equiv tag
Patch by Ken Krugler.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=885308&r1=885307&r2=885308&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Mon Nov 30 01:40:14 2009
@@ -25,30 +25,78 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.txt.CharsetDetector;
+import org.apache.tika.parser.txt.CharsetMatch;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
- * HTML parser. Uses CyberNeko to turn the input document to HTML SAX events,
+ * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
* and post-processes the events to produce XHTML and metadata expected by
* Tika clients.
*/
public class HtmlParser implements Parser {
+ // Use the widest, most common charset as our default.
+ private static final String DEFAULT_CHARSET = "windows-1252";
+
+ // TODO: Move this into core, along with CharsetDetector
+ private String getEncoding(InputStream stream, Metadata metadata) throws
IOException {
+ // TODO: Check for <meta tag in stream. If that exists and is
supported, then
+ // set that in metadata and return.
+
+ CharsetDetector detector = new CharsetDetector();
+ String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
+ if (incomingCharset == null) {
+ // TODO: check for charset in metadata's content_type
+ }
+
+ if (incomingCharset != null) {
+ detector.setDeclaredEncoding(incomingCharset);
+ }
+
+ detector.setText(stream);
+ for (CharsetMatch match : detector.detectAll()) {
+ if (Charset.isSupported(match.getName())) {
+ metadata.set(Metadata.CONTENT_ENCODING, match.getName());
+
+ // Is the encoding language-specific (KOI8-R, SJIS, etc.)?
+ String language = match.getLanguage();
+ if (language != null) {
+ metadata.set(Metadata.CONTENT_LANGUAGE,
match.getLanguage());
+ metadata.set(Metadata.LANGUAGE, match.getLanguage());
+ }
+
+ break;
+ }
+ }
+
+ String encoding = metadata.get(Metadata.CONTENT_ENCODING);
+ if (encoding == null) {
+ if (Charset.isSupported(DEFAULT_CHARSET)) {
+ encoding = DEFAULT_CHARSET;
+ } else {
+ encoding = Charset.defaultCharset().name();
+ }
+
+ metadata.set(Metadata.CONTENT_ENCODING, encoding);
+ }
+
+ return encoding;
+ }
+
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ throws IOException, SAXException, TikaException {
// Protect the stream from being closed by CyberNeko
+ // TODO: Is this still needed, given our use of TagSoup?
stream = new CloseShieldInputStream(stream);
// Prepare the input source using the encoding hint if available
InputSource source = new InputSource(stream);
- String encoding = metadata.get(Metadata.CONTENT_ENCODING);
- if (encoding != null && Charset.isSupported(encoding)) {
- source.setEncoding(encoding);
- }
+ source.setEncoding(getEncoding(stream, metadata));
// Parse the HTML document
org.ccil.cowan.tagsoup.Parser parser =
@@ -63,7 +111,7 @@
*/
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
- throws IOException, SAXException, TikaException {
+ throws IOException, SAXException, TikaException {
parse(stream, handler, metadata, new ParseContext());
}
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=885308&r1=885307&r2=885308&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Mon Nov 30 01:40:14 2009
@@ -219,4 +219,17 @@
assertFalse(content.contains("ab"));
}
+ /**
+ * Test case for TIKA-334
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
+ */
+ public void testDetectOfCharset() throws Exception {
+ String test = "<html><title>\u017d</title><body></body></html>";
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse (
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("\u017d", metadata.get(Metadata.TITLE));
+ }
+
}