Author: jukka
Date: Sun Dec 13 00:23:29 2009
New Revision: 890010
URL: http://svn.apache.org/viewvc?rev=890010&view=rev
Log:
TIKA-334: HtmlParser should use CharsetDetector whenever no charset is
specified via meta http-equiv tag
Note that the BufferedInputStream wrapper needs to be applied *outside* the
getEncoding() method so that the mark()/reset() operations affect the stream
instance that's later passed to tagsoup for parsing.
Also rearranged some comments.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=890010&r1=890009&r2=890010&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Sun Dec 13 00:23:29 2009
@@ -49,13 +49,13 @@
"(?is)<meta\\s+http-equiv\\s*=\\s*['\"]\\s*Content-Type['\"]\\s+"
+ "content\\s*=\\s*['\"][^;]+;\\s*charset\\s*=\\s*([^'\"]+)\"");
- // TODO: Move this into core, along with CharsetDetector
+ /**
+ * TIKA-332: Check for meta http-equiv tag with charset info in
+ * HTML content.
+ * <p>
+ * TODO: Move this into core, along with CharsetDetector
+ */
private String getEncoding(InputStream stream, Metadata metadata) throws
IOException {
- // TIKA-332: Check for meta http-equiv tag with charset info in HTML
content
- if (!stream.markSupported()) {
- stream = new BufferedInputStream(stream);
- }
-
stream.mark(META_TAG_BUFFER_SIZE);
char[] buffer = new char[META_TAG_BUFFER_SIZE];
InputStreamReader isr = new InputStreamReader(stream, "us-ascii");
@@ -118,7 +118,12 @@
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ throws IOException, SAXException, TikaException {
+ // The getEncoding() method depends on the mark feature
+ if (!stream.markSupported()) {
+ stream = new BufferedInputStream(stream);
+ }
+
// Protect the stream from being closed by CyberNeko
// TODO: Is this still needed, given our use of TagSoup?
stream = new CloseShieldInputStream(stream);