Author: jukka
Date: Sun Dec 13 01:09:18 2009
New Revision: 890014
URL: http://svn.apache.org/viewvc?rev=890014&view=rev
Log:
TIKA-341: Use charset in CONTENT_TYPE metadata when detecting the character
encoding
Patch by Ken Krugler
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=890014&r1=890013&r2=890014&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Sun Dec 13 01:09:18 2009
@@ -49,6 +49,9 @@
"(?is)<meta\\s+http-equiv\\s*=\\s*['\"]\\s*Content-Type['\"]\\s+"
+ "content\\s*=\\s*['\"][^;]+;\\s*charset\\s*=\\s*([^'\"]+)\"");
+ private static final Pattern CONTENT_TYPE_PATTERN =
+ Pattern.compile("(?i);\\s*charset\\s*=\\s*(.*)");
+
/**
* TIKA-332: Check for meta http-equiv tag with charset info in
* HTML content.
@@ -78,13 +81,26 @@
CharsetDetector detector = new CharsetDetector();
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
if (incomingCharset == null) {
- // TODO: check for charset in metadata's content_type
+ // TIKA-341: Use charset in content-type
+ String contentType = metadata.get(Metadata.CONTENT_TYPE);
+ if (contentType != null) {
+ Matcher m = CONTENT_TYPE_PATTERN.matcher(contentType);
+ if (m.find()) {
+ String charset = m.group(1).trim();
+ if (Charset.isSupported(charset)) {
+ incomingCharset = charset;
+ }
+ }
+ }
}
if (incomingCharset != null) {
detector.setDeclaredEncoding(incomingCharset);
}
+ // TIKA-341 without enabling input filtering (stripping of tags) the
+ // short HTML tests don't work well.
+ detector.enableInputFilter(true);
detector.setText(stream);
for (CharsetMatch match : detector.detectAll()) {
if (Charset.isSupported(match.getName())) {
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=890014&r1=890013&r2=890014&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
Sun Dec 13 01:09:18 2009
@@ -24,6 +24,8 @@
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
@@ -60,11 +62,12 @@
*/
public class TXTParser implements Parser {
+ private static final Pattern CONTENT_TYPE_PATTERN =
Pattern.compile("(?i);\\s*charset\\s*=\\s*(.*)");
+
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
- metadata.set(Metadata.CONTENT_TYPE, "text/plain");
// CharsetDetector expects a stream to support marks
if (!stream.markSupported()) {
@@ -74,10 +77,19 @@
// Detect the content encoding (the stream is reset to the beginning)
CharsetDetector detector = new CharsetDetector();
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
+ if (incomingCharset == null) {
+ // TIKA-341: Use charset in content-type
+ String contentType = metadata.get(Metadata.CONTENT_TYPE);
+ if (contentType != null) {
+ Matcher m = CONTENT_TYPE_PATTERN.matcher(contentType);
+ if (m.find()) {
+ incomingCharset = m.group(1).trim();
+ }
+ }
+ }
+
if (incomingCharset != null) {
detector.setDeclaredEncoding(incomingCharset);
- } else {
- // TODO: try to extract charset from CONTENT_TYPE in metadata
}
detector.setText(stream);
@@ -103,6 +115,10 @@
+ " hint is available in document metadata");
}
+ // TIKA-341: Only stomp on content-type after we're done trying to
+ // use it to guess at the charset.
+ metadata.set(Metadata.CONTENT_TYPE, "text/plain");
+
try {
Reader reader =
new BufferedReader(new InputStreamReader(stream, encoding));
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=890014&r1=890013&r2=890014&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Sun Dec 13 01:09:18 2009
@@ -250,4 +250,27 @@
assertEquals("\u017d", metadata.get(Metadata.TITLE));
}
+ /**
+ * Test case for TIKA-341
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-XXX">TIKA-XXX</a>
+ */
+ public void testUsingCharsetInContentTypeHeader() throws Exception {
+ final String test =
+ "<html><head><title>the name is \u00e1ndre</title></head>"
+ + "<body></body></html>";
+
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse (
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+ metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
+ new HtmlParser().parse (
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
}
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=890014&r1=890013&r2=890014&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
Sun Dec 13 01:09:18 2009
@@ -122,6 +122,31 @@
assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
}
+ /**
+ * Test case for TIKA-341: using charset in content-type
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
+ */
+ public void testUsingCharsetInContentTypeHeader() throws Exception {
+ // Could be UTF-8 or ISO 8859-1 or ...
+ // u00e1 is latin small letter a with acute
+ final String test2 = "the name is \u00e1ndre";
+
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(test2.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+
+ assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+ metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
+ parser.parse(
+ new ByteArrayInputStream(test2.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ }
private void assertExtractText(String msg, String expected, byte[] input)
throws Exception {