jukka
Sat, 12 Dec 2009 17:09:43 -0800
Author: jukka Date: Sun Dec 13 01:09:18 2009 New Revision: 890014 URL: http://svn.apache.org/viewvc?rev=890014&view=rev Log: TIKA-341: Use charset in CONTENT_TYPE metadata when detecting the character encoding Patch by Ken Krugler Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=890014&r1=890013&r2=890014&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Sun Dec 13 01:09:18 2009 @@ -49,6 +49,9 @@ "(?is)<meta\\s+http-equiv\\s*=\\s*['\"]\\s*Content-Type['\"]\\s+" + "content\\s*=\\s*['\"][^;]+;\\s*charset\\s*=\\s*([^'\"]+)\""); + private static final Pattern CONTENT_TYPE_PATTERN = + Pattern.compile("(?i);\\s*charset\\s*=\\s*(.*)"); + /** * TIKA-332: Check for meta http-equiv tag with charset info in * HTML content. @@ -78,13 +81,26 @@ CharsetDetector detector = new CharsetDetector(); String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING); if (incomingCharset == null) { - // TODO: check for charset in metadata's content_type + // TIKA-341: Use charset in content-type + String contentType = metadata.get(Metadata.CONTENT_TYPE); + if (contentType != null) { + Matcher m = CONTENT_TYPE_PATTERN.matcher(contentType); + if (m.find()) { + String charset = m.group(1).trim(); + if (Charset.isSupported(charset)) { + incomingCharset = charset; + } + } + } } if (incomingCharset != null) { detector.setDeclaredEncoding(incomingCharset); } + // TIKA-341 without enabling input filtering (stripping of tags) the + // short HTML tests don't work well. + detector.enableInputFilter(true); detector.setText(stream); for (CharsetMatch match : detector.detectAll()) { if (Charset.isSupported(match.getName())) { Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=890014&r1=890013&r2=890014&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java Sun Dec 13 01:09:18 2009 @@ -24,6 +24,8 @@ import java.io.Reader; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.DublinCore; @@ -60,11 +62,12 @@ */ public class TXTParser implements Parser { + private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile("(?i);\\s*charset\\s*=\\s*(.*)"); + public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - metadata.set(Metadata.CONTENT_TYPE, "text/plain"); // CharsetDetector expects a stream to support marks if (!stream.markSupported()) { @@ -74,10 +77,19 @@ // Detect the content encoding (the stream is reset to the beginning) CharsetDetector detector = new CharsetDetector(); String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING); + if (incomingCharset == null) { + // TIKA-341: Use charset in content-type + String contentType = metadata.get(Metadata.CONTENT_TYPE); + if (contentType != null) { + Matcher m = CONTENT_TYPE_PATTERN.matcher(contentType); + if (m.find()) { + incomingCharset = m.group(1).trim(); + } + } + } + if (incomingCharset != null) { detector.setDeclaredEncoding(incomingCharset); - } else { - // TODO: try to extract charset from CONTENT_TYPE in metadata } detector.setText(stream); @@ -103,6 +115,10 @@ + " hint is available in document metadata"); } + // TIKA-341: Only stomp on content-type after we're done trying to + // use it to guess at the charset. + metadata.set(Metadata.CONTENT_TYPE, "text/plain"); + try { Reader reader = new BufferedReader(new InputStreamReader(stream, encoding)); Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=890014&r1=890013&r2=890014&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original) +++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Sun Dec 13 01:09:18 2009 @@ -250,4 +250,27 @@ assertEquals("\u017d", metadata.get(Metadata.TITLE)); } + /** + * Test case for TIKA-341 + * @see <a href="https://issues.apache.org/jira/browse/TIKA-XXX">TIKA-XXX</a> + */ + public void testUsingCharsetInContentTypeHeader() throws Exception { + final String test = + "<html><head><title>the name is \u00e1ndre</title></head>" + + "<body></body></html>"; + + Metadata metadata = new Metadata(); + new HtmlParser().parse ( + new ByteArrayInputStream(test.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); + + metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1"); + new HtmlParser().parse ( + new ByteArrayInputStream(test.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + } + } Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=890014&r1=890013&r2=890014&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (original) +++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java Sun Dec 13 01:09:18 2009 @@ -122,6 +122,31 @@ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); } + /** + * Test case for TIKA-341: using charset in content-type + * + * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a> + */ + public void testUsingCharsetInContentTypeHeader() throws Exception { + // Could be UTF-8 or ISO 8859-1 or ... + // u00e1 is latin small letter a with acute + final String test2 = "the name is \u00e1ndre"; + + Metadata metadata = new Metadata(); + parser.parse( + new ByteArrayInputStream(test2.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); + + metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1"); + parser.parse( + new ByteArrayInputStream(test2.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + } private void assertExtractText(String msg, String expected, byte[] input) throws Exception {