jukka
Tue, 15 Dec 2009 16:02:53 -0800
Author: jukka Date: Wed Dec 16 00:02:28 2009 New Revision: 891075 URL: http://svn.apache.org/viewvc?rev=891075&view=rev Log: TIKA-350: HtmlParser's content-type handling code needs to be more flexible Patch by Ken Krugler Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=891075&r1=891074&r2=891075&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Wed Dec 16 00:02:28 2009 @@ -50,8 +50,9 @@ "Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" + "([^'\\\"]+)['\\\"]\\s*/>"); - private static final Pattern CONTENT_TYPE_PATTERN = - Pattern.compile("(?i);\\s*charset\\s*=\\s*(.*)"); + // TIKA-350: handle charset as first element in content-type + private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile( + "(?i)(?:;|)\\s*charset\\s*=\\s*([^\r;\\s]*)"); /** * TIKA-332: Check for meta http-equiv tag with charset info in @@ -86,7 +87,8 @@ } } - // No charset in a meta http-equiv tag, so detect from actual content bytes. + // No charset in a meta http-equiv tag, see if it's in the passed content-encoding + // hint, or the passed content-type hint. CharsetDetector detector = new CharsetDetector(); String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING); if (incomingCharset == null) { Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=891075&r1=891074&r2=891075&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original) +++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Wed Dec 16 00:02:28 2009 @@ -336,4 +336,27 @@ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); } + /** + * Test case for TIKA-350 + * @see <a href="https://issues.apache.org/jira/browse/TIKA-350">TIKA-350</a> + */ + public void testUsingFunkyCharsetInContentTypeHeader() throws Exception { + final String test = + "<html><head><title>the name is \u00e1ndre</title></head>" + + "<body></body></html>"; + + Metadata metadata = new Metadata(); + new HtmlParser().parse ( + new ByteArrayInputStream(test.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); + + metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html"); + new HtmlParser().parse ( + new ByteArrayInputStream(test.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + } + }