jukka
Tue, 15 Dec 2009 15:56:28 -0800
Author: jukka Date: Tue Dec 15 23:56:03 2009 New Revision: 891074 URL: http://svn.apache.org/viewvc?rev=891074&view=rev Log: TIKA-349: HtmlParser's http-equiv code needs to be more flexible Patch by Ken Krugler Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=891074&r1=891073&r2=891074&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original) +++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Tue Dec 15 23:56:03 2009 @@ -45,10 +45,11 @@ // Use the widest, most common charset as our default. private static final String DEFAULT_CHARSET = "windows-1252"; private static final int META_TAG_BUFFER_SIZE = 4096; - private static final Pattern HTTP_EQUIV_CHARSET_PATTERN = Pattern.compile( - "(?is)<meta\\s+http-equiv\\s*=\\s*['\"]\\s*Content-Type['\"]\\s+" - + "content\\s*=\\s*['\"][^;]+;\\s*charset\\s*=\\s*([^'\"]+)\""); - + private static final Pattern HTTP_EQUIV_PATTERN = Pattern.compile( + "(?is)<meta\\s+http-equiv\\s*=\\s*['\\\"]\\s*" + + "Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" + + "([^'\\\"]+)['\\\"]\\s*/>"); + private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile("(?i);\\s*charset\\s*=\\s*(.*)"); @@ -67,12 +68,20 @@ if (bufferSize != -1) { String metaString = new String(buffer, 0, bufferSize); - Matcher m = HTTP_EQUIV_CHARSET_PATTERN.matcher(metaString); + Matcher m = HTTP_EQUIV_PATTERN.matcher(metaString); if (m.find()) { - String charset = m.group(1); - if (Charset.isSupported(charset)) { - metadata.set(Metadata.CONTENT_ENCODING, charset); - return charset; + // TIKA-349: flexible handling of attributes + // We have one or more x or x=y attributes, separated by ';' + String[] attrs = m.group(1).split(";"); + for (String attr : attrs) { + String[] keyValue = attr.trim().split("="); + if ((keyValue.length == 2) && keyValue[0].equalsIgnoreCase("charset")) { + String charset = keyValue[1]; + if (Charset.isSupported(charset)) { + metadata.set(Metadata.CONTENT_ENCODING, charset); + return charset; + } + } } } } Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=891074&r1=891073&r2=891074&view=diff ============================================================================== --- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original) +++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Tue Dec 15 23:56:03 2009 @@ -252,7 +252,7 @@ /** * Test case for TIKA-341 - * @see <a href="https://issues.apache.org/jira/browse/TIKA-XXX">TIKA-XXX</a> + * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a> */ public void testUsingCharsetInContentTypeHeader() throws Exception { final String test = @@ -307,4 +307,33 @@ assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE)); } + /** + * Test case for TIKA-349 + * @see <a href="https://issues.apache.org/jira/browse/TIKA-349">TIKA-349</a> + */ + public void testHttpEquivCharsetFunkyAttributes() throws Exception { + String test1 = + "<html><head><meta http-equiv=\"content-type\"" + + " content=\"text/html; charset=ISO-8859-1; charset=iso-8859-1\" />" + + "<title>the name is \u00e1ndre</title>" + + "</head><body></body></html>"; + Metadata metadata = new Metadata(); + new HtmlParser().parse ( + new ByteArrayInputStream(test1.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + + // Some HTML pages have errors like ';;' versus '; ' as separator + String test2 = + "<html><head><meta http-equiv=\"content-type\"" + + " content=\"text/html;;charset=ISO-8859-1\" />" + + "<title>the name is \u00e1ndre</title>" + + "</head><body></body></html>"; + metadata = new Metadata(); + new HtmlParser().parse ( + new ByteArrayInputStream(test2.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + } + }