Author: jukka
Date: Wed Dec 16 00:02:28 2009
New Revision: 891075
URL: http://svn.apache.org/viewvc?rev=891075&view=rev
Log:
TIKA-350: HtmlParser's content-type handling code needs to be more flexible
Patch by Ken Krugler
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=891075&r1=891074&r2=891075&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Wed Dec 16 00:02:28 2009
@@ -50,8 +50,9 @@
"Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" +
"([^'\\\"]+)['\\\"]\\s*/>");
- private static final Pattern CONTENT_TYPE_PATTERN =
- Pattern.compile("(?i);\\s*charset\\s*=\\s*(.*)");
+ // TIKA-350: handle charset as first element in content-type
+ private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile(
+ "(?i)(?:;|)\\s*charset\\s*=\\s*([^\r;\\s]*)");
/**
* TIKA-332: Check for meta http-equiv tag with charset info in
@@ -86,7 +87,8 @@
}
}
- // No charset in a meta http-equiv tag, so detect from actual content
bytes.
+ // No charset in a meta http-equiv tag, see if it's in the passed
content-encoding
+ // hint, or the passed content-type hint.
CharsetDetector detector = new CharsetDetector();
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
if (incomingCharset == null) {
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=891075&r1=891074&r2=891075&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Wed Dec 16 00:02:28 2009
@@ -336,4 +336,27 @@
assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
}
+ /**
+ * Test case for TIKA-350
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-350">TIKA-350</a>
+ */
+ public void testUsingFunkyCharsetInContentTypeHeader() throws Exception {
+ final String test =
+ "<html><head><title>the name is \u00e1ndre</title></head>"
+ + "<body></body></html>";
+
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse (
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+ metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
+ new HtmlParser().parse (
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
}