This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 164c9286fc0933051e86ce0a209250aa51bee3bf Author: tballison <talli...@mitre.org> AuthorDate: Wed Mar 7 13:47:38 2018 -0500 TIKA-2592 -- ignore charsets not supported by IANA in html meta-headers via Andreas Meier. --- CHANGES.txt | 4 + .../tika/parser/html/HtmlEncodingDetector.java | 41 +++++++ .../html/StandardCharsets_unsupported_by_IANA.txt | 125 +++++++++++++++++++++ .../apache/tika/parser/html/HtmlParserTest.java | 10 ++ .../test-documents/testHTML_charset_utf16le.html | Bin 0 -> 380 bytes .../test-documents/testHTML_charset_utf8.html | 8 ++ 6 files changed, 188 insertions(+) diff --git a/CHANGES.txt b/CHANGES.txt index d553961..73d3d68 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,9 @@ Release 1.18 - ??? + * Ignore non-IANA supported charsets in HTML meta-headers + during charset detection in HTMLEncodingDetector + via Andreas Meier (TIKA-2592) + * Add detection and parsing of zstd (if user provides com.github.luben:zstd-jni) via Andreas Meier (TIKA-2576) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java index 559ec4d..e383f80 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java @@ -16,10 +16,17 @@ */ package org.apache.tika.parser.html; +import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.nio.ByteBuffer; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -39,6 +46,37 @@ import org.apache.tika.utils.CharsetUtils; */ public class HtmlEncodingDetector implements EncodingDetector { + /** + * HTML can include non-iana supported charsets that Java + * recognizes, e.g. "unicode". This can lead to incorrect detection/mojibake. + * Ignore charsets in html meta-headers that are not supported by IANA. + * See: TIKA-2592 + */ + private static Set<String> CHARSETS_UNSUPPORTED_BY_IANA; + static { + Set<String> unsupported = new HashSet<>(); + try (BufferedReader reader = + new BufferedReader( + new InputStreamReader( + HtmlEncodingDetector.class + .getResourceAsStream("StandardCharsets_unsupported_by_IANA.txt"), + StandardCharsets.UTF_8))) { + String line = reader.readLine(); + while (line != null) { + if (line.startsWith("#")) { + continue; + } + line = line.trim(); + if (line.length() > 0) { + unsupported.add(line.toLowerCase(Locale.US)); + } + line = reader.readLine(); + } + } catch (IOException e) { + throw new IllegalArgumentException("couldn't find StandardCharsets_unsupported_by_IANA.txt on the class path"); + } + CHARSETS_UNSUPPORTED_BY_IANA = Collections.unmodifiableSet(unsupported); + } // TIKA-357 - use bigger buffer for meta tag sniffing (was 4K) private static final int DEFAULT_MARK_LIMIT = 8192; @@ -112,6 +150,9 @@ public class HtmlEncodingDetector implements EncodingDetector { //that is valid while (charsetMatcher.find()) { String candCharset = charsetMatcher.group(1); + if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) { + continue; + } if (CharsetUtils.isSupported(candCharset)) { try { return CharsetUtils.forName(candCharset); diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt b/tika-parsers/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt new file mode 100644 index 0000000..05f76ce --- /dev/null +++ b/tika-parsers/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt @@ -0,0 +1,125 @@ +646 +737 +775 +813 +819 +858 +874 +8859_1 +8859_13 +8859_15 +8859_2 +8859_4 +8859_5 +8859_7 +8859_9 +912 +914 +915 +920 +923 +ansi-1251 +ascii +ascii7 +cesu8 +cp1250 +cp1251 +cp1252 +cp1253 +cp1254 +cp1257 +cp5346 +cp5347 +cp5348 +cp5349 +cp5350 +cp5353 +cp737 +cp813 +cp858 +cp874 +cp912 +cp914 +cp915 +cp920 +cp923 +csibm862 +csisolatin0 +csisolatin9 +cspcp855 +default +ibm-437 +ibm-737 +ibm-775 +ibm-813 +ibm-819 +ibm-850 +ibm-852 +ibm-855 +ibm-857 +ibm-862 +ibm-866 +ibm-874 +ibm-912 +ibm-914 +ibm-915 +ibm-920 +ibm-923 +ibm737 +ibm813 +ibm874 +ibm912 +ibm914 +ibm915 +ibm920 +ibm923 +iso8859-1 +iso8859-13 +iso8859-15 +iso8859-2 +iso8859-4 +iso8859-5 +iso8859-7 +iso8859-9 +iso8859_1 +iso8859_13 +iso8859_15 +iso8859_15_fdis +iso8859_2 +iso8859_4 +iso8859_5 +iso8859_7 +iso8859_9 +iso_8859-13 +iso_8859_1 +koi8 +koi8_r +koi8_u +l9 +latin0 +latin9 +sun_eu_greek +unicode +unicode-1-1-utf-8 +unicodebig +unicodebigunmarked +unicodelittle +unicodelittleunmarked +utf-32be-bom +utf-32le-bom +utf16 +utf32 +utf8 +utf_16 +utf_16be +utf_16le +utf_32 +utf_32be +utf_32be_bom +utf_32le +utf_32le_bom +windows-437 +x-utf-16be +x-utf-16le +x-utf-32be +x-utf-32le \ No newline at end of file diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 6f2eb1f..ab8e314 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -1385,4 +1385,14 @@ public class HtmlParserTest extends TikaTest { } } } + + @Test + public void testCharsetsNotSupportedByIANA() throws Exception { + assertContains("This is a sample text", + getXML("testHTML_charset_utf8.html").xml); + + assertContains("This is a sample text", + getXML("testHTML_charset_utf16le.html").xml); + + } } diff --git a/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf16le.html b/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf16le.html new file mode 100644 index 0000000..26cb535 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf16le.html differ diff --git a/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf8.html b/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf8.html new file mode 100644 index 0000000..1f61f02 --- /dev/null +++ b/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf8.html @@ -0,0 +1,8 @@ +<html> + <head> + <title>Title</title> + <meta http-equiv="Content-Type" content="text/html; charset=unicode"> + <style></style> + </head> + <body>This is a sample text</body> +</html> \ No newline at end of file -- To stop receiving notification emails like this one, please contact talli...@apache.org.