Author: kkrugler
Date: Thu Jul 8 17:53:52 2010
New Revision: 961850
URL: http://svn.apache.org/viewvc?rev=961850&view=rev
Log:
TIKA-459: Improve handling for invalid charset names.
- Added CharsetUtils, tests
- Use new CharsetUtils.clean & isSafe from inside HtmlParser.
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
(with props)
tika/trunk/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java
(with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java?rev=961850&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
(added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
Thu Jul 8 17:53:52 2010
@@ -0,0 +1,113 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+...@suppresswarnings("serial")
+public class CharsetUtils {
+ private static final Pattern CHARSET_NAME_PATTERN = Pattern.compile("[
\\\"]*([^ >,;\\\"]+).*");
+ private static final Pattern ISO_NAME_PATTERN =
Pattern.compile("(?i).*8859-([\\d]+)");
+ private static final Pattern CP_NAME_PATTERN =
Pattern.compile("(?i)cp-([\\d]+)");
+ private static final Pattern WIN_NAME_PATTERN =
Pattern.compile("(?i)win(|-)([\\d]+)");
+
+ // List of common invalid charset names that we can't fix using
+ // pattern matching + heuristic
+ private static final Map<String, String> CHARSET_ALIASES = new
HashMap<String, String>() {{
+ put("none", null);
+ put("no", null);
+
+ put("iso-8851-1", "iso-8859-1");
+
+ put("windows", "windows-1252");
+
+ put("koi8r", "KOI8-R");
+ }};
+
+ /**
+ * Safely return whether <charsetName> is supported, without throwing
exceptions
+ *
+ * @param charsetName Name of charset (can be null)
+ * @return true if the character set is supported
+ */
+ public static boolean isSupported(String charsetName) {
+ try {
+ return Charset.isSupported(charsetName);
+ } catch (IllegalCharsetNameException e) {
+ return false;
+ } catch (IllegalArgumentException e) {
+ // null, for example
+ return false;
+ } catch (Exception e) {
+ // Unexpected exception, what to do?
+ return false;
+ }
+ }
+
+ /**
+ * Handle various common charset name errors, and return something
+ * that will be considered valid (and is normalized)
+ *
+ * @param charsetName name of charset to process
+ * @return potentially remapped/cleaned up version of charset name
+ */
+ public static String clean(String charsetName) {
+ if (charsetName == null) {
+ return null;
+ }
+
+ // Get rid of cruft around names, like <>, trailing commas, etc.
+ Matcher m = CHARSET_NAME_PATTERN.matcher(charsetName);
+ if (!m.matches()) {
+ return null;
+ }
+
+ String result = m.group(1);
+ if (CHARSET_ALIASES.containsKey(result.toLowerCase())) {
+ // Handle common erroneous charset names.
+ result = CHARSET_ALIASES.get(result.toLowerCase());
+ } else if (ISO_NAME_PATTERN.matcher(result).matches()) {
+ // Handle "iso 8859-x" error
+ m = ISO_NAME_PATTERN.matcher(result);
+ m.matches();
+ result = "iso-8859-" + m.group(1);
+ } else if (CP_NAME_PATTERN.matcher(result).matches()) {
+ // Handle "cp-xxx" error
+ m = CP_NAME_PATTERN.matcher(result);
+ m.matches();
+ result = "cp" + m.group(1);
+ } else if (WIN_NAME_PATTERN.matcher(result).matches()) {
+ // Handle "winxxx" and "win-xxx" errors
+ m = WIN_NAME_PATTERN.matcher(result);
+ m.matches();
+ result = "windows-" + m.group(2);
+ }
+
+ try {
+ Charset cs = Charset.forName(result);
+ return cs.name();
+ } catch (Exception e) {
+ return null;
+ }
+ }
+
+}
Propchange:
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added:
tika/trunk/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java?rev=961850&view=auto
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java
(added)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java
Thu Jul 8 17:53:52 2010
@@ -0,0 +1,47 @@
+package org.apache.tika.utils;
+
+import junit.framework.TestCase;
+
+public class CharsetUtilsTest extends TestCase {
+
+ public void testInvalidCharset() {
+ assertFalse(CharsetUtils.isSupported(" utf-8"));
+ assertFalse(CharsetUtils.isSupported("my charset name"));
+ assertFalse(CharsetUtils.isSupported("charset1; charset2"));
+ assertFalse(CharsetUtils.isSupported(null));
+ assertFalse(CharsetUtils.isSupported(""));
+ }
+
+ public void testValidCharset() {
+ assertTrue(CharsetUtils.isSupported("UTF-8"));
+ assertFalse(CharsetUtils.isSupported("bogus"));
+ }
+
+ public void testCleaningCharsetName() {
+ assertEquals("UTF-8", CharsetUtils.clean("utf-8"));
+ assertEquals(null, CharsetUtils.clean(""));
+ assertEquals(null, CharsetUtils.clean(null));
+ assertEquals("US-ASCII", CharsetUtils.clean(" us-ascii "));
+ assertEquals("UTF-8", CharsetUtils.clean("\"utf-8\""));
+ assertEquals("ISO-8859-1", CharsetUtils.clean("ISO-8859-1, latin1"));
+ }
+
+ public void testFunkyNames() {
+ assertEquals(null, CharsetUtils.clean("none"));
+ assertEquals(null, CharsetUtils.clean("no"));
+
+ assertEquals("UTF-8", CharsetUtils.clean("utf-8>"));
+
+ assertEquals("ISO-8859-1", CharsetUtils.clean("iso-8851-1"));
+ assertEquals("ISO-8859-15", CharsetUtils.clean("8859-15"));
+
+ assertEquals("windows-1251", CharsetUtils.clean("cp-1251"));
+ assertEquals("windows-1251", CharsetUtils.clean("win1251"));
+ assertEquals("windows-1251", CharsetUtils.clean("WIN-1251"));
+ assertEquals("windows-1251", CharsetUtils.clean("win-1251"));
+ assertEquals("windows-1252", CharsetUtils.clean("Windows"));
+
+ assertEquals("KOI8-R", CharsetUtils.clean("koi8r"));
+ }
+
+}
Propchange:
tika/trunk/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
tika/trunk/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=961850&r1=961849&r2=961850&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Thu Jul 8 17:53:52 2010
@@ -37,6 +37,7 @@ import org.apache.tika.parser.ParseConte
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
+import org.apache.tika.utils.CharsetUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
@@ -91,21 +92,18 @@ public class HtmlParser implements Parse
for (String attr : attrs) {
String[] keyValue = attr.trim().split("=");
if ((keyValue.length == 2) &&
keyValue[0].equalsIgnoreCase("charset")) {
- String charset = keyValue[1];
- try {
- if (Charset.isSupported(charset)) {
- metadata.set(Metadata.CONTENT_ENCODING,
charset);
- return charset;
- }
- } catch (IllegalCharsetNameException e){
- // Ignore malformed charset names
+ // TIKA-459: improve charset handling.
+ String charset = CharsetUtils.clean(keyValue[1]);
+ if (CharsetUtils.isSupported(charset)) {
+ metadata.set(Metadata.CONTENT_ENCODING, charset);
+ return charset;
}
}
}
}
}
- // No charset in a meta http-equiv tag, see if it's in the passed
content-encoding
+ // No (valid) charset in a meta http-equiv tag, see if it's in the
passed content-encoding
// hint, or the passed content-type hint.
CharsetDetector detector = new CharsetDetector();
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);