HtmlParser.java

kkrugler Thu, 08 Jul 2010 10:54:51 -0700

Author: kkrugler
Date: Thu Jul  8 17:53:52 2010
New Revision: 961850

URL: http://svn.apache.org/viewvc?rev=961850&view=rev
Log:
TIKA-459: Improve handling for invalid charset names.


- Added CharsetUtils, tests
- Use new CharsetUtils.clean & isSafe from inside HtmlParser.

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java  
 (with props)
    
tika/trunk/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java  
 (with props)
Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Added: 
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java?rev=961850&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java 
(added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java 
Thu Jul  8 17:53:52 2010
@@ -0,0 +1,113 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+...@suppresswarnings("serial")
+public class CharsetUtils {
+    private static final Pattern CHARSET_NAME_PATTERN = Pattern.compile("[ 
\\\"]*([^ >,;\\\"]+).*");
+    private static final Pattern ISO_NAME_PATTERN = 
Pattern.compile("(?i).*8859-([\\d]+)");
+    private static final Pattern CP_NAME_PATTERN = 
Pattern.compile("(?i)cp-([\\d]+)");
+    private static final Pattern WIN_NAME_PATTERN = 
Pattern.compile("(?i)win(|-)([\\d]+)");
+    
+    // List of common invalid charset names that we can't fix using
+    // pattern matching + heuristic
+    private static final Map<String, String> CHARSET_ALIASES = new 
HashMap<String, String>() {{
+        put("none", null);
+        put("no", null);
+        
+        put("iso-8851-1", "iso-8859-1");
+        
+        put("windows", "windows-1252");
+        
+        put("koi8r", "KOI8-R");
+    }};
+    
+    /**
+     * Safely return whether <charsetName> is supported, without throwing 
exceptions
+     * 
+     * @param charsetName Name of charset (can be null)
+     * @return true if the character set is supported
+     */
+    public static boolean isSupported(String charsetName) {
+        try {
+            return Charset.isSupported(charsetName);
+        } catch (IllegalCharsetNameException e) {
+            return false;
+        } catch (IllegalArgumentException e) {
+            // null, for example
+            return false;
+        } catch (Exception e) {
+            // Unexpected exception, what to do?
+            return false;
+        }
+    }
+    
+    /**
+     * Handle various common charset name errors, and return something
+     * that will be considered valid (and is normalized)
+     * 
+     * @param charsetName name of charset to process
+     * @return potentially remapped/cleaned up version of charset name
+     */
+    public static String clean(String charsetName) {
+        if (charsetName == null) {
+            return null;
+        }
+        
+        // Get rid of cruft around names, like <>, trailing commas, etc.
+        Matcher m = CHARSET_NAME_PATTERN.matcher(charsetName);
+        if (!m.matches()) {
+            return null;
+        }
+
+        String result = m.group(1);
+        if (CHARSET_ALIASES.containsKey(result.toLowerCase())) {
+            // Handle common erroneous charset names.
+            result = CHARSET_ALIASES.get(result.toLowerCase());
+        } else if (ISO_NAME_PATTERN.matcher(result).matches()) {
+            // Handle "iso 8859-x" error
+            m = ISO_NAME_PATTERN.matcher(result);
+            m.matches();
+            result = "iso-8859-" + m.group(1);
+        } else if (CP_NAME_PATTERN.matcher(result).matches()) {
+            // Handle "cp-xxx" error
+            m = CP_NAME_PATTERN.matcher(result);
+            m.matches();
+            result = "cp" + m.group(1);
+        } else if (WIN_NAME_PATTERN.matcher(result).matches()) {
+            // Handle "winxxx" and "win-xxx" errors
+            m = WIN_NAME_PATTERN.matcher(result);
+            m.matches();
+            result = "windows-" + m.group(2);
+        }
+        
+        try {
+            Charset cs = Charset.forName(result);
+            return cs.name();
+        } catch (Exception e) {
+            return null;
+        }
+    }
+
+}

Propchange: 
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
tika/trunk/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java?rev=961850&view=auto
==============================================================================
--- 
tika/trunk/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java 
(added)
+++ 
tika/trunk/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java 
Thu Jul  8 17:53:52 2010
@@ -0,0 +1,47 @@
+package org.apache.tika.utils;
+
+import junit.framework.TestCase;
+
+public class CharsetUtilsTest extends TestCase {
+
+    public void testInvalidCharset() {
+        assertFalse(CharsetUtils.isSupported(" utf-8"));
+        assertFalse(CharsetUtils.isSupported("my charset name"));
+        assertFalse(CharsetUtils.isSupported("charset1; charset2"));
+        assertFalse(CharsetUtils.isSupported(null));
+        assertFalse(CharsetUtils.isSupported(""));
+    }
+    
+    public void testValidCharset() {
+        assertTrue(CharsetUtils.isSupported("UTF-8"));
+        assertFalse(CharsetUtils.isSupported("bogus"));
+    }
+    
+    public void testCleaningCharsetName() {
+        assertEquals("UTF-8", CharsetUtils.clean("utf-8"));
+        assertEquals(null, CharsetUtils.clean(""));
+        assertEquals(null, CharsetUtils.clean(null));
+        assertEquals("US-ASCII", CharsetUtils.clean(" us-ascii  "));
+        assertEquals("UTF-8", CharsetUtils.clean("\"utf-8\""));
+        assertEquals("ISO-8859-1", CharsetUtils.clean("ISO-8859-1, latin1"));
+    }
+    
+    public void testFunkyNames() {
+        assertEquals(null, CharsetUtils.clean("none"));
+        assertEquals(null, CharsetUtils.clean("no"));
+        
+        assertEquals("UTF-8", CharsetUtils.clean("utf-8>"));
+        
+        assertEquals("ISO-8859-1", CharsetUtils.clean("iso-8851-1"));
+        assertEquals("ISO-8859-15", CharsetUtils.clean("8859-15"));
+        
+        assertEquals("windows-1251", CharsetUtils.clean("cp-1251"));
+        assertEquals("windows-1251", CharsetUtils.clean("win1251"));
+        assertEquals("windows-1251", CharsetUtils.clean("WIN-1251"));
+        assertEquals("windows-1251", CharsetUtils.clean("win-1251"));
+        assertEquals("windows-1252", CharsetUtils.clean("Windows"));
+        
+        assertEquals("KOI8-R", CharsetUtils.clean("koi8r"));
+    }
+
+}

Propchange: 
tika/trunk/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
tika/trunk/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=961850&r1=961849&r2=961850&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
 Thu Jul  8 17:53:52 2010
@@ -37,6 +37,7 @@ import org.apache.tika.parser.ParseConte
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
+import org.apache.tika.utils.CharsetUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
@@ -91,21 +92,18 @@ public class HtmlParser implements Parse
                 for (String attr : attrs) {
                     String[] keyValue = attr.trim().split("=");
                     if ((keyValue.length == 2) && 
keyValue[0].equalsIgnoreCase("charset")) {
-                       String charset = keyValue[1];
-                       try {
-                               if (Charset.isSupported(charset)) {
-                                       metadata.set(Metadata.CONTENT_ENCODING, 
charset);
-                                       return charset;
-                               }
-                       } catch (IllegalCharsetNameException e){
-                               // Ignore malformed charset names
+                        // TIKA-459: improve charset handling.
+                       String charset = CharsetUtils.clean(keyValue[1]);
+                       if (CharsetUtils.isSupported(charset)) {
+                           metadata.set(Metadata.CONTENT_ENCODING, charset);
+                           return charset;
                        }
                     }
                 }
             }
         }
 
-        // No charset in a meta http-equiv tag, see if it's in the passed 
content-encoding
+        // No (valid) charset in a meta http-equiv tag, see if it's in the 
passed content-encoding
         // hint, or the passed content-type hint.
         CharsetDetector detector = new CharsetDetector();
         String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);

svn commit: r961850 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Reply via email to