Author: jukka
Date: Wed Dec 16 00:10:24 2009
New Revision: 891078

URL: http://svn.apache.org/viewvc?rev=891078&view=rev
Log:
TIKA-351: MediaType.parse should be more forgiving of broken input

Patch by Ken Krugler.

Added:
    
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java
      - copied unchanged from r890491, 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MediaTypeTest.java
Removed:
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MediaTypeTest.java
Modified:
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java

Modified: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java?rev=891078&r1=891077&r2=891078&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java 
(original)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java 
Wed Dec 16 00:10:24 2009
@@ -16,12 +16,12 @@
  */
 package org.apache.tika.mime;
 
-import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.SortedMap;
 import java.util.TreeMap;
+import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 /**
@@ -38,6 +38,16 @@
     private static final Pattern SPECIAL_OR_WHITESPACE =
         Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]");
 
+    // TIKA-350: handle charset as first element in content-type
+    // See http://www.ietf.org/rfc/rfc2045.txt for valid mime-type characters.
+    private static final String VALID_MIMETYPE_CHARS = 
"[^\\c\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]";
+    private static final String MIME_TYPE_PATTERN_STRING = "(" + 
VALID_MIMETYPE_CHARS + "+)"
+                    + "\\s*/\\s*" + "(" + VALID_MIMETYPE_CHARS + "+)";
+    private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile(
+                    "(?is)\\s*" + MIME_TYPE_PATTERN_STRING + "\\s*($|;.*)");
+    private static final Pattern CONTENT_TYPE_CHARSET_FIRST_PATTERN = 
Pattern.compile(
+                    "(?i)\\s*(charset\\s*=\\s*[^\\c;\\s]+)\\s*;\\s*" + 
MIME_TYPE_PATTERN_STRING);
+
     public static final MediaType OCTET_STREAM =
         new MediaType("application", "octet-stream", NO_PARAMETERS);
 
@@ -49,46 +59,49 @@
 
     /**
      * Parses the given string to a media type. The string is expected to be of
-     * the form "type/subtype(; parameter=...)*" as defined in RFC 2045.
+     * the form "type/subtype(; parameter=...)*" as defined in RFC 2045, though
+     * we also handle "charset=xxx; type/subtype" for broken web servers.
      * 
      * @param string
      *            media type string to be parsed
      * @return parsed media type, or <code>null</code> if parsing fails
      */
     public static MediaType parse(String string) {
-        int colon = string.indexOf(';');
-        if (colon != -1 && colon != string.length()-1) {
-            String primarySubString = string.substring(0, colon);
-            String parameters = string
-                    .substring(colon + 1, string.length());
-
-            MediaType type = parseNoParams(primarySubString);
-            String[] paramBases = parameters.split(";");
-            for (int i = 0; i < paramBases.length; i++) {
-                String[] paramToks = paramBases[i].split("=");
-                String paramName = paramToks[0].trim();
-                String paramValue = paramToks[1].trim();
-                type.parameters.put(paramName, paramValue);
+        String type;
+        String subtype;
+        String params;
+        
+        Matcher m = CONTENT_TYPE_PATTERN.matcher(string);
+        if (m.matches()) {
+            type = m.group(1);
+            subtype = m.group(2);
+            params = m.group(3);
+        } else {
+            m = CONTENT_TYPE_CHARSET_FIRST_PATTERN.matcher(string);
+            if (m.matches()) {
+                params = m.group(1);
+                type = m.group(2);
+                subtype = m.group(3);
+            } else {
+                return null;
             }
-
-            return type;
-
-        } else
-            return parseNoParams(string);
-
-    }
-
-    private static MediaType parseNoParams(String string) {
-        int slash = string.indexOf('/');
-        if (slash != -1) {
-            String type = string.substring(0, slash).trim();
-            String subtype = string.substring(slash + 1).trim();
-            if (type.length() > 0 && subtype.length() > 0) {
-                return new MediaType(type, subtype);
+        }
+        
+        MediaType result = new MediaType(type, subtype);
+        String[] paramPieces = params.split(";");
+        for (String paramPiece : paramPieces) {
+            String[] keyValue = paramPiece.split("=");
+            if (keyValue.length != 2) {
+                continue;
+            }
+            
+            String key = keyValue[0].trim();
+            if (key.length() > 0) {
+                result.parameters.put(key, keyValue[1].trim());
             }
         }
-
-        return null;
+        
+        return result;
     }
 
     private final String type;


Reply via email to