Author: jukka
Date: Wed Dec 16 00:10:24 2009
New Revision: 891078
URL: http://svn.apache.org/viewvc?rev=891078&view=rev
Log:
TIKA-351: MediaType.parse should be more forgiving of broken input
Patch by Ken Krugler.
Added:
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java
- copied unchanged from r890491,
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MediaTypeTest.java
Removed:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MediaTypeTest.java
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java?rev=891078&r1=891077&r2=891078&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
Wed Dec 16 00:10:24 2009
@@ -16,12 +16,12 @@
*/
package org.apache.tika.mime;
-import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
@@ -38,6 +38,16 @@
private static final Pattern SPECIAL_OR_WHITESPACE =
Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]");
+ // TIKA-350: handle charset as first element in content-type
+ // See http://www.ietf.org/rfc/rfc2045.txt for valid mime-type characters.
+ private static final String VALID_MIMETYPE_CHARS =
"[^\\c\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]";
+ private static final String MIME_TYPE_PATTERN_STRING = "(" +
VALID_MIMETYPE_CHARS + "+)"
+ + "\\s*/\\s*" + "(" + VALID_MIMETYPE_CHARS + "+)";
+ private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile(
+ "(?is)\\s*" + MIME_TYPE_PATTERN_STRING + "\\s*($|;.*)");
+ private static final Pattern CONTENT_TYPE_CHARSET_FIRST_PATTERN =
Pattern.compile(
+ "(?i)\\s*(charset\\s*=\\s*[^\\c;\\s]+)\\s*;\\s*" +
MIME_TYPE_PATTERN_STRING);
+
public static final MediaType OCTET_STREAM =
new MediaType("application", "octet-stream", NO_PARAMETERS);
@@ -49,46 +59,49 @@
/**
* Parses the given string to a media type. The string is expected to be of
- * the form "type/subtype(; parameter=...)*" as defined in RFC 2045.
+ * the form "type/subtype(; parameter=...)*" as defined in RFC 2045, though
+ * we also handle "charset=xxx; type/subtype" for broken web servers.
*
* @param string
* media type string to be parsed
* @return parsed media type, or <code>null</code> if parsing fails
*/
public static MediaType parse(String string) {
- int colon = string.indexOf(';');
- if (colon != -1 && colon != string.length()-1) {
- String primarySubString = string.substring(0, colon);
- String parameters = string
- .substring(colon + 1, string.length());
-
- MediaType type = parseNoParams(primarySubString);
- String[] paramBases = parameters.split(";");
- for (int i = 0; i < paramBases.length; i++) {
- String[] paramToks = paramBases[i].split("=");
- String paramName = paramToks[0].trim();
- String paramValue = paramToks[1].trim();
- type.parameters.put(paramName, paramValue);
+ String type;
+ String subtype;
+ String params;
+
+ Matcher m = CONTENT_TYPE_PATTERN.matcher(string);
+ if (m.matches()) {
+ type = m.group(1);
+ subtype = m.group(2);
+ params = m.group(3);
+ } else {
+ m = CONTENT_TYPE_CHARSET_FIRST_PATTERN.matcher(string);
+ if (m.matches()) {
+ params = m.group(1);
+ type = m.group(2);
+ subtype = m.group(3);
+ } else {
+ return null;
}
-
- return type;
-
- } else
- return parseNoParams(string);
-
- }
-
- private static MediaType parseNoParams(String string) {
- int slash = string.indexOf('/');
- if (slash != -1) {
- String type = string.substring(0, slash).trim();
- String subtype = string.substring(slash + 1).trim();
- if (type.length() > 0 && subtype.length() > 0) {
- return new MediaType(type, subtype);
+ }
+
+ MediaType result = new MediaType(type, subtype);
+ String[] paramPieces = params.split(";");
+ for (String paramPiece : paramPieces) {
+ String[] keyValue = paramPiece.split("=");
+ if (keyValue.length != 2) {
+ continue;
+ }
+
+ String key = keyValue[0].trim();
+ if (key.length() > 0) {
+ result.parameters.put(key, keyValue[1].trim());
}
}
-
- return null;
+
+ return result;
}
private final String type;