Author: jukka
Date: Fri Nov 11 07:51:12 2011
New Revision: 1200754
URL: http://svn.apache.org/viewvc?rev=1200754&view=rev
Log:
TIKA-780: Optimize loading of the media type registry
Various different optimizations
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
Fri Nov 11 07:51:12 2011
@@ -16,6 +16,7 @@
*/
package org.apache.tika.detect;
+import java.io.CharArrayWriter;
import java.io.IOException;
import java.io.InputStream;
@@ -30,6 +31,141 @@ import org.apache.tika.mime.MediaType;
*/
public class MagicDetector implements Detector {
+ public static MagicDetector parse(
+ MediaType mediaType,
+ String type, String offset, String value, String mask) {
+ int start = 0;
+ int end = 0;
+ if (offset != null) {
+ int colon = offset.indexOf(':');
+ if (colon == -1) {
+ start = Integer.parseInt(offset);
+ end = start;
+ } else {
+ start = Integer.parseInt(offset.substring(0, colon));
+ end = Integer.parseInt(offset.substring(colon + 1));
+ }
+ }
+
+ byte[] patternBytes = decodeValue(value, type);
+ byte[] maskBytes = null;
+ if (mask != null) {
+ maskBytes = decodeValue(mask, type);
+ }
+
+ return new MagicDetector(
+ mediaType, patternBytes, maskBytes, start, end);
+ }
+
+ private static byte[] decodeValue(String value, String type) {
+ // Preliminary check
+ if ((value == null) || (type == null)) {
+ return null;
+ }
+
+ byte[] decoded = null;
+ String tmpVal = null;
+ int radix = 8;
+
+ // hex
+ if (value.startsWith("0x")) {
+ tmpVal = value.substring(2);
+ radix = 16;
+ } else {
+ tmpVal = value;
+ radix = 8;
+ }
+
+ if (type.equals("string") || type.equals("unicodeLE")
+ || type.equals("unicodeBE")) {
+ decoded = decodeString(value, type);
+ } else if (type.equals("byte")) {
+ decoded = tmpVal.getBytes();
+ } else if (type.equals("host16") || type.equals("little16")) {
+ int i = Integer.parseInt(tmpVal, radix);
+ decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
+ } else if (type.equals("big16")) {
+ int i = Integer.parseInt(tmpVal, radix);
+ decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
+ } else if (type.equals("host32") || type.equals("little32")) {
+ long i = Long.parseLong(tmpVal, radix);
+ decoded = new byte[] {
+ (byte) ((i & 0x000000FF)),
+ (byte) ((i & 0x0000FF00) >> 8),
+ (byte) ((i & 0x00FF0000) >> 16),
+ (byte) ((i & 0xFF000000) >> 24) };
+ } else if (type.equals("big32")) {
+ long i = Long.parseLong(tmpVal, radix);
+ decoded = new byte[] {
+ (byte) ((i & 0xFF000000) >> 24),
+ (byte) ((i & 0x00FF0000) >> 16),
+ (byte) ((i & 0x0000FF00) >> 8),
+ (byte) ((i & 0x000000FF)) };
+ }
+ return decoded;
+ }
+
+ private static byte[] decodeString(String value, String type) {
+ if (value.startsWith("0x")) {
+ byte[] vals = new byte[(value.length() - 2) / 2];
+ for (int i = 0; i < vals.length; i++) {
+ vals[i] = (byte)
+ Integer.parseInt(value.substring(2 + i * 2, 4 + i * 2), 16);
+ }
+ return vals;
+ }
+
+ CharArrayWriter decoded = new CharArrayWriter();
+
+ for (int i = 0; i < value.length(); i++) {
+ if (value.charAt(i) == '\\') {
+ if (value.charAt(i + 1) == '\\') {
+ decoded.write('\\');
+ i++;
+ } else if (value.charAt(i + 1) == 'x') {
+ decoded.write(Integer.parseInt(
+ value.substring(i + 2, i + 4), 16));
+ i += 3;
+ } else {
+ int j = i + 1;
+ while ((j < i + 4) && (j < value.length())
+ && (Character.isDigit(value.charAt(j)))) {
+ j++;
+ }
+ decoded.write(Short.decode(
+ "0" + value.substring(i + 1, j)).byteValue());
+ i = j - 1;
+ }
+ } else {
+ decoded.write(value.charAt(i));
+ }
+ }
+
+ // Now turn the chars into bytes
+ char[] chars = decoded.toCharArray();
+ byte[] bytes;
+ if ("unicodeLE".equals(type)) {
+ bytes = new byte[chars.length * 2];
+ for (int i = 0; i < chars.length; i++) {
+ bytes[i * 2] = (byte) (chars[i] & 0xff);
+ bytes[i * 2 + 1] = (byte) (chars[i] >> 8);
+ }
+ } else if ("unicodeBE".equals(type)) {
+ bytes = new byte[chars.length * 2];
+ for(int i = 0; i < chars.length; i++) {
+ bytes[i * 2] = (byte) (chars[i] >> 8);
+ bytes[i * 2 + 1] = (byte) (chars[i] & 0xff);
+ }
+ } else {
+ // Copy with truncation
+ bytes = new byte[chars.length];
+ for(int i = 0; i < bytes.length; i++) {
+ bytes[i] = (byte) chars[i];
+ }
+ }
+ return bytes;
+ }
+
/**
* The matching media type. Returned by the
* {@link #detect(InputStream, Metadata)} method if a match is found.
@@ -69,8 +205,6 @@ public class MagicDetector implements De
* starts at this offset.
*/
private final int offsetRangeEnd;
-
- private final String asString;
/**
* Creates a detector for input documents that have the exact given byte
@@ -136,13 +270,6 @@ public class MagicDetector implements De
this.offsetRangeBegin = offsetRangeBegin;
this.offsetRangeEnd = offsetRangeEnd;
-
- // Build the string representation. Needs to be unique, as
- // these get compared. Compute now as may get compared a lot!
- this.asString = "Magic Detection for " + type.toString() +
- " looking for " + pattern.length +
- " bytes = " + this.pattern +
- " mask = " + this.mask;
}
/**
@@ -205,12 +332,20 @@ public class MagicDetector implements De
}
}
+ public int getLength() {
+ return length;
+ }
+
/**
* Returns a string representation of the Detection Rule.
* Should sort nicely by type and details, as we sometimes
* compare these.
*/
public String toString() {
- return asString;
+ // Needs to be unique, as these get compared.
+ return "Magic Detection for " + type +
+ " looking for " + pattern.length +
+ " bytes = " + this.pattern +
+ " mask = " + this.mask;
}
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java Fri Nov
11 07:51:12 2011
@@ -26,30 +26,27 @@ class Magic implements Clause, Comparabl
private final MimeType type;
- private int priority = 50;
+ private final int priority;
- private Clause clause = null;
+ private final Clause clause;
- Magic(MimeType type) {
+ private final String string;
+
+ Magic(MimeType type, int priority, Clause clause) {
this.type = type;
+ this.priority = priority;
+ this.clause = clause;
+ this.string = "[" + priority + "/" + clause + "]";
}
MimeType getType() {
return type;
}
- void setPriority(int priority) {
- this.priority = priority;
- }
-
int getPriority() {
return priority;
}
- void setClause(Clause clause) {
- this.clause = clause;
- }
-
public boolean eval(byte[] data) {
return clause.eval(data);
}
@@ -59,9 +56,7 @@ class Magic implements Clause, Comparabl
}
public String toString() {
- StringBuffer buf = new StringBuffer();
-
buf.append("[").append(priority).append("/").append(clause).append("]");
- return buf.toString();
+ return string;
}
public int compareTo(Magic o) {
@@ -73,7 +68,7 @@ class Magic implements Clause, Comparabl
diff = o.type.compareTo(type);
}
if (diff == 0) {
- diff = o.toString().compareTo(toString());
+ diff = o.string.compareTo(string);
}
return diff;
}
@@ -81,23 +76,13 @@ class Magic implements Clause, Comparabl
public boolean equals(Object o) {
if (o instanceof Magic) {
Magic that = (Magic) o;
-
- if (this.size() != that.size()) {
- return false;
- }
-
- if (!this.type.equals(that.type)) {
- return false;
- }
-
- return this.toString().equals(that.toString());
+ return type.equals(that.type) && string.equals(that.string);
}
-
return false;
}
public int hashCode() {
- return size() ^ type.hashCode() ^ toString().hashCode();
+ return type.hashCode() ^ string.hashCode();
}
}
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java Fri
Nov 11 07:51:12 2011
@@ -27,18 +27,38 @@ import org.apache.tika.metadata.Metadata
*/
class MagicMatch implements Clause {
- private final MagicDetector detector;
+ private final MediaType mediaType;
- private final int length;
+ private final String type;
- MagicMatch(MagicDetector detector, int length) throws MimeTypeException {
- this.detector = detector;
- this.length = length;
+ private final String offset;
+
+ private final String value;
+
+ private final String mask;
+
+ private MagicDetector detector = null;
+
+ MagicMatch(
+ MediaType mediaType,
+ String type, String offset, String value, String mask) {
+ this.mediaType = mediaType;
+ this.type = type;
+ this.offset = offset;
+ this.value = value;
+ this.mask = mask;
+ }
+
+ private synchronized MagicDetector getDetector() {
+ if (detector == null) {
+ detector = MagicDetector.parse(mediaType, type, offset, value,
mask);
+ }
+ return detector;
}
public boolean eval(byte[] data) {
try {
- return detector.detect(
+ return getDetector().detect(
new ByteArrayInputStream(data), new Metadata())
!= MediaType.OCTET_STREAM;
} catch (IOException e) {
@@ -48,11 +68,12 @@ class MagicMatch implements Clause {
}
public int size() {
- return length;
+ return getDetector().getLength();
}
public String toString() {
- return detector.toString();
+ return mediaType.toString()
+ + " " + type + " " + offset + " " + value + " " + mask;
}
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java Fri
Nov 11 07:51:12 2011
@@ -45,15 +45,20 @@ public final class MediaType implements
private static final Pattern SPECIAL_OR_WHITESPACE =
Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]");
+ /**
+ * See http://www.ietf.org/rfc/rfc2045.txt for valid mime-type characters.
+ */
+ private static final String VALID_CHARS =
+ "([^\\c\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]+)";
+
+ private static final Pattern TYPE_PATTERN = Pattern.compile(
+ "(?s)\\s*" + VALID_CHARS + "\\s*/\\s*" + VALID_CHARS
+ + "\\s*($|;.*)");
+
// TIKA-350: handle charset as first element in content-type
- // See http://www.ietf.org/rfc/rfc2045.txt for valid mime-type characters.
- private static final String VALID_MIMETYPE_CHARS =
"[^\\c\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]";
- private static final String MIME_TYPE_PATTERN_STRING = "(" +
VALID_MIMETYPE_CHARS + "+)"
- + "\\s*/\\s*" + "(" + VALID_MIMETYPE_CHARS + "+)";
- private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile(
- "(?is)\\s*" + MIME_TYPE_PATTERN_STRING + "\\s*($|;.*)");
- private static final Pattern CONTENT_TYPE_CHARSET_FIRST_PATTERN =
Pattern.compile(
- "(?i)\\s*(charset\\s*=\\s*[^\\c;\\s]+)\\s*;\\s*" +
MIME_TYPE_PATTERN_STRING);
+ private static final Pattern CHARSET_FIRST_PATTERN = Pattern.compile(
+ "(?is)\\s*(charset\\s*=\\s*[^\\c;\\s]+)\\s*;\\s*"
+ + VALID_CHARS + "\\s*/\\s*" + VALID_CHARS + "\\s*");
public static final MediaType OCTET_STREAM = application("octet-stream");
@@ -97,39 +102,78 @@ public final class MediaType implements
return null;
}
- String type;
- String subtype;
- String params;
-
- Matcher m = CONTENT_TYPE_PATTERN.matcher(string);
- if (m.matches()) {
- type = m.group(1);
- subtype = m.group(2);
- params = m.group(3);
- } else {
- m = CONTENT_TYPE_CHARSET_FIRST_PATTERN.matcher(string);
- if (m.matches()) {
- params = m.group(1);
- type = m.group(2);
- subtype = m.group(3);
- } else {
- return null;
+ int slash = string.indexOf('/');
+ if (slash == -1) {
+ return null;
+ }
+
+ // Optimization for the common case
+ String type = string.substring(0, slash);
+ String subtype = string.substring(slash + 1);
+ if (isValidName(type) && isValidName(subtype)) {
+ return new MediaType(type, subtype);
+ }
+
+ Matcher matcher;
+ matcher = TYPE_PATTERN.matcher(string);
+ if (matcher.matches()) {
+ return new MediaType(
+ matcher.group(1), matcher.group(2),
+ parseParameters(matcher.group(3)));
+ }
+ matcher = CHARSET_FIRST_PATTERN.matcher(string);
+ if (matcher.matches()) {
+ return new MediaType(
+ matcher.group(2), matcher.group(3),
+ parseParameters(matcher.group(1)));
+ }
+
+ return null;
+ }
+
+ private static boolean isValidName(String name) {
+ for (int i = 0; i < name.length(); i++) {
+ char c = name.charAt(i);
+ if (c != '-' && c != '+' && c != '.' && c != '_'
+ && !('0' <= c && c <= '9')
+ && !('A' <= c && c <= 'Z')
+ && !('a' <= c && c <= 'z')) {
+ return false;
}
}
+ return name.length() > 0;
+ }
+
+ private static Map<String, String> parseParameters(String string) {
+ if (string.length() == 0) {
+ return NO_PARAMETERS;
+ }
Map<String, String> parameters = new HashMap<String, String>();
- for (String paramPiece : params.split(";")) {
- String[] keyValue = paramPiece.split("=", 2);
- String key = keyValue[0].trim();
+ while (string.length() > 0) {
+ String key = string;
+ String value = "";
+
+ int semicolon = string.indexOf(';');
+ if (semicolon != -1) {
+ key = string.substring(0, semicolon);
+ string = string.substring(semicolon + 1);
+ } else {
+ string = "";
+ }
+
+ int equals = key.indexOf('=');
+ if (equals != -1) {
+ value = key.substring(equals + 1);
+ key = key.substring(0, equals);
+ }
+
+ key = key.trim();
if (key.length() > 0) {
- if (keyValue.length > 1) {
- parameters.put(key, keyValue[1].trim());
- } else {
- parameters.put(key, "");
- }
+ parameters.put(key, value.trim());
}
}
- return new MediaType(type, subtype, parameters);
+ return parameters;
}
private final String type;
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java Fri
Nov 11 07:51:12 2011
@@ -80,10 +80,10 @@ public final class MimeType implements C
private String description = "";
/** The magics associated to this Mime-Type */
- private final ArrayList<Magic> magics = new ArrayList<Magic>();
+ private List<Magic> magics = null;
/** The root-XML associated to this Mime-Type */
- private final ArrayList<RootXML> rootXML = new ArrayList<RootXML>();
+ private List<RootXML> rootXML = null;
/** The minimum length of data to provides for magic analyzis */
private int minLength = 0;
@@ -92,7 +92,7 @@ public final class MimeType implements C
* All known file extensions of this type, in order of preference
* (best first).
*/
- private final List<String> extensions = new ArrayList<String>();
+ private List<String> extensions = null;
/**
* Creates a media type with the give name and containing media type
@@ -156,34 +156,42 @@ public final class MimeType implements C
* @param localName
*/
void addRootXML(String namespaceURI, String localName) {
+ if (rootXML == null) {
+ rootXML = new ArrayList<RootXML>();
+ }
rootXML.add(new RootXML(this, namespaceURI, localName));
}
boolean matchesXML(String namespaceURI, String localName) {
- for (RootXML xml : rootXML) {
- if (xml.matches(namespaceURI, localName)) {
- return true;
+ if (rootXML != null) {
+ for (RootXML xml : rootXML) {
+ if (xml.matches(namespaceURI, localName)) {
+ return true;
+ }
}
}
return false;
}
boolean hasRootXML() {
- return (rootXML.size() > 0);
+ return rootXML != null;
}
- RootXML[] getRootXMLs() {
- return rootXML.toArray(new RootXML[rootXML.size()]);
- }
-
- Magic[] getMagics() {
- return magics.toArray(new Magic[magics.size()]);
+ List<Magic> getMagics() {
+ if (magics != null) {
+ return magics;
+ } else {
+ return Collections.emptyList();
+ }
}
void addMagic(Magic magic) {
if (magic == null) {
return;
}
+ if (magics == null) {
+ magics = new ArrayList<Magic>();
+ }
magics.add(magic);
}
@@ -192,11 +200,11 @@ public final class MimeType implements C
}
public boolean hasMagic() {
- return (magics.size() > 0);
+ return magics != null;
}
public boolean matchesMagic(byte[] data) {
- for (int i = 0; i < magics.size(); i++) {
+ for (int i = 0; magics != null && i < magics.size(); i++) {
Magic magic = magics.get(i);
if (magic.eval(data)) {
return true;
@@ -330,7 +338,7 @@ public final class MimeType implements C
* @return preferred file extension or empty string
*/
public String getExtension() {
- if (extensions.isEmpty()) {
+ if (extensions == null) {
return "";
} else {
return extensions.get(0);
@@ -344,7 +352,11 @@ public final class MimeType implements C
* @return known extensions in order of preference (best first)
*/
public List<String> getExtensions() {
- return Collections.unmodifiableList(extensions);
+ if (extensions != null) {
+ return Collections.unmodifiableList(extensions);
+ } else {
+ return Collections.emptyList();
+ }
}
/**
@@ -353,6 +365,11 @@ public final class MimeType implements C
* @param extension file extension
*/
void addExtension(String extension) {
+ if (extensions == null) {
+ extensions = Collections.singletonList(extension);
+ } else if (extensions.size() == 1) {
+ extensions = new ArrayList<String>(extensions);
+ }
if (!extensions.contains(extension)) {
extensions.add(extension);
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java Fri
Nov 11 07:51:12 2011
@@ -24,12 +24,12 @@ import java.io.InputStream;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
-import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.Collections;
import java.util.HashMap;
+import java.util.List;
import java.util.Locale;
import java.util.Map;
-import java.util.SortedSet;
-import java.util.TreeSet;
import javax.xml.namespace.QName;
@@ -102,11 +102,11 @@ public final class MimeTypes implements
/** The patterns matcher */
private Patterns patterns = new Patterns(registry);
- /** List of all registered magics */
- private SortedSet<Magic> magics = new TreeSet<Magic>();
+ /** Sorted list of all registered magics */
+ private final List<Magic> magics = new ArrayList<Magic>();
- /** List of all registered rootXML */
- private SortedSet<MimeType> xmls = new TreeSet<MimeType>();
+ /** Sorted list of all registered rootXML */
+ private final List<MimeType> xmls = new ArrayList<MimeType>();
public MimeTypes() {
rootMimeType = new MimeType(MediaType.OCTET_STREAM);
@@ -362,7 +362,7 @@ public final class MimeTypes implements
// Update the magics index...
if (type.hasMagic()) {
- magics.addAll(Arrays.asList(type.getMagics()));
+ magics.addAll(type.getMagics());
}
// Update the xml (xmlRoot) index...
@@ -372,6 +372,21 @@ public final class MimeTypes implements
}
/**
+ * Called after all configured types have been loaded.
+ * Initializes the magics and xmls sets.
+ */
+ void init() {
+ for (MimeType type : types.values()) {
+ magics.addAll(type.getMagics());
+ if (type.hasRootXML()) {
+ xmls.add(type);
+ }
+ }
+ Collections.sort(magics);
+ Collections.sort(xmls);
+ }
+
+ /**
* Automatically detects the MIME type of a document based on magic
* markers in the stream prefix and any given metadata hints.
* <p>
@@ -441,23 +456,29 @@ public final class MimeTypes implements
return type;
}
-
+
+ private static MimeTypes DEFAULT_TYPES = null;
+
/**
* Get the default MimeTypes. This includes all the build in
- * mimetypes, and any custom override ones present.
+ * media types, and any custom override ones present.
*
- * @return MimeTypes
- * @throws MimeTypeException
- * @throws IOException
+ * @return MimeTypes default type registry
*/
- public static MimeTypes getDefaultMimeTypes() {
- try {
- return MimeTypesFactory.create("tika-mimetypes.xml",
"custom-mimetypes.xml");
- } catch (MimeTypeException e) {
- throw new RuntimeException("Unable to read default mimetypes", e);
- } catch (IOException e) {
- throw new RuntimeException("Unable to read default mimetypes", e);
+ public static synchronized MimeTypes getDefaultMimeTypes() {
+ if (DEFAULT_TYPES == null) {
+ try {
+ DEFAULT_TYPES = MimeTypesFactory.create(
+ "tika-mimetypes.xml", "custom-mimetypes.xml");
+ } catch (MimeTypeException e) {
+ throw new RuntimeException(
+ "Unable to parse the default media type registry", e);
+ } catch (IOException e) {
+ throw new RuntimeException(
+ "Unable to read the default media type registry", e);
+ }
}
+ return DEFAULT_TYPES;
}
}
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java
Fri Nov 11 07:51:12 2011
@@ -46,6 +46,7 @@ public class MimeTypesFactory {
public static MimeTypes create(Document document) throws MimeTypeException
{
MimeTypes mimeTypes = new MimeTypes();
new MimeTypesReader(mimeTypes).read(document);
+ mimeTypes.init();
return mimeTypes;
}
@@ -62,6 +63,7 @@ public class MimeTypesFactory {
for(InputStream inputStream : inputStreams) {
reader.read(inputStream);
}
+ mimeTypes.init();
return mimeTypes;
}
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
Fri Nov 11 07:51:12 2011
@@ -16,17 +16,16 @@
*/
package org.apache.tika.mime;
-import java.io.CharArrayWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
-import org.apache.tika.detect.MagicDetector;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
@@ -184,17 +183,20 @@ final class MimeTypesReader implements M
}
for (Clause clause : readMatches(element, mimeType.getType())) {
- Magic magic = new Magic(mimeType);
- magic.setPriority(priority);
- magic.setClause(clause);
+ Magic magic = new Magic(mimeType, priority, clause);
mimeType.addMagic(magic);
}
}
private List<Clause> readMatches(Element element, MediaType mediaType)
throws MimeTypeException {
- List<Clause> clauses = new ArrayList<Clause>();
NodeList nodes = element.getChildNodes();
- for (int i = 0; i < nodes.getLength(); i++) {
+ int n = nodes.getLength();
+ if (n == 0) {
+ return Collections.emptyList();
+ }
+
+ List<Clause> clauses = new ArrayList<Clause>();
+ for (int i = 0; i < n; i++) {
Node node = nodes.item(i);
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element nodeElement = (Element) node;
@@ -208,9 +210,22 @@ final class MimeTypesReader implements M
/** Read Element named match. */
private Clause readMatch(Element element, MediaType mediaType) throws
MimeTypeException {
+ Clause clause = getMagicClause(element, mediaType);
+
+ List<Clause> subClauses = readMatches(element, mediaType);
+ if (subClauses.size() == 0) {
+ return clause;
+ } else if (subClauses.size() == 1) {
+ return new AndClause(clause, subClauses.get(0));
+ } else {
+ return new AndClause(clause, new OrClause(subClauses));
+ }
+ }
+
+ private Clause getMagicClause(Element element, MediaType mediaType)
+ throws MimeTypeException {
String type = "string";
- int start = 0;
- int end = 0;
+ String offset = null;
String value = null;
String mask = null;
@@ -218,15 +233,7 @@ final class MimeTypesReader implements M
for (int i = 0; i < attrs.getLength(); i++) {
Attr attr = (Attr) attrs.item(i);
if (attr.getName().equals(MATCH_OFFSET_ATTR)) {
- String offset = attr.getValue();
- int colon = offset.indexOf(':');
- if (colon == -1) {
- start = Integer.parseInt(offset);
- end = start;
- } else {
- start = Integer.parseInt(offset.substring(0, colon));
- end = Integer.parseInt(offset.substring(colon + 1));
- }
+ offset = attr.getValue();
} else if (attr.getName().equals(MATCH_TYPE_ATTR)) {
type = attr.getValue();
} else if (attr.getName().equals(MATCH_VALUE_ATTR)) {
@@ -236,151 +243,7 @@ final class MimeTypesReader implements M
}
}
- if (value == null) {
- throw new MimeTypeException("Missing magic byte pattern");
- } else if (start < 0 || end < start) {
- throw new MimeTypeException(
- "Invalid offset range: [" + start + "," + end + "]");
- }
-
- byte[] patternBytes = decodeValue(type, value);
- int length = patternBytes.length;
- byte[] maskBytes = null;
- if (mask != null) {
- maskBytes = decodeValue(type, mask);
- length = Math.max(patternBytes.length, maskBytes.length);
- }
-
- MagicDetector detector = new MagicDetector(
- mediaType, patternBytes, maskBytes, start, end);
- Clause clause = new MagicMatch(detector, length);
-
- List<Clause> subClauses = readMatches(element, mediaType);
- if (subClauses.size() == 0) {
- return clause;
- } else if (subClauses.size() == 1) {
- return new AndClause(clause, subClauses.get(0));
- } else {
- return new AndClause(clause, new OrClause(subClauses));
- }
- }
-
- private byte[] decodeValue(String type, String value)
- throws MimeTypeException {
- // Preliminary check
- if ((value == null) || (type == null)) {
- return null;
- }
-
- byte[] decoded = null;
- String tmpVal = null;
- int radix = 8;
-
- // hex
- if (value.startsWith("0x")) {
- tmpVal = value.substring(2);
- radix = 16;
- } else {
- tmpVal = value;
- radix = 8;
- }
-
- if (type.equals("string") || type.equals("unicodeLE") ||
type.equals("unicodeBE")) {
- decoded = decodeString(value, type);
-
- } else if (type.equals("byte")) {
- decoded = tmpVal.getBytes();
-
- } else if (type.equals("host16") || type.equals("little16")) {
- int i = Integer.parseInt(tmpVal, radix);
- decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
-
- } else if (type.equals("big16")) {
- int i = Integer.parseInt(tmpVal, radix);
- decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
-
- } else if (type.equals("host32") || type.equals("little32")) {
- long i = Long.parseLong(tmpVal, radix);
- decoded = new byte[] { (byte) ((i & 0x000000FF)),
- (byte) ((i & 0x0000FF00) >> 8),
- (byte) ((i & 0x00FF0000) >> 16),
- (byte) ((i & 0xFF000000) >> 24) };
-
- } else if (type.equals("big32")) {
- long i = Long.parseLong(tmpVal, radix);
- decoded = new byte[] { (byte) ((i & 0xFF000000) >> 24),
- (byte) ((i & 0x00FF0000) >> 16),
- (byte) ((i & 0x0000FF00) >> 8), (byte) ((i & 0x000000FF))
};
- }
- return decoded;
- }
-
- private byte[] decodeString(String value, String type) throws
MimeTypeException {
- if (value.startsWith("0x")) {
- byte[] vals = new byte[(value.length() - 2) / 2];
- for (int i = 0; i < vals.length; i++) {
- vals[i] = (byte)
- Integer.parseInt(value.substring(2 + i * 2, 4 + i * 2), 16);
- }
- return vals;
- }
-
- try {
- CharArrayWriter decoded = new CharArrayWriter();
-
- for (int i = 0; i < value.length(); i++) {
- if (value.charAt(i) == '\\') {
- if (value.charAt(i + 1) == '\\') {
- decoded.write('\\');
- i++;
- } else if (value.charAt(i + 1) == 'x') {
- decoded.write(Integer.parseInt(
- value.substring(i + 2, i + 4), 16));
- i += 3;
- } else {
- int j = i + 1;
- while ((j < i + 4) && (j < value.length())
- && (Character.isDigit(value.charAt(j)))) {
- j++;
- }
- decoded.write(Short.decode(
- "0" + value.substring(i + 1, j)).byteValue());
- i = j - 1;
- }
- } else {
- decoded.write(value.charAt(i));
- }
- }
-
- // Now turn the chars into bytes
- char[] chars = decoded.toCharArray();
- byte[] bytes;
- if("unicodeLE".equals(type)) {
- bytes = new byte[chars.length*2];
- for(int i=0; i<chars.length; i++) {
- bytes[i*2] = (byte)(chars[i] & 0xff);
- bytes[i*2+1] = (byte)(chars[i] >> 8);
- }
- }
- else if("unicodeBE".equals(type)) {
- bytes = new byte[chars.length*2];
- for(int i=0; i<chars.length; i++) {
- bytes[i*2] = (byte)(chars[i] >> 8);
- bytes[i*2+1] = (byte)(chars[i] & 0xff);
- }
- }
- else {
- // Copy with truncation
- bytes = new byte[chars.length];
- for(int i=0; i<bytes.length; i++) {
- bytes[i] = (byte)chars[i];
- }
- }
-
- return bytes;
- } catch (NumberFormatException e) {
- throw new MimeTypeException("Invalid string value: " + value, e);
- }
+ return new MagicMatch(mediaType, type, offset, value, mask);
}
/** Read Element named root-XML. */
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
Fri Nov 11 07:51:12 2011
@@ -43,8 +43,7 @@ import org.apache.tika.metadata.Metadata
public class MimeTypesReaderTest extends TestCase {
private MimeTypes mimeTypes;
- private SortedSet<Magic> magics;
- private SortedSet<MimeType> xmls;
+ private List<Magic> magics;
@Override
@SuppressWarnings("unchecked")
@@ -54,11 +53,7 @@ public class MimeTypesReaderTest extends
Field magicsField = mimeTypes.getClass().getDeclaredField("magics");
magicsField.setAccessible(true);
- magics = (SortedSet<Magic>)magicsField.get(mimeTypes);
-
- Field xmlsField = mimeTypes.getClass().getDeclaredField("xmls");
- xmlsField.setAccessible(true);
- xmls = (SortedSet<MimeType>)xmlsField.get(mimeTypes);
+ magics = (List<Magic>)magicsField.get(mimeTypes);
}
public void testHtmlMatches() throws Exception {
@@ -68,8 +63,8 @@ public class MimeTypesReaderTest extends
MimeType html = mimeTypes.forName("text/html");
assertTrue(html.hasMagic());
assertTrue(
- "There should be at least "+minMatches+" HTML matches, found " +
html.getMagics().length,
- html.getMagics().length >= minMatches
+ "There should be at least "+minMatches+" HTML matches, found " +
html.getMagics().size(),
+ html.getMagics().size() >= minMatches
);
// Check on the overall magics
@@ -93,8 +88,8 @@ public class MimeTypesReaderTest extends
MimeType excel = mimeTypes.forName("application/vnd.ms-excel");
assertTrue(excel.hasMagic());
assertTrue(
- "There should be at least "+minMatches+" Excel matches, found " +
excel.getMagics().length,
- excel.getMagics().length >= minMatches
+ "There should be at least "+minMatches+" Excel matches, found " +
excel.getMagics().size(),
+ excel.getMagics().size() >= minMatches
);
// Check on the overall magics