Author: jukka
Date: Fri Nov 11 07:51:12 2011
New Revision: 1200754

URL: http://svn.apache.org/viewvc?rev=1200754&view=rev
Log:
TIKA-780: Optimize loading of the media type registry

Various different optimizations

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
    
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
    
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java 
Fri Nov 11 07:51:12 2011
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.detect;
 
+import java.io.CharArrayWriter;
 import java.io.IOException;
 import java.io.InputStream;
 
@@ -30,6 +31,141 @@ import org.apache.tika.mime.MediaType;
  */
 public class MagicDetector implements Detector {
 
+    public static MagicDetector parse(
+            MediaType mediaType,
+            String type, String offset, String value, String mask) {
+        int start = 0;
+        int end = 0;
+        if (offset != null) {
+            int colon = offset.indexOf(':');
+            if (colon == -1) {
+                start = Integer.parseInt(offset);
+                end = start;
+            } else {
+                start = Integer.parseInt(offset.substring(0, colon));
+                end = Integer.parseInt(offset.substring(colon + 1));
+            }
+        }
+
+        byte[] patternBytes = decodeValue(value, type);
+        byte[] maskBytes = null;
+        if (mask != null) {
+            maskBytes = decodeValue(mask, type);
+        }
+
+        return new MagicDetector(
+                mediaType, patternBytes, maskBytes, start, end);
+    }
+
+    private static byte[] decodeValue(String value, String type) {
+        // Preliminary check
+        if ((value == null) || (type == null)) {
+            return null;
+        }
+
+        byte[] decoded = null;
+        String tmpVal = null;
+        int radix = 8;
+
+        // hex
+        if (value.startsWith("0x")) {
+            tmpVal = value.substring(2);
+            radix = 16;
+        } else {
+            tmpVal = value;
+            radix = 8;
+        }
+
+        if (type.equals("string") || type.equals("unicodeLE")
+                || type.equals("unicodeBE")) {
+            decoded = decodeString(value, type);
+        } else if (type.equals("byte")) {
+            decoded = tmpVal.getBytes();
+        } else if (type.equals("host16") || type.equals("little16")) {
+            int i = Integer.parseInt(tmpVal, radix);
+            decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
+        } else if (type.equals("big16")) {
+            int i = Integer.parseInt(tmpVal, radix);
+            decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
+        } else if (type.equals("host32") || type.equals("little32")) {
+            long i = Long.parseLong(tmpVal, radix);
+            decoded = new byte[] {
+                    (byte) ((i & 0x000000FF)),
+                    (byte) ((i & 0x0000FF00) >> 8),
+                    (byte) ((i & 0x00FF0000) >> 16),
+                    (byte) ((i & 0xFF000000) >> 24) };
+        } else if (type.equals("big32")) {
+            long i = Long.parseLong(tmpVal, radix);
+            decoded = new byte[] {
+                    (byte) ((i & 0xFF000000) >> 24),
+                    (byte) ((i & 0x00FF0000) >> 16),
+                    (byte) ((i & 0x0000FF00) >> 8),
+                    (byte) ((i & 0x000000FF)) };
+        }
+        return decoded;
+    }
+
+    private static byte[] decodeString(String value, String type) {
+        if (value.startsWith("0x")) {
+            byte[] vals = new byte[(value.length() - 2) / 2];
+            for (int i = 0; i < vals.length; i++) {
+                vals[i] = (byte)
+                Integer.parseInt(value.substring(2 + i * 2, 4 + i * 2), 16);
+            }
+            return vals;
+        }
+
+        CharArrayWriter decoded = new CharArrayWriter();
+
+        for (int i = 0; i < value.length(); i++) {
+            if (value.charAt(i) == '\\') {
+                if (value.charAt(i + 1) == '\\') {
+                    decoded.write('\\');
+                    i++;
+                } else if (value.charAt(i + 1) == 'x') {
+                    decoded.write(Integer.parseInt(
+                            value.substring(i + 2, i + 4), 16));
+                    i += 3;
+                } else {
+                    int j = i + 1;
+                    while ((j < i + 4) && (j < value.length())
+                            && (Character.isDigit(value.charAt(j)))) {
+                        j++;
+                    }
+                    decoded.write(Short.decode(
+                            "0" + value.substring(i + 1, j)).byteValue());
+                    i = j - 1;
+                }
+            } else {
+                decoded.write(value.charAt(i));
+            }
+        }
+
+        // Now turn the chars into bytes
+        char[] chars = decoded.toCharArray();
+        byte[] bytes;
+        if ("unicodeLE".equals(type)) {
+            bytes = new byte[chars.length * 2];
+            for (int i = 0; i < chars.length; i++) {
+                bytes[i * 2] = (byte) (chars[i] & 0xff);
+                bytes[i * 2 + 1] = (byte) (chars[i] >> 8);
+            }
+        } else if ("unicodeBE".equals(type)) {
+            bytes = new byte[chars.length * 2];
+            for(int i = 0; i < chars.length; i++) {
+                bytes[i * 2] = (byte) (chars[i] >> 8);
+                bytes[i * 2 + 1] = (byte) (chars[i] & 0xff);
+            }
+        } else {
+            // Copy with truncation
+            bytes = new byte[chars.length];
+            for(int i = 0; i < bytes.length; i++) {
+                bytes[i] = (byte) chars[i];
+            }
+        }
+        return bytes;
+    }
+
     /**
      * The matching media type. Returned by the
      * {@link #detect(InputStream, Metadata)} method if a match is found.
@@ -69,8 +205,6 @@ public class MagicDetector implements De
      * starts at this offset.
      */
     private final int offsetRangeEnd;
-    
-    private final String asString;
 
     /**
      * Creates a detector for input documents that have the exact given byte
@@ -136,13 +270,6 @@ public class MagicDetector implements De
 
         this.offsetRangeBegin = offsetRangeBegin;
         this.offsetRangeEnd = offsetRangeEnd;
-        
-        // Build the string representation. Needs to be unique, as
-        //  these get compared. Compute now as may get compared a lot!
-        this.asString = "Magic Detection for " + type.toString() +
-          " looking for " + pattern.length + 
-          " bytes = " + this.pattern + 
-          " mask = " + this.mask;
     }
 
     /**
@@ -205,12 +332,20 @@ public class MagicDetector implements De
         }
     }
 
+    public int getLength() {
+        return length;
+    }
+
     /**
      * Returns a string representation of the Detection Rule.
      * Should sort nicely by type and details, as we sometimes
      *  compare these.
      */
     public String toString() {
-       return asString;
+        // Needs to be unique, as these get compared.
+        return "Magic Detection for " + type +
+                " looking for " + pattern.length + 
+                " bytes = " + this.pattern + 
+                " mask = " + this.mask;
     }
 }

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Magic.java Fri Nov 
11 07:51:12 2011
@@ -26,30 +26,27 @@ class Magic implements Clause, Comparabl
 
     private final MimeType type;
 
-    private int priority = 50;
+    private final int priority;
 
-    private Clause clause = null;
+    private final Clause clause;
 
-    Magic(MimeType type) {
+    private final String string;
+
+    Magic(MimeType type, int priority, Clause clause) {
         this.type = type;
+        this.priority = priority;
+        this.clause = clause;
+        this.string = "[" + priority + "/" + clause + "]";
     }
 
     MimeType getType() {
         return type;
     }
 
-    void setPriority(int priority) {
-        this.priority = priority;
-    }
-
     int getPriority() {
         return priority;
     }
 
-    void setClause(Clause clause) {
-        this.clause = clause;
-    }
-
     public boolean eval(byte[] data) {
         return clause.eval(data);
     }
@@ -59,9 +56,7 @@ class Magic implements Clause, Comparabl
     }
 
     public String toString() {
-        StringBuffer buf = new StringBuffer();
-        
buf.append("[").append(priority).append("/").append(clause).append("]");
-        return buf.toString();
+        return string;
     }
 
     public int compareTo(Magic o) {
@@ -73,7 +68,7 @@ class Magic implements Clause, Comparabl
             diff = o.type.compareTo(type);
         }
         if (diff == 0) {
-            diff = o.toString().compareTo(toString());
+            diff = o.string.compareTo(string);
         }
         return diff;
     }
@@ -81,23 +76,13 @@ class Magic implements Clause, Comparabl
     public boolean equals(Object o) {
         if (o instanceof Magic) {
             Magic that = (Magic) o;
-
-            if (this.size() != that.size()) {
-                return false;
-            }
-
-            if (!this.type.equals(that.type)) {
-                return false;
-            }
-
-            return this.toString().equals(that.toString());
+            return type.equals(that.type) && string.equals(that.string);
         }
-
         return false;
     }
 
     public int hashCode() {
-        return size() ^ type.hashCode() ^ toString().hashCode();
+        return type.hashCode() ^ string.hashCode();
     }
 
 }

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java Fri 
Nov 11 07:51:12 2011
@@ -27,18 +27,38 @@ import org.apache.tika.metadata.Metadata
  */
 class MagicMatch implements Clause {
 
-    private final MagicDetector detector;
+    private final MediaType mediaType;
 
-    private final int length;
+    private final String type;
 
-    MagicMatch(MagicDetector detector, int length) throws MimeTypeException {
-        this.detector = detector;
-        this.length = length;
+    private final String offset;
+
+    private final String value;
+
+    private final String mask;
+
+    private MagicDetector detector = null;
+
+    MagicMatch(
+            MediaType mediaType,
+            String type, String offset, String value, String mask) {
+        this.mediaType = mediaType;
+        this.type = type;
+        this.offset = offset;
+        this.value = value;
+        this.mask = mask;
+    }
+
+    private synchronized MagicDetector getDetector() {
+        if (detector == null) {
+            detector = MagicDetector.parse(mediaType, type, offset, value, 
mask);
+        }
+        return detector;
     }
 
     public boolean eval(byte[] data) {
         try {
-            return detector.detect(
+            return getDetector().detect(
                     new ByteArrayInputStream(data), new Metadata())
                     != MediaType.OCTET_STREAM;
         } catch (IOException e) {
@@ -48,11 +68,12 @@ class MagicMatch implements Clause {
     }
 
     public int size() {
-        return length;
+        return getDetector().getLength();
     }
 
     public String toString() {
-        return detector.toString();
+        return mediaType.toString()
+                + " " + type + " " + offset + " " +  value + " " + mask;
     }
 
 }

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java Fri 
Nov 11 07:51:12 2011
@@ -45,15 +45,20 @@ public final class MediaType implements 
     private static final Pattern SPECIAL_OR_WHITESPACE =
         Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]");
 
+    /**
+     * See http://www.ietf.org/rfc/rfc2045.txt for valid mime-type characters.
+     */
+    private static final String VALID_CHARS =
+            "([^\\c\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]+)";
+
+    private static final Pattern TYPE_PATTERN = Pattern.compile(
+                    "(?s)\\s*" + VALID_CHARS + "\\s*/\\s*" + VALID_CHARS
+                    + "\\s*($|;.*)");
+
     // TIKA-350: handle charset as first element in content-type
-    // See http://www.ietf.org/rfc/rfc2045.txt for valid mime-type characters.
-    private static final String VALID_MIMETYPE_CHARS = 
"[^\\c\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]";
-    private static final String MIME_TYPE_PATTERN_STRING = "(" + 
VALID_MIMETYPE_CHARS + "+)"
-                    + "\\s*/\\s*" + "(" + VALID_MIMETYPE_CHARS + "+)";
-    private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile(
-                    "(?is)\\s*" + MIME_TYPE_PATTERN_STRING + "\\s*($|;.*)");
-    private static final Pattern CONTENT_TYPE_CHARSET_FIRST_PATTERN = 
Pattern.compile(
-                    "(?i)\\s*(charset\\s*=\\s*[^\\c;\\s]+)\\s*;\\s*" + 
MIME_TYPE_PATTERN_STRING);
+    private static final Pattern CHARSET_FIRST_PATTERN = Pattern.compile(
+            "(?is)\\s*(charset\\s*=\\s*[^\\c;\\s]+)\\s*;\\s*"
+            + VALID_CHARS + "\\s*/\\s*" + VALID_CHARS + "\\s*");
 
     public static final MediaType OCTET_STREAM = application("octet-stream");
 
@@ -97,39 +102,78 @@ public final class MediaType implements 
             return null;
         }
 
-        String type;
-        String subtype;
-        String params;
-        
-        Matcher m = CONTENT_TYPE_PATTERN.matcher(string);
-        if (m.matches()) {
-            type = m.group(1);
-            subtype = m.group(2);
-            params = m.group(3);
-        } else {
-            m = CONTENT_TYPE_CHARSET_FIRST_PATTERN.matcher(string);
-            if (m.matches()) {
-                params = m.group(1);
-                type = m.group(2);
-                subtype = m.group(3);
-            } else {
-                return null;
+        int slash = string.indexOf('/');
+        if (slash == -1) {
+            return null;
+        }
+
+        // Optimization for the common case
+        String type = string.substring(0, slash);
+        String subtype = string.substring(slash + 1);
+        if (isValidName(type) && isValidName(subtype)) {
+            return new MediaType(type, subtype);
+        }
+
+        Matcher matcher;
+        matcher = TYPE_PATTERN.matcher(string);
+        if (matcher.matches()) {
+            return new MediaType(
+                    matcher.group(1), matcher.group(2),
+                    parseParameters(matcher.group(3)));
+        }
+        matcher = CHARSET_FIRST_PATTERN.matcher(string);
+        if (matcher.matches()) {
+            return new MediaType(
+                    matcher.group(2), matcher.group(3),
+                    parseParameters(matcher.group(1)));
+        }
+
+        return null;
+    }
+
+    private static boolean isValidName(String name) {
+        for (int i = 0; i < name.length(); i++) {
+            char c = name.charAt(i);
+            if (c != '-' && c != '+' && c != '.' && c != '_'
+                    && !('0' <= c && c <= '9')
+                    && !('A' <= c && c <= 'Z')
+                    && !('a' <= c && c <= 'z')) {
+                return false;
             }
         }
+        return name.length() > 0;
+    }
+
+    private static Map<String, String> parseParameters(String string) {
+        if (string.length() == 0) {
+            return NO_PARAMETERS;
+        }
 
         Map<String, String> parameters = new HashMap<String, String>();
-        for (String paramPiece : params.split(";")) {
-            String[] keyValue = paramPiece.split("=", 2);
-            String key = keyValue[0].trim();
+        while (string.length() > 0) {
+            String key = string;
+            String value = "";
+
+            int semicolon = string.indexOf(';');
+            if (semicolon != -1) {
+                key = string.substring(0, semicolon);
+                string = string.substring(semicolon + 1);
+            } else {
+                string = "";
+            }
+
+            int equals = key.indexOf('=');
+            if (equals != -1) {
+                value = key.substring(equals + 1);
+                key = key.substring(0, equals);
+            }
+
+            key = key.trim();
             if (key.length() > 0) {
-                if (keyValue.length > 1) {
-                    parameters.put(key, keyValue[1].trim());
-                } else {
-                    parameters.put(key, "");
-                }
+                parameters.put(key, value.trim());
             }
         }
-        return new MediaType(type, subtype, parameters);
+        return parameters;
     }
 
     private final String type;

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java Fri 
Nov 11 07:51:12 2011
@@ -80,10 +80,10 @@ public final class MimeType implements C
     private String description = "";
 
     /** The magics associated to this Mime-Type */
-    private final ArrayList<Magic> magics = new ArrayList<Magic>();
+    private List<Magic> magics = null;
 
     /** The root-XML associated to this Mime-Type */
-    private final ArrayList<RootXML> rootXML = new ArrayList<RootXML>();
+    private List<RootXML> rootXML = null;
 
     /** The minimum length of data to provides for magic analyzis */
     private int minLength = 0;
@@ -92,7 +92,7 @@ public final class MimeType implements C
      * All known file extensions of this type, in order of preference
      * (best first).
      */
-    private final List<String> extensions = new ArrayList<String>();
+    private List<String> extensions = null;
 
     /**
      * Creates a media type with the give name and containing media type
@@ -156,34 +156,42 @@ public final class MimeType implements C
      * @param localName
      */
     void addRootXML(String namespaceURI, String localName) {
+        if (rootXML == null) {
+            rootXML = new ArrayList<RootXML>();
+        }
         rootXML.add(new RootXML(this, namespaceURI, localName));
     }
 
     boolean matchesXML(String namespaceURI, String localName) {
-        for (RootXML xml : rootXML) {
-            if (xml.matches(namespaceURI, localName)) {
-                return true;
+        if (rootXML != null) {
+            for (RootXML xml : rootXML) {
+                if (xml.matches(namespaceURI, localName)) {
+                    return true;
+                }
             }
         }
         return false;
     }
 
     boolean hasRootXML() {
-        return (rootXML.size() > 0);
+        return rootXML != null;
     }
 
-    RootXML[] getRootXMLs() {
-        return rootXML.toArray(new RootXML[rootXML.size()]);
-    }
-
-    Magic[] getMagics() {
-        return magics.toArray(new Magic[magics.size()]);
+    List<Magic> getMagics() {
+        if (magics != null) {
+            return magics;
+        } else {
+            return Collections.emptyList();
+        }
     }
 
     void addMagic(Magic magic) {
         if (magic == null) {
             return;
         }
+        if (magics == null) {
+            magics = new ArrayList<Magic>();
+        }
         magics.add(magic);
     }
 
@@ -192,11 +200,11 @@ public final class MimeType implements C
     }
 
     public boolean hasMagic() {
-        return (magics.size() > 0);
+        return magics != null;
     }
 
     public boolean matchesMagic(byte[] data) {
-        for (int i = 0; i < magics.size(); i++) {
+        for (int i = 0; magics != null && i < magics.size(); i++) {
             Magic magic = magics.get(i);
             if (magic.eval(data)) {
                 return true;
@@ -330,7 +338,7 @@ public final class MimeType implements C
      * @return preferred file extension or empty string
      */
     public String getExtension() {
-        if (extensions.isEmpty()) {
+        if (extensions == null) {
             return "";
         } else {
             return extensions.get(0);
@@ -344,7 +352,11 @@ public final class MimeType implements C
      * @return known extensions in order of preference (best first)
      */
     public List<String> getExtensions() {
-        return Collections.unmodifiableList(extensions);
+        if (extensions != null) {
+            return Collections.unmodifiableList(extensions);
+        } else {
+            return Collections.emptyList();
+        }
     }
 
     /**
@@ -353,6 +365,11 @@ public final class MimeType implements C
      * @param extension file extension
      */
     void addExtension(String extension) {
+        if (extensions == null) {
+            extensions = Collections.singletonList(extension);
+        } else if (extensions.size() == 1) {
+            extensions = new ArrayList<String>(extensions);
+        }
         if (!extensions.contains(extension)) {
             extensions.add(extension);
         }

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java Fri 
Nov 11 07:51:12 2011
@@ -24,12 +24,12 @@ import java.io.InputStream;
 import java.io.Serializable;
 import java.net.URI;
 import java.net.URISyntaxException;
-import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Locale;
 import java.util.Map;
-import java.util.SortedSet;
-import java.util.TreeSet;
 
 import javax.xml.namespace.QName;
 
@@ -102,11 +102,11 @@ public final class MimeTypes implements 
     /** The patterns matcher */
     private Patterns patterns = new Patterns(registry);
 
-    /** List of all registered magics */
-    private SortedSet<Magic> magics = new TreeSet<Magic>();
+    /** Sorted list of all registered magics */
+    private final List<Magic> magics = new ArrayList<Magic>();
 
-    /** List of all registered rootXML */
-    private SortedSet<MimeType> xmls = new TreeSet<MimeType>();
+    /** Sorted list of all registered rootXML */
+    private final List<MimeType> xmls = new ArrayList<MimeType>();
 
     public MimeTypes() {
         rootMimeType = new MimeType(MediaType.OCTET_STREAM);
@@ -362,7 +362,7 @@ public final class MimeTypes implements 
 
         // Update the magics index...
         if (type.hasMagic()) {
-            magics.addAll(Arrays.asList(type.getMagics()));
+            magics.addAll(type.getMagics());
         }
 
         // Update the xml (xmlRoot) index...
@@ -372,6 +372,21 @@ public final class MimeTypes implements 
     }
 
     /**
+     * Called after all configured types have been loaded.
+     * Initializes the magics and xmls sets.
+     */
+    void init() {
+        for (MimeType type : types.values()) {
+            magics.addAll(type.getMagics());
+            if (type.hasRootXML()) {
+                xmls.add(type);
+            }
+        }
+        Collections.sort(magics);
+        Collections.sort(xmls);
+    }
+
+    /**
      * Automatically detects the MIME type of a document based on magic
      * markers in the stream prefix and any given metadata hints.
      * <p>
@@ -441,23 +456,29 @@ public final class MimeTypes implements 
 
         return type;
     }
-    
+
+    private static MimeTypes DEFAULT_TYPES = null;
+
     /**
      * Get the default MimeTypes. This includes all the build in
-     *  mimetypes, and any custom override ones present. 
+     * media types, and any custom override ones present.
      * 
-     * @return MimeTypes
-     * @throws MimeTypeException
-     * @throws IOException
+     * @return MimeTypes default type registry
      */
-    public static MimeTypes getDefaultMimeTypes() {
-        try {
-            return MimeTypesFactory.create("tika-mimetypes.xml", 
"custom-mimetypes.xml");
-        } catch (MimeTypeException e) {
-            throw new RuntimeException("Unable to read default mimetypes", e);
-        } catch (IOException e) {
-            throw new RuntimeException("Unable to read default mimetypes", e);
+    public static synchronized MimeTypes getDefaultMimeTypes() {
+        if (DEFAULT_TYPES == null) {
+            try {
+                DEFAULT_TYPES = MimeTypesFactory.create(
+                        "tika-mimetypes.xml", "custom-mimetypes.xml");
+            } catch (MimeTypeException e) {
+                throw new RuntimeException(
+                        "Unable to parse the default media type registry", e);
+            } catch (IOException e) {
+                throw new RuntimeException(
+                        "Unable to read the default media type registry", e);
+            }
         }
+        return DEFAULT_TYPES;
     }
 
 }

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java 
Fri Nov 11 07:51:12 2011
@@ -46,6 +46,7 @@ public class MimeTypesFactory {
     public static MimeTypes create(Document document) throws MimeTypeException 
{
         MimeTypes mimeTypes = new MimeTypes();
         new MimeTypesReader(mimeTypes).read(document);
+        mimeTypes.init();
         return mimeTypes;
     }
 
@@ -62,6 +63,7 @@ public class MimeTypesFactory {
         for(InputStream inputStream : inputStreams) {
            reader.read(inputStream);
         }
+        mimeTypes.init();
         return mimeTypes;
     }
 

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java 
Fri Nov 11 07:51:12 2011
@@ -16,17 +16,16 @@
  */
 package org.apache.tika.mime;
 
-import java.io.CharArrayWriter;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
 
-import org.apache.tika.detect.MagicDetector;
 import org.w3c.dom.Attr;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
@@ -184,17 +183,20 @@ final class MimeTypesReader implements M
         }
 
         for (Clause clause : readMatches(element, mimeType.getType())) {
-            Magic magic = new Magic(mimeType);
-            magic.setPriority(priority);
-            magic.setClause(clause);
+            Magic magic = new Magic(mimeType, priority, clause);
             mimeType.addMagic(magic);
         }
     }
 
     private List<Clause> readMatches(Element element, MediaType mediaType) 
throws MimeTypeException {
-        List<Clause> clauses = new ArrayList<Clause>();
         NodeList nodes = element.getChildNodes();
-        for (int i = 0; i < nodes.getLength(); i++) {
+        int n = nodes.getLength();
+        if (n == 0) {
+            return Collections.emptyList();
+        }
+
+        List<Clause> clauses = new ArrayList<Clause>();
+        for (int i = 0; i < n; i++) {
             Node node = nodes.item(i);
             if (node.getNodeType() == Node.ELEMENT_NODE) {
                 Element nodeElement = (Element) node;
@@ -208,9 +210,22 @@ final class MimeTypesReader implements M
 
     /** Read Element named match. */
     private Clause readMatch(Element element, MediaType mediaType) throws 
MimeTypeException {
+        Clause clause = getMagicClause(element, mediaType);
+
+        List<Clause> subClauses = readMatches(element, mediaType);
+        if (subClauses.size() == 0) {
+            return clause;
+        } else if (subClauses.size() == 1) {
+            return new AndClause(clause, subClauses.get(0));
+        } else {
+            return new AndClause(clause, new OrClause(subClauses));
+        }
+    }
+
+    private Clause getMagicClause(Element element, MediaType mediaType)
+            throws MimeTypeException {
         String type = "string";
-        int start = 0;
-        int end = 0;
+        String offset = null;
         String value = null;
         String mask = null;
 
@@ -218,15 +233,7 @@ final class MimeTypesReader implements M
         for (int i = 0; i < attrs.getLength(); i++) {
             Attr attr = (Attr) attrs.item(i);
             if (attr.getName().equals(MATCH_OFFSET_ATTR)) {
-                String offset = attr.getValue();
-                int colon = offset.indexOf(':');
-                if (colon == -1) {
-                    start = Integer.parseInt(offset);
-                    end = start;
-                } else {
-                    start = Integer.parseInt(offset.substring(0, colon));
-                    end = Integer.parseInt(offset.substring(colon + 1));
-                }
+                offset = attr.getValue();
             } else if (attr.getName().equals(MATCH_TYPE_ATTR)) {
                 type = attr.getValue();
             } else if (attr.getName().equals(MATCH_VALUE_ATTR)) {
@@ -236,151 +243,7 @@ final class MimeTypesReader implements M
             }
         }
 
-        if (value == null) {
-            throw new MimeTypeException("Missing magic byte pattern");
-        } else if (start < 0 || end < start) {
-            throw new MimeTypeException(
-                    "Invalid offset range: [" + start + "," + end + "]");
-        }
-
-        byte[] patternBytes = decodeValue(type, value);
-        int length = patternBytes.length;
-        byte[] maskBytes = null;
-        if (mask != null) {
-            maskBytes = decodeValue(type, mask);
-            length = Math.max(patternBytes.length, maskBytes.length);
-        }
-
-        MagicDetector detector = new MagicDetector(
-                mediaType, patternBytes, maskBytes, start, end);
-        Clause clause = new MagicMatch(detector, length);
-
-        List<Clause> subClauses = readMatches(element, mediaType);
-        if (subClauses.size() == 0) {
-            return clause;
-        } else if (subClauses.size() == 1) {
-            return new AndClause(clause, subClauses.get(0));
-        } else {
-            return new AndClause(clause, new OrClause(subClauses));
-        }
-    }
-
-    private byte[] decodeValue(String type, String value)
-            throws MimeTypeException {
-        // Preliminary check
-        if ((value == null) || (type == null)) {
-            return null;
-        }
-
-        byte[] decoded = null;
-        String tmpVal = null;
-        int radix = 8;
-
-        // hex
-        if (value.startsWith("0x")) {
-            tmpVal = value.substring(2);
-            radix = 16;
-        } else {
-            tmpVal = value;
-            radix = 8;
-        }
-
-        if (type.equals("string") || type.equals("unicodeLE") || 
type.equals("unicodeBE")) {
-            decoded = decodeString(value, type);
-            
-        } else if (type.equals("byte")) {
-            decoded = tmpVal.getBytes();
-
-        } else if (type.equals("host16") || type.equals("little16")) {
-            int i = Integer.parseInt(tmpVal, radix);
-            decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
-
-        } else if (type.equals("big16")) {
-            int i = Integer.parseInt(tmpVal, radix);
-            decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
-
-        } else if (type.equals("host32") || type.equals("little32")) {
-            long i = Long.parseLong(tmpVal, radix);
-            decoded = new byte[] { (byte) ((i & 0x000000FF)),
-                    (byte) ((i & 0x0000FF00) >> 8),
-                    (byte) ((i & 0x00FF0000) >> 16),
-                    (byte) ((i & 0xFF000000) >> 24) };
-
-        } else if (type.equals("big32")) {
-            long i = Long.parseLong(tmpVal, radix);
-            decoded = new byte[] { (byte) ((i & 0xFF000000) >> 24),
-                    (byte) ((i & 0x00FF0000) >> 16),
-                    (byte) ((i & 0x0000FF00) >> 8), (byte) ((i & 0x000000FF)) 
};
-        }
-        return decoded;
-    }
-
-    private byte[] decodeString(String value, String type) throws 
MimeTypeException {
-        if (value.startsWith("0x")) {
-            byte[] vals = new byte[(value.length() - 2) / 2];
-            for (int i = 0; i < vals.length; i++) {
-                vals[i] = (byte)
-                Integer.parseInt(value.substring(2 + i * 2, 4 + i * 2), 16);
-            }
-            return vals;
-        }
-
-        try {
-            CharArrayWriter decoded = new CharArrayWriter();
-
-            for (int i = 0; i < value.length(); i++) {
-                if (value.charAt(i) == '\\') {
-                    if (value.charAt(i + 1) == '\\') {
-                        decoded.write('\\');
-                        i++;
-                    } else if (value.charAt(i + 1) == 'x') {
-                        decoded.write(Integer.parseInt(
-                                value.substring(i + 2, i + 4), 16));
-                        i += 3;
-                    } else {
-                        int j = i + 1;
-                        while ((j < i + 4) && (j < value.length())
-                                && (Character.isDigit(value.charAt(j)))) {
-                            j++;
-                        }
-                        decoded.write(Short.decode(
-                                "0" + value.substring(i + 1, j)).byteValue());
-                        i = j - 1;
-                    }
-                } else {
-                    decoded.write(value.charAt(i));
-                }
-            }
-            
-            // Now turn the chars into bytes
-            char[] chars = decoded.toCharArray();
-            byte[] bytes;
-            if("unicodeLE".equals(type)) {
-               bytes = new byte[chars.length*2];
-               for(int i=0; i<chars.length; i++) {
-                  bytes[i*2] = (byte)(chars[i] & 0xff);
-                  bytes[i*2+1] = (byte)(chars[i] >> 8);
-               }
-            }
-            else if("unicodeBE".equals(type)) {
-               bytes = new byte[chars.length*2];
-               for(int i=0; i<chars.length; i++) {
-                  bytes[i*2] = (byte)(chars[i] >> 8);
-                  bytes[i*2+1] = (byte)(chars[i] & 0xff);
-               }
-            }
-            else {
-               // Copy with truncation
-               bytes = new byte[chars.length];
-               for(int i=0; i<bytes.length; i++) {
-                  bytes[i] = (byte)chars[i];
-               }
-            }
-            
-            return bytes;
-        } catch (NumberFormatException e) {
-            throw new MimeTypeException("Invalid string value: " + value, e);
-        }
+        return new MagicMatch(mediaType, type, offset, value, mask);
     }
 
     /** Read Element named root-XML. */

Modified: 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java?rev=1200754&r1=1200753&r2=1200754&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
 (original)
+++ 
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
 Fri Nov 11 07:51:12 2011
@@ -43,8 +43,7 @@ import org.apache.tika.metadata.Metadata
 public class MimeTypesReaderTest extends TestCase {
 
     private MimeTypes mimeTypes;
-    private SortedSet<Magic> magics;
-    private SortedSet<MimeType> xmls;
+    private List<Magic> magics;
 
     @Override
     @SuppressWarnings("unchecked")
@@ -54,11 +53,7 @@ public class MimeTypesReaderTest extends
 
         Field magicsField = mimeTypes.getClass().getDeclaredField("magics");
         magicsField.setAccessible(true);
-        magics = (SortedSet<Magic>)magicsField.get(mimeTypes);
-
-        Field xmlsField = mimeTypes.getClass().getDeclaredField("xmls");
-        xmlsField.setAccessible(true);
-        xmls = (SortedSet<MimeType>)xmlsField.get(mimeTypes);
+        magics = (List<Magic>)magicsField.get(mimeTypes);
     }
 
     public void testHtmlMatches() throws Exception {
@@ -68,8 +63,8 @@ public class MimeTypesReaderTest extends
        MimeType html = mimeTypes.forName("text/html");
        assertTrue(html.hasMagic());
        assertTrue(
-             "There should be at least "+minMatches+" HTML matches, found " + 
html.getMagics().length,
-             html.getMagics().length >= minMatches
+             "There should be at least "+minMatches+" HTML matches, found " + 
html.getMagics().size(),
+             html.getMagics().size() >= minMatches
        );
 
        // Check on the overall magics
@@ -93,8 +88,8 @@ public class MimeTypesReaderTest extends
        MimeType excel = mimeTypes.forName("application/vnd.ms-excel");
        assertTrue(excel.hasMagic());
        assertTrue(
-             "There should be at least "+minMatches+" Excel matches, found " + 
excel.getMagics().length,
-             excel.getMagics().length >= minMatches
+             "There should be at least "+minMatches+" Excel matches, found " + 
excel.getMagics().size(),
+             excel.getMagics().size() >= minMatches
        );
 
        // Check on the overall magics


Reply via email to