Author: jukka
Date: Mon Nov 16 15:50:07 2009
New Revision: 880815

URL: http://svn.apache.org/viewvc?rev=880815&view=rev
Log:
TIKA-321: Optimize type detection speed

Use the new XmlRootExtractor instead of the old regexp patterns for detecting 
different types of XML. This is notably faster than before as we need only a 
single pass over the initial bytes of the document.

Modified:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
    
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

Modified: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java?rev=880815&r1=880814&r2=880815&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java 
(original)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java 
Mon Nov 16 15:50:07 2009
@@ -21,7 +21,6 @@
 import java.util.Collections;
 import java.util.SortedSet;
 import java.util.TreeSet;
-import java.util.regex.Pattern;
 
 /**
  * Internet media type.
@@ -251,18 +250,6 @@
         rootXML.add(new RootXML(this, namespaceURI, localName));
     }
 
-    boolean matchesXML(byte[] data) {
-        RootXML xml = null;
-        String content = new String(data);
-        for (int i = 0; i < rootXML.size(); i++) {
-            xml = rootXML.get(i);            
-            if (xml.matches(content)) {
-                return true;
-            }
-        }
-        return false;
-    }
-
     boolean matchesXML(String namespaceURI, String localName) {
         for (RootXML xml : rootXML) {
             if (xml.matches(namespaceURI, localName)) {
@@ -310,7 +297,7 @@
     }
 
     public boolean matches(byte[] data) {
-        return matchesXML(data) || matchesMagic(data);
+        return matchesMagic(data);
     }
 
     /**
@@ -319,44 +306,20 @@
      */
     class RootXML {
 
-        private final static int PATTERN_FLAGS = Pattern.CASE_INSENSITIVE
-                | Pattern.DOTALL | Pattern.MULTILINE;
-
         private MimeType type = null;
 
         private String namespaceURI = null;
 
         private String localName = null;
 
-        private Pattern pattern = null;
-
         RootXML(MimeType type, String namespaceURI, String localName) {
-            this.type = type;
-            this.namespaceURI = namespaceURI;
-            this.localName = localName;
             if (isEmpty(namespaceURI) && isEmpty(localName)) {
                 throw new IllegalArgumentException(
-                        "Both namespaceURI and localName cannot be null");
-            }
-            String regex = null;
-            if (isEmpty(namespaceURI)) {
-                regex = ".*<" + localName + "[^<>]*.*";
-            } else if (isEmpty(localName)) {
-                regex = ".*<[^<>]*\\p{Space}xmlns=[\"\']?" + namespaceURI
-                        + "[\"\']?[^<>]*>.*";
-            } else {
-                regex = ".*<" + localName + "[^<>]*\\p{Space}xmlns=[\"\']?"
-                        + namespaceURI + "[\"\']?[^<>]*>.*";
+                        "Both namespaceURI and localName cannot be empty");
             }
-            this.pattern = Pattern.compile(regex, PATTERN_FLAGS);
-        }
-
-        boolean matches(byte[] data) {
-            return matches(new String(data));
-        }
-
-        boolean matches(String data) {
-            return pattern.matcher(data).matches();
+            this.type = type;
+            this.namespaceURI = namespaceURI;
+            this.localName = localName;
         }
 
         boolean matches(String namespaceURI, String localName) {
@@ -396,9 +359,7 @@
         }
 
         public String toString() {
-            return new StringBuffer().append(type.getName()).append(", ")
-                    .append(namespaceURI).append(", ").append(localName)
-                    .toString();
+            return type + ", " + namespaceURI + ", " + localName;
         }
     }
 

Modified: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=880815&r1=880814&r2=880815&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java 
(original)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java 
Mon Nov 16 15:50:07 2009
@@ -215,14 +215,6 @@
             throw new IllegalArgumentException("Data is missing");
         }
 
-        // First, check for XML descriptions (level by level)
-        // Problem: Regexp matching doesn't work for all XML encodings
-        for (MimeType type : xmls) {
-            if (type.matchesXML(data)) {
-                return type;
-            }
-        }
-
         // Then, check for magic bytes
         MimeType result = null;
         for (Magic magic : magics) {
@@ -232,8 +224,10 @@
             }
         }
         if (result != null) {
-            // When detecting generic XML, parse XML to determine the root 
element
-            if ("application/xml".equals(result.getName())) {
+            // When detecting generic XML (or possibly XHTML),
+            // extract the root element and match it against known types
+            if ("application/xml".equals(result.getName())
+                    || "text/html".equals(result.getName())) {
                 QName rootElement = xmlRootExtractor.extractRootElement(data);
                 if (rootElement != null) {
                     for (MimeType type : xmls) {
@@ -487,8 +481,9 @@
      * @see #getMimeType(String, byte[])
      */
     public int getMinLength() {
-        return 1024;
-        // return minLength;
+        // This needs to be reasonably large to be able to correctly detect
+        // things like XML root elements after initial comment and DTDs
+        return 4 * 1024;
     }
 
     /**

Modified: 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=880815&r1=880814&r2=880815&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
 (original)
+++ 
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
 Mon Nov 16 15:50:07 2009
@@ -366,9 +366,7 @@
   <mime-type type="application/qsig"/>
 
   <mime-type type="application/rdf+xml">
-    <root-XML localName="rdf:RDF"/>
-    <root-XML localName="RDF"
-              namespaceURI="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>
+    <root-XML localName="RDF"/>
     <sub-class-of type="application/xml"/>
     <acronym>RDF/XML</acronym>
     <comment>XML syntax for RDF graphs</comment>


Reply via email to