jukka
Mon, 16 Nov 2009 07:50:34 -0800
Author: jukka Date: Mon Nov 16 15:50:07 2009 New Revision: 880815 URL: http://svn.apache.org/viewvc?rev=880815&view=rev Log: TIKA-321: Optimize type detection speed Use the new XmlRootExtractor instead of the old regexp patterns for detecting different types of XML. This is notably faster than before as we need only a single pass over the initial bytes of the document. Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java?rev=880815&r1=880814&r2=880815&view=diff ============================================================================== --- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java (original) +++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java Mon Nov 16 15:50:07 2009 @@ -21,7 +21,6 @@ import java.util.Collections; import java.util.SortedSet; import java.util.TreeSet; -import java.util.regex.Pattern; /** * Internet media type. @@ -251,18 +250,6 @@ rootXML.add(new RootXML(this, namespaceURI, localName)); } - boolean matchesXML(byte[] data) { - RootXML xml = null; - String content = new String(data); - for (int i = 0; i < rootXML.size(); i++) { - xml = rootXML.get(i); - if (xml.matches(content)) { - return true; - } - } - return false; - } - boolean matchesXML(String namespaceURI, String localName) { for (RootXML xml : rootXML) { if (xml.matches(namespaceURI, localName)) { @@ -310,7 +297,7 @@ } public boolean matches(byte[] data) { - return matchesXML(data) || matchesMagic(data); + return matchesMagic(data); } /** @@ -319,44 +306,20 @@ */ class RootXML { - private final static int PATTERN_FLAGS = Pattern.CASE_INSENSITIVE - | Pattern.DOTALL | Pattern.MULTILINE; - private MimeType type = null; private String namespaceURI = null; private String localName = null; - private Pattern pattern = null; - RootXML(MimeType type, String namespaceURI, String localName) { - this.type = type; - this.namespaceURI = namespaceURI; - this.localName = localName; if (isEmpty(namespaceURI) && isEmpty(localName)) { throw new IllegalArgumentException( - "Both namespaceURI and localName cannot be null"); - } - String regex = null; - if (isEmpty(namespaceURI)) { - regex = ".*<" + localName + "[^<>]*.*"; - } else if (isEmpty(localName)) { - regex = ".*<[^<>]*\\p{Space}xmlns=[\"\']?" + namespaceURI - + "[\"\']?[^<>]*>.*"; - } else { - regex = ".*<" + localName + "[^<>]*\\p{Space}xmlns=[\"\']?" - + namespaceURI + "[\"\']?[^<>]*>.*"; + "Both namespaceURI and localName cannot be empty"); } - this.pattern = Pattern.compile(regex, PATTERN_FLAGS); - } - - boolean matches(byte[] data) { - return matches(new String(data)); - } - - boolean matches(String data) { - return pattern.matcher(data).matches(); + this.type = type; + this.namespaceURI = namespaceURI; + this.localName = localName; } boolean matches(String namespaceURI, String localName) { @@ -396,9 +359,7 @@ } public String toString() { - return new StringBuffer().append(type.getName()).append(", ") - .append(namespaceURI).append(", ").append(localName) - .toString(); + return type + ", " + namespaceURI + ", " + localName; } } Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=880815&r1=880814&r2=880815&view=diff ============================================================================== --- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java (original) +++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java Mon Nov 16 15:50:07 2009 @@ -215,14 +215,6 @@ throw new IllegalArgumentException("Data is missing"); } - // First, check for XML descriptions (level by level) - // Problem: Regexp matching doesn't work for all XML encodings - for (MimeType type : xmls) { - if (type.matchesXML(data)) { - return type; - } - } - // Then, check for magic bytes MimeType result = null; for (Magic magic : magics) { @@ -232,8 +224,10 @@ } } if (result != null) { - // When detecting generic XML, parse XML to determine the root element - if ("application/xml".equals(result.getName())) { + // When detecting generic XML (or possibly XHTML), + // extract the root element and match it against known types + if ("application/xml".equals(result.getName()) + || "text/html".equals(result.getName())) { QName rootElement = xmlRootExtractor.extractRootElement(data); if (rootElement != null) { for (MimeType type : xmls) { @@ -487,8 +481,9 @@ * @see #getMimeType(String, byte[]) */ public int getMinLength() { - return 1024; - // return minLength; + // This needs to be reasonably large to be able to correctly detect + // things like XML root elements after initial comment and DTDs + return 4 * 1024; } /** Modified: lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=880815&r1=880814&r2=880815&view=diff ============================================================================== --- lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original) +++ lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Mon Nov 16 15:50:07 2009 @@ -366,9 +366,7 @@ <mime-type type="application/qsig"/> <mime-type type="application/rdf+xml"> - <root-XML localName="rdf:RDF"/> - <root-XML localName="RDF" - namespaceURI="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/> + <root-XML localName="RDF"/> <sub-class-of type="application/xml"/> <acronym>RDF/XML</acronym> <comment>XML syntax for RDF graphs</comment>