Author: jukka
Date: Mon Nov 16 15:50:07 2009
New Revision: 880815
URL: http://svn.apache.org/viewvc?rev=880815&view=rev
Log:
TIKA-321: Optimize type detection speed
Use the new XmlRootExtractor instead of the old regexp patterns for detecting
different types of XML. This is notably faster than before as we need only a
single pass over the initial bytes of the document.
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java?rev=880815&r1=880814&r2=880815&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
Mon Nov 16 15:50:07 2009
@@ -21,7 +21,6 @@
import java.util.Collections;
import java.util.SortedSet;
import java.util.TreeSet;
-import java.util.regex.Pattern;
/**
* Internet media type.
@@ -251,18 +250,6 @@
rootXML.add(new RootXML(this, namespaceURI, localName));
}
- boolean matchesXML(byte[] data) {
- RootXML xml = null;
- String content = new String(data);
- for (int i = 0; i < rootXML.size(); i++) {
- xml = rootXML.get(i);
- if (xml.matches(content)) {
- return true;
- }
- }
- return false;
- }
-
boolean matchesXML(String namespaceURI, String localName) {
for (RootXML xml : rootXML) {
if (xml.matches(namespaceURI, localName)) {
@@ -310,7 +297,7 @@
}
public boolean matches(byte[] data) {
- return matchesXML(data) || matchesMagic(data);
+ return matchesMagic(data);
}
/**
@@ -319,44 +306,20 @@
*/
class RootXML {
- private final static int PATTERN_FLAGS = Pattern.CASE_INSENSITIVE
- | Pattern.DOTALL | Pattern.MULTILINE;
-
private MimeType type = null;
private String namespaceURI = null;
private String localName = null;
- private Pattern pattern = null;
-
RootXML(MimeType type, String namespaceURI, String localName) {
- this.type = type;
- this.namespaceURI = namespaceURI;
- this.localName = localName;
if (isEmpty(namespaceURI) && isEmpty(localName)) {
throw new IllegalArgumentException(
- "Both namespaceURI and localName cannot be null");
- }
- String regex = null;
- if (isEmpty(namespaceURI)) {
- regex = ".*<" + localName + "[^<>]*.*";
- } else if (isEmpty(localName)) {
- regex = ".*<[^<>]*\\p{Space}xmlns=[\"\']?" + namespaceURI
- + "[\"\']?[^<>]*>.*";
- } else {
- regex = ".*<" + localName + "[^<>]*\\p{Space}xmlns=[\"\']?"
- + namespaceURI + "[\"\']?[^<>]*>.*";
+ "Both namespaceURI and localName cannot be empty");
}
- this.pattern = Pattern.compile(regex, PATTERN_FLAGS);
- }
-
- boolean matches(byte[] data) {
- return matches(new String(data));
- }
-
- boolean matches(String data) {
- return pattern.matcher(data).matches();
+ this.type = type;
+ this.namespaceURI = namespaceURI;
+ this.localName = localName;
}
boolean matches(String namespaceURI, String localName) {
@@ -396,9 +359,7 @@
}
public String toString() {
- return new StringBuffer().append(type.getName()).append(", ")
- .append(namespaceURI).append(", ").append(localName)
- .toString();
+ return type + ", " + namespaceURI + ", " + localName;
}
}
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=880815&r1=880814&r2=880815&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
Mon Nov 16 15:50:07 2009
@@ -215,14 +215,6 @@
throw new IllegalArgumentException("Data is missing");
}
- // First, check for XML descriptions (level by level)
- // Problem: Regexp matching doesn't work for all XML encodings
- for (MimeType type : xmls) {
- if (type.matchesXML(data)) {
- return type;
- }
- }
-
// Then, check for magic bytes
MimeType result = null;
for (Magic magic : magics) {
@@ -232,8 +224,10 @@
}
}
if (result != null) {
- // When detecting generic XML, parse XML to determine the root
element
- if ("application/xml".equals(result.getName())) {
+ // When detecting generic XML (or possibly XHTML),
+ // extract the root element and match it against known types
+ if ("application/xml".equals(result.getName())
+ || "text/html".equals(result.getName())) {
QName rootElement = xmlRootExtractor.extractRootElement(data);
if (rootElement != null) {
for (MimeType type : xmls) {
@@ -487,8 +481,9 @@
* @see #getMimeType(String, byte[])
*/
public int getMinLength() {
- return 1024;
- // return minLength;
+ // This needs to be reasonably large to be able to correctly detect
+ // things like XML root elements after initial comment and DTDs
+ return 4 * 1024;
}
/**
Modified:
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=880815&r1=880814&r2=880815&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
(original)
+++
lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
Mon Nov 16 15:50:07 2009
@@ -366,9 +366,7 @@
<mime-type type="application/qsig"/>
<mime-type type="application/rdf+xml">
- <root-XML localName="rdf:RDF"/>
- <root-XML localName="RDF"
- namespaceURI="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>
+ <root-XML localName="RDF"/>
<sub-class-of type="application/xml"/>
<acronym>RDF/XML</acronym>
<comment>XML syntax for RDF graphs</comment>