Author: jukka
Date: Tue Jun 2 03:21:17 2009
New Revision: 780925
URL: http://svn.apache.org/viewvc?rev=780925&view=rev
Log:
TIKA-225: [PATCH] Various bugfixes for MIME detection
Improve XmlRootExtractor performance by avoiding repeated parser and factory
instantiation and potential online lookups for DTD references.
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java?rev=780925&r1=780924&r2=780925&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
Tue Jun 2 03:21:17 2009
@@ -16,49 +16,59 @@
*/
package org.apache.tika.detect;
-import java.io.InputStream;
+import java.io.ByteArrayInputStream;
+import javax.xml.XMLConstants;
import javax.xml.namespace.QName;
+import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
+import org.apache.tika.sax.OfflineContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
- * Utility class that uses a {...@link SAXParser} to determine the namespace
URI and local name of
- * the root element of an XML file.
+ * Utility class that uses a {...@link SAXParser} to determine
+ * the namespace URI and local name of the root element of an XML file.
*
* @since Apache Tika 0.4
*/
public class XmlRootExtractor {
- public static QName extractRootElement(byte[] data) {
- SAXParserFactory parserFactory = SAXParserFactory.newInstance();
- parserFactory.setNamespaceAware(true);
- parserFactory.setValidating(false);
+ private final SAXParser parser;
+ public XmlRootExtractor() throws SAXException,
ParserConfigurationException {
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+
+ factory.setNamespaceAware(true);
+ factory.setValidating(false);
+ factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
+
+ this.parser = factory.newSAXParser();
+ }
+
+ public QName extractRootElement(byte[] data) {
ExtractorHandler handler = new ExtractorHandler();
try {
- SAXParser parser = parserFactory.newSAXParser();
- InputStream in = new java.io.ByteArrayInputStream(data);
- parser.parse(in, handler);
- } catch (Exception e) {
- //ignore
+ parser.parse(
+ new ByteArrayInputStream(data),
+ new OfflineContentHandler(handler));
+ } catch (Exception ignore) {
}
return handler.rootElement;
}
private static class ExtractorHandler extends DefaultHandler {
- private QName rootElement;
+ private QName rootElement = null;
- /** @inheritDoc */
@Override
- public void startElement(String uri, String localName, String name,
Attributes attributes)
+ public void startElement(
+ String uri, String local, String name, Attributes attributes)
throws SAXException {
- this.rootElement = new QName(uri, localName);
+ this.rootElement = new QName(uri, local);
throw new SAXException("Aborting: root element received");
}
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=780925&r1=780924&r2=780925&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
Tue Jun 2 03:21:17 2009
@@ -115,6 +115,8 @@
/** List of all registered rootXML */
private SortedSet<MimeType> xmls = new TreeSet<MimeType>();
+ private final XmlRootExtractor xmlRootExtractor;
+
public MimeTypes() {
root = new MimeType(this, OCTET_STREAM);
text = new MimeType(this, PLAIN_TEXT);
@@ -126,6 +128,13 @@
types.put(root.getName(), root);
types.put(text.getName(), text);
+
+ try {
+ xmlRootExtractor = new XmlRootExtractor();
+ } catch (Exception e) {
+ throw new IllegalStateException(
+ "Unable to create a XmlRootExtractor", e);
+ }
}
/**
@@ -207,7 +216,7 @@
if (result != null) {
// When detecting generic XML, parse XML to determine the root
element
if ("application/xml".equals(result.getName())) {
- QName rootElement = XmlRootExtractor.extractRootElement(data);
+ QName rootElement = xmlRootExtractor.extractRootElement(data);
if (rootElement != null) {
for (MimeType type : xmls) {
if (type.matchesXML(