Author: jukka
Date: Tue Jun  2 03:21:17 2009
New Revision: 780925

URL: http://svn.apache.org/viewvc?rev=780925&view=rev
Log:
TIKA-225: [PATCH] Various bugfixes for MIME detection

Improve XmlRootExtractor performance by avoiding repeated parser and factory 
instantiation and potential online lookups for DTD references.

Modified:
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java

Modified: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java?rev=780925&r1=780924&r2=780925&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
 (original)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
 Tue Jun  2 03:21:17 2009
@@ -16,49 +16,59 @@
  */
 package org.apache.tika.detect;
 
-import java.io.InputStream;
+import java.io.ByteArrayInputStream;
 
+import javax.xml.XMLConstants;
 import javax.xml.namespace.QName;
+import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 
+import org.apache.tika.sax.OfflineContentHandler;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
- * Utility class that uses a {...@link SAXParser} to determine the namespace 
URI and local name of
- * the root element of an XML file.
+ * Utility class that uses a {...@link SAXParser} to determine
+ * the namespace URI and local name of the root element of an XML file.
  *
  * @since Apache Tika 0.4
  */
 public class XmlRootExtractor {
 
-    public static QName extractRootElement(byte[] data) {
-        SAXParserFactory parserFactory = SAXParserFactory.newInstance();
-        parserFactory.setNamespaceAware(true);
-        parserFactory.setValidating(false);
+    private final SAXParser parser;
 
+    public XmlRootExtractor() throws SAXException, 
ParserConfigurationException {
+        SAXParserFactory factory = SAXParserFactory.newInstance();
+
+        factory.setNamespaceAware(true);
+        factory.setValidating(false);
+        factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
+
+        this.parser = factory.newSAXParser();
+    }
+
+    public QName extractRootElement(byte[] data) {
         ExtractorHandler handler = new ExtractorHandler();
         try {
-            SAXParser parser = parserFactory.newSAXParser();
-            InputStream in = new java.io.ByteArrayInputStream(data);
-            parser.parse(in, handler);
-        } catch (Exception e) {
-            //ignore
+            parser.parse(
+                    new ByteArrayInputStream(data),
+                    new OfflineContentHandler(handler));
+        } catch (Exception ignore) {
         }
         return handler.rootElement;
     }
 
     private static class ExtractorHandler extends DefaultHandler {
 
-        private QName rootElement;
+        private QName rootElement = null;
 
-        /** @inheritDoc */
         @Override
-        public void startElement(String uri, String localName, String name, 
Attributes attributes)
+        public void startElement(
+                String uri, String local, String name, Attributes attributes)
                 throws SAXException {
-            this.rootElement = new QName(uri, localName);
+            this.rootElement = new QName(uri, local);
             throw new SAXException("Aborting: root element received");
         }
 

Modified: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=780925&r1=780924&r2=780925&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java 
(original)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java 
Tue Jun  2 03:21:17 2009
@@ -115,6 +115,8 @@
     /** List of all registered rootXML */
     private SortedSet<MimeType> xmls = new TreeSet<MimeType>();
 
+    private final XmlRootExtractor xmlRootExtractor;
+
     public MimeTypes() {
         root = new MimeType(this, OCTET_STREAM);
         text = new MimeType(this, PLAIN_TEXT);
@@ -126,6 +128,13 @@
 
         types.put(root.getName(), root);
         types.put(text.getName(), text);
+
+        try {
+            xmlRootExtractor = new XmlRootExtractor();
+        } catch (Exception e) {
+            throw new IllegalStateException(
+                    "Unable to create a XmlRootExtractor", e);
+        }
     }
 
     /**
@@ -207,7 +216,7 @@
         if (result != null) {
             // When detecting generic XML, parse XML to determine the root 
element
             if ("application/xml".equals(result.getName())) {
-                QName rootElement = XmlRootExtractor.extractRootElement(data);
+                QName rootElement = xmlRootExtractor.extractRootElement(data);
                 if (rootElement != null) {
                     for (MimeType type : xmls) {
                         if (type.matchesXML(


Reply via email to