Author: jukka
Date: Fri Feb 17 18:23:39 2012
New Revision: 1245692

URL: http://svn.apache.org/viewvc?rev=1245692&view=rev
Log:
TIKA-866: Invalid configuration file causes OutOfMemoryException

Restore ability to use DefaultParser in a configuration file and instead break 
the default configuration loop in getDefaultRegistry().

Make also <parser> and <detector> elements default to the standard settings if 
a configuration file doesn't explicitly specify any entries (use EmptyParser or 
EmptyDetector to disable that behavior).

Added:
    
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java   
(with props)
    
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-composite.xml
      - copied, changed from r1245600, 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
    
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
    
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
    
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1245692&r1=1245691&r2=1245692&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
Fri Feb 17 18:23:39 2012
@@ -56,6 +56,20 @@ import org.xml.sax.SAXException;
  */
 public class TikaConfig {
 
+    private static MimeTypes getDefaultMimeTypes() {
+        return MimeTypes.getDefaultMimeTypes();
+    }
+
+    private static Detector getDefaultDetector(
+            MimeTypes types, ClassLoader loader) {
+        return new DefaultDetector(types, loader);
+    }
+
+    private static CompositeParser getDefaultParser(
+            MimeTypes types, ClassLoader loader) {
+        return new DefaultParser(types.getMediaTypeRegistry(), loader);
+    }
+
     private final CompositeParser parser;
     private final Detector detector;
 
@@ -115,9 +129,9 @@ public class TikaConfig {
      */
     public TikaConfig(ClassLoader loader)
             throws MimeTypeException, IOException {
-        this.mimeTypes = MimeTypes.getDefaultMimeTypes();
-        this.detector = new DefaultDetector(mimeTypes, loader);
-        this.parser = new DefaultParser(mimeTypes.getMediaTypeRegistry(), 
loader);
+        this.mimeTypes = getDefaultMimeTypes();
+        this.detector = getDefaultDetector(mimeTypes, loader);
+        this.parser = getDefaultParser(mimeTypes, loader);
     }
 
     /**
@@ -138,43 +152,53 @@ public class TikaConfig {
      * @throws TikaException if problem with MimeTypes or parsing XML config
      */
     public TikaConfig() throws TikaException, IOException {
+        ClassLoader loader = ServiceLoader.getContextClassLoader();
+
         String config = System.getProperty("tika.config");
         if (config == null) {
             config = System.getenv("TIKA_CONFIG");
         }
+
         if (config == null) {
-            this.mimeTypes = MimeTypes.getDefaultMimeTypes();
-            this.parser = new DefaultParser(mimeTypes.getMediaTypeRegistry());
-            this.detector = new DefaultDetector(mimeTypes);
+            this.mimeTypes = getDefaultMimeTypes();
+            this.parser = getDefaultParser(mimeTypes, loader);
+            this.detector = getDefaultDetector(mimeTypes, loader);
         } else {
-            ClassLoader loader = ServiceLoader.getContextClassLoader();
-            InputStream stream;
+            // Locate the given configuration file
+            InputStream stream = null;
             File file = new File(config);
             if (file.isFile()) {
                 stream = new FileInputStream(file);
-            } else {
-                stream = loader.getResourceAsStream(config);
             }
-            if (stream != null) {
+            if (stream == null) {
                 try {
-                    Element element =
-                        getBuilder().parse(stream).getDocumentElement();
-                    this.mimeTypes = typesFromDomElement(element);
-                    this.parser =
-                        parserFromDomElement(element, mimeTypes, loader);
-                    this.detector =
-                       detectorFromDomElement(element, mimeTypes, loader);
-                } catch (SAXException e) {
-                    throw new TikaException(
-                            "Specified Tika configuration has syntax errors: "
-                            + config, e);
-                } finally {
-                    stream.close();
+                    stream = new URL(config).openStream();
+                } catch (IOException ignore) {
                 }
-            } else {
+            }
+            if (stream == null) {
+                stream = loader.getResourceAsStream(config);
+            }
+            if (stream == null) {
                 throw new TikaException(
                         "Specified Tika configuration not found: " + config);
             }
+
+            try {
+                Element element =
+                        getBuilder().parse(stream).getDocumentElement();
+                this.mimeTypes = typesFromDomElement(element);
+                this.parser =
+                        parserFromDomElement(element, mimeTypes, loader);
+                this.detector =
+                        detectorFromDomElement(element, mimeTypes, loader);
+            } catch (SAXException e) {
+                throw new TikaException(
+                        "Specified Tika configuration has syntax errors: "
+                                + config, e);
+            } finally {
+                stream.close();
+            }
         }
     }
 
@@ -271,7 +295,7 @@ public class TikaConfig {
         if (mtr != null && mtr.hasAttribute("resource")) {
             return MimeTypesFactory.create(mtr.getAttribute("resource"));
         } else {
-            return MimeTypes.getDefaultMimeTypes();
+            return getDefaultMimeTypes();
         }
     }
 
@@ -287,12 +311,12 @@ public class TikaConfig {
             try {
                 Class<?> parserClass = Class.forName(name, true, loader);
                 // https://issues.apache.org/jira/browse/TIKA-866
-                if (DefaultParser.class.isAssignableFrom(parserClass)
-                        || 
AutoDetectParser.class.isAssignableFrom(parserClass)) {
+                if (AutoDetectParser.class.isAssignableFrom(parserClass)) {
                     throw new TikaException(
-                            "Composite parsers not supported in <parser>"
-                            + " configuration elements: " + name);
+                            "AutoDetectParser not supported in a <parser>"
+                            + " configuration element: " + name);
                 }
+
                 Object instance = parserClass.newInstance();
                 if (!(instance instanceof Parser)) {
                     throw new TikaException(
@@ -328,7 +352,12 @@ public class TikaConfig {
                         "Unable to instantiate a parser class: " + name, e);
             }
         }
-        return new CompositeParser(mimeTypes.getMediaTypeRegistry(), parsers);
+        if (parsers.isEmpty()) {
+            return getDefaultParser(mimeTypes, loader);
+        } else {
+            MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
+            return new CompositeParser(registry, parsers);
+        }
     }
 
     private static Detector detectorFromDomElement(
@@ -360,7 +389,11 @@ public class TikaConfig {
                        "Unable to instantiate a detector class: " + name, e);
            }
        }
-       
-       return new CompositeDetector(mimeTypes.getMediaTypeRegistry(), 
detectors);
+       if (detectors.isEmpty()) {
+           return getDefaultDetector(mimeTypes, loader);
+       } else {
+           MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
+           return new CompositeDetector(registry, detectors);
+       }
     }
 }

Added: 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java?rev=1245692&view=auto
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java 
(added)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java 
Fri Feb 17 18:23:39 2012
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Dummy detector that returns application/octet-stream for all documents.
+ */
+public class EmptyDetector implements Detector {
+
+    /**
+     * Singleton instance of this class.
+     */
+    public static final EmptyDetector INSTANCE = new EmptyDetector();
+
+    public MediaType detect(InputStream input, Metadata metadata)
+            throws IOException {
+        return MediaType.OCTET_STREAM;
+    }
+
+}

Propchange: 
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java?rev=1245692&r1=1245691&r2=1245692&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java 
Fri Feb 17 18:23:39 2012
@@ -22,8 +22,6 @@ import java.util.Map;
 import java.util.SortedSet;
 import java.util.TreeSet;
 
-import org.apache.tika.config.TikaConfig;
-
 /**
  * Registry of known Internet media types.
  */
@@ -39,7 +37,7 @@ public class MediaTypeRegistry implement
      * @return default media type registry
      */
     public static MediaTypeRegistry getDefaultRegistry() {
-        return TikaConfig.getDefaultConfig().getMediaTypeRegistry();
+        return MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry();
     }
 
     /**

Modified: 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java?rev=1245692&r1=1245691&r2=1245692&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java 
(original)
+++ 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java 
Fri Feb 17 18:23:39 2012
@@ -16,32 +16,46 @@
  */
 package org.apache.tika.config;
 
-import java.io.InputStream;
+import java.net.URL;
 
 import junit.framework.TestCase;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.DefaultParser;
 
 public class TikaConfigTest extends TestCase {
 
     /**
-     * Make sure that a configuration file can't reference to composite
-     * parser classes like {@link DefaultParser} in the &lt;parser&gt;
-     * configuration elements.
+     * Make sure that a configuration file can't reference the
+     * {@link AutoDetectParser} class a &lt;parser&gt; configuration element.
      *
      * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-866";>TIKA-866</a>
      */
     public void testInvalidParser() throws Exception {
-        InputStream xml = TikaConfigTest.class.getResourceAsStream(
-                "TIKA-866-invalid.xml");
+        URL url = TikaConfigTest.class.getResource("TIKA-866-invalid.xml");
+        System.setProperty("tika.config", url.toExternalForm());
         try {
-            new TikaConfig(xml);
-            fail("Composite parser class was allowed in <parser>");
+            new TikaConfig();
+            fail("AutoDetectParser allowed in a <parser> element");
         } catch (TikaException expected) {
-            // OK
-        } finally {
-            xml.close();
+        }
+    }
+
+    /**
+     * Make sure that a configuration file can reference also a composite
+     * parser class like {@link DefaultParser} in a &lt;parser&gt;
+     * configuration element.
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-866";>TIKA-866</a>
+     */
+    public void testCompositeParser() throws Exception {
+        URL url = TikaConfigTest.class.getResource("TIKA-866-composite.xml");
+        System.setProperty("tika.config", url.toExternalForm());
+        try {
+            new TikaConfig();
+        } catch (TikaException e) {
+            fail("Unexpected TikaException: " + e);
         }
     }
 
@@ -52,13 +66,12 @@ public class TikaConfigTest extends Test
      * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-866";>TIKA-866</a>
      */
     public void testValidParser() throws Exception {
-        InputStream xml = TikaConfigTest.class.getResourceAsStream(
-                "TIKA-866-valid.xml");
+        URL url = TikaConfigTest.class.getResource("TIKA-866-valid.xml");
+        System.setProperty("tika.config", url.toExternalForm());
         try {
-            new TikaConfig(xml);
-            // OK
-        } finally {
-            xml.close();
+            new TikaConfig();
+        } catch (TikaException e) {
+            fail("Unexpected TikaException: " + e);
         }
     }
 

Copied: 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-composite.xml
 (from r1245600, 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml)
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-composite.xml?p2=tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-composite.xml&p1=tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml&r1=1245600&r2=1245692&rev=1245692&view=diff
==============================================================================
    (empty)

Modified: 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml?rev=1245692&r1=1245691&r2=1245692&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml
 (original)
+++ 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml
 Fri Feb 17 18:23:39 2012
@@ -17,6 +17,6 @@
 -->
 <properties>
   <parsers>
-    <parser class="org.apache.tika.parser.DefaultParser"/>
+    <parser class="org.apache.tika.parser.AutoDetectParser"/>
   </parsers>
 </properties>


Reply via email to