Author: jukka
Date: Fri Feb 17 18:23:39 2012
New Revision: 1245692
URL: http://svn.apache.org/viewvc?rev=1245692&view=rev
Log:
TIKA-866: Invalid configuration file causes OutOfMemoryException
Restore ability to use DefaultParser in a configuration file and instead break
the default configuration loop in getDefaultRegistry().
Make also <parser> and <detector> elements default to the standard settings if
a configuration file doesn't explicitly specify any entries (use EmptyParser or
EmptyDetector to disable that behavior).
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java
(with props)
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-composite.xml
- copied, changed from r1245600,
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1245692&r1=1245691&r2=1245692&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
Fri Feb 17 18:23:39 2012
@@ -56,6 +56,20 @@ import org.xml.sax.SAXException;
*/
public class TikaConfig {
+ private static MimeTypes getDefaultMimeTypes() {
+ return MimeTypes.getDefaultMimeTypes();
+ }
+
+ private static Detector getDefaultDetector(
+ MimeTypes types, ClassLoader loader) {
+ return new DefaultDetector(types, loader);
+ }
+
+ private static CompositeParser getDefaultParser(
+ MimeTypes types, ClassLoader loader) {
+ return new DefaultParser(types.getMediaTypeRegistry(), loader);
+ }
+
private final CompositeParser parser;
private final Detector detector;
@@ -115,9 +129,9 @@ public class TikaConfig {
*/
public TikaConfig(ClassLoader loader)
throws MimeTypeException, IOException {
- this.mimeTypes = MimeTypes.getDefaultMimeTypes();
- this.detector = new DefaultDetector(mimeTypes, loader);
- this.parser = new DefaultParser(mimeTypes.getMediaTypeRegistry(),
loader);
+ this.mimeTypes = getDefaultMimeTypes();
+ this.detector = getDefaultDetector(mimeTypes, loader);
+ this.parser = getDefaultParser(mimeTypes, loader);
}
/**
@@ -138,43 +152,53 @@ public class TikaConfig {
* @throws TikaException if problem with MimeTypes or parsing XML config
*/
public TikaConfig() throws TikaException, IOException {
+ ClassLoader loader = ServiceLoader.getContextClassLoader();
+
String config = System.getProperty("tika.config");
if (config == null) {
config = System.getenv("TIKA_CONFIG");
}
+
if (config == null) {
- this.mimeTypes = MimeTypes.getDefaultMimeTypes();
- this.parser = new DefaultParser(mimeTypes.getMediaTypeRegistry());
- this.detector = new DefaultDetector(mimeTypes);
+ this.mimeTypes = getDefaultMimeTypes();
+ this.parser = getDefaultParser(mimeTypes, loader);
+ this.detector = getDefaultDetector(mimeTypes, loader);
} else {
- ClassLoader loader = ServiceLoader.getContextClassLoader();
- InputStream stream;
+ // Locate the given configuration file
+ InputStream stream = null;
File file = new File(config);
if (file.isFile()) {
stream = new FileInputStream(file);
- } else {
- stream = loader.getResourceAsStream(config);
}
- if (stream != null) {
+ if (stream == null) {
try {
- Element element =
- getBuilder().parse(stream).getDocumentElement();
- this.mimeTypes = typesFromDomElement(element);
- this.parser =
- parserFromDomElement(element, mimeTypes, loader);
- this.detector =
- detectorFromDomElement(element, mimeTypes, loader);
- } catch (SAXException e) {
- throw new TikaException(
- "Specified Tika configuration has syntax errors: "
- + config, e);
- } finally {
- stream.close();
+ stream = new URL(config).openStream();
+ } catch (IOException ignore) {
}
- } else {
+ }
+ if (stream == null) {
+ stream = loader.getResourceAsStream(config);
+ }
+ if (stream == null) {
throw new TikaException(
"Specified Tika configuration not found: " + config);
}
+
+ try {
+ Element element =
+ getBuilder().parse(stream).getDocumentElement();
+ this.mimeTypes = typesFromDomElement(element);
+ this.parser =
+ parserFromDomElement(element, mimeTypes, loader);
+ this.detector =
+ detectorFromDomElement(element, mimeTypes, loader);
+ } catch (SAXException e) {
+ throw new TikaException(
+ "Specified Tika configuration has syntax errors: "
+ + config, e);
+ } finally {
+ stream.close();
+ }
}
}
@@ -271,7 +295,7 @@ public class TikaConfig {
if (mtr != null && mtr.hasAttribute("resource")) {
return MimeTypesFactory.create(mtr.getAttribute("resource"));
} else {
- return MimeTypes.getDefaultMimeTypes();
+ return getDefaultMimeTypes();
}
}
@@ -287,12 +311,12 @@ public class TikaConfig {
try {
Class<?> parserClass = Class.forName(name, true, loader);
// https://issues.apache.org/jira/browse/TIKA-866
- if (DefaultParser.class.isAssignableFrom(parserClass)
- ||
AutoDetectParser.class.isAssignableFrom(parserClass)) {
+ if (AutoDetectParser.class.isAssignableFrom(parserClass)) {
throw new TikaException(
- "Composite parsers not supported in <parser>"
- + " configuration elements: " + name);
+ "AutoDetectParser not supported in a <parser>"
+ + " configuration element: " + name);
}
+
Object instance = parserClass.newInstance();
if (!(instance instanceof Parser)) {
throw new TikaException(
@@ -328,7 +352,12 @@ public class TikaConfig {
"Unable to instantiate a parser class: " + name, e);
}
}
- return new CompositeParser(mimeTypes.getMediaTypeRegistry(), parsers);
+ if (parsers.isEmpty()) {
+ return getDefaultParser(mimeTypes, loader);
+ } else {
+ MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
+ return new CompositeParser(registry, parsers);
+ }
}
private static Detector detectorFromDomElement(
@@ -360,7 +389,11 @@ public class TikaConfig {
"Unable to instantiate a detector class: " + name, e);
}
}
-
- return new CompositeDetector(mimeTypes.getMediaTypeRegistry(),
detectors);
+ if (detectors.isEmpty()) {
+ return getDefaultDetector(mimeTypes, loader);
+ } else {
+ MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
+ return new CompositeDetector(registry, detectors);
+ }
}
}
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java?rev=1245692&view=auto
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java
(added)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java
Fri Feb 17 18:23:39 2012
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Dummy detector that returns application/octet-stream for all documents.
+ */
+public class EmptyDetector implements Detector {
+
+ /**
+ * Singleton instance of this class.
+ */
+ public static final EmptyDetector INSTANCE = new EmptyDetector();
+
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ return MediaType.OCTET_STREAM;
+ }
+
+}
Propchange:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java?rev=1245692&r1=1245691&r2=1245692&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
Fri Feb 17 18:23:39 2012
@@ -22,8 +22,6 @@ import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
-import org.apache.tika.config.TikaConfig;
-
/**
* Registry of known Internet media types.
*/
@@ -39,7 +37,7 @@ public class MediaTypeRegistry implement
* @return default media type registry
*/
public static MediaTypeRegistry getDefaultRegistry() {
- return TikaConfig.getDefaultConfig().getMediaTypeRegistry();
+ return MimeTypes.getDefaultMimeTypes().getMediaTypeRegistry();
}
/**
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java?rev=1245692&r1=1245691&r2=1245692&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
Fri Feb 17 18:23:39 2012
@@ -16,32 +16,46 @@
*/
package org.apache.tika.config;
-import java.io.InputStream;
+import java.net.URL;
import junit.framework.TestCase;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DefaultParser;
public class TikaConfigTest extends TestCase {
/**
- * Make sure that a configuration file can't reference to composite
- * parser classes like {@link DefaultParser} in the <parser>
- * configuration elements.
+ * Make sure that a configuration file can't reference the
+ * {@link AutoDetectParser} class a <parser> configuration element.
*
* @see <a
href="https://issues.apache.org/jira/browse/TIKA-866">TIKA-866</a>
*/
public void testInvalidParser() throws Exception {
- InputStream xml = TikaConfigTest.class.getResourceAsStream(
- "TIKA-866-invalid.xml");
+ URL url = TikaConfigTest.class.getResource("TIKA-866-invalid.xml");
+ System.setProperty("tika.config", url.toExternalForm());
try {
- new TikaConfig(xml);
- fail("Composite parser class was allowed in <parser>");
+ new TikaConfig();
+ fail("AutoDetectParser allowed in a <parser> element");
} catch (TikaException expected) {
- // OK
- } finally {
- xml.close();
+ }
+ }
+
+ /**
+ * Make sure that a configuration file can reference also a composite
+ * parser class like {@link DefaultParser} in a <parser>
+ * configuration element.
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-866">TIKA-866</a>
+ */
+ public void testCompositeParser() throws Exception {
+ URL url = TikaConfigTest.class.getResource("TIKA-866-composite.xml");
+ System.setProperty("tika.config", url.toExternalForm());
+ try {
+ new TikaConfig();
+ } catch (TikaException e) {
+ fail("Unexpected TikaException: " + e);
}
}
@@ -52,13 +66,12 @@ public class TikaConfigTest extends Test
* @see <a
href="https://issues.apache.org/jira/browse/TIKA-866">TIKA-866</a>
*/
public void testValidParser() throws Exception {
- InputStream xml = TikaConfigTest.class.getResourceAsStream(
- "TIKA-866-valid.xml");
+ URL url = TikaConfigTest.class.getResource("TIKA-866-valid.xml");
+ System.setProperty("tika.config", url.toExternalForm());
try {
- new TikaConfig(xml);
- // OK
- } finally {
- xml.close();
+ new TikaConfig();
+ } catch (TikaException e) {
+ fail("Unexpected TikaException: " + e);
}
}
Copied:
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-composite.xml
(from r1245600,
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml)
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-composite.xml?p2=tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-composite.xml&p1=tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml&r1=1245600&r2=1245692&rev=1245692&view=diff
==============================================================================
(empty)
Modified:
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml?rev=1245692&r1=1245691&r2=1245692&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml
(original)
+++
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-866-invalid.xml
Fri Feb 17 18:23:39 2012
@@ -17,6 +17,6 @@
-->
<properties>
<parsers>
- <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.AutoDetectParser"/>
</parsers>
</properties>