Author: nick
Date: Fri Dec 19 06:21:03 2014
New Revision: 1646626
URL: http://svn.apache.org/r1646626
Log:
TIKA-1445 - Allow you to exclude certain mimetypes from a parser that would
otherwise handle them, in your Tika Config xml
Added:
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1646626&r1=1646625&r2=1646626&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
Fri Dec 19 06:21:03 2014
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@@ -344,22 +345,18 @@ public class TikaConfig {
}
Parser parser = parserClass.newInstance();
- NodeList mimes = node.getElementsByTagName("mime");
- if (mimes.getLength() > 0) {
- Set<MediaType> types = new HashSet<MediaType>();
- for (int j = 0; j < mimes.getLength(); j++) {
- String mime = getText(mimes.item(j));
- MediaType type = MediaType.parse(mime);
- if (type != null) {
- types.add(type);
- } else {
- throw new TikaException(
- "Invalid media type name: " + mime);
- }
- }
- parser = ParserDecorator.withTypes(parser, types);
+ // Is there an explicit list of mime types for this to handle?
+ Set<MediaType> parserTypes =
mediaTypesListFromDomElement(node, "mime");
+ if (! parserTypes.isEmpty()) {
+ parser = ParserDecorator.withTypes(parser, parserTypes);
+ }
+ // Is there an explicit list of mime types this shouldn't
handle?
+ Set<MediaType> parserExclTypes =
mediaTypesListFromDomElement(node, "mime-exclude");
+ if (! parserExclTypes.isEmpty()) {
+ parser = ParserDecorator.withoutTypes(parser,
parserExclTypes);
}
+ // All done with setup
parsers.add(parser);
} catch (ClassNotFoundException e) {
throw new TikaException(
@@ -379,6 +376,26 @@ public class TikaConfig {
return new CompositeParser(registry, parsers);
}
}
+ private static Set<MediaType> mediaTypesListFromDomElement(
+ Element node, String tag)
+ throws TikaException, IOException {
+ NodeList mimes = node.getElementsByTagName(tag);
+ if (mimes.getLength() > 0) {
+ Set<MediaType> types = new HashSet<MediaType>();
+ for (int j = 0; j < mimes.getLength(); j++) {
+ String mime = getText(mimes.item(j));
+ MediaType type = MediaType.parse(mime);
+ if (type != null) {
+ types.add(type);
+ } else {
+ throw new TikaException(
+ "Invalid media type name: " + mime);
+ }
+ }
+ return types;
+ }
+ return Collections.emptySet();
+ }
private static Detector detectorFromDomElement(
Element element, MimeTypes mimeTypes, ServiceLoader loader)
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=1646626&r1=1646625&r2=1646626&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
Fri Dec 19 06:21:03 2014
@@ -140,6 +140,15 @@ public class CompositeParser extends Abs
}
/**
+ * Returns all parsers registered with the Composite Parser,
+ * including ones which may not currently be active.
+ * This won't include the Fallback Parser, if defined
+ */
+ public List<Parser> getAllComponentParsers() {
+ return Collections.unmodifiableList(parsers);
+ }
+
+ /**
* Returns the component parsers.
*
* @return component parsers, keyed by media type
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=1646626&r1=1646625&r2=1646626&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
Fri Dec 19 06:21:03 2014
@@ -18,6 +18,7 @@ package org.apache.tika.parser;
import java.io.IOException;
import java.io.InputStream;
+import java.util.HashSet;
import java.util.Set;
import org.apache.tika.exception.TikaException;
@@ -55,6 +56,31 @@ public class ParserDecorator extends Abs
}
};
}
+
+ /**
+ * Decorates the given parser so that it never claims to support
+ * parsing of the given media types, but will work for all others.
+ *
+ * @param parser the parser to be decorated
+ * @param types excluded/ignored media types
+ * @return the decorated parser
+ */
+ public static final Parser withoutTypes(
+ Parser parser, final Set<MediaType> excludeTypes) {
+ return new ParserDecorator(parser) {
+ private static final long serialVersionUID = 7979614774021768609L;
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ // Get our own, writable copy of the types the parser supports
+ Set<MediaType> parserTypes =
+ new
HashSet<MediaType>(super.getSupportedTypes(context));
+ // Remove anything on our excludes list
+ parserTypes.removeAll(excludeTypes);
+ // Return whatever is left
+ return parserTypes;
+ }
+ };
+ }
/**
* The decorated parser instance.
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java?rev=1646626&r1=1646625&r2=1646626&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
Fri Dec 19 06:21:03 2014
@@ -23,7 +23,12 @@ import java.util.Map;
import org.apache.tika.ResourceLoggingClassLoader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ErrorParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
@@ -40,7 +45,7 @@ public class TikaConfigTest {
* @see <a
href="https://issues.apache.org/jira/browse/TIKA-866">TIKA-866</a>
*/
@Test
- public void testInvalidParser() throws Exception {
+ public void withInvalidParser() throws Exception {
URL url = TikaConfigTest.class.getResource("TIKA-866-invalid.xml");
System.setProperty("tika.config", url.toExternalForm());
try {
@@ -59,7 +64,8 @@ public class TikaConfigTest {
*
* @see <a
href="https://issues.apache.org/jira/browse/TIKA-866">TIKA-866</a>
*/
- public void testCompositeParser() throws Exception {
+ @Test
+ public void asCompositeParser() throws Exception {
URL url = TikaConfigTest.class.getResource("TIKA-866-composite.xml");
System.setProperty("tika.config", url.toExternalForm());
try {
@@ -77,7 +83,8 @@ public class TikaConfigTest {
*
* @see <a
href="https://issues.apache.org/jira/browse/TIKA-866">TIKA-866</a>
*/
- public void testValidParser() throws Exception {
+ @Test
+ public void onlyValidParser() throws Exception {
URL url = TikaConfigTest.class.getResource("TIKA-866-valid.xml");
System.setProperty("tika.config", url.toExternalForm());
try {
@@ -94,7 +101,8 @@ public class TikaConfigTest {
* that should be used when loading the mimetypes and when
* discovering services
*/
- public void testClassLoaderUsedEverywhere() throws Exception {
+ @Test
+ public void ensureClassLoaderUsedEverywhere() throws Exception {
ResourceLoggingClassLoader customLoader =
new ResourceLoggingClassLoader(getClass().getClassLoader());
TikaConfig config;
@@ -127,4 +135,46 @@ public class TikaConfigTest {
// - Custom Mimetypes
assertNotNull(resources.get("org/apache/tika/mime/custom-mimetypes.xml"));
}
+
+ /**
+ * TIKA-1445 It should be possible to exclude DefaultParser from
+ * certain types, so another parser explicitly listed will take them
+ */
+ @Test
+ public void defaultParserWithExcludes() throws Exception {
+ URL url =
TikaConfigTest.class.getResource("TIKA-1445-default-except.xml");
+ System.setProperty("tika.config", url.toExternalForm());
+ try {
+ TikaConfig config = new TikaConfig();
+
+ CompositeParser cp = (CompositeParser)config.getParser();
+ List<Parser> parsers = cp.getAllComponentParsers();
+ Parser p;
+
+ // Will be the three parsers defined in the xml
+ assertEquals(3, parsers.size());
+
+ // Should have a wrapped DefaultParser, not the main DefaultParser,
+ // as it is excluded from handling certain classes
+ p = parsers.get(0);
+ assertTrue(p.toString(), p instanceof ParserDecorator);
+ assertEquals(DefaultParser.class,
((ParserDecorator)p).getWrappedParser().getClass());
+
+ // Should have two others which claim things, which they wouldn't
+ // otherwise handle
+ p = parsers.get(1);
+ assertTrue(p.toString(), p instanceof ParserDecorator);
+ assertEquals(EmptyParser.class,
((ParserDecorator)p).getWrappedParser().getClass());
+ assertEquals("hello/world",
p.getSupportedTypes(null).iterator().next().toString());
+
+ p = parsers.get(2);
+ assertTrue(p.toString(), p instanceof ParserDecorator);
+ assertEquals(ErrorParser.class,
((ParserDecorator)p).getWrappedParser().getClass());
+ assertEquals("fail/world",
p.getSupportedTypes(null).iterator().next().toString());
+ } catch (TikaException e) {
+ fail("Unexpected TikaException: " + e);
+ } finally {
+ System.clearProperty("tika.config");
+ }
+ }
}
\ No newline at end of file
Added:
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml?rev=1646626&view=auto
==============================================================================
---
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml
(added)
+++
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml
Fri Dec 19 06:21:03 2014
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <mime-exclude>hello/world</mime-exclude>
+ <mime-exclude>fail/world</mime-exclude>
+ </parser>
+ <parser class="org.apache.tika.parser.EmptyParser">
+ <mime>hello/world</mime>
+ </parser>
+ <parser class="org.apache.tika.parser.ErrorParser">
+ <mime>fail/world</mime>
+ </parser>
+ </parsers>
+</properties>