Author: nick
Date: Fri Dec 19 06:21:03 2014
New Revision: 1646626

URL: http://svn.apache.org/r1646626
Log:
TIKA-1445 - Allow you to exclude certain mimetypes from a parser that would 
otherwise handle them, in your Tika Config xml

Added:
    
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
    
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
    
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
    
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1646626&r1=1646625&r2=1646626&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
Fri Dec 19 06:21:03 2014
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
@@ -344,22 +345,18 @@ public class TikaConfig {
                 }
                 Parser parser = parserClass.newInstance();
 
-                NodeList mimes = node.getElementsByTagName("mime");
-                if (mimes.getLength() > 0) {
-                    Set<MediaType> types = new HashSet<MediaType>();
-                    for (int j = 0; j < mimes.getLength(); j++) {
-                        String mime = getText(mimes.item(j));
-                        MediaType type = MediaType.parse(mime);
-                        if (type != null) {
-                            types.add(type);
-                        } else {
-                            throw new TikaException(
-                                    "Invalid media type name: " + mime);
-                        }
-                    }
-                    parser = ParserDecorator.withTypes(parser, types);
+                // Is there an explicit list of mime types for this to handle?
+                Set<MediaType> parserTypes = 
mediaTypesListFromDomElement(node, "mime");
+                if (! parserTypes.isEmpty()) {
+                    parser = ParserDecorator.withTypes(parser, parserTypes);
+                }
+                // Is there an explicit list of mime types this shouldn't 
handle?
+                Set<MediaType> parserExclTypes = 
mediaTypesListFromDomElement(node, "mime-exclude");
+                if (! parserExclTypes.isEmpty()) {
+                    parser = ParserDecorator.withoutTypes(parser, 
parserExclTypes);
                 }
 
+                // All done with setup
                 parsers.add(parser);
             } catch (ClassNotFoundException e) {
                 throw new TikaException(
@@ -379,6 +376,26 @@ public class TikaConfig {
             return new CompositeParser(registry, parsers);
         }
     }
+    private static Set<MediaType> mediaTypesListFromDomElement(
+            Element node, String tag) 
+            throws TikaException, IOException {
+        NodeList mimes = node.getElementsByTagName(tag);
+        if (mimes.getLength() > 0) {
+            Set<MediaType> types = new HashSet<MediaType>();
+            for (int j = 0; j < mimes.getLength(); j++) {
+                String mime = getText(mimes.item(j));
+                MediaType type = MediaType.parse(mime);
+                if (type != null) {
+                    types.add(type);
+                } else {
+                    throw new TikaException(
+                            "Invalid media type name: " + mime);
+                }
+            }
+            return types;
+        }
+        return Collections.emptySet();
+    }
 
     private static Detector detectorFromDomElement(
           Element element, MimeTypes mimeTypes, ServiceLoader loader)

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=1646626&r1=1646625&r2=1646626&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java 
Fri Dec 19 06:21:03 2014
@@ -140,6 +140,15 @@ public class CompositeParser extends Abs
     }
 
     /**
+     * Returns all parsers registered with the Composite Parser,
+     *  including ones which may not currently be active.
+     * This won't include the Fallback Parser, if defined
+     */
+    public List<Parser> getAllComponentParsers() {
+        return Collections.unmodifiableList(parsers);
+    }
+    
+    /**
      * Returns the component parsers.
      *
      * @return component parsers, keyed by media type

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=1646626&r1=1646625&r2=1646626&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java 
(original)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java 
Fri Dec 19 06:21:03 2014
@@ -18,6 +18,7 @@ package org.apache.tika.parser;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.HashSet;
 import java.util.Set;
 
 import org.apache.tika.exception.TikaException;
@@ -55,6 +56,31 @@ public class ParserDecorator extends Abs
             }
         };
     }
+
+    /**
+     * Decorates the given parser so that it never claims to support
+     * parsing of the given media types, but will work for all others.
+     *
+     * @param parser the parser to be decorated
+     * @param types excluded/ignored media types
+     * @return the decorated parser
+     */
+    public static final Parser withoutTypes(
+            Parser parser, final Set<MediaType> excludeTypes) {
+        return new ParserDecorator(parser) {
+            private static final long serialVersionUID = 7979614774021768609L;
+            @Override
+            public Set<MediaType> getSupportedTypes(ParseContext context) {
+                // Get our own, writable copy of the types the parser supports
+                Set<MediaType> parserTypes = 
+                        new 
HashSet<MediaType>(super.getSupportedTypes(context));
+                // Remove anything on our excludes list
+                parserTypes.removeAll(excludeTypes);
+                // Return whatever is left
+                return parserTypes;
+            }
+        };
+    }
 
     /**
      * The decorated parser instance.

Modified: 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java?rev=1646626&r1=1646625&r2=1646626&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java 
(original)
+++ 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java 
Fri Dec 19 06:21:03 2014
@@ -23,7 +23,12 @@ import java.util.Map;
 import org.apache.tika.ResourceLoggingClassLoader;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ErrorParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
 import org.junit.Test;
 
 import static org.junit.Assert.assertEquals;
@@ -40,7 +45,7 @@ public class TikaConfigTest {
      * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-866";>TIKA-866</a>
      */
     @Test
-    public void testInvalidParser() throws Exception {
+    public void withInvalidParser() throws Exception {
         URL url = TikaConfigTest.class.getResource("TIKA-866-invalid.xml");
         System.setProperty("tika.config", url.toExternalForm());
         try {
@@ -59,7 +64,8 @@ public class TikaConfigTest {
      *
      * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-866";>TIKA-866</a>
      */
-    public void testCompositeParser() throws Exception {
+    @Test
+    public void asCompositeParser() throws Exception {
         URL url = TikaConfigTest.class.getResource("TIKA-866-composite.xml");
         System.setProperty("tika.config", url.toExternalForm());
         try {
@@ -77,7 +83,8 @@ public class TikaConfigTest {
      *
      * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-866";>TIKA-866</a>
      */
-    public void testValidParser() throws Exception {
+    @Test
+    public void onlyValidParser() throws Exception {
         URL url = TikaConfigTest.class.getResource("TIKA-866-valid.xml");
         System.setProperty("tika.config", url.toExternalForm());
         try {
@@ -94,7 +101,8 @@ public class TikaConfigTest {
      * that should be used when loading the mimetypes and when
      * discovering services
      */
-    public void testClassLoaderUsedEverywhere() throws Exception {
+    @Test
+    public void ensureClassLoaderUsedEverywhere() throws Exception {
         ResourceLoggingClassLoader customLoader = 
                 new ResourceLoggingClassLoader(getClass().getClassLoader());
         TikaConfig config;
@@ -127,4 +135,46 @@ public class TikaConfigTest {
         // - Custom Mimetypes
         
assertNotNull(resources.get("org/apache/tika/mime/custom-mimetypes.xml"));
     }
+    
+    /**
+     * TIKA-1445 It should be possible to exclude DefaultParser from
+     *  certain types, so another parser explicitly listed will take them
+     */
+    @Test
+    public void defaultParserWithExcludes() throws Exception {
+        URL url = 
TikaConfigTest.class.getResource("TIKA-1445-default-except.xml");
+        System.setProperty("tika.config", url.toExternalForm());
+        try {
+            TikaConfig config = new TikaConfig();
+            
+            CompositeParser cp = (CompositeParser)config.getParser();
+            List<Parser> parsers = cp.getAllComponentParsers();
+            Parser p;
+            
+            // Will be the three parsers defined in the xml
+            assertEquals(3, parsers.size());
+            
+            // Should have a wrapped DefaultParser, not the main DefaultParser,
+            //  as it is excluded from handling certain classes
+            p = parsers.get(0);
+            assertTrue(p.toString(), p instanceof ParserDecorator);
+            assertEquals(DefaultParser.class, 
((ParserDecorator)p).getWrappedParser().getClass());
+            
+            // Should have two others which claim things, which they wouldn't
+            //  otherwise handle
+            p = parsers.get(1);
+            assertTrue(p.toString(), p instanceof ParserDecorator);
+            assertEquals(EmptyParser.class, 
((ParserDecorator)p).getWrappedParser().getClass());
+            assertEquals("hello/world", 
p.getSupportedTypes(null).iterator().next().toString());
+            
+            p = parsers.get(2);
+            assertTrue(p.toString(), p instanceof ParserDecorator);
+            assertEquals(ErrorParser.class, 
((ParserDecorator)p).getWrappedParser().getClass());
+            assertEquals("fail/world", 
p.getSupportedTypes(null).iterator().next().toString());
+        } catch (TikaException e) {
+            fail("Unexpected TikaException: " + e);
+        } finally {
+            System.clearProperty("tika.config");
+        }
+    }
 }
\ No newline at end of file

Added: 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml?rev=1646626&view=auto
==============================================================================
--- 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml
 (added)
+++ 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1445-default-except.xml
 Fri Dec 19 06:21:03 2014
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <mime-exclude>hello/world</mime-exclude>
+      <mime-exclude>fail/world</mime-exclude>
+    </parser>
+    <parser class="org.apache.tika.parser.EmptyParser">
+      <mime>hello/world</mime>
+    </parser>
+    <parser class="org.apache.tika.parser.ErrorParser">
+      <mime>fail/world</mime>
+    </parser>
+  </parsers>
+</properties>


Reply via email to