Author: nick
Date: Mon Jun  8 14:41:48 2015
New Revision: 1684199

URL: http://svn.apache.org/r1684199
Log:
TIKA-1653 Re-do the XML parsing in the Tika Config, so that a parser tag with 
another inside it doesn't get accidently duplicated at the top level

Added:
    
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
    
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1684199&r1=1684198&r2=1684199&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
Mon Jun  8 14:41:48 2015
@@ -332,10 +332,25 @@ public class TikaConfig {
             Element element, MimeTypes mimeTypes, ServiceLoader loader)
             throws TikaException, IOException {
         List<Parser> parsers = new ArrayList<Parser>();
-        NodeList nodes = element.getElementsByTagName("parser");
-        for (int i = 0; i < nodes.getLength(); i++) {
-            Element node = (Element) nodes.item(i);
-            parsers.add(parserFromParserDomElement(node, mimeTypes, loader));
+        
+        // Should be only zero or one <parsers> tag
+        NodeList nodes = element.getElementsByTagName("parsers");
+        if (nodes.getLength() > 1) {
+            throw new TikaException("Properties may not contain multiple 
Parsers entries");
+        }
+        else if (nodes.getLength() == 1) {
+            // Find only the direct child parser objects
+            Node parsersE = nodes.item(0);
+            nodes = parsersE.getChildNodes();
+            for (int i = 0; i < nodes.getLength(); i++) {
+                Node node = nodes.item(i);
+                if (node instanceof Element) {
+                    Element nodeE = (Element)node;
+                    if ("parser".equals(nodeE.getTagName())) {
+                        parsers.add(parserFromParserDomElement(nodeE, 
mimeTypes, loader));
+                    }
+                }
+            }
         }
         
         if (parsers.isEmpty()) {
@@ -444,21 +459,26 @@ public class TikaConfig {
     private static Set<MediaType> mediaTypesListFromDomElement(
             Element node, String tag) 
             throws TikaException, IOException {
-        NodeList mimes = node.getElementsByTagName(tag);
-        if (mimes.getLength() > 0) {
-            Set<MediaType> types = new HashSet<MediaType>();
-            for (int j = 0; j < mimes.getLength(); j++) {
-                String mime = getText(mimes.item(j));
-                MediaType type = MediaType.parse(mime);
-                if (type != null) {
-                    types.add(type);
-                } else {
-                    throw new TikaException(
-                            "Invalid media type name: " + mime);
+        Set<MediaType> types = null;
+        NodeList children = node.getChildNodes();
+        for (int i=0; i<children.getLength(); i++) {
+            Node cNode = children.item(i);
+            if (cNode instanceof Element) {
+                Element cElement = (Element)cNode;
+                if (tag.equals(cElement.getTagName())) {
+                    String mime = getText(cElement);
+                    MediaType type = MediaType.parse(mime);
+                    if (type != null) {
+                        if (types == null) types = new HashSet<MediaType>();
+                        types.add(type);
+                    } else {
+                        throw new TikaException(
+                                "Invalid media type name: " + mime);
+                    }
                 }
             }
-            return types;
         }
+        if (types != null) return types;
         return Collections.emptySet();
     }
 

Modified: 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java?rev=1684199&r1=1684198&r2=1684199&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java 
(original)
+++ 
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java 
Mon Jun  8 14:41:48 2015
@@ -177,4 +177,39 @@ public class TikaConfigTest {
             System.clearProperty("tika.config");
         }
     }
+    
+    /**
+     * TIKA-1653 If one parser has child parsers, those child parsers shouldn't
+     *  show up at the top level as well
+     */
+    @Test
+    public void parserWithChildParsers() throws Exception {
+        URL url = TikaConfigTest.class.getResource("TIKA-1653-norepeat.xml");
+        System.setProperty("tika.config", url.toExternalForm());
+        try {
+            TikaConfig config = new TikaConfig();
+            
+            CompositeParser cp = (CompositeParser)config.getParser();
+            List<Parser> parsers = cp.getAllComponentParsers();
+            Parser p;
+            
+            // Just 2 top level parsers
+            assertEquals(2, parsers.size());
+            
+            // Should have a CompositeParser with 2 child ones, and
+            //  and a wrapped empty parser
+            p = parsers.get(0);
+            assertTrue(p.toString(), p instanceof CompositeParser);
+            assertEquals(2, 
((CompositeParser)p).getAllComponentParsers().size());
+            
+            p = parsers.get(1);
+            assertTrue(p.toString(), p instanceof ParserDecorator);
+            assertEquals(EmptyParser.class, 
((ParserDecorator)p).getWrappedParser().getClass());
+            assertEquals("hello/world", 
p.getSupportedTypes(null).iterator().next().toString());
+        } catch (TikaException e) {
+            fail("Unexpected TikaException: " + e);
+        } finally {
+            System.clearProperty("tika.config");
+        }
+    }
 }
\ No newline at end of file

Added: 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml?rev=1684199&view=auto
==============================================================================
--- 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml
 (added)
+++ 
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml
 Mon Jun  8 14:41:48 2015
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.CompositeParser">
+       <parser class="org.apache.tika.parser.EmptyParser">
+          <mime>hello/world1</mime>
+       </parser>
+       <parser class="org.apache.tika.parser.EmptyParser">
+          <mime>hello/world2</mime>
+       </parser>
+    </parser>
+    <parser class="org.apache.tika.parser.EmptyParser">
+      <mime>hello/world</mime>
+    </parser>
+  </parsers>
+</properties>

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java?rev=1684199&r1=1684198&r2=1684199&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
 Mon Jun  8 14:41:48 2015
@@ -19,18 +19,35 @@ package org.apache.tika.parser.ctakes;
 import java.io.IOException;
 import java.io.InputStream;
 
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 /**
- * CTAKESParser decorates {@see AutoDetectParser} and leverages on {@see
- * CTAKESContentHandler} to extract biomedical information from clinical text 
using Apache cTAKES.
- * 
+ * CTAKESParser decorates a {@see Parser} and leverages on 
+ * {@see CTAKESContentHandler} to extract biomedical information from 
+ * clinical text using Apache cTAKES.
+ * <p>It is normally called by supplying an instance to 
+ *  {@link AutoDetectParser}, such as:
+ * <code>AutoDetectParser parser = new AutoDetectParser(new 
CTakesParser());</code>
+ * <p>It can also be used by giving a Tika Config file similar to:
+ * <code>
+ *  &gt;properties>
+ *    &gt;parsers>
+ *      &gt;parser class="org.apache.tika.parser.ctakes.CTAKESParser">
+ *         &gt;parser class="org.apache.tika.parser.DefaultParser"/>
+ *      &gt;/parser>
+ *    &gt;/parsers>
+ *  &gt;/properties>
+ * </code>
+ * <p>Because this is a Parser Decorator, and not a normal Parser in
+ *  it's own right, it isn't normally selected via the Parser Service Loader.
  */
 public class CTAKESParser extends ParserDecorator {
     /**
@@ -39,10 +56,22 @@ public class CTAKESParser extends Parser
     private static final long serialVersionUID = -2313482748027097961L;
 
     /**
-     * Default constructor.
+     * Wraps the default Parser
      */
     public CTAKESParser() {
-        super(new AutoDetectParser());
+        this(TikaConfig.getDefaultConfig());
+    }
+    /**
+     * Wraps the default Parser for this Config
+     */
+    public CTAKESParser(TikaConfig config) {
+        this(config.getParser());
+    }
+    /**
+     * Wraps the specified Parser
+     */
+    public CTAKESParser(Parser parser) {
+        super(parser);
     }
 
     @Override
@@ -55,4 +84,9 @@ public class CTAKESParser extends Parser
                 metadata, config);
         super.parse(stream, ctakesHandler, metadata, context);
     }
+    
+    @Override
+    public String getDecorationName() {
+        return "CTakes";
+    }            
 }


Reply via email to