Author: nick Date: Mon Jun 8 14:41:48 2015 New Revision: 1684199 URL: http://svn.apache.org/r1684199 Log: TIKA-1653 Re-do the XML parsing in the Tika Config, so that a parser tag with another inside it doesn't get accidently duplicated at the top level
Added: tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1684199&r1=1684198&r2=1684199&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Mon Jun 8 14:41:48 2015 @@ -332,10 +332,25 @@ public class TikaConfig { Element element, MimeTypes mimeTypes, ServiceLoader loader) throws TikaException, IOException { List<Parser> parsers = new ArrayList<Parser>(); - NodeList nodes = element.getElementsByTagName("parser"); - for (int i = 0; i < nodes.getLength(); i++) { - Element node = (Element) nodes.item(i); - parsers.add(parserFromParserDomElement(node, mimeTypes, loader)); + + // Should be only zero or one <parsers> tag + NodeList nodes = element.getElementsByTagName("parsers"); + if (nodes.getLength() > 1) { + throw new TikaException("Properties may not contain multiple Parsers entries"); + } + else if (nodes.getLength() == 1) { + // Find only the direct child parser objects + Node parsersE = nodes.item(0); + nodes = parsersE.getChildNodes(); + for (int i = 0; i < nodes.getLength(); i++) { + Node node = nodes.item(i); + if (node instanceof Element) { + Element nodeE = (Element)node; + if ("parser".equals(nodeE.getTagName())) { + parsers.add(parserFromParserDomElement(nodeE, mimeTypes, loader)); + } + } + } } if (parsers.isEmpty()) { @@ -444,21 +459,26 @@ public class TikaConfig { private static Set<MediaType> mediaTypesListFromDomElement( Element node, String tag) throws TikaException, IOException { - NodeList mimes = node.getElementsByTagName(tag); - if (mimes.getLength() > 0) { - Set<MediaType> types = new HashSet<MediaType>(); - for (int j = 0; j < mimes.getLength(); j++) { - String mime = getText(mimes.item(j)); - MediaType type = MediaType.parse(mime); - if (type != null) { - types.add(type); - } else { - throw new TikaException( - "Invalid media type name: " + mime); + Set<MediaType> types = null; + NodeList children = node.getChildNodes(); + for (int i=0; i<children.getLength(); i++) { + Node cNode = children.item(i); + if (cNode instanceof Element) { + Element cElement = (Element)cNode; + if (tag.equals(cElement.getTagName())) { + String mime = getText(cElement); + MediaType type = MediaType.parse(mime); + if (type != null) { + if (types == null) types = new HashSet<MediaType>(); + types.add(type); + } else { + throw new TikaException( + "Invalid media type name: " + mime); + } } } - return types; } + if (types != null) return types; return Collections.emptySet(); } Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java?rev=1684199&r1=1684198&r2=1684199&view=diff ============================================================================== --- tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java (original) +++ tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java Mon Jun 8 14:41:48 2015 @@ -177,4 +177,39 @@ public class TikaConfigTest { System.clearProperty("tika.config"); } } + + /** + * TIKA-1653 If one parser has child parsers, those child parsers shouldn't + * show up at the top level as well + */ + @Test + public void parserWithChildParsers() throws Exception { + URL url = TikaConfigTest.class.getResource("TIKA-1653-norepeat.xml"); + System.setProperty("tika.config", url.toExternalForm()); + try { + TikaConfig config = new TikaConfig(); + + CompositeParser cp = (CompositeParser)config.getParser(); + List<Parser> parsers = cp.getAllComponentParsers(); + Parser p; + + // Just 2 top level parsers + assertEquals(2, parsers.size()); + + // Should have a CompositeParser with 2 child ones, and + // and a wrapped empty parser + p = parsers.get(0); + assertTrue(p.toString(), p instanceof CompositeParser); + assertEquals(2, ((CompositeParser)p).getAllComponentParsers().size()); + + p = parsers.get(1); + assertTrue(p.toString(), p instanceof ParserDecorator); + assertEquals(EmptyParser.class, ((ParserDecorator)p).getWrappedParser().getClass()); + assertEquals("hello/world", p.getSupportedTypes(null).iterator().next().toString()); + } catch (TikaException e) { + fail("Unexpected TikaException: " + e); + } finally { + System.clearProperty("tika.config"); + } + } } \ No newline at end of file Added: tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml?rev=1684199&view=auto ============================================================================== --- tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml (added) +++ tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml Mon Jun 8 14:41:48 2015 @@ -0,0 +1,32 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.CompositeParser"> + <parser class="org.apache.tika.parser.EmptyParser"> + <mime>hello/world1</mime> + </parser> + <parser class="org.apache.tika.parser.EmptyParser"> + <mime>hello/world2</mime> + </parser> + </parser> + <parser class="org.apache.tika.parser.EmptyParser"> + <mime>hello/world</mime> + </parser> + </parsers> +</properties> Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java?rev=1684199&r1=1684198&r2=1684199&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java Mon Jun 8 14:41:48 2015 @@ -19,18 +19,35 @@ package org.apache.tika.parser.ctakes; import java.io.IOException; import java.io.InputStream; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** - * CTAKESParser decorates {@see AutoDetectParser} and leverages on {@see - * CTAKESContentHandler} to extract biomedical information from clinical text using Apache cTAKES. - * + * CTAKESParser decorates a {@see Parser} and leverages on + * {@see CTAKESContentHandler} to extract biomedical information from + * clinical text using Apache cTAKES. + * <p>It is normally called by supplying an instance to + * {@link AutoDetectParser}, such as: + * <code>AutoDetectParser parser = new AutoDetectParser(new CTakesParser());</code> + * <p>It can also be used by giving a Tika Config file similar to: + * <code> + * >properties> + * >parsers> + * >parser class="org.apache.tika.parser.ctakes.CTAKESParser"> + * >parser class="org.apache.tika.parser.DefaultParser"/> + * >/parser> + * >/parsers> + * >/properties> + * </code> + * <p>Because this is a Parser Decorator, and not a normal Parser in + * it's own right, it isn't normally selected via the Parser Service Loader. */ public class CTAKESParser extends ParserDecorator { /** @@ -39,10 +56,22 @@ public class CTAKESParser extends Parser private static final long serialVersionUID = -2313482748027097961L; /** - * Default constructor. + * Wraps the default Parser */ public CTAKESParser() { - super(new AutoDetectParser()); + this(TikaConfig.getDefaultConfig()); + } + /** + * Wraps the default Parser for this Config + */ + public CTAKESParser(TikaConfig config) { + this(config.getParser()); + } + /** + * Wraps the specified Parser + */ + public CTAKESParser(Parser parser) { + super(parser); } @Override @@ -55,4 +84,9 @@ public class CTAKESParser extends Parser metadata, config); super.parse(stream, ctakesHandler, metadata, context); } + + @Override + public String getDecorationName() { + return "CTakes"; + } }