Author: nick
Date: Mon Jun 8 14:41:48 2015
New Revision: 1684199
URL: http://svn.apache.org/r1684199
Log:
TIKA-1653 Re-do the XML parsing in the Tika Config, so that a parser tag with
another inside it doesn't get accidently duplicated at the top level
Added:
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1684199&r1=1684198&r2=1684199&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
Mon Jun 8 14:41:48 2015
@@ -332,10 +332,25 @@ public class TikaConfig {
Element element, MimeTypes mimeTypes, ServiceLoader loader)
throws TikaException, IOException {
List<Parser> parsers = new ArrayList<Parser>();
- NodeList nodes = element.getElementsByTagName("parser");
- for (int i = 0; i < nodes.getLength(); i++) {
- Element node = (Element) nodes.item(i);
- parsers.add(parserFromParserDomElement(node, mimeTypes, loader));
+
+ // Should be only zero or one <parsers> tag
+ NodeList nodes = element.getElementsByTagName("parsers");
+ if (nodes.getLength() > 1) {
+ throw new TikaException("Properties may not contain multiple
Parsers entries");
+ }
+ else if (nodes.getLength() == 1) {
+ // Find only the direct child parser objects
+ Node parsersE = nodes.item(0);
+ nodes = parsersE.getChildNodes();
+ for (int i = 0; i < nodes.getLength(); i++) {
+ Node node = nodes.item(i);
+ if (node instanceof Element) {
+ Element nodeE = (Element)node;
+ if ("parser".equals(nodeE.getTagName())) {
+ parsers.add(parserFromParserDomElement(nodeE,
mimeTypes, loader));
+ }
+ }
+ }
}
if (parsers.isEmpty()) {
@@ -444,21 +459,26 @@ public class TikaConfig {
private static Set<MediaType> mediaTypesListFromDomElement(
Element node, String tag)
throws TikaException, IOException {
- NodeList mimes = node.getElementsByTagName(tag);
- if (mimes.getLength() > 0) {
- Set<MediaType> types = new HashSet<MediaType>();
- for (int j = 0; j < mimes.getLength(); j++) {
- String mime = getText(mimes.item(j));
- MediaType type = MediaType.parse(mime);
- if (type != null) {
- types.add(type);
- } else {
- throw new TikaException(
- "Invalid media type name: " + mime);
+ Set<MediaType> types = null;
+ NodeList children = node.getChildNodes();
+ for (int i=0; i<children.getLength(); i++) {
+ Node cNode = children.item(i);
+ if (cNode instanceof Element) {
+ Element cElement = (Element)cNode;
+ if (tag.equals(cElement.getTagName())) {
+ String mime = getText(cElement);
+ MediaType type = MediaType.parse(mime);
+ if (type != null) {
+ if (types == null) types = new HashSet<MediaType>();
+ types.add(type);
+ } else {
+ throw new TikaException(
+ "Invalid media type name: " + mime);
+ }
}
}
- return types;
}
+ if (types != null) return types;
return Collections.emptySet();
}
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java?rev=1684199&r1=1684198&r2=1684199&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
Mon Jun 8 14:41:48 2015
@@ -177,4 +177,39 @@ public class TikaConfigTest {
System.clearProperty("tika.config");
}
}
+
+ /**
+ * TIKA-1653 If one parser has child parsers, those child parsers shouldn't
+ * show up at the top level as well
+ */
+ @Test
+ public void parserWithChildParsers() throws Exception {
+ URL url = TikaConfigTest.class.getResource("TIKA-1653-norepeat.xml");
+ System.setProperty("tika.config", url.toExternalForm());
+ try {
+ TikaConfig config = new TikaConfig();
+
+ CompositeParser cp = (CompositeParser)config.getParser();
+ List<Parser> parsers = cp.getAllComponentParsers();
+ Parser p;
+
+ // Just 2 top level parsers
+ assertEquals(2, parsers.size());
+
+ // Should have a CompositeParser with 2 child ones, and
+ // and a wrapped empty parser
+ p = parsers.get(0);
+ assertTrue(p.toString(), p instanceof CompositeParser);
+ assertEquals(2,
((CompositeParser)p).getAllComponentParsers().size());
+
+ p = parsers.get(1);
+ assertTrue(p.toString(), p instanceof ParserDecorator);
+ assertEquals(EmptyParser.class,
((ParserDecorator)p).getWrappedParser().getClass());
+ assertEquals("hello/world",
p.getSupportedTypes(null).iterator().next().toString());
+ } catch (TikaException e) {
+ fail("Unexpected TikaException: " + e);
+ } finally {
+ System.clearProperty("tika.config");
+ }
+ }
}
\ No newline at end of file
Added:
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml?rev=1684199&view=auto
==============================================================================
---
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml
(added)
+++
tika/trunk/tika-core/src/test/resources/org/apache/tika/config/TIKA-1653-norepeat.xml
Mon Jun 8 14:41:48 2015
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.CompositeParser">
+ <parser class="org.apache.tika.parser.EmptyParser">
+ <mime>hello/world1</mime>
+ </parser>
+ <parser class="org.apache.tika.parser.EmptyParser">
+ <mime>hello/world2</mime>
+ </parser>
+ </parser>
+ <parser class="org.apache.tika.parser.EmptyParser">
+ <mime>hello/world</mime>
+ </parser>
+ </parsers>
+</properties>
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java?rev=1684199&r1=1684198&r2=1684199&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
Mon Jun 8 14:41:48 2015
@@ -19,18 +19,35 @@ package org.apache.tika.parser.ctakes;
import java.io.IOException;
import java.io.InputStream;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
- * CTAKESParser decorates {@see AutoDetectParser} and leverages on {@see
- * CTAKESContentHandler} to extract biomedical information from clinical text
using Apache cTAKES.
- *
+ * CTAKESParser decorates a {@see Parser} and leverages on
+ * {@see CTAKESContentHandler} to extract biomedical information from
+ * clinical text using Apache cTAKES.
+ * <p>It is normally called by supplying an instance to
+ * {@link AutoDetectParser}, such as:
+ * <code>AutoDetectParser parser = new AutoDetectParser(new
CTakesParser());</code>
+ * <p>It can also be used by giving a Tika Config file similar to:
+ * <code>
+ * >properties>
+ * >parsers>
+ * >parser class="org.apache.tika.parser.ctakes.CTAKESParser">
+ * >parser class="org.apache.tika.parser.DefaultParser"/>
+ * >/parser>
+ * >/parsers>
+ * >/properties>
+ * </code>
+ * <p>Because this is a Parser Decorator, and not a normal Parser in
+ * it's own right, it isn't normally selected via the Parser Service Loader.
*/
public class CTAKESParser extends ParserDecorator {
/**
@@ -39,10 +56,22 @@ public class CTAKESParser extends Parser
private static final long serialVersionUID = -2313482748027097961L;
/**
- * Default constructor.
+ * Wraps the default Parser
*/
public CTAKESParser() {
- super(new AutoDetectParser());
+ this(TikaConfig.getDefaultConfig());
+ }
+ /**
+ * Wraps the default Parser for this Config
+ */
+ public CTAKESParser(TikaConfig config) {
+ this(config.getParser());
+ }
+ /**
+ * Wraps the specified Parser
+ */
+ public CTAKESParser(Parser parser) {
+ super(parser);
}
@Override
@@ -55,4 +84,9 @@ public class CTAKESParser extends Parser
metadata, config);
super.parse(stream, ctakesHandler, metadata, context);
}
+
+ @Override
+ public String getDecorationName() {
+ return "CTakes";
+ }
}