Repository: tika Updated Branches: refs/heads/TIKA-1508 a20c46cc7 -> ea47b716e
Add uniformity to parser parameter configuration. 1. Added Configurable interface. This can be used for all services like Parser, Detector which can take configurable parameters. 2. Added ConfigurableParser interface which extends Parser interface. I didn't add new method to existing Parser because that will break the compatibility. 3. AbstractParser extends ConfigurableParser and has default implementation for configure() contract. I think it is safe to do so and it doesnt break anything. In addition all parsers which extend AbstractParser will can easily access config from TikaConfig if they want to 3. Added a TODO to TikaConfig, after this should allow multiple instances of same parser with different runtime configurations. 4. TikaConfig is modified to detect if instance can be configured, if so, then checks if params are available in XML file, parses the params and invokes configure(ctx) method with these params 5. Added DummyConfigurableParser that simply copies parameters to metadata for the sake of testing 6. Added a sample XML config file for testing. Added ConfigurableParserTest that performs an end to end test of all the above. Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/b2cf2317 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/b2cf2317 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/b2cf2317 Branch: refs/heads/TIKA-1508 Commit: b2cf23178ede925b0ef23f88ebf1aff95c8c157c Parents: 1caa4fb Author: Thamme Gowda <tgow...@gmail.com> Authored: Tue Mar 8 18:23:19 2016 -0800 Committer: Thamme Gowda <tgow...@gmail.com> Committed: Tue Mar 8 18:23:19 2016 -0800 ---------------------------------------------------------------------- .../java/org/apache/tika/base/Configurable.java | 19 ++++++ .../java/org/apache/tika/config/TikaConfig.java | 41 ++++++++++- .../org/apache/tika/parser/AbstractParser.java | 18 ++++- .../apache/tika/parser/ConfigurableParser.java | 30 ++++++++ .../org/apache/tika/parser/ParseContext.java | 39 ++++++++++- .../java/org/apache/tika/parser/Parser.java | 1 + .../tika/parser/ConfigurableParserTest.java | 44 ++++++++++++ .../tika/parser/DummyConfigurableParser.java | 72 ++++++++++++++++++++ .../tika/config/TIKA-1508-configurable.xml | 27 ++++++++ 9 files changed, 288 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/main/java/org/apache/tika/base/Configurable.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/base/Configurable.java b/tika-core/src/main/java/org/apache/tika/base/Configurable.java new file mode 100644 index 0000000..8ae1b30 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/base/Configurable.java @@ -0,0 +1,19 @@ +package org.apache.tika.base; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.ParseContext; + +/** + * Defines contract for configurable services + * @since Apache Tika 1.13 + */ +public interface Configurable { + + /** + * Confure an instance with Tika Context + * @param context configuration instance in the form of context + * @throws TikaException when an instance fails to work at the given context + * @since Apache Tika 1.13 + */ + void configure(ParseContext context) throws TikaException; +} http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java index 17f36e0..a4dedae 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java @@ -28,8 +28,10 @@ import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutorService; @@ -38,6 +40,7 @@ import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; +import org.apache.tika.base.Configurable; import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor; import org.apache.tika.concurrent.SimpleThreadPoolExecutor; import org.apache.tika.detect.CompositeDetector; @@ -54,6 +57,7 @@ import org.apache.tika.mime.MimeTypesFactory; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.DefaultParser; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.w3c.dom.Document; @@ -465,6 +469,7 @@ public class TikaConfig { } private static abstract class XmlLoader<CT,T> { + protected static final String PARAMS_TAG_NAME = "params"; abstract boolean supportsComposite(); abstract String getParentTagName(); // eg parsers abstract String getLoaderTagName(); // eg parser @@ -510,6 +515,7 @@ public class TikaConfig { // Wrap the defined parsers/detectors up in a Composite return createComposite(loaded, mimeTypes, loader); } + T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) throws TikaException, IOException { String name = element.getAttribute("class"); @@ -520,6 +526,7 @@ public class TikaConfig { loader.getServiceClass(getLoaderClass(), name); // Do pre-load checks and short-circuits + //TODO : allow duplicate instances with different configurations loaded = preLoadOne(loadedClass, name, mimeTypes); if (loaded != null) return loaded; @@ -563,7 +570,12 @@ public class TikaConfig { // Have any decoration performed, eg explicit mimetypes loaded = decorate(loaded, element); - + //if the instance is configurable, then call configure() + if (loaded instanceof Configurable){ + ParseContext context = new ParseContext(); + context.getParams().putAll(getParams(element)); + ((Configurable) loaded).configure(context); // initialize here + } // All done with setup return loaded; } catch (ClassNotFoundException e) { @@ -586,6 +598,33 @@ public class TikaConfig { "Unable to instantiate a "+getLoaderTagName()+" class: " + name, e); } } + + /** + * Gets parameters from a given + * @param el xml node which has {@link #PARAMS_TAG_NAME} child + * @return Map of key values read from xml + */ + Map<String, String> getParams(Element el){ + //TODO: move this constant to static final + Map<String, String> params = new HashMap<>(); + for (Node child = el.getFirstChild(); child != null; + child = child.getNextSibling()){ + if (PARAMS_TAG_NAME.equals(child.getNodeName())){ //found the node + if (child.hasChildNodes()) {//it has children + NodeList childNodes = child.getChildNodes(); + for (int i = 0; i < childNodes.getLength(); i++) { + Node item = childNodes.item(i); + if (item.getNodeType() == Node.ELEMENT_NODE){ + params.put(item.getNodeName().trim(), item.getTextContent().trim()); + } + } + } + break; //only the first one is used + } + } + return params; + } + } private static class ParserXmlLoader extends XmlLoader<CompositeParser,Parser> { boolean supportsComposite() { return true; } http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java index 2411f05..10f731e 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java @@ -18,6 +18,7 @@ package org.apache.tika.parser; import java.io.IOException; import java.io.InputStream; +import java.util.Properties; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -30,7 +31,12 @@ import org.xml.sax.SAXException; * * @since Apache Tika 0.10 */ -public abstract class AbstractParser implements Parser { +public abstract class AbstractParser implements ConfigurableParser { + + /** + * Configuration supplied at runtime + */ + protected ParseContext context; /** * Serial version UID. @@ -53,4 +59,14 @@ public abstract class AbstractParser implements Parser { parse(stream, handler, metadata, new ParseContext()); } + /** + * called by the framework to supply runtime parameters which may be + * required for initialization + * @param context the parser context at runtime + * @since Apache Tika 1.13 + */ + @Override + public void configure(ParseContext context) throws TikaException { + this.context = context; + } } http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java b/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java new file mode 100644 index 0000000..3eabc02 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser; + +import org.apache.tika.base.Configurable; + +import java.io.Serializable; + +/** + * Extension of {@link Parser} with {@link Configurable} contract. + * This interface shall be implemented to create parsers which accepts runtime parameters + * from tika configuration file + */ +public interface ConfigurableParser extends Parser, + Configurable, Serializable { +} http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java index 48a7841..20607d9 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java @@ -43,7 +43,11 @@ public class ParseContext implements Serializable { /** Map of objects in this context */ private final Map<String, Object> context = new HashMap<String, Object>(); - + /** + * Map of configurable arguments + */ + private final Map<String, String> params = new HashMap<>(); + /** * Adds the given value to the context as an implementation of the given * interface. @@ -145,4 +149,37 @@ public class ParseContext implements Serializable { return factory; } + /** + * Stores a key=value parameter + * @param key parameter name + * @param value value + */ + public void setParam(String key, String value){ + this.params.put(key, value); + } + + /** + * Gets the value associated with given parameter + * @param key parameter name + */ + public void getParam(String key){ + this.params.get(key); + } + + /** + * Gets all the params + * @return map of key values + */ + public Map<String, String> getParams() { + return params; + } + + /** + * Checks if parameter is available + * @param key parameter name + * @return true if parameter is available, false otherwise + */ + public boolean hasParam(String key){ + return params.containsKey(key); + } } http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/main/java/org/apache/tika/parser/Parser.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/parser/Parser.java b/tika-core/src/main/java/org/apache/tika/parser/Parser.java index 3ac2d1f..352b8d3 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/Parser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/Parser.java @@ -29,6 +29,7 @@ import org.xml.sax.SAXException; /** * Tika parser interface. + * @see ConfigurableParser for parsers which adopts to runtime params */ public interface Parser extends Serializable { http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java new file mode 100644 index 0000000..f91a2b0 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser; + +import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.junit.Assert; +import org.junit.Test; + +import java.net.URL; + +public class ConfigurableParserTest { + + public static final String TIKA_CFG_FILE = "org/apache/tika/config/TIKA-1508-configurable.xml"; + public static final String TEST_PARAM = "testparam"; + public static final String TEST_PARAM_VAL = "testparamval"; + + @Test + public void testConfigurableParser() throws Exception { + URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE); + assert configFileUrl != null; + TikaConfig config = new TikaConfig(configFileUrl); + Tika tika = new Tika(config); + Metadata md = new Metadata(); + tika.parse(configFileUrl.openStream(), md); + Assert.assertEquals(TEST_PARAM_VAL, md.get(TEST_PARAM)); + //assert that param from configuration file is read, given to parser and it copied to metadata + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java new file mode 100644 index 0000000..4bbeac9 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.io.InputStream; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * + * This Parser is created to test runtime configuration to parser. + * This parser simply copies parameters to metadata so that a test + * suit can be developed to test that : + * 1. Parameters were parsed from configuration file + * 2. parameters were supplied to parser via configure(ctx) method + * 3. parameters were available at parse + * + */ +public class DummyConfigurableParser extends AbstractParser { + + private static Set<MediaType> MIMES = new HashSet<>(); + static { + MIMES.add(MediaType.TEXT_PLAIN); + MIMES.add(MediaType.TEXT_HTML); + MIMES.add(MediaType.OCTET_STREAM); + } + + private Map<String, String> params; + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return MIMES; + } + + @Override + public void configure(ParseContext context) throws TikaException { + super.configure(context); + this.params = context.getParams(); + // initialize here + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + for (Map.Entry<String, String> entry : this.params.entrySet()) { + metadata.add(entry.getKey(), entry.getValue()); + } + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/test/resources/org/apache/tika/config/TIKA-1508-configurable.xml ---------------------------------------------------------------------- diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1508-configurable.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1508-configurable.xml new file mode 100644 index 0000000..999cb45 --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1508-configurable.xml @@ -0,0 +1,27 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DummyConfigurableParser"> + <params> + <testparam>testparamval</testparam> + </params> + </parser> + + </parsers> +</properties>