Repository: tika
Updated Branches:
  refs/heads/TIKA-1508 a20c46cc7 -> ea47b716e


Add uniformity to parser parameter configuration.

1. Added Configurable interface.
 This can be used for all services like Parser, Detector which can take
  configurable parameters.

2. Added ConfigurableParser interface which extends Parser interface.
   I didn't add new method to existing Parser because
    that will break the compatibility.

3. AbstractParser extends ConfigurableParser and has
  default implementation for configure() contract.
  I think it is safe to do so and it doesnt break anything.
  In addition all parsers which extend AbstractParser will can easily
  access config from TikaConfig if they want to

3. Added a TODO to TikaConfig,
 after this should allow multiple instances of same parser with
 different runtime configurations.

4. TikaConfig is modified to detect if instance can be configured,
  if so, then checks if params are available in XML file, parses the
  params and invokes configure(ctx) method with these params

5. Added DummyConfigurableParser that simply copies parameters to
 metadata for the sake of testing

6. Added a sample XML config file for testing.
Added ConfigurableParserTest that performs an end to end test of all
the above.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/b2cf2317
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/b2cf2317
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/b2cf2317

Branch: refs/heads/TIKA-1508
Commit: b2cf23178ede925b0ef23f88ebf1aff95c8c157c
Parents: 1caa4fb
Author: Thamme Gowda <tgow...@gmail.com>
Authored: Tue Mar 8 18:23:19 2016 -0800
Committer: Thamme Gowda <tgow...@gmail.com>
Committed: Tue Mar 8 18:23:19 2016 -0800

----------------------------------------------------------------------
 .../java/org/apache/tika/base/Configurable.java | 19 ++++++
 .../java/org/apache/tika/config/TikaConfig.java | 41 ++++++++++-
 .../org/apache/tika/parser/AbstractParser.java  | 18 ++++-
 .../apache/tika/parser/ConfigurableParser.java  | 30 ++++++++
 .../org/apache/tika/parser/ParseContext.java    | 39 ++++++++++-
 .../java/org/apache/tika/parser/Parser.java     |  1 +
 .../tika/parser/ConfigurableParserTest.java     | 44 ++++++++++++
 .../tika/parser/DummyConfigurableParser.java    | 72 ++++++++++++++++++++
 .../tika/config/TIKA-1508-configurable.xml      | 27 ++++++++
 9 files changed, 288 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/main/java/org/apache/tika/base/Configurable.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/base/Configurable.java 
b/tika-core/src/main/java/org/apache/tika/base/Configurable.java
new file mode 100644
index 0000000..8ae1b30
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/base/Configurable.java
@@ -0,0 +1,19 @@
+package org.apache.tika.base;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Defines contract for configurable services
+ * @since Apache Tika 1.13
+ */
+public interface Configurable {
+
+    /**
+     * Confure an instance with Tika Context
+     * @param context configuration instance in the form of context
+     * @throws TikaException when an instance fails to work at the given 
context
+     * @since Apache Tika 1.13
+     */
+    void configure(ParseContext context) throws TikaException;
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 17f36e0..a4dedae 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -28,8 +28,10 @@ import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
 
@@ -38,6 +40,7 @@ import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
 
+import org.apache.tika.base.Configurable;
 import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor;
 import org.apache.tika.concurrent.SimpleThreadPoolExecutor;
 import org.apache.tika.detect.CompositeDetector;
@@ -54,6 +57,7 @@ import org.apache.tika.mime.MimeTypesFactory;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
 import org.w3c.dom.Document;
@@ -465,6 +469,7 @@ public class TikaConfig {
     }
     
     private static abstract class XmlLoader<CT,T> {
+        protected static final String PARAMS_TAG_NAME = "params";
         abstract boolean supportsComposite();
         abstract String getParentTagName(); // eg parsers
         abstract String getLoaderTagName(); // eg parser
@@ -510,6 +515,7 @@ public class TikaConfig {
             // Wrap the defined parsers/detectors up in a Composite
             return createComposite(loaded, mimeTypes, loader);
         }
+
         T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) 
                 throws TikaException, IOException {
             String name = element.getAttribute("class");
@@ -520,6 +526,7 @@ public class TikaConfig {
                         loader.getServiceClass(getLoaderClass(), name);
 
                 // Do pre-load checks and short-circuits
+                //TODO : allow duplicate instances with different 
configurations
                 loaded = preLoadOne(loadedClass, name, mimeTypes);
                 if (loaded != null) return loaded;
                 
@@ -563,7 +570,12 @@ public class TikaConfig {
                 
                 // Have any decoration performed, eg explicit mimetypes
                 loaded = decorate(loaded, element);
-                
+                //if the instance is configurable, then call configure()
+                if (loaded instanceof Configurable){
+                    ParseContext context = new ParseContext();
+                    context.getParams().putAll(getParams(element));
+                    ((Configurable) loaded).configure(context); // initialize 
here
+                }
                 // All done with setup
                 return loaded;
             } catch (ClassNotFoundException e) {
@@ -586,6 +598,33 @@ public class TikaConfig {
                         "Unable to instantiate a "+getLoaderTagName()+" class: 
" + name, e);
             }
         }
+
+        /**
+         * Gets parameters from a given
+         * @param el xml node which has {@link #PARAMS_TAG_NAME} child
+         * @return Map of key values read from xml
+         */
+        Map<String, String>  getParams(Element el){
+            //TODO: move this constant to static final
+            Map<String, String> params = new HashMap<>();
+            for (Node child = el.getFirstChild(); child != null;
+                 child = child.getNextSibling()){
+                if (PARAMS_TAG_NAME.equals(child.getNodeName())){ //found the 
node
+                    if (child.hasChildNodes()) {//it has children
+                        NodeList childNodes = child.getChildNodes();
+                        for (int i = 0; i < childNodes.getLength(); i++) {
+                            Node item = childNodes.item(i);
+                            if (item.getNodeType() == Node.ELEMENT_NODE){
+                                params.put(item.getNodeName().trim(), 
item.getTextContent().trim());
+                            }
+                        }
+                    }
+                    break; //only the first one is used
+                }
+            }
+            return params;
+        }
+
     }
     private static class ParserXmlLoader extends 
XmlLoader<CompositeParser,Parser> {
         boolean supportsComposite() { return true; }

http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java 
b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
index 2411f05..10f731e 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Properties;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -30,7 +31,12 @@ import org.xml.sax.SAXException;
  *
  * @since Apache Tika 0.10
  */
-public abstract class AbstractParser implements Parser {
+public abstract class AbstractParser implements ConfigurableParser {
+
+    /**
+     * Configuration supplied at runtime
+     */
+    protected ParseContext context;
 
     /**
      * Serial version UID.
@@ -53,4 +59,14 @@ public abstract class AbstractParser implements Parser {
         parse(stream, handler, metadata, new ParseContext());
     }
 
+    /**
+     * called by the framework to supply runtime parameters which may be
+     * required for initialization
+     * @param context the parser context at runtime
+     * @since Apache Tika 1.13
+     */
+    @Override
+    public void configure(ParseContext context) throws TikaException {
+        this.context = context;
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java 
b/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java
new file mode 100644
index 0000000..3eabc02
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import org.apache.tika.base.Configurable;
+
+import java.io.Serializable;
+
+/**
+ * Extension of {@link Parser} with {@link Configurable} contract.
+ * This interface shall be implemented to create parsers which accepts runtime 
parameters
+ * from tika configuration file
+ */
+public interface ConfigurableParser extends Parser,
+        Configurable, Serializable {
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java 
b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
index 48a7841..20607d9 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
@@ -43,7 +43,11 @@ public class ParseContext implements Serializable {
 
     /** Map of objects in this context */
     private final Map<String, Object> context = new HashMap<String, Object>();
- 
+    /**
+     * Map of configurable arguments
+     */
+    private final Map<String, String> params = new HashMap<>();
+
     /**
      * Adds the given value to the context as an implementation of the given
      * interface.
@@ -145,4 +149,37 @@ public class ParseContext implements Serializable {
         return factory;
     }
 
+    /**
+     * Stores a key=value parameter
+     * @param key parameter name
+     * @param value value
+     */
+    public void setParam(String key, String value){
+        this.params.put(key, value);
+    }
+
+    /**
+     * Gets the value associated with given parameter
+     * @param key parameter name
+     */
+    public void getParam(String key){
+        this.params.get(key);
+    }
+
+    /**
+     * Gets all the params
+     * @return map of key values
+     */
+    public Map<String, String> getParams() {
+        return params;
+    }
+
+    /**
+     * Checks if parameter is available
+     * @param key parameter name
+     * @return true if parameter is available, false otherwise
+     */
+    public boolean hasParam(String key){
+       return params.containsKey(key);
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/main/java/org/apache/tika/parser/Parser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/Parser.java 
b/tika-core/src/main/java/org/apache/tika/parser/Parser.java
index 3ac2d1f..352b8d3 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/Parser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/Parser.java
@@ -29,6 +29,7 @@ import org.xml.sax.SAXException;
 
 /**
  * Tika parser interface.
+ * @see ConfigurableParser for parsers which adopts to runtime params
  */
 public interface Parser extends Serializable {
 

http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java 
b/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
new file mode 100644
index 0000000..f91a2b0
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.net.URL;
+
+public class ConfigurableParserTest {
+
+    public static final String TIKA_CFG_FILE = 
"org/apache/tika/config/TIKA-1508-configurable.xml";
+    public static final String TEST_PARAM = "testparam";
+    public static final String TEST_PARAM_VAL = "testparamval";
+
+    @Test
+    public void testConfigurableParser() throws Exception {
+        URL configFileUrl = 
getClass().getClassLoader().getResource(TIKA_CFG_FILE);
+        assert configFileUrl != null;
+        TikaConfig config = new TikaConfig(configFileUrl);
+        Tika tika = new Tika(config);
+        Metadata md = new Metadata();
+        tika.parse(configFileUrl.openStream(), md);
+        Assert.assertEquals(TEST_PARAM_VAL, md.get(TEST_PARAM));
+        //assert that param from configuration file is read, given to parser 
and it copied to metadata
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java 
b/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
new file mode 100644
index 0000000..4bbeac9
--- /dev/null
+++ 
b/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ *
+ * This Parser is created to test runtime configuration to parser.
+ * This parser simply copies parameters to metadata so that a test
+ * suit can be developed to test that :
+ * 1. Parameters were parsed from configuration file
+ * 2. parameters were supplied to parser via configure(ctx) method
+ * 3. parameters were available at parse
+ *
+ */
+public class DummyConfigurableParser extends AbstractParser {
+
+    private static Set<MediaType> MIMES = new HashSet<>();
+    static {
+        MIMES.add(MediaType.TEXT_PLAIN);
+        MIMES.add(MediaType.TEXT_HTML);
+        MIMES.add(MediaType.OCTET_STREAM);
+    }
+
+    private Map<String, String> params;
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return MIMES;
+    }
+
+    @Override
+    public void configure(ParseContext context) throws TikaException {
+        super.configure(context);
+        this.params = context.getParams();
+        // initialize here
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+                      Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        for (Map.Entry<String, String> entry : this.params.entrySet()) {
+            metadata.add(entry.getKey(), entry.getValue());
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/b2cf2317/tika-core/src/test/resources/org/apache/tika/config/TIKA-1508-configurable.xml
----------------------------------------------------------------------
diff --git 
a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1508-configurable.xml
 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1508-configurable.xml
new file mode 100644
index 0000000..999cb45
--- /dev/null
+++ 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1508-configurable.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DummyConfigurableParser">
+            <params>
+                <testparam>testparamval</testparam>
+            </params>
+        </parser>
+
+    </parsers>
+</properties>

Reply via email to