TIKA-1986 -- add Initializable, strip out handling of params passed via ParseContext in PDFParser
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/01320372 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/01320372 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/01320372 Branch: refs/heads/master Commit: 01320372fdbfc5e4ff0cfe0fe85fab91b5b369e7 Parents: d9dcd59 Author: tballison <[email protected]> Authored: Wed Jun 15 13:32:58 2016 -0400 Committer: tballison <[email protected]> Committed: Wed Jun 15 13:32:58 2016 -0400 ---------------------------------------------------------------------- .../org/apache/tika/config/Initializable.java | 31 ++++++++++ .../java/org/apache/tika/config/TikaConfig.java | 3 + .../tika/parser/DummyInitializableParser.java | 64 ++++++++++++++++++++ .../tika/parser/DummyParameterizedParser.java | 1 + .../tika/parser/InitializableParserTest.java | 45 ++++++++++++++ .../tika/parser/ParameterizedParserTest.java | 2 +- .../tika/config/TIKA-1986-initializable.xml | 28 +++++++++ .../tika/config/TIKA-1986-some-parameters.xml | 2 +- .../org/apache/tika/parser/pdf/PDFParser.java | 10 +-- .../apache/tika/parser/pdf/PDFParserTest.java | 2 + 10 files changed, 178 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/main/java/org/apache/tika/config/Initializable.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/config/Initializable.java b/tika-core/src/main/java/org/apache/tika/config/Initializable.java new file mode 100644 index 0000000..bc7769c --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/config/Initializable.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config; + +import org.apache.tika.exception.TikaConfigException; + +/** + * Components that must do special processing across multiple fields + * at initialization time should implement this interface. + * <p> + * TikaConfig will call initialize on Initializable classes after + * setting the parameters. + */ +public interface Initializable { + + void initialize() throws TikaConfigException; +} http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java index 49c5e26..fbafe7e 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java @@ -565,6 +565,9 @@ public class TikaConfig { Map<String, Param<?>> params = getParams(element); //Assigning the params to bean fields/setters AnnotationUtils.assignFieldParams(loaded, params); + if (loaded instanceof Initializable) { + ((Initializable) loaded).initialize(); + } // Have any decoration performed, eg explicit mimetypes loaded = decorate(loaded, element); http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java new file mode 100644 index 0000000..4bb8668 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser; + +import org.apache.tika.config.Field; +import org.apache.tika.config.Initializable; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.io.InputStream; +import java.util.HashSet; +import java.util.Set; + +/** + * This tests that initialize() is called after adding the parameters + * configured via TikaConfig + */ +public class DummyInitializableParser extends AbstractParser implements Initializable { + + public static String SUM_FIELD = "SUM"; + private static Set<MediaType> MIMES = new HashSet<>(); + static { + MIMES.add(MediaType.TEXT_PLAIN); + } + + @Field private short shortA = -2; + @Field private short shortB = -3; + private int sum = 0; + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return MIMES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + metadata.set(SUM_FIELD, Integer.toString(sum)); + } + + @Override + public void initialize() throws TikaConfigException { + sum = shortA+shortB; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java index 801d65e..435dc52 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java @@ -63,6 +63,7 @@ public class DummyParameterizedParser extends AbstractParser { @Field private String missing = "default"; + private String inner = "inner"; private File xfile; http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java new file mode 100644 index 0000000..b9d378d --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser; + +import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.junit.Assert; +import org.junit.Test; + +import java.net.URL; +import java.nio.charset.StandardCharsets; + +import static org.junit.Assert.assertEquals; + +public class InitializableParserTest { + + public static final String TIKA_CFG_FILE = "org/apache/tika/config/TIKA-1986-initializable.xml"; + + @Test + public void testInitializableParser() throws Exception { + URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE); + assert configFileUrl != null; + TikaConfig config = new TikaConfig(configFileUrl); + Tika tika = new Tika(config); + Metadata md = new Metadata(); + tika.parse(TikaInputStream.get("someString".getBytes(StandardCharsets.ISO_8859_1)), md); + assertEquals("5", md.get(DummyInitializableParser.SUM_FIELD)); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java index 31c59c0..1471504 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java +++ b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java @@ -82,7 +82,7 @@ public class ParameterizedParserTest { Metadata md = getMetadata("TIKA-1986-some-parameters.xml"); assertEquals("-6.0", md.get("xdouble")); assertEquals("testparamval", md.get("testparam")); - assertEquals("true", md.get("xbool")); + assertEquals("false", md.get("xbool")); } @Test http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-initializable.xml ---------------------------------------------------------------------- diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-initializable.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-initializable.xml new file mode 100644 index 0000000..0b11bb4 --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-initializable.xml @@ -0,0 +1,28 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DummyInitializableParser"> + <params> + <param name="shortA" type="short">2</param> + <param name="shortB" type="short">3</param> + </params> + </parser> + + </parsers> +</properties> http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-some-parameters.xml ---------------------------------------------------------------------- diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-some-parameters.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-some-parameters.xml index 250d439..dea8269 100644 --- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-some-parameters.xml +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-some-parameters.xml @@ -20,7 +20,7 @@ <parser class="org.apache.tika.parser.DummyParameterizedParser"> <params> <param name="testparam" type="string">testparamval</param> - <param name="testbool" type="bool">false</param> + <param name="xbool" type="bool">false</param> </params> </parser> http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index a5673ee..7b12d58 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -109,14 +109,8 @@ public class PDFParser extends AbstractParser { Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - //step 1, check to see if there are params for the PDFParser class - Map<String, Param<?>> params = context.getParams(PDFParser.class); - PDFParserConfig localConfig = new PDFParserConfig(); - if (params != null) { - AnnotationUtils.assignFieldParams(localConfig, params); - } else if (context.get(PDFParserConfig.class) != null) { - localConfig = context.get(PDFParserConfig.class, defaultConfig); - } + PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); + PDDocument pdfDocument = null; TemporaryResources tmp = new TemporaryResources(); http://git-wip-us.apache.org/repos/asf/tika/blob/01320372/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 2ef29f3..e9f55fe 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -61,6 +61,7 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; import org.junit.AfterClass; import org.junit.BeforeClass; +import org.junit.Ignore; import org.junit.Test; import org.xml.sax.ContentHandler; @@ -1231,6 +1232,7 @@ public class PDFParserTest extends TikaTest { } @Test + @Ignore("We've turned this off for now") public void testParameterizationViaContext() throws Exception { ParseContext context = new ParseContext();
