This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4161f2281bbc5b476b80e00c4fad7d54d1d12827 Author: tballison <[email protected]> AuthorDate: Mon Jul 3 07:58:46 2017 -0400 TIKA-2389 -- allow users to configure warnings for problems during initialization --- .../java/org/apache/tika/config/Initializable.java | 28 ++++++- .../tika/config/InitializableProblemHandler.java | 90 ++++++++++++++++++++++ .../java/org/apache/tika/config/ServiceLoader.java | 38 ++++++++- .../java/org/apache/tika/config/TikaConfig.java | 48 ++++++++++-- .../org/apache/tika/config/TikaConfigTest.java | 31 ++++++++ .../tika/parser/DummyInitializableParser.java | 23 ++++-- .../org/apache/tika/config/TIKA-2389-illegal.xml | 29 +++++++ .../config/TIKA-2389-throw-default-overridden.xml | 30 ++++++++ .../apache/tika/config/TIKA-2389-throw-default.xml | 30 ++++++++ .../tika/config/TIKA-2389-throw-per-parser.xml | 29 +++++++ .../tika/config/TIKA-2389-warn-per-parser.xml | 29 +++++++ .../tika/dl/imagerec/DL4JInceptionV3Net.java | 38 +++++---- .../org/apache/tika/parser/image/ImageParser.java | 24 +++++- .../org/apache/tika/parser/jdbc/SQLite3Parser.java | 47 ++++++++--- .../apache/tika/parser/ocr/TesseractOCRParser.java | 41 +++++++--- .../java/org/apache/tika/parser/pdf/PDFParser.java | 73 +++++++++++++++++- .../recognition/ObjectRecognitionParser.java | 26 ++++--- .../recognition/tf/TensorflowImageRecParser.java | 32 ++++---- .../recognition/tf/TensorflowRESTRecogniser.java | 25 +++--- .../parser/sentiment/analysis/SentimentParser.java | 21 +++-- 20 files changed, 636 insertions(+), 96 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/config/Initializable.java b/tika-core/src/main/java/org/apache/tika/config/Initializable.java index 8beaf77..a325670 100644 --- a/tika-core/src/main/java/org/apache/tika/config/Initializable.java +++ b/tika-core/src/main/java/org/apache/tika/config/Initializable.java @@ -16,18 +16,40 @@ */ package org.apache.tika.config; -import org.apache.tika.exception.TikaConfigException; - import java.util.Map; +import org.apache.tika.exception.TikaConfigException; + /** * Components that must do special processing across multiple fields * at initialization time should implement this interface. * <p> * TikaConfig will call initialize on Initializable classes after - * setting the parameters. + * setting the parameters for non-statically service loaded classes. + * <p> + * TikaConfig will call checkInitialization on all Initializables, + * whether loaded statically */ public interface Initializable { + /** + * @param params params to use for initialization + * @throws TikaConfigException + */ void initialize(Map<String, Param> params) throws TikaConfigException; + + + /** + * + * + * @param problemHandler if there is a problem and no + * custom initializableProblemHandler has been configured + * via Initializable parameters, + * this is called to respond. + * @throws TikaConfigException + */ + void checkInitialization(InitializableProblemHandler problemHandler) + throws TikaConfigException; + + } diff --git a/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java b/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java new file mode 100644 index 0000000..db2bfc1 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config; + + +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.tika.exception.TikaConfigException; + + +/** + * This is to be used to handle potential recoverable problems that + * might arise during initialization. + */ +public interface InitializableProblemHandler { + + + void handleInitializableProblem(String className, String message) throws TikaConfigException; + + /** + * Strategy that simply ignores all problems. + */ + InitializableProblemHandler IGNORE = new InitializableProblemHandler() { + public void handleInitializableProblem(String className, String message) { + } + @Override + public String toString() { + return "IGNORE"; + } + }; + + /** + * Strategy that logs warnings of all problems using a {@link Logger} + * created using the given class name. + */ + InitializableProblemHandler INFO = new InitializableProblemHandler() { + public void handleInitializableProblem(String classname, String message) { + Logger.getLogger(classname).log( + Level.INFO, message); + } + @Override + public String toString() { + return "INFO"; + } + }; + + + /** + * Strategy that logs warnings of all problems using a {@link Logger} + * created using the given class name. + */ + InitializableProblemHandler WARN = new InitializableProblemHandler() { + public void handleInitializableProblem(String classname, String message) { + Logger.getLogger(classname).log( + Level.WARNING, message); + } + @Override + public String toString() { + return "WARN"; + } + }; + + InitializableProblemHandler THROW = new InitializableProblemHandler() { + public void handleInitializableProblem(String classname, String message) throws TikaConfigException { + throw new TikaConfigException(message); + } + @Override + public String toString() { + return "THROW"; + } + }; + + InitializableProblemHandler DEFAULT = WARN; + +} diff --git a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java index 66c2883..c546b37 100644 --- a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java +++ b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java @@ -120,14 +120,22 @@ public class ServiceLoader { private final ClassLoader loader; private final LoadErrorHandler handler; + private final InitializableProblemHandler initializableProblemHandler; private final boolean dynamic; - + public ServiceLoader( - ClassLoader loader, LoadErrorHandler handler, boolean dynamic) { + ClassLoader loader, LoadErrorHandler handler, + InitializableProblemHandler initializableProblemHandler, boolean dynamic) { this.loader = loader; this.handler = handler; + this.initializableProblemHandler = initializableProblemHandler; this.dynamic = dynamic; + + } + public ServiceLoader( + ClassLoader loader, LoadErrorHandler handler, boolean dynamic) { + this(loader, handler, InitializableProblemHandler.WARN, dynamic); } public ServiceLoader(ClassLoader loader, LoadErrorHandler handler) { @@ -165,6 +173,16 @@ public class ServiceLoader { } /** + * Returns the handler for problems with initializables + * + * @return handler for problems with initializables + * @since Apache Tika 1.15.1 + */ + public InitializableProblemHandler getInitializableProblemHandler() { + return initializableProblemHandler; + } + + /** * Returns an input stream for reading the specified resource from the * configured class loader. * @@ -182,6 +200,16 @@ public class ServiceLoader { } /** + * + * @return ClassLoader used by this ServiceLoader + * @see #getContextClassLoader() for the context's ClassLoader + * @since Apache Tika 1.15.1 + */ + public ClassLoader getLoader() { + return loader; + } + + /** * Loads and returns the named service class that's expected to implement * the given interface. * @@ -327,7 +355,11 @@ public class ServiceLoader { try { Class<?> klass = loader.loadClass(name); if (iface.isAssignableFrom(klass)) { - providers.add((T) klass.newInstance()); + T instance = (T) klass.newInstance(); + if (instance instanceof Initializable) { + ((Initializable)instance).checkInitialization(InitializableProblemHandler.WARN); + } + providers.add(instance); } } catch (Throwable t) { handler.handleLoadError(name, t); diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java index 06eb43b..8ca0d6b 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java @@ -34,6 +34,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutorService; @@ -47,6 +48,7 @@ import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.DefaultEncodingDetector; import org.apache.tika.detect.Detector; import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.language.translate.DefaultTranslator; import org.apache.tika.language.translate.Translator; @@ -231,7 +233,6 @@ public class TikaConfig { * @throws TikaException if problem with MimeTypes or parsing XML config */ public TikaConfig() throws TikaException, IOException { - this.serviceLoader = new ServiceLoader(); String config = System.getProperty("tika.config"); if (config == null) { @@ -239,6 +240,7 @@ public class TikaConfig { } if (config == null) { + this.serviceLoader = new ServiceLoader(); this.mimeTypes = getDefaultMimeTypes(ServiceLoader.getContextClassLoader()); this.encodingDetector = getDefaultEncodingDetector(serviceLoader); this.parser = getDefaultParser(mimeTypes, serviceLoader, encodingDetector); @@ -246,8 +248,10 @@ public class TikaConfig { this.translator = getDefaultTranslator(serviceLoader); this.executorService = getDefaultExecutorService(); } else { - try (InputStream stream = getConfigInputStream(config, serviceLoader)) { + ServiceLoader tmpServiceLoader = new ServiceLoader(); + try (InputStream stream = getConfigInputStream(config, tmpServiceLoader)) { Element element = getBuilder().parse(stream).getDocumentElement(); + serviceLoader = serviceLoaderFromDomElement(element, tmpServiceLoader.getLoader()); DetectorXmlLoader detectorLoader = new DetectorXmlLoader(); EncodingDetectorXmlLoader encodingDetectorLoader = new EncodingDetectorXmlLoader(); TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader(); @@ -474,7 +478,7 @@ public class TikaConfig { return Collections.emptySet(); } - private static ServiceLoader serviceLoaderFromDomElement(Element element, ClassLoader loader) { + private static ServiceLoader serviceLoaderFromDomElement(Element element, ClassLoader loader) throws TikaConfigException { Element serviceLoaderElement = getChild(element, "service-loader"); ServiceLoader serviceLoader; if (serviceLoaderElement != null) { @@ -486,8 +490,9 @@ public class TikaConfig { } else if(LoadErrorHandler.THROW.toString().equalsIgnoreCase(loadErrorHandleConfig)) { loadErrorHandler = LoadErrorHandler.THROW; } - - serviceLoader = new ServiceLoader(loader, loadErrorHandler, dynamic); + + InitializableProblemHandler initializableProblemHandler = getInitializableProblemHandler(serviceLoaderElement.getAttribute("initializableProblemHandler")); + serviceLoader = new ServiceLoader(loader, loadErrorHandler, initializableProblemHandler, dynamic); } else if(loader != null) { serviceLoader = new ServiceLoader(loader); } else { @@ -496,6 +501,26 @@ public class TikaConfig { return serviceLoader; } + private static InitializableProblemHandler getInitializableProblemHandler(String initializableProblemHandler) + throws TikaConfigException { + if (initializableProblemHandler == null || initializableProblemHandler.length() == 0) { + return InitializableProblemHandler.DEFAULT; + } + if (InitializableProblemHandler.IGNORE.toString().equalsIgnoreCase(initializableProblemHandler)) { + return InitializableProblemHandler.IGNORE; + } else if (InitializableProblemHandler.INFO.toString().equalsIgnoreCase(initializableProblemHandler)) { + return InitializableProblemHandler.INFO; + } else if (InitializableProblemHandler.WARN.toString().equalsIgnoreCase(initializableProblemHandler)) { + return InitializableProblemHandler.WARN; + } else if (InitializableProblemHandler.THROW.toString().equalsIgnoreCase(initializableProblemHandler)) { + return InitializableProblemHandler.THROW; + } + throw new TikaConfigException( + String.format(Locale.US, "Couldn't parse non-null '%s'. Must be one of 'ignore', 'info', 'warn' or 'throw'", + initializableProblemHandler)); + } + + private static abstract class XmlLoader<CT,T> { protected static final String PARAMS_TAG_NAME = "params"; abstract boolean supportsComposite(); @@ -547,6 +572,16 @@ public class TikaConfig { T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) throws TikaException, IOException { String name = element.getAttribute("class"); + + String initProbHandler = element.getAttribute("initializableProblemHandler"); + InitializableProblemHandler initializableProblemHandler; + if (initProbHandler == null || initProbHandler.length() == 0) { + initializableProblemHandler = loader.getInitializableProblemHandler(); + } else { + initializableProblemHandler = + getInitializableProblemHandler(initProbHandler); + } + T loaded = null; try { @@ -601,6 +636,7 @@ public class TikaConfig { AnnotationUtils.assignFieldParams(loaded, params); if (loaded instanceof Initializable) { ((Initializable) loaded).initialize(params); + ((Initializable) loaded).checkInitialization(initializableProblemHandler); } // Have any decoration performed, eg explicit mimetypes loaded = decorate(loaded, element); @@ -630,6 +666,7 @@ public class TikaConfig { } } + T newInstance(Class<? extends T> loadedClass) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException { @@ -1087,4 +1124,5 @@ public class TikaConfig { } } + } diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java index fed0968..aa10923 100644 --- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java +++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java @@ -28,6 +28,7 @@ import org.apache.tika.ResourceLoggingClassLoader; import org.apache.tika.config.DummyExecutor; import org.apache.tika.config.TikaConfig; import org.apache.tika.config.TikaConfigTest; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.CompositeParser; @@ -261,4 +262,34 @@ public class TikaConfigTest extends AbstractTikaConfigTest { assertEquals("Should have configured Core Threads", 3, executorService.getCorePoolSize()); assertEquals("Should have configured Max Threads", 10, executorService.getMaximumPoolSize()); } + + @Test(expected = TikaConfigException.class) + public void testInitializerBadValue() throws Exception { + TikaConfig config = getConfig("TIKA-2389-illegal.xml"); + } + + + @Test(expected = TikaConfigException.class) + public void testInitializerPerParserThrow() throws Exception { + TikaConfig config = getConfig("TIKA-2389-throw-per-parser.xml"); + } + + @Test(expected = TikaConfigException.class) + public void testInitializerServiceLoaderThrow() throws Exception { + TikaConfig config = getConfig("TIKA-2389-throw-default.xml"); + } + + @Test + public void testInitializerServiceLoaderThrowButOverridden() throws Exception { + //TODO: test that this was logged at INFO level + TikaConfig config = getConfig("TIKA-2389-throw-default-overridden.xml"); + } + + + @Test + public void testInitializerPerParserWarn() throws Exception { + //TODO: test that this was logged at WARN level + TikaConfig config = getConfig("TIKA-2389-warn-per-parser.xml"); + } + } \ No newline at end of file diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java index 6365af2..8c80f3c 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java @@ -17,8 +17,15 @@ package org.apache.tika.parser; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; @@ -27,12 +34,6 @@ import org.apache.tika.mime.MediaType; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import java.io.IOException; -import java.io.InputStream; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - /** * This tests that initialize() is called after adding the parameters * configured via TikaConfig @@ -65,4 +66,14 @@ public class DummyInitializableParser extends AbstractParser implements Initiali shortB = (Short)params.get("shortB").getValue(); sum = shortA+shortB; } + + @Override + public void checkInitialization(InitializableProblemHandler + handler) throws TikaConfigException { + //completely arbitrary + if (sum > 1000) { + handler.handleInitializableProblem("DummyInitializableParser", + "sum cannot be > 1000: "+sum); + } + } } diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-illegal.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-illegal.xml new file mode 100644 index 0000000..3869b1b --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-illegal.xml @@ -0,0 +1,29 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser-exclude class="org.apache.tika.parser.DefaultParser"/> + <parser class="org.apache.tika.parser.DummyInitializableParser" initializableProblemHandler="impossible.method"> + <params> + <param name="shortA" type="short">500</param> + <param name="shortB" type="short">501</param> + </params> + </parser> + + </parsers> +</properties> diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-throw-default-overridden.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-throw-default-overridden.xml new file mode 100644 index 0000000..4c974ee --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-throw-default-overridden.xml @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <service-loader initializableProblemHandler="throw"/> + <parsers> + <parser-exclude class="org.apache.tika.parser.DefaultParser"/> + <parser class="org.apache.tika.parser.DummyInitializableParser" initializableProblemHandler="info"> + <params> + <param name="shortA" type="short">500</param> + <param name="shortB" type="short">501</param> + </params> + </parser> + + </parsers> +</properties> diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-throw-default.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-throw-default.xml new file mode 100644 index 0000000..675aecf --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-throw-default.xml @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <service-loader initializableProblemHandler="throw"/> + <parsers> + <parser-exclude class="org.apache.tika.parser.DefaultParser"/> + <parser class="org.apache.tika.parser.DummyInitializableParser"> + <params> + <param name="shortA" type="short">500</param> + <param name="shortB" type="short">501</param> + </params> + </parser> + + </parsers> +</properties> diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-throw-per-parser.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-throw-per-parser.xml new file mode 100644 index 0000000..85e2935 --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-throw-per-parser.xml @@ -0,0 +1,29 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser-exclude class="org.apache.tika.parser.DefaultParser"/> + <parser class="org.apache.tika.parser.DummyInitializableParser" initializableProblemHandler="throw"> + <params> + <param name="shortA" type="short">500</param> + <param name="shortB" type="short">501</param> + </params> + </parser> + + </parsers> +</properties> diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-warn-per-parser.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-warn-per-parser.xml new file mode 100644 index 0000000..c87c55d --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-2389-warn-per-parser.xml @@ -0,0 +1,29 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser-exclude class="org.apache.tika.parser.DefaultParser"/> + <parser class="org.apache.tika.parser.DummyInitializableParser" initializableProblemHandler="warn"> + <params> + <param name="shortA" type="short">500</param> + <param name="shortB" type="short">501</param> + </params> + </parser> + + </parsers> +</properties> diff --git a/tika-dl/src/main/java/org/apache/tika/dl/imagerec/DL4JInceptionV3Net.java b/tika-dl/src/main/java/org/apache/tika/dl/imagerec/DL4JInceptionV3Net.java index 62e6b1e..a6f8aa3 100644 --- a/tika-dl/src/main/java/org/apache/tika/dl/imagerec/DL4JInceptionV3Net.java +++ b/tika-dl/src/main/java/org/apache/tika/dl/imagerec/DL4JInceptionV3Net.java @@ -16,8 +16,25 @@ */ package org.apache.tika.dl.imagerec; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + import org.apache.commons.io.FileUtils; import org.apache.tika.config.Field; +import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; @@ -42,22 +59,6 @@ import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; -import java.net.URI; -import java.net.URISyntaxException; -import java.net.URL; -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; - /** * {@link DL4JInceptionV3Net} is an implementation of {@link ObjectRecogniser}. * This object recogniser is powered by <a href="http://deeplearning4j.org">Deeplearning4j</a>. @@ -273,6 +274,11 @@ public class DL4JInceptionV3Net implements ObjectRecogniser { } @Override + public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException { + //TODO: what do we want to check here? + } + + @Override public boolean isAvailable() { return graph != null; } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java index ddb4f92..9cab73f 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java @@ -27,9 +27,14 @@ import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; +import java.util.Map; import java.util.Set; import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; @@ -45,7 +50,7 @@ import org.w3c.dom.Node; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -public class ImageParser extends AbstractParser { +public class ImageParser extends AbstractParser implements Initializable { /** * Serial version UID */ @@ -71,7 +76,6 @@ public class ImageParser extends AbstractParser { Class.forName("com.levigo.jbig2.JBIG2ImageReader"); TMP_SUPPORTED.add(MediaType.image("x-jbig2")); } catch (ClassNotFoundException e) { - LOG.warn("JBIG2ImageReader not loaded. jbig2 files will be ignored"); } } @@ -217,4 +221,20 @@ public class ImageParser extends AbstractParser { xhtml.startDocument(); xhtml.endDocument(); } + + @Override + public void initialize(Map<String, Param> params) throws TikaConfigException { + + } + + @Override + public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException { + try { + Class.forName("com.levigo.jbig2.JBIG2ImageReader"); + } catch (ClassNotFoundException e) { + problemHandler.handleInitializableProblem("org.apache.tika.parser.ImageParser", + "com.levigo.jbig2.JBIG2ImageReader not on class path. The ImageParser will skip jbig2 images"); + } + + } } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java index 1cba2e4..1d0cdcd 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java @@ -19,8 +19,13 @@ package org.apache.tika.parser.jdbc; import java.io.IOException; import java.io.InputStream; import java.util.Collections; +import java.util.Map; import java.util.Set; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -45,7 +50,7 @@ import org.xml.sax.SAXException; * If using a TikaInputStream, make sure to close it to delete the temp file * that has to be created. */ -public class SQLite3Parser extends AbstractParser { +public class SQLite3Parser extends AbstractParser implements Initializable { /** * Serial version UID */ @@ -53,14 +58,8 @@ public class SQLite3Parser extends AbstractParser { private static final MediaType MEDIA_TYPE = MediaType.application("x-sqlite3"); - private final Set<MediaType> SUPPORTED_TYPES; - - /** - * Checks to see if class is available for org.sqlite.JDBC. - * <p/> - * If not, this class will return an EMPTY_SET for getSupportedTypes() - */ - public SQLite3Parser() { + private static final Set<MediaType> SUPPORTED_TYPES; + static { Set<MediaType> tmp; try { Class.forName(SQLite3DBParser.SQLITE_CLASS_NAME); @@ -68,7 +67,15 @@ public class SQLite3Parser extends AbstractParser { } catch (ClassNotFoundException e) { tmp = Collections.EMPTY_SET; } - SUPPORTED_TYPES = tmp; + SUPPORTED_TYPES = Collections.unmodifiableSet(tmp); + } + /** + * Checks to see if class is available for org.sqlite.JDBC. + * <p/> + * If not, this class will return an EMPTY_SET for getSupportedTypes() + */ + public SQLite3Parser() { + } @Override @@ -81,4 +88,24 @@ public class SQLite3Parser extends AbstractParser { SQLite3DBParser p = new SQLite3DBParser(); p.parse(stream, handler, metadata, context); } + + /** + * No-op + * @param params params to use for initialization + * @throws TikaConfigException + */ + @Override + public void initialize(Map<String, Param> params) throws TikaConfigException { + + } + + @Override + public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException { + if (SUPPORTED_TYPES.size() == 0) { + problemHandler.handleInitializableProblem("org.apache.tika.parser.SQLite3Parser", + "org.xerial's sqlite-jdbc is not loaded.\n" + + "Please provide the jar on your classpath to parse sqlite files.\n" + + "See tika-parsers/pom.xml for the correct version."); + } + } } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 121e096..bfa3ff5 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -52,6 +52,10 @@ import org.apache.commons.exec.DefaultExecutor; import org.apache.commons.exec.PumpStreamHandler; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -89,7 +93,7 @@ import org.xml.sax.helpers.DefaultHandler; * * */ -public class TesseractOCRParser extends AbstractParser { +public class TesseractOCRParser extends AbstractParser implements Initializable { private static final Logger LOG = LoggerFactory.getLogger(TesseractOCRParser.class); private static final long serialVersionUID = -8167538283213097265L; @@ -101,7 +105,6 @@ public class TesseractOCRParser extends AbstractParser { MediaType.image("jpx"), MediaType.image("x-portable-pixmap") }))); private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<>(); - private static volatile boolean HAS_ALERTED = false; @Override @@ -109,14 +112,6 @@ public class TesseractOCRParser extends AbstractParser { // If Tesseract is installed, offer our supported image types TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG); if (hasTesseract(config)) { - if (! HAS_ALERTED) { - LOG.info("Tesseract OCR is installed and will be automatically applied to image files.\n"+ - "This may dramatically slow down content extraction (TIKA-2359).\n"+ - "As of Tika 1.15 (and prior versions), Tesseract is automatically called.\n"+ - "In future versions of Tika, users may need to turn the TesseractOCRParser on via TikaConfig." - ); - HAS_ALERTED = true; - } return SUPPORTED_TYPES; } // Otherwise don't advertise anything, so the other image parsers @@ -395,9 +390,35 @@ public class TesseractOCRParser extends AbstractParser { } } + /** + * no-op + * @param params params to use for initialization + * @throws TikaConfigException + */ + @Override + public void initialize(Map<String, Param> params) throws TikaConfigException { + + } + @Override + public void checkInitialization(InitializableProblemHandler problemHandler) + throws TikaConfigException { + //this will incorrectly trigger for people who turn off Tesseract + //by sending in a bogus tesseract path via a custom TesseractOCRConfig. + //TODO: figure out how to solve that. + if (hasTesseract(DEFAULT_CONFIG)) { + problemHandler.handleInitializableProblem(this.getClass().getName(), + "Tesseract OCR is installed and will be automatically applied to image files.\n" + + "This may dramatically slow down content extraction (TIKA-2359).\n" + + "As of Tika 1.15 (and prior versions), Tesseract is automatically called.\n" + + "In future versions of Tika, users may need to turn the TesseractOCRParser on via TikaConfig."); + } + } // TIKA-1445 workaround parser private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser(); + + + private static class CompositeImageParser extends CompositeParser { private static final long serialVersionUID = -2398203346206381382L; private static List<Parser> imageParsers = Arrays.asList(new Parser[]{ diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 7adc281..35051a0 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -26,6 +26,7 @@ import java.util.Calendar; import java.util.Collections; import java.util.List; import java.util.Locale; +import java.util.Map; import java.util.Set; import org.apache.commons.io.input.CloseShieldInputStream; @@ -45,7 +46,11 @@ import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; import org.apache.poi.util.IOUtils; import org.apache.tika.config.Field; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; @@ -94,7 +99,7 @@ import org.xml.sax.SAXException; * <a href="http://tabula.technology/">tabula</a> for one project that * tries to maintain the structure of tables represented in PDFs. */ -public class PDFParser extends AbstractParser { +public class PDFParser extends AbstractParser implements Initializable { /** @@ -112,6 +117,7 @@ public class PDFParser extends AbstractParser { private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE); private PDFParserConfig defaultConfig = new PDFParserConfig(); + private InitializableProblemHandler initializableProblemHandler = null; public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; @@ -690,6 +696,22 @@ public class PDFParser extends AbstractParser { defaultConfig.setExtractActions(extractActions); } + @Field + void setInitializableProblemHander(String name) { + if ("ignore".equals(name)) { + setInitializableProblemHandler(InitializableProblemHandler.IGNORE); + } else if ("info".equalsIgnoreCase(name)) { + setInitializableProblemHandler(InitializableProblemHandler.INFO); + } else if ("warn".equalsIgnoreCase(name)) { + setInitializableProblemHandler(InitializableProblemHandler.WARN); + } else if ("throw".equalsIgnoreCase(name)) { + setInitializableProblemHandler(InitializableProblemHandler.THROW); + } + } + + public void setInitializableProblemHandler(InitializableProblemHandler initializableProblemHandler) { + this.initializableProblemHandler = initializableProblemHandler; + } //can return null! private Document loadDOM(PDMetadata pdMetadata, Metadata metadata, ParseContext context) { if (pdMetadata == null) { @@ -715,4 +737,53 @@ public class PDFParser extends AbstractParser { } + /** + * This is a no-op. There is no need to initialize multiple fields. + * The regular field loading should happen without this. + * + * @param params params to use for initialization + * @throws TikaConfigException + */ + @Override + public void initialize(Map<String, Param> params) throws TikaConfigException { + //no-op + } + + @Override + public void checkInitialization(InitializableProblemHandler handler) throws TikaConfigException { + StringBuilder sb = new StringBuilder(); + try { + Class.forName("com.levigo.jbig2.JBIG2ImageReader"); + } catch (ClassNotFoundException e) { + sb.append("JBIG2ImageReader not loaded. jbig2 files will be ignored\n"); + sb.append("See https://pdfbox.apache.org/2.0/dependencies.html#jai-image-io\n"); + sb.append("for optional dependencies.\n"); + } + try { + Class.forName("com.github.jaiimageio.impl.plugins.tiff.TIFFImageWriter"); + } catch (ClassNotFoundException e) { + sb.append("TIFFImageWriter not loaded. tiff files will not be processed\n"); + sb.append("See https://pdfbox.apache.org/2.0/dependencies.html#jai-image-io\n"); + sb.append("for optional dependencies.\n"); + + } + + try { + Class.forName("com.github.jaiimageio.jpeg2000.impl.J2KImageReader"); + } catch (ClassNotFoundException e) { + sb.append("J2KImageReader not loaded. JPEG2000 files will not be processed.\n"); + sb.append("See https://pdfbox.apache.org/2.0/dependencies.html#jai-image-io\n"); + sb.append("for optional dependencies.\n"); + } + + if (sb.length() > 0) { + InitializableProblemHandler localInitializableProblemHandler = + (initializableProblemHandler == null) ? + handler : initializableProblemHandler; + localInitializableProblemHandler.handleInitializableProblem("org.apache.tika.parsers.PDFParser", + sb.toString()); + } + + } + } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java index 00ba5cc..cf64989 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java @@ -16,8 +16,19 @@ */ package org.apache.tika.parser.recognition; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; @@ -34,16 +45,6 @@ import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import java.io.IOException; -import java.io.InputStream; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.ArrayList; -import java.util.Locale; -import java.util.Map; -import java.util.Set; - /** * This parser recognises objects from Images. @@ -104,6 +105,11 @@ public class ObjectRecognitionParser extends AbstractParser implements Initializ } @Override + public void checkInitialization(InitializableProblemHandler handler) throws TikaConfigException { + //TODO -- what do we want to check? + } + + @Override public Set<MediaType> getSupportedTypes(ParseContext context) { return recogniser.isAvailable() ? recogniser.getSupportedMimes() : Collections.<MediaType>emptySet(); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java index e00d48b..5793791 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java @@ -16,7 +16,21 @@ */ package org.apache.tika.parser.recognition.tf; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + import org.apache.tika.config.Field; +import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; @@ -32,19 +46,6 @@ import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; - /** * This is an implementation of {@link ObjectRecogniser} powered by <a href="http://www.tensorflow.org"> Tensorflow <a/> * convolutional neural network (CNN). This implementation binds to Python API using {@link ExternalParser}. @@ -136,6 +137,11 @@ public class TensorflowImageRecParser extends ExternalParser implements ObjectRe } @Override + public void checkInitialization(InitializableProblemHandler handler) throws TikaConfigException { + //TODO -- what do we want to check? + } + + @Override public List<RecognisedObject> recognise(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowRESTRecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowRESTRecogniser.java index 2b61408..7b8fcfb 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowRESTRecogniser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowRESTRecogniser.java @@ -17,12 +17,22 @@ package org.apache.tika.parser.recognition.tf; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; + import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.entity.ByteArrayEntity; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.tika.config.Field; +import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; @@ -39,15 +49,6 @@ import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.net.URI; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Set; - /** * Tensor Flow image recogniser which has high performance. * This implementation uses Tensorflow via REST API. @@ -99,6 +100,12 @@ public class TensorflowRESTRecogniser implements ObjectRecogniser { } @Override + public void checkInitialization(InitializableProblemHandler handler) + throws TikaConfigException { + //TODO -- what do we want to check? + } + + @Override public List<RecognisedObject> recognise(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/sentiment/analysis/SentimentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/sentiment/analysis/SentimentParser.java index a3efc06..4192936 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/sentiment/analysis/SentimentParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/sentiment/analysis/SentimentParser.java @@ -17,11 +17,20 @@ package org.apache.tika.parser.sentiment.analysis; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.util.Collections; +import java.util.Map; +import java.util.Set; + import opennlp.tools.sentiment.SentimentME; import opennlp.tools.sentiment.SentimentModel; import org.apache.commons.io.IOUtils; import org.apache.tika.config.Field; import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; @@ -34,14 +43,6 @@ import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.net.URL; -import java.util.Collections; -import java.util.Map; -import java.util.Set; - /** * This parser classifies documents based on the sentiment of document. * The classifier is powered by Apache OpenNLP's Maximum Entropy Classifier @@ -105,6 +106,10 @@ public class SentimentParser extends AbstractParser implements Initializable { } } + @Override + public void checkInitialization(InitializableProblemHandler handler) throws TikaConfigException { + //TODO -- what do we want to check? + } /** * Returns the types supported * -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
