This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit b2ca3781f7a27e7c0ca627359d13a66c56940039 Author: Ewan Mellor <[email protected]> AuthorDate: Wed Feb 21 13:43:44 2018 -0800 Fix for TIKA-2584 contributed by ewanmellor. Add TesseractOCRConfig.{add,get}OtherTesseractConfig, plus parsing of TesseractOCRConfig.properties to extract any key-value pair where the key has an underscore. Inside TesseractOCRParser, pass these key-value pairs to Tesseract using its -c command line option. This gives a mechanism by which user code can pass arbitrary options to Tesseract without Tika having to understand them. --- .../apache/tika/parser/ocr/TesseractOCRConfig.java | 42 ++++++++++++++++++++++ .../apache/tika/parser/ocr/TesseractOCRParser.java | 15 ++++++-- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java index 4139cd2..07bb7f8 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java @@ -20,7 +20,9 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; +import java.util.HashMap; import java.util.Locale; +import java.util.Map; import java.util.Properties; /** @@ -100,6 +102,9 @@ public class TesseractOCRConfig implements Serializable { // whether or not to apply rotation calculated by the rotation.py script private boolean applyRotation = false; + // See addOtherTesseractConfig. + private Map<String, String> otherTesseractConfig = new HashMap<>(); + /** * Default contructor. @@ -178,6 +183,7 @@ public class TesseractOCRConfig implements Serializable { setApplyRotation( getProp(props, "applyRotation", getApplyRotation())); + loadOtherTesseractConfig(props); } /** @@ -517,6 +523,28 @@ public class TesseractOCRConfig implements Serializable { } /** + * @see #addOtherTesseractConfig(String, String) + */ + public Map<String, String> getOtherTesseractConfig() { + return otherTesseractConfig; + } + + /** + * Add a key-value pair to pass to Tesseract using its -c command line option. + * To see the possible options, run tesseract --print-parameters. + * + * You may also add these parameters in TesseractOCRConfig.properties; any + * key-value pair in the properties file where the key contains an underscore + * is passed directly to Tesseract. + * + * @param key + * @param value + */ + public void addOtherTesseractConfig(String key, String value) { + otherTesseractConfig.put(key, value); + } + + /** * Get property from the properties file passed in. * * @param properties properties file to read from. @@ -565,4 +593,18 @@ public class TesseractOCRConfig implements Serializable { property, propVal)); } + /** + * Populate otherTesseractConfig from the given properties. + * This assumes that any key-value pair where the key contains + * an underscore is an option to be passed opaquely to Tesseract. + * + * @param properties properties file to read from. + */ + private void loadOtherTesseractConfig(Properties properties) { + for (String k : properties.stringPropertyNames()) { + if (k.contains("_")) { + otherTesseractConfig.put(k, properties.getProperty(k)); + } + } + } } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 3e15c44..6bf2ab4 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -34,6 +34,7 @@ import java.io.Reader; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.StandardCopyOption; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; @@ -465,12 +466,20 @@ public class TesseractOCRParser extends AbstractParser implements Initializable * if an input error occurred */ private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException { - String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l", + ArrayList<String> cmd = new ArrayList<>(Arrays.asList( + config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l", config.getLanguage(), "-psm", config.getPageSegMode(), - config.getOutputType().name().toLowerCase(Locale.US), + config.getOutputType().name().toLowerCase(Locale.US) + )); + for (Map.Entry<String, String> entry : config.getOtherTesseractConfig().entrySet()) { + cmd.add("-c"); + cmd.add(entry.getKey() + "=" + entry.getValue()); + } + cmd.addAll(Arrays.asList( "-c", "page_separator=" + config.getPageSeparator(), "-c", - (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"}; + (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0" + )); ProcessBuilder pb = new ProcessBuilder(cmd); setEnv(config, pb); final Process process = pb.start(); -- To stop receiving notification emails like this one, please contact [email protected].
