This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 2efe3f97a7df0ac8863b225beb2deb41e99c1e90 Author: Ewan Mellor <[email protected]> AuthorDate: Mon Mar 26 16:25:31 2018 -0700 Fix for TIKA-2613 contributed by ewanmellor. Change -psm on the Tesseract command line to --psm, with two dashes. This matches a change in Tesseract 4.0 to remove the one-dash version. It has been deprecated since Nov 2016. The Tesseract cset is ee201e1f4. Also, move the config file (i.e. getOutputType in Tika's terms) so that it is the last parameter on the command line. Tesseract logs an error message (though otherwise doesn't fail) if the config file is not the last thing on the command line. --- .../main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 6bf2ab4..f274ce1 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -468,8 +468,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException { ArrayList<String> cmd = new ArrayList<>(Arrays.asList( config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l", - config.getLanguage(), "-psm", config.getPageSegMode(), - config.getOutputType().name().toLowerCase(Locale.US) + config.getLanguage(), "--psm", config.getPageSegMode() )); for (Map.Entry<String, String> entry : config.getOtherTesseractConfig().entrySet()) { cmd.add("-c"); @@ -478,7 +477,8 @@ public class TesseractOCRParser extends AbstractParser implements Initializable cmd.addAll(Arrays.asList( "-c", "page_separator=" + config.getPageSeparator(), "-c", - (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0" + (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0", + config.getOutputType().name().toLowerCase(Locale.US) )); ProcessBuilder pb = new ProcessBuilder(cmd); setEnv(config, pb); -- To stop receiving notification emails like this one, please contact [email protected].
