[
https://issues.apache.org/jira/browse/TIKA-2613?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16419582#comment-16419582
]
ASF GitHub Bot commented on TIKA-2613:
--------------------------------------
tballison closed pull request #230: Fix for TIKA-2613 contributed by ewanmellor.
URL: https://github.com/apache/tika/pull/230
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index c8c8bc93e..6ee050156 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -20,7 +20,9 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
+import java.util.HashMap;
import java.util.Locale;
+import java.util.Map;
import java.util.Properties;
/**
@@ -91,12 +93,18 @@
// factor by which image is to be scaled.
private int resize = 900;
+ // See setPageSeparator.
+ private String pageSeparator = "";
+
// whether or not to preserve interword spacing
private boolean preserveInterwordSpacing = false;
// whether or not to apply rotation calculated by the rotation.py script
private boolean applyRotation = false;
+ // See addOtherTesseractConfig.
+ private Map<String, String> otherTesseractConfig = new HashMap<>();
+
/**
* Default contructor.
@@ -175,6 +183,7 @@ private void init(InputStream is) {
setApplyRotation(
getProp(props, "applyRotation", getApplyRotation()));
+ loadOtherTesseractConfig(props);
}
/**
@@ -255,6 +264,25 @@ public void setPageSegMode(String pageSegMode) {
this.pageSegMode = pageSegMode;
}
+ /**
+ * @see #setPageSeparator(String pageSeparator)
+ */
+ public String getPageSeparator() {
+ return pageSeparator;
+ }
+
+ /**
+ * The page separator to use in plain text output. This corresponds to
Tesseract's page_separator config option.
+ * The default here is the empty string (i.e. no page separators). Note
that this is also the default in
+ * Tesseract 3.x, but in Tesseract 4.0 the default is to use the form feed
control character. We are overriding
+ * Tesseract 4.0's default here.
+ *
+ * @param pageSeparator
+ */
+ public void setPageSeparator(String pageSeparator) {
+ this.pageSeparator = pageSeparator;
+ }
+
/**
* Whether or not to maintain interword spacing. Default is
<code>false</code>.
*
@@ -494,6 +522,28 @@ public void setApplyRotation(boolean applyRotation) {
this.applyRotation = applyRotation;
}
+ /**
+ * @see #addOtherTesseractConfig(String, String)
+ */
+ public Map<String, String> getOtherTesseractConfig() {
+ return otherTesseractConfig;
+ }
+
+ /**
+ * Add a key-value pair to pass to Tesseract using its -c command line
option.
+ * To see the possible options, run tesseract --print-parameters.
+ *
+ * You may also add these parameters in TesseractOCRConfig.properties; any
+ * key-value pair in the properties file where the key contains an
underscore
+ * is passed directly to Tesseract.
+ *
+ * @param key
+ * @param value
+ */
+ public void addOtherTesseractConfig(String key, String value) {
+ otherTesseractConfig.put(key, value);
+ }
+
/**
* Get property from the properties file passed in.
*
@@ -543,4 +593,18 @@ private boolean getProp(Properties properties, String
property, boolean defaultM
property, propVal));
}
+ /**
+ * Populate otherTesseractConfig from the given properties.
+ * This assumes that any key-value pair where the key contains
+ * an underscore is an option to be passed opaquely to Tesseract.
+ *
+ * @param properties properties file to read from.
+ */
+ private void loadOtherTesseractConfig(Properties properties) {
+ for (String k : properties.stringPropertyNames()) {
+ if (k.contains("_")) {
+ otherTesseractConfig.put(k, properties.getProperty(k));
+ }
+ }
+ }
}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 08847fd74..f274ce164 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -34,6 +34,7 @@
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
@@ -465,11 +466,20 @@ public void
checkInitialization(InitializableProblemHandler problemHandler)
* if an input error occurred
*/
private void doOCR(File input, File output, TesseractOCRConfig config)
throws IOException, TikaException {
- String[] cmd = { config.getTesseractPath() + getTesseractProg(),
input.getPath(), output.getPath(), "-l",
- config.getLanguage(), "-psm", config.getPageSegMode(),
- config.getOutputType().name().toLowerCase(Locale.US),
+ ArrayList<String> cmd = new ArrayList<>(Arrays.asList(
+ config.getTesseractPath() + getTesseractProg(),
input.getPath(), output.getPath(), "-l",
+ config.getLanguage(), "--psm", config.getPageSegMode()
+ ));
+ for (Map.Entry<String, String> entry :
config.getOtherTesseractConfig().entrySet()) {
+ cmd.add("-c");
+ cmd.add(entry.getKey() + "=" + entry.getValue());
+ }
+ cmd.addAll(Arrays.asList(
+ "-c", "page_separator=" + config.getPageSeparator(),
"-c",
- (config.getPreserveInterwordSpacing())?
"preserve_interword_spaces=1" : "preserve_interword_spaces=0"};
+ (config.getPreserveInterwordSpacing())?
"preserve_interword_spaces=1" : "preserve_interword_spaces=0",
+ config.getOutputType().name().toLowerCase(Locale.US)
+ ));
ProcessBuilder pb = new ProcessBuilder(cmd);
setEnv(config, pb);
final Process process = pb.start();
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> Tesseract 4.0 has removed -psm, so Tika must update
> ---------------------------------------------------
>
> Key: TIKA-2613
> URL: https://issues.apache.org/jira/browse/TIKA-2613
> Project: Tika
> Issue Type: Improvement
> Components: parser
> Affects Versions: 1.17
> Reporter: Ewan Mellor
> Priority: Major
>
> Tesseract 4.0 (currently in beta-1) has removed the {{\-psm}} flag, in favor
> of {{\-\-psm}} (with two dashes).
> The {{\-\-psm}} variant was introduced in Nov 2016, so it should be safe to
> simply switch Tika to use the two-dash variant, even for people still using
> Tesseract 3.05.
> For reference, the Tesseract cset is:
> {code}
> commit ee201e1f4fa277a4b2ecd751a45d3bf1eba6dfdb
> Author: Stefan Weil <[email protected]>
> Date: Sun Mar 25 17:28:33 2018 +0200
> Remove deprecated support for -psm argument (#1419)
> It was replaced by --psm and deprecated in commit 92d981b93.
> Signed-off-by: Stefan Weil <[email protected]>
> {code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)