Author: mattmann
Date: Wed Aug 5 01:33:13 2015
New Revision: 1694133
URL: http://svn.apache.org/r1694133
Log:
Fix for TIKA-1703: Can't Specify Tesseract Data Folder Distinct from Tesseract
Executable Path Contributed by Christian Wolfe <[email protected]> this closes
#56.
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Aug 5 01:33:13 2015
@@ -1,3 +1,9 @@
+Release 1.11 - Current Development
+
+ * The ability to specify the Tesseract Config Path was added
+ to the OCR Parser (TIKA-1703).
+
+
Release 1.10 - 8/1/2015
* Tika Config XML can now be used to create composite detectors,
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
Wed Aug 5 01:33:13 2015
@@ -25,7 +25,7 @@ import java.util.Properties;
/**
* Configuration for TesseractOCRParser.
- *
+ *
* This allows to enable TesseractOCRParser and set its parameters:
* <p>
* TesseractOCRConfig config = new TesseractOCRConfig();<br>
@@ -36,27 +36,30 @@ import java.util.Properties;
* Parameters can also be set by either editing the existing
TesseractOCRConfig.properties file in,
* tika-parser/src/main/resources/org/apache/tika/parser/ocr, or overriding it
by creating your own
* and placing it in the package org/apache/tika/parser/ocr on the classpath.
- *
+ *
*/
public class TesseractOCRConfig implements Serializable{
private static final long serialVersionUID = -4861942486845757891L;
-
+
// Path to tesseract installation folder, if not on system path.
private String tesseractPath = "";
-
+
+ // Path to the 'tessdata' folder, which contains language files and config
files.
+ private String tessdataPath = "";
+
// Language dictionary to be used.
private String language = "eng";
-
+
// Tesseract page segmentation mode.
private String pageSegMode = "1";
-
+
// Minimum file size to submit file to ocr.
private int minFileSizeToOcr = 0;
-
+
// Maximum file size to submit file to ocr.
private int maxFileSizeToOcr = Integer.MAX_VALUE;
-
+
// Maximum time (seconds) to wait for the ocring process termination
private int timeout = 120;
@@ -98,6 +101,8 @@ public class TesseractOCRConfig implemen
setTesseractPath(
getProp(props, "tesseractPath",
getTesseractPath()));
+ setTessdataPath(
+ getProp(props, "tessdataPath", getTessdataPath()));
setLanguage(
getProp(props, "language", getLanguage()));
setPageSegMode(
@@ -107,7 +112,7 @@ public class TesseractOCRConfig implemen
setMaxFileSizeToOcr(
getProp(props, "maxFileSizeToOcr",
getMaxFileSizeToOcr()));
setTimeout(
- getProp(props, "timeout", getTimeout()));
+ getProp(props, "timeout", getTimeout()));
}
@@ -115,22 +120,43 @@ public class TesseractOCRConfig implemen
public String getTesseractPath() {
return tesseractPath;
}
-
+
/**
- * Set tesseract installation folder, needed if it is not on system
path.
+ * Set the path to the Tesseract executable, needed if it is not on
system path.
+ * <p>
+ * Note that if you set this value, it is highly recommended that you also
+ * set the path to the 'tessdata' folder using {@link #setTessdataPath}.
+ * </p>
*/
public void setTesseractPath(String tesseractPath) {
if(!tesseractPath.isEmpty() &&
!tesseractPath.endsWith(File.separator))
tesseractPath += File.separator;
-
+
this.tesseractPath = tesseractPath;
}
-
+
+ /** @see #setTessdataPath(String tessdataPath) */
+ public String getTessdataPath() {
+ return tessdataPath;
+ }
+
+ /**
+ * Set the path to the 'tessdata' folder, which contains language files
and config files. In some cases (such
+ * as on Windows), this folder is found in the Tesseract installation, but
in other cases
+ * (such as when Tesseract is built from source), it may be located
elsewhere.
+ */
+ public void setTessdataPath(String tessdataPath) {
+ if(!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator))
+ tessdataPath += File.separator;
+
+ this.tessdataPath = tessdataPath;
+ }
+
/** @see #setLanguage(String language)*/
public String getLanguage() {
return language;
}
-
+
/**
* Set tesseract language dictionary to be used. Default is "eng".
* Multiple languages may be specified, separated by plus characters.
@@ -141,12 +167,12 @@ public class TesseractOCRConfig implemen
}
this.language = language;
}
-
+
/** @see #setPageSegMode(String pageSegMode)*/
public String getPageSegMode() {
return pageSegMode;
}
-
+
/**
* Set tesseract page segmentation mode.
* Default is 1 = Automatic page segmentation with OSD (Orientation and
Script Detection)
@@ -157,12 +183,12 @@ public class TesseractOCRConfig implemen
}
this.pageSegMode = pageSegMode;
}
-
+
/** @see #setMinFileSizeToOcr(int minFileSizeToOcr)*/
public int getMinFileSizeToOcr() {
return minFileSizeToOcr;
}
-
+
/**
* Set minimum file size to submit file to ocr.
* Default is 0.
@@ -170,12 +196,12 @@ public class TesseractOCRConfig implemen
public void setMinFileSizeToOcr(int minFileSizeToOcr) {
this.minFileSizeToOcr = minFileSizeToOcr;
}
-
+
/** @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)*/
public int getMaxFileSizeToOcr() {
return maxFileSizeToOcr;
}
-
+
/**
* Set maximum file size to submit file to ocr.
* Default is Integer.MAX_VALUE.
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
Wed Aug 5 01:33:13 2015
@@ -97,9 +97,14 @@ public class TesseractOCRParser extends
}
private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
- if (!config.getTesseractPath().isEmpty()) {
- Map<String, String> env = pb.environment();
- env.put("TESSDATA_PREFIX", config.getTesseractPath());
+ String tessdataPrefix = "TESSDATA_PREFIX";
+ Map<String, String> env = pb.environment();
+
+ if (!config.getTessdataPath().isEmpty()) {
+ env.put(tessdataPrefix, config.getTessdataPath());
+ }
+ else if(!config.getTesseractPath().isEmpty()) {
+ env.put(tessdataPrefix, config.getTesseractPath());
}
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
Wed Aug 5 01:33:13 2015
@@ -32,6 +32,7 @@ public class TesseractOCRConfigTest exte
public void testNoConfig() throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
assertEquals("Invalid default tesseractPath value", "",
config.getTesseractPath());
+ assertEquals("Invalid default tessdataPath value", "",
config.getTessdataPath());
assertEquals("Invalid default language value", "eng",
config.getLanguage());
assertEquals("Invalid default pageSegMode value", "1",
config.getPageSegMode());
assertEquals("Invalid default minFileSizeToOcr value", 0,
config.getMinFileSizeToOcr());
@@ -47,6 +48,7 @@ public class TesseractOCRConfigTest exte
TesseractOCRConfig config = new TesseractOCRConfig(stream);
assertEquals("Invalid default tesseractPath value", "",
config.getTesseractPath());
+ assertEquals("Invalid default tessdataPath value", "",
config.getTessdataPath());
assertEquals("Invalid overridden language value", "fra+deu",
config.getLanguage());
assertEquals("Invalid default pageSegMode value", "1",
config.getPageSegMode());
assertEquals("Invalid overridden minFileSizeToOcr value", 1,
config.getMinFileSizeToOcr());
@@ -62,6 +64,7 @@ public class TesseractOCRConfigTest exte
TesseractOCRConfig config = new TesseractOCRConfig(stream);
assertEquals("Invalid overridden tesseractPath value",
"/opt/tesseract" + File.separator, config.getTesseractPath());
+ assertEquals("Invalid overridden tesseractPath value",
"/usr/local/share" + File.separator, config.getTessdataPath());
assertEquals("Invalid overridden language value", "fra+deu",
config.getLanguage());
assertEquals("Invalid overridden pageSegMode value", "2",
config.getPageSegMode());
assertEquals("Invalid overridden minFileSizeToOcr value", 1,
config.getMinFileSizeToOcr());
Modified:
tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties
(original)
+++
tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties
Wed Aug 5 01:33:13 2015
@@ -14,6 +14,7 @@
# limitations under the License.
tesseractPath=/opt/tesseract
+tessdataPath=/usr/local/share
language=fra+deu
pageSegMode=2
maxFileSizeToOcr=2000000