This is an automated email from the ASF dual-hosted git repository.
dmeikle pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 0aaa121 TIKA-2357: Increased support for Tesseract PSM up to 13
from Rafael Ferreira
0aaa121 is described below
commit 0aaa1215fd11632c349e9bdebac9829578276cb1
Author: David Meikle <[email protected]>
AuthorDate: Mon May 8 14:32:19 2017 +0100
TIKA-2357: Increased support for Tesseract PSM up to 13 from Rafael Ferreira
---
CHANGES.txt | 3 +++
.../src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java | 4 ++--
.../test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java | 2 +-
3 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index ce0e247..416311e 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -107,6 +107,9 @@ Release 1.15 - ??
* Further mime magic for WebVTT (TIKA-1772)
+ * Extend support for increased PSM options up to 13 for modern
+ versions of Tesseract (TIKA-2357).
+
Release 1.14 - 10/19/2016
* Extract all headers from MSG/RFC822 (TIKA-2122).
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index e861876..624c97e 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -244,8 +244,8 @@ public class TesseractOCRConfig implements Serializable {
* Default is 1 = Automatic page segmentation with OSD (Orientation and
Script Detection)
*/
public void setPageSegMode(String pageSegMode) {
- if (!pageSegMode.matches("[1-9]|10")) {
- throw new IllegalArgumentException("Invalid language code");
+ if (!pageSegMode.matches("[0-9]|10|11|12|13")) {
+ throw new IllegalArgumentException("Invalid page segmentation
mode");
}
this.pageSegMode = pageSegMode;
}
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
index fcdd271..adec5db 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
@@ -129,7 +129,7 @@ public class TesseractOCRConfigTest extends TikaTest {
config.setPageSegMode("0");
config.setPageSegMode("10");
assertTrue("Couldn't set valid values", true);
- config.setPageSegMode("11");
+ config.setPageSegMode("14");
}
@Test(expected=IllegalArgumentException.class)
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].