This is an automated email from the ASF dual-hosted git repository. dmeikle pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push: new 0aaa121 TIKA-2357: Increased support for Tesseract PSM up to 13 from Rafael Ferreira 0aaa121 is described below commit 0aaa1215fd11632c349e9bdebac9829578276cb1 Author: David Meikle <da...@logicalspark.com> AuthorDate: Mon May 8 14:32:19 2017 +0100 TIKA-2357: Increased support for Tesseract PSM up to 13 from Rafael Ferreira --- CHANGES.txt | 3 +++ .../src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java | 4 ++-- .../test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index ce0e247..416311e 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -107,6 +107,9 @@ Release 1.15 - ?? * Further mime magic for WebVTT (TIKA-1772) + * Extend support for increased PSM options up to 13 for modern + versions of Tesseract (TIKA-2357). + Release 1.14 - 10/19/2016 * Extract all headers from MSG/RFC822 (TIKA-2122). diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java index e861876..624c97e 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java @@ -244,8 +244,8 @@ public class TesseractOCRConfig implements Serializable { * Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection) */ public void setPageSegMode(String pageSegMode) { - if (!pageSegMode.matches("[1-9]|10")) { - throw new IllegalArgumentException("Invalid language code"); + if (!pageSegMode.matches("[0-9]|10|11|12|13")) { + throw new IllegalArgumentException("Invalid page segmentation mode"); } this.pageSegMode = pageSegMode; } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java index fcdd271..adec5db 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java @@ -129,7 +129,7 @@ public class TesseractOCRConfigTest extends TikaTest { config.setPageSegMode("0"); config.setPageSegMode("10"); assertTrue("Couldn't set valid values", true); - config.setPageSegMode("11"); + config.setPageSegMode("14"); } @Test(expected=IllegalArgumentException.class) -- To stop receiving notification emails like this one, please contact ['"commits@tika.apache.org" <commits@tika.apache.org>'].