This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new e0deb67 TIKA-3207 -- allow for _vert for language codes
e0deb67 is described below
commit e0deb67cab2c47216a334c51d5affc5a27642c40
Author: tallison <[email protected]>
AuthorDate: Fri Oct 9 11:51:30 2020 -0400
TIKA-3207 -- allow for _vert for language codes
---
.../src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java | 4 ++--
.../test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java | 3 ++-
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 1c65ece..bdd6eb5 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -247,9 +247,9 @@ public class TesseractOCRConfig implements Serializable {
* e.g. "chi_tra+chi_sim"
*/
public void setLanguage(String language) {
- if (!language.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4})?(\\+?))+")
+ if (!language.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2}(\\+?))+")
|| language.endsWith("+")) {
- throw new IllegalArgumentException("Invalid language code");
+ throw new IllegalArgumentException("Invalid language code:
"+language);
}
this.language = language;
}
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
index 8dd9cf6..246c24a 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
@@ -99,7 +99,8 @@ public class TesseractOCRConfigTest extends TikaTest {
@Test
public void testValidateValidLanguage() {
List<String> validLanguages = Arrays.asList(
- "eng", "slk_frak", "chi_tra", "eng+fra",
"tgk+chi_tra+slk_frak");
+ "eng", "slk_frak", "chi_tra", "eng+fra",
"tgk+chi_tra+slk_frak",
+ "chi_tra_vert", "tgk+chi_tra_vert+slk_frak");
TesseractOCRConfig config = new TesseractOCRConfig();