This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 4c7affb  TIKA-3207 -- allow for _vert for language codes
4c7affb is described below

commit 4c7affbffc2e0723826edacb7ebdc4498cdd05b9
Author: tallison <[email protected]>
AuthorDate: Fri Oct 9 11:51:30 2020 -0400

    TIKA-3207 -- allow for _vert for language codes
---
 .../src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java  | 4 ++--
 .../test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java  | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git 
a/tika-parser-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
 
b/tika-parser-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 1c65ece..bdd6eb5 100644
--- 
a/tika-parser-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ 
b/tika-parser-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -247,9 +247,9 @@ public class TesseractOCRConfig implements Serializable {
      * e.g. "chi_tra+chi_sim"
      */
     public void setLanguage(String language) {
-        if (!language.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4})?(\\+?))+")
+        if (!language.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2}(\\+?))+")
                 || language.endsWith("+")) {
-            throw new IllegalArgumentException("Invalid language code");
+            throw new IllegalArgumentException("Invalid language code: 
"+language);
         }
         this.language = language;
     }
diff --git 
a/tika-parser-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
 
b/tika-parser-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
index 8dd9cf6..246c24a 100644
--- 
a/tika-parser-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
+++ 
b/tika-parser-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
@@ -99,7 +99,8 @@ public class TesseractOCRConfigTest extends TikaTest {
     @Test
     public void testValidateValidLanguage() {
         List<String> validLanguages = Arrays.asList(
-                "eng", "slk_frak", "chi_tra", "eng+fra", 
"tgk+chi_tra+slk_frak");
+                "eng", "slk_frak", "chi_tra", "eng+fra", 
"tgk+chi_tra+slk_frak",
+                "chi_tra_vert", "tgk+chi_tra_vert+slk_frak");
 
         TesseractOCRConfig config = new TesseractOCRConfig();
 

Reply via email to