This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit a6f87f244f434098d0ee1e622b288ee2af6ba4a5 Author: tballison <[email protected]> AuthorDate: Thu Feb 4 13:19:23 2021 -0500 revert code that checks if language files actually exist --- .../apache/tika/parser/ocr/TesseractOCRConfig.java | 31 ---------------------- .../tika/parser/ocr/TesseractOCRParserTest.java | 2 +- 2 files changed, 1 insertion(+), 32 deletions(-) diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java index d1fe016..faa5ec3 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java @@ -70,9 +70,6 @@ public class TesseractOCRConfig implements Serializable { // Path to the 'tessdata' folder, which contains language files and config files. private String tessdataPath = ""; - // Actual path to tessdata, if not specified by user and we have to find it ourselves - private static File windowsActualTessdataDir; - // Language dictionary to be used. private String language = "eng"; @@ -277,8 +274,6 @@ public class TesseractOCRConfig implements Serializable { // First, make sure it conforms to the correct syntax if (!lang.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+")) { invalidCodes.add(lang + " (invalid syntax)"); - } else if (!langExists(lang)) { - invalidCodes.add(lang + " (not found)"); } } if (!invalidCodes.isEmpty()) { @@ -287,32 +282,6 @@ public class TesseractOCRConfig implements Serializable { this.language = language; } - - /** - * Check if tessdata language model exists - */ - private boolean langExists(String lang) { - if (windowsActualTessdataDir == null) { - // Use the same logic used in TesseractOCRParser.setEnv(). If tessdataPath is not specified then use tesseractPath, if specified - if (!tessdataPath.isEmpty()) { - windowsActualTessdataDir = new File(tessdataPath); - } else if (!tesseractPath.isEmpty()) { - windowsActualTessdataDir = new File(tesseractPath, "tessdata"); - } else { - // Neither path was specified, so we'll just assume - // the language is good and rely on Tesseract to tell us if there's a problem - return true; - } - } - - if (!windowsActualTessdataDir.isDirectory()) { - throw new RuntimeException(windowsActualTessdataDir + " is not a directory"); - } - String trainedDataName = lang + ".traineddata"; - return new File(windowsActualTessdataDir, trainedDataName).exists(); - } - - /** * @see #setPageSegMode(String pageSegMode) */ diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index 64c8453..466db73 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -98,7 +98,7 @@ public class TesseractOCRParserTest extends TikaTest { assumeTrue("can run OCR", canRun()); TesseractOCRConfig tesseractOCRConfigconfig = new TesseractOCRConfig(); - tesseractOCRConfigconfig.setLanguage("kerplekistanese"); + tesseractOCRConfigconfig.setLanguage("zzz"); ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, tesseractOCRConfigconfig);
