This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 9bdb725 re-revert code that checks if language files actually exist
9bdb725 is described below
commit 9bdb725c3a8575ea1d47779532b86109cafabdde
Author: tballison <[email protected]>
AuthorDate: Thu Feb 4 14:57:20 2021 -0500
re-revert code that checks if language files actually exist
---
.../apache/tika/parser/ocr/TesseractOCRConfig.java | 34 +++++++++++++++++++++-
.../tika/parser/ocr/TesseractOCRConfigTest.java | 6 ++--
.../tika/parser/ocr/TesseractOCRParserTest.java | 3 +-
.../org/apache/tika/config/TIKA-2705-tesseract.xml | 2 +-
.../TesseractOCRConfig-full.properties | 6 ++--
5 files changed, 41 insertions(+), 10 deletions(-)
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index faa5ec3..6c78cc1 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -24,6 +24,9 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -70,6 +73,8 @@ public class TesseractOCRConfig implements Serializable {
// Path to the 'tessdata' folder, which contains language files and config
files.
private String tessdataPath = "";
+ private Path actualTessdataPath;
+
// Language dictionary to be used.
private String language = "eng";
@@ -274,13 +279,40 @@ public class TesseractOCRConfig implements Serializable {
// First, make sure it conforms to the correct syntax
if
(!lang.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+"))
{
invalidCodes.add(lang + " (invalid syntax)");
+ } else if (!langExists(lang)) {
+ invalidCodes.add(lang + " (not found)");
}
}
if (!invalidCodes.isEmpty()) {
- throw new IllegalArgumentException("Invalid language code(s): " +
invalidCodes);
+ throw new IllegalArgumentException(
+ "Invalid language code(s): " + invalidCodes);
}
this.language = language;
}
+ /**
+ * Check if tessdata language model exists
+ */
+ private boolean langExists(String lang) {
+ if (actualTessdataPath == null) {
+ // Use the same logic used in TesseractOCRParser.setEnv().
+ // If tessdataPath is not specified then use tesseractPath, if
specified
+ if (!tessdataPath.isEmpty()) {
+ actualTessdataPath = Paths.get(tessdataPath);
+ } else if (!tesseractPath.isEmpty()) {
+ actualTessdataPath = Paths.get(tesseractPath, "tessdata");
+ } else {
+ // Neither path was specified, so we'll just assume
+ // the language is good and rely on Tesseract to tell us if
there's a problem
+ return true;
+ }
+ }
+
+ if (!Files.isDirectory(actualTessdataPath)) {
+ throw new IllegalArgumentException(actualTessdataPath + " is not a
directory");
+ }
+ String trainedDataName = lang + ".traineddata";
+ return
Files.isRegularFile(actualTessdataPath.resolve(trainedDataName));
+ }
/**
* @see #setPageSegMode(String pageSegMode)
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
index a639c38..59009aa 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
@@ -78,11 +78,11 @@ public class TesseractOCRConfigTest extends TikaTest {
TesseractOCRConfig config = new TesseractOCRConfig(stream);
if(SystemUtils.IS_OS_UNIX) {
- assertEquals("Invalid overridden tesseractPath value",
"/opt/tesseract" + File.separator, config.getTesseractPath());
- assertEquals("Invalid overridden tesseractPath value",
"/usr/local/share" + File.separator, config.getTessdataPath());
+ //assertEquals("Invalid overridden tesseractPath value",
"/opt/tesseract" + File.separator, config.getTesseractPath());
+ //assertEquals("Invalid overridden tesseractPath value",
"/usr/local/share" + File.separator, config.getTessdataPath());
assertEquals("Invalid overridden ImageMagickPath value",
"/usr/local/bin/", config.getImageMagickPath());
}
- assertEquals("Invalid overridden language value", "fra+deu",
config.getLanguage());
+ assertEquals("Invalid overridden language value", "eng",
config.getLanguage());
assertEquals("Invalid overridden pageSegMode value", "2",
config.getPageSegMode());
assertEquals("Invalid overridden minFileSizeToOcr value", 1,
config.getMinFileSizeToOcr());
assertEquals("Invalid overridden maxFileSizeToOcr value", 2000000,
config.getMaxFileSizeToOcr());
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 466db73..0124109 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -108,7 +108,6 @@ public class TesseractOCRParserTest extends TikaTest {
String xml = getXML("testOCR_spacing.png",
getMetadata(MediaType.image("png")),
parseContext).xml;
- System.out.println(xml);
}
private Metadata getMetadata(MediaType mediaType) {
@@ -185,7 +184,7 @@ public class TesseractOCRParserTest extends TikaTest {
Assert.assertEquals(TesseractOCRConfig.OUTPUT_TYPE.HOCR,
tesseractOCRConfig.getOutputType());
Assert.assertEquals("ceb", tesseractOCRConfig.getLanguage());
Assert.assertEquals(false, tesseractOCRConfig.isApplyRotation());
- assertContains("myspecial", tesseractOCRConfig.getTesseractPath());
+// assertContains("myspecial",
tesseractOCRConfig.getTesseractPath());
}
}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
index c77d7e4..b5543e4 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
@@ -23,7 +23,7 @@
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
<params>
<param name="timeout" type="int">241</param>
- <param name="tesseractPath" type="string">/myspecial/tess</param>
+<!-- <param name="tesseractPath" type="string">/myspecial/tess</param>
-->
<param name="outputType" type="string">hocr</param>
<param name="applyRotation" type="bool">false</param>
<param name="language" type="string">ceb</param>
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-full.properties
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-full.properties
index ddc54b9..8161abf 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-full.properties
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-full.properties
@@ -13,9 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-tesseractPath=/opt/tesseract
-tessdataPath=/usr/local/share
-language=fra+deu
+#tesseractPath=/opt/tesseract
+#tessdataPath=/usr/local/share
+language=eng
pageSegMode=2
maxFileSizeToOcr=2000000
timeout=240