Repository: tika Updated Branches: refs/heads/master 8a45f67a2 -> 3a5431e20
add hOCR output format to TesseractParser Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/10507d05 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/10507d05 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/10507d05 Branch: refs/heads/master Commit: 10507d0521a0f06c50f32aa6150228ef4ac773d4 Parents: 8a45f67 Author: Eric Pugh <[email protected]> Authored: Thu Sep 22 13:14:55 2016 -0400 Committer: Eric Pugh <[email protected]> Committed: Thu Sep 22 13:14:55 2016 -0400 ---------------------------------------------------------------------- .../tika/parser/ocr/TesseractOCRConfig.java | 18 +++++++++++ .../tika/parser/ocr/TesseractOCRParser.java | 6 ++-- .../parser/ocr/TesseractOCRConfig.properties | 1 + .../tika/parser/ocr/TesseractOCRParserTest.java | 34 +++++++++++++++----- 4 files changed, 48 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/10507d05/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java index 84312d8..7b266f1 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java @@ -62,6 +62,9 @@ public class TesseractOCRConfig implements Serializable{ // Maximum time (seconds) to wait for the ocring process termination private int timeout = 120; + + // The format of the ocr'ed output to be returned, txt or hocr. + private String outputType = "txt"; // enable image processing (optional) private int enableImageProcessing = 0; @@ -135,6 +138,8 @@ public class TesseractOCRConfig implements Serializable{ getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr())); setTimeout( getProp(props, "timeout", getTimeout())); + setOutputType( + getProp(props, "outputType", getOutputType())); // set parameters for ImageMagick setEnableImageProcessing( @@ -261,6 +266,19 @@ public class TesseractOCRConfig implements Serializable{ public int getTimeout() { return timeout; } + + /** + * Set output type from ocr process. Default is "txt", but can be "hocr". + * Default value is 120s. + */ + public void setOutputType(String outputType) { + this.outputType = outputType; + } + + /** @see #setOutputType(String outputType) */ + public String getOutputType() { + return outputType; + } /** @see #setEnableImageProcessing(boolean) * @return image processing is enabled or not */ http://git-wip-us.apache.org/repos/asf/tika/blob/10507d05/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index e0f0d2b..ccf21cb 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -328,8 +328,8 @@ public class TesseractOCRParser extends AbstractParser { doOCR(tmpFile, tmpImgFile, config); - // Tesseract appends .txt to output file name - tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + ".txt"); + // Tesseract appends the output type (.txt or .hocr) to output file name + tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + "." + config.getOutputType()); if (tmpTxtOutput.exists()) { try (InputStream is = new FileInputStream(tmpTxtOutput)) { @@ -375,7 +375,7 @@ public class TesseractOCRParser extends AbstractParser { */ private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException { String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l", - config.getLanguage(), "-psm", config.getPageSegMode() }; + config.getLanguage(), "-psm", config.getPageSegMode(), config.getOutputType()}; ProcessBuilder pb = new ProcessBuilder(cmd); setEnv(config, pb); http://git-wip-us.apache.org/repos/asf/tika/blob/10507d05/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties index 7acc694..2380282 100644 --- a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties +++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties @@ -20,6 +20,7 @@ pageSegMode=1 maxFileSizeToOcr=2147483647 minFileSizeToOcr=0 timeout=120 +outputType=txt # properties for image processing # to enable processing, set enableImageProcessing to 1 http://git-wip-us.apache.org/repos/asf/tika/blob/10507d05/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index cc0288f..4490953 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -126,9 +126,31 @@ public class TesseractOCRParserTest extends TikaTest { }; testBasicOCR(resource, nonOCRContains, 3); } + + @Test + public void testOCROutputsHOCR() throws Exception { + String resource = "/test-documents/testOCR.pdf"; + String[] nonOCRContains = new String[0]; + String contents = runOCR(resource, nonOCRContains, 2, "hocr"); + assertTrue(contents.contains("<meta name='ocr-system' content='tesseract")); + + } - private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception { + private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception{ + String contents = runOCR(resource, nonOCRContains, numMetadatas, "txt"); + if (canRun()) { + if(resource.substring(resource.lastIndexOf('.'), resource.length()).equals(".jpg")) { + assertTrue(contents.toString().contains("Apache")); + } else { + assertTrue(contents.toString().contains("Happy New Year 2003!")); + } + } + } + + private String runOCR(String resource, String[] nonOCRContains, int numMetadatas, String outputType) throws Exception { TesseractOCRConfig config = new TesseractOCRConfig(); + config.setOutputType(outputType); + Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory( BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); @@ -151,13 +173,7 @@ public class TesseractOCRParserTest extends TikaTest { for (Metadata m : metadataList) { contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT)); } - if (canRun()) { - if(resource.substring(resource.lastIndexOf('.'), resource.length()).equals(".jpg")) { - assertTrue(contents.toString().contains("Apache")); - } else { - assertTrue(contents.toString().contains("Happy New Year 2003!")); - } - } + for (String needle : nonOCRContains) { assertContains(needle, contents.toString()); } @@ -165,6 +181,8 @@ public class TesseractOCRParserTest extends TikaTest { assertTrue(metadataList.get(1).names().length > 10); //test at least one value assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName")); + + return contents.toString(); } @Test
