TIKA-2093 -- add option for Tesseract's hOCR output, thanks to Eric Pugh! This closes #133.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3a5431e2 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3a5431e2 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3a5431e2 Branch: refs/heads/master Commit: 3a5431e200056d85b458bea766fd185225771c97 Parents: 10507d0 Author: tballison <talli...@mitre.org> Authored: Thu Sep 22 21:12:44 2016 -0400 Committer: tballison <talli...@mitre.org> Committed: Thu Sep 22 21:12:44 2016 -0400 ---------------------------------------------------------------------- CHANGES.txt | 3 + .../tika/parser/ocr/TesseractOCRConfig.java | 27 +++-- .../tika/parser/ocr/TesseractOCRParser.java | 117 ++++++++++++++++--- .../tika/parser/ocr/TesseractOCRParserTest.java | 23 ++-- 4 files changed, 140 insertions(+), 30 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/3a5431e2/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 9a03b01..ef82775 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,8 @@ Release 1.14 - ??? + * Add Tesseract's hOCR output format as an option, via Eric Pugh + (TIKA-2093) + * Extract macros from MSOffice files (TIKA-2069). * Maintain passed-in mime in TXTParser (TIKA-2047). http://git-wip-us.apache.org/repos/asf/tika/blob/3a5431e2/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java index 7b266f1..7d6cd3f 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java @@ -42,6 +42,11 @@ public class TesseractOCRConfig implements Serializable{ private static final long serialVersionUID = -4861942486845757891L; + public enum OUTPUT_TYPE { + TXT, + HOCR + } + // Path to tesseract installation folder, if not on system path. private String tesseractPath = ""; @@ -64,7 +69,7 @@ public class TesseractOCRConfig implements Serializable{ private int timeout = 120; // The format of the ocr'ed output to be returned, txt or hocr. - private String outputType = "txt"; + private OUTPUT_TYPE outputType = OUTPUT_TYPE.TXT; // enable image processing (optional) private int enableImageProcessing = 0; @@ -138,9 +143,13 @@ public class TesseractOCRConfig implements Serializable{ getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr())); setTimeout( getProp(props, "timeout", getTimeout())); - setOutputType( - getProp(props, "outputType", getOutputType())); - + String outputTypeString = props.getProperty("outputType"); + if ("txt".equals(outputTypeString)) { + setOutputType(OUTPUT_TYPE.TXT); + } else if ("hocr".equals(outputTypeString)) { + setOutputType(OUTPUT_TYPE.HOCR); + } + // set parameters for ImageMagick setEnableImageProcessing( getProp(props, "enableImageProcessing", isEnableImageProcessing())); @@ -271,16 +280,16 @@ public class TesseractOCRConfig implements Serializable{ * Set output type from ocr process. Default is "txt", but can be "hocr". * Default value is 120s. */ - public void setOutputType(String outputType) { + public void setOutputType(OUTPUT_TYPE outputType) { this.outputType = outputType; } - /** @see #setOutputType(String outputType) */ - public String getOutputType() { + /** @see #setOutputType(OUTPUT_TYPE outputType) */ + public OUTPUT_TYPE getOutputType() { return outputType; } - /** @see #setEnableImageProcessing(boolean) + /** @see #setEnableImageProcessing(int) * @return image processing is enabled or not */ public int isEnableImageProcessing() { return enableImageProcessing; @@ -411,7 +420,7 @@ public class TesseractOCRConfig implements Serializable{ /** * Set the path to the ImageMagick executable, needed if it is not on system path. - * @param path to ImageMagick file. + * @param ImageMagickPath to ImageMagick file. */ public void setImageMagickPath(String ImageMagickPath) { if(!ImageMagickPath.isEmpty() && !ImageMagickPath.endsWith(File.separator)) http://git-wip-us.apache.org/repos/asf/tika/blob/3a5431e2/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index ccf21cb..36c831b 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -16,8 +16,10 @@ */ package org.apache.tika.parser.ocr; -import javax.imageio.ImageIO; +import static java.nio.charset.StandardCharsets.UTF_8; +import javax.imageio.ImageIO; +import javax.xml.parsers.SAXParser; import java.awt.Image; import java.awt.image.BufferedImage; import java.io.BufferedReader; @@ -36,6 +38,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.Callable; @@ -65,11 +68,12 @@ import org.apache.tika.parser.image.ImageParser; import org.apache.tika.parser.image.TiffParser; import org.apache.tika.parser.jpeg.JpegParser; import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.OfflineContentHandler; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; - -import static java.nio.charset.StandardCharsets.UTF_8; +import org.xml.sax.helpers.DefaultHandler; /** * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser, @@ -95,6 +99,8 @@ public class TesseractOCRParser extends AbstractParser { }))); private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>(); + + @Override public Set<MediaType> getSupportedTypes(ParseContext context) { // If Tesseract is installed, offer our supported image types @@ -127,7 +133,6 @@ public class TesseractOCRParser extends AbstractParser { if (TESSERACT_PRESENT.containsKey(tesseract)) { return TESSERACT_PRESENT.get(tesseract); } - // Try running Tesseract from there, and see if it exists + works String[] checkCmd = { tesseract }; boolean hasTesseract = ExternalParser.check(checkCmd); @@ -199,9 +204,10 @@ public class TesseractOCRParser extends AbstractParser { } @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException { - TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG); + + TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, DEFAULT_CONFIG); // If Tesseract is not on the path with the current config, do not try to run OCR // getSupportedTypes shouldn't have listed us as handling it, so this should only // occur if someone directly calls this parser, not via DefaultParser or similar @@ -215,12 +221,12 @@ public class TesseractOCRParser extends AbstractParser { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); File tmpImgFile = tmp.createTemporaryFile(); - parse(tikaStream, tmpImgFile, xhtml, config); + parse(tikaStream, tmpImgFile, parseContext, xhtml, config); // Temporary workaround for TIKA-1445 - until we can specify // composite parsers with strategies (eg Composite, Try In Turn), // always send the image onwards to the regular parser to have // the metadata for them extracted as well - _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new EmbeddedContentHandler(xhtml), metadata, context); + _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new EmbeddedContentHandler(xhtml), metadata, parseContext); xhtml.endDocument(); } finally { tmp.dispose(); @@ -230,15 +236,37 @@ public class TesseractOCRParser extends AbstractParser { /** * Use this to parse content without starting a new document. * This appends SAX events to xhtml without re-adding the metadata, body start, etc. + * * @param stream inputstream * @param xhtml handler * @param config TesseractOCRConfig to use for this parse * @throws IOException * @throws SAXException * @throws TikaException + * + * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, ParseContext, TesseractOCRConfig)} */ public void parseInline(InputStream stream, XHTMLContentHandler xhtml, TesseractOCRConfig config) throws IOException, SAXException, TikaException { + parseInline(stream, xhtml, new ParseContext(), config); + } + + /** + * Use this to parse content without starting a new document. + * This appends SAX events to xhtml without re-adding the metadata, body start, etc. + * + * @param stream inputstream + * @param xhtml handler + * @param config TesseractOCRConfig to use for this parse + * @throws IOException + * @throws SAXException + * @throws TikaException + * + * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, ParseContext, TesseractOCRConfig)} + */ + public void parseInline(InputStream stream, XHTMLContentHandler xhtml, ParseContext parseContext, + TesseractOCRConfig config) + throws IOException, SAXException, TikaException { // If Tesseract is not on the path with the current config, do not try to run OCR // getSupportedTypes shouldn't have listed us as handling it, so this should only // occur if someone directly calls this parser, not via DefaultParser or similar @@ -249,7 +277,7 @@ public class TesseractOCRParser extends AbstractParser { try { TikaInputStream tikaStream = TikaInputStream.get(stream, tmp); File tmpImgFile = tmp.createTemporaryFile(); - parse(tikaStream, tmpImgFile, xhtml, config); + parse(tikaStream, tmpImgFile, parseContext, xhtml, config); } finally { tmp.dispose(); } @@ -305,10 +333,10 @@ public class TesseractOCRParser extends AbstractParser { tmp.close(); } - private void parse(TikaInputStream tikaInputStream, File tmpImgFile, XHTMLContentHandler xhtml, TesseractOCRConfig config) + private void parse(TikaInputStream tikaInputStream, File tmpImgFile, ParseContext parseContext, + XHTMLContentHandler xhtml, TesseractOCRConfig config) throws IOException, SAXException, TikaException { File tmpTxtOutput = null; - try { File input = tikaInputStream.getFile(); long size = tikaInputStream.getLength(); @@ -333,7 +361,11 @@ public class TesseractOCRParser extends AbstractParser { if (tmpTxtOutput.exists()) { try (InputStream is = new FileInputStream(tmpTxtOutput)) { - extractOutput(is, xhtml); + if (config.getOutputType().equals(TesseractOCRConfig.OUTPUT_TYPE.HOCR)) { + extractHOCROutput(is, parseContext, xhtml); + } else { + extractOutput(is, xhtml); + } } } @@ -347,6 +379,7 @@ public class TesseractOCRParser extends AbstractParser { } } + // TIKA-1445 workaround parser private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser(); private static class CompositeImageParser extends CompositeParser { @@ -375,7 +408,7 @@ public class TesseractOCRParser extends AbstractParser { */ private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException { String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l", - config.getLanguage(), "-psm", config.getPageSegMode(), config.getOutputType()}; + config.getLanguage(), "-psm", config.getPageSegMode(), config.getOutputType().name().toLowerCase(Locale.US)}; ProcessBuilder pb = new ProcessBuilder(cmd); setEnv(config, pb); @@ -441,7 +474,17 @@ public class TesseractOCRParser extends AbstractParser { } } xhtml.endElement("div"); + } + private void extractHOCROutput(InputStream is, ParseContext parseContext, + XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException { + if (parseContext == null) { + parseContext = new ParseContext(); + } + SAXParser parser = parseContext.getSAXParser(); + xhtml.startElement("div", "class", "ocr"); + parser.parse(is, new OfflineContentHandler(new HOCRPassThroughHandler(xhtml))); + xhtml.endElement("div"); } /** @@ -477,5 +520,53 @@ public class TesseractOCRParser extends AbstractParser { static String getImageMagickProg() { return System.getProperty("os.name").startsWith("Windows") ? "convert.exe" : "convert"; } + + + private static class HOCRPassThroughHandler extends DefaultHandler { + private final ContentHandler xhtml; + public static final Set<String> IGNORE = unmodifiableSet( + "html", "head", "title", "meta", "body"); + + public HOCRPassThroughHandler(ContentHandler xhtml) { + this.xhtml = xhtml; + } + + /** + * Starts the given element. Table cells and list items are automatically + * indented by emitting a tab character as ignorable whitespace. + */ + @Override + public void startElement( + String uri, String local, String name, Attributes attributes) + throws SAXException { + if (!IGNORE.contains(name)) { + xhtml.startElement(uri, local, name, attributes); + } + } + + /** + * Ends the given element. Block elements are automatically followed + * by a newline character. + */ + @Override + public void endElement(String uri, String local, String name) throws SAXException { + if (!IGNORE.contains(name)) { + xhtml.endElement(uri, local, name); + } + } + + /** + * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a> + */ + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + xhtml.characters(ch, start, length); + } + + private static Set<String> unmodifiableSet(String... elements) { + return Collections.unmodifiableSet( + new HashSet<String>(Arrays.asList(elements))); + } + } } http://git-wip-us.apache.org/repos/asf/tika/blob/3a5431e2/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index 4490953..b81ded3 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -21,10 +21,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assume.assumeTrue; -import java.io.BufferedReader; -import java.io.File; import java.io.InputStream; -import java.io.InputStreamReader; import java.util.List; import org.apache.tika.TikaTest; @@ -129,15 +126,23 @@ public class TesseractOCRParserTest extends TikaTest { @Test public void testOCROutputsHOCR() throws Exception { + assumeTrue(canRun()); + String resource = "/test-documents/testOCR.pdf"; + String[] nonOCRContains = new String[0]; - String contents = runOCR(resource, nonOCRContains, 2, "hocr"); - assertTrue(contents.contains("<meta name='ocr-system' content='tesseract")); + String contents = runOCR(resource, nonOCRContains, 2, + BasicContentHandlerFactory.HANDLER_TYPE.XML, + TesseractOCRConfig.OUTPUT_TYPE.HOCR); + + assertContains("<span class=\"ocrx_word\" id=\"word_1_1\"", contents); + assertContains("Happy</span>", contents); } private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception{ - String contents = runOCR(resource, nonOCRContains, numMetadatas, "txt"); + String contents = runOCR(resource, nonOCRContains, numMetadatas, + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, TesseractOCRConfig.OUTPUT_TYPE.TXT); if (canRun()) { if(resource.substring(resource.lastIndexOf('.'), resource.length()).equals(".jpg")) { assertTrue(contents.toString().contains("Apache")); @@ -147,13 +152,15 @@ public class TesseractOCRParserTest extends TikaTest { } } - private String runOCR(String resource, String[] nonOCRContains, int numMetadatas, String outputType) throws Exception { + private String runOCR(String resource, String[] nonOCRContains, int numMetadatas, + BasicContentHandlerFactory.HANDLER_TYPE handlerType, + TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception { TesseractOCRConfig config = new TesseractOCRConfig(); config.setOutputType(outputType); Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory( - BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); + handlerType, -1)); PDFParserConfig pdfConfig = new PDFParserConfig(); pdfConfig.setExtractInlineImages(true);