TIKA-2093 -- add option for Tesseract's hOCR output, thanks to Eric Pugh! This 
closes #133.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3a5431e2
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3a5431e2
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3a5431e2

Branch: refs/heads/master
Commit: 3a5431e200056d85b458bea766fd185225771c97
Parents: 10507d0
Author: tballison <talli...@mitre.org>
Authored: Thu Sep 22 21:12:44 2016 -0400
Committer: tballison <talli...@mitre.org>
Committed: Thu Sep 22 21:12:44 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   3 +
 .../tika/parser/ocr/TesseractOCRConfig.java     |  27 +++--
 .../tika/parser/ocr/TesseractOCRParser.java     | 117 ++++++++++++++++---
 .../tika/parser/ocr/TesseractOCRParserTest.java |  23 ++--
 4 files changed, 140 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/3a5431e2/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 9a03b01..ef82775 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.14 - ???
 
+  * Add Tesseract's hOCR output format as an option, via Eric Pugh
+    (TIKA-2093)
+
   * Extract macros from MSOffice files (TIKA-2069).
 
   * Maintain passed-in mime in TXTParser (TIKA-2047).

http://git-wip-us.apache.org/repos/asf/tika/blob/3a5431e2/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 7b266f1..7d6cd3f 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -42,6 +42,11 @@ public class TesseractOCRConfig implements Serializable{
 
        private static final long serialVersionUID = -4861942486845757891L;
 
+       public enum OUTPUT_TYPE {
+               TXT,
+               HOCR
+       }
+
        // Path to tesseract installation folder, if not on system path.
        private  String tesseractPath = "";
 
@@ -64,7 +69,7 @@ public class TesseractOCRConfig implements Serializable{
        private int timeout = 120;
        
        // The format of the ocr'ed output to be returned, txt or hocr.
-       private String outputType = "txt";
+       private OUTPUT_TYPE outputType = OUTPUT_TYPE.TXT;
 
        // enable image processing (optional)
        private int enableImageProcessing = 0;
@@ -138,9 +143,13 @@ public class TesseractOCRConfig implements Serializable{
                                getProp(props, "maxFileSizeToOcr", 
getMaxFileSizeToOcr()));
                setTimeout(
                 getProp(props, "timeout", getTimeout()));
-               setOutputType(
-                getProp(props, "outputType", getOutputType()));                
-               
+               String outputTypeString = props.getProperty("outputType");
+               if ("txt".equals(outputTypeString)) {
+                       setOutputType(OUTPUT_TYPE.TXT);
+               } else if ("hocr".equals(outputTypeString)) {
+                       setOutputType(OUTPUT_TYPE.HOCR);
+               }
+
                // set parameters for ImageMagick
                setEnableImageProcessing(
                                getProp(props, "enableImageProcessing", 
isEnableImageProcessing()));
@@ -271,16 +280,16 @@ public class TesseractOCRConfig implements Serializable{
         * Set output type from ocr process.  Default is "txt", but can be 
"hocr".
         * Default value is 120s.
         */
-       public void setOutputType(String outputType) {
+       public void setOutputType(OUTPUT_TYPE outputType) {
                this.outputType = outputType;
        }
 
-       /** @see #setOutputType(String outputType) */
-       public String getOutputType() {
+       /** @see #setOutputType(OUTPUT_TYPE outputType) */
+       public OUTPUT_TYPE getOutputType() {
                return outputType;
        }       
 
-       /** @see #setEnableImageProcessing(boolean)
+       /** @see #setEnableImageProcessing(int)
         * @return image processing is enabled or not */
        public int isEnableImageProcessing() {
                return enableImageProcessing;
@@ -411,7 +420,7 @@ public class TesseractOCRConfig implements Serializable{
        
        /**
         * Set the path to the ImageMagick executable, needed if it is not on 
system path.
-        * @param path to ImageMagick file.
+        * @param ImageMagickPath to ImageMagick file.
         */
        public void setImageMagickPath(String ImageMagickPath) {
                if(!ImageMagickPath.isEmpty() && 
!ImageMagickPath.endsWith(File.separator))

http://git-wip-us.apache.org/repos/asf/tika/blob/3a5431e2/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index ccf21cb..36c831b 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -16,8 +16,10 @@
  */
 package org.apache.tika.parser.ocr;
 
-import javax.imageio.ImageIO;
+import static java.nio.charset.StandardCharsets.UTF_8;
 
+import javax.imageio.ImageIO;
+import javax.xml.parsers.SAXParser;
 import java.awt.Image;
 import java.awt.image.BufferedImage;
 import java.io.BufferedReader;
@@ -36,6 +38,7 @@ import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.Callable;
@@ -65,11 +68,12 @@ import org.apache.tika.parser.image.ImageParser;
 import org.apache.tika.parser.image.TiffParser;
 import org.apache.tika.parser.jpeg.JpegParser;
 import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
+import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
@@ -95,6 +99,8 @@ public class TesseractOCRParser extends AbstractParser {
             })));
     private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, 
Boolean>();
 
+
+
     @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         // If Tesseract is installed, offer our supported image types
@@ -127,7 +133,6 @@ public class TesseractOCRParser extends AbstractParser {
         if (TESSERACT_PRESENT.containsKey(tesseract)) {
             return TESSERACT_PRESENT.get(tesseract);
         }
-
         // Try running Tesseract from there, and see if it exists + works
         String[] checkCmd = { tesseract };
         boolean hasTesseract = ExternalParser.check(checkCmd);
@@ -199,9 +204,10 @@ public class TesseractOCRParser extends AbstractParser {
     }
 
     @Override
-    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext parseContext)
             throws IOException, SAXException, TikaException {
-        TesseractOCRConfig config = context.get(TesseractOCRConfig.class, 
DEFAULT_CONFIG);
+
+        TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, 
DEFAULT_CONFIG);
         // If Tesseract is not on the path with the current config, do not try 
to run OCR
         // getSupportedTypes shouldn't have listed us as handling it, so this 
should only
         //  occur if someone directly calls this parser, not via DefaultParser 
or similar
@@ -215,12 +221,12 @@ public class TesseractOCRParser extends AbstractParser {
             XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
             xhtml.startDocument();
             File tmpImgFile = tmp.createTemporaryFile();
-            parse(tikaStream, tmpImgFile, xhtml, config);
+            parse(tikaStream, tmpImgFile, parseContext, xhtml, config);
             // Temporary workaround for TIKA-1445 - until we can specify
             //  composite parsers with strategies (eg Composite, Try In Turn),
             //  always send the image onwards to the regular parser to have
             //  the metadata for them extracted as well
-            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new 
EmbeddedContentHandler(xhtml), metadata, context);
+            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new 
EmbeddedContentHandler(xhtml), metadata, parseContext);
             xhtml.endDocument();
         } finally {
             tmp.dispose();
@@ -230,15 +236,37 @@ public class TesseractOCRParser extends AbstractParser {
     /**
      * Use this to parse content without starting a new document.
      * This appends SAX events to xhtml without re-adding the metadata, body 
start, etc.
+     *
      * @param stream inputstream
      * @param xhtml handler
      * @param config TesseractOCRConfig to use for this parse
      * @throws IOException
      * @throws SAXException
      * @throws TikaException
+     *
+     * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, 
ParseContext, TesseractOCRConfig)}
      */
     public void parseInline(InputStream stream, XHTMLContentHandler xhtml, 
TesseractOCRConfig config)
             throws IOException, SAXException, TikaException {
+        parseInline(stream, xhtml, new ParseContext(), config);
+    }
+
+    /**
+     * Use this to parse content without starting a new document.
+     * This appends SAX events to xhtml without re-adding the metadata, body 
start, etc.
+     *
+     * @param stream inputstream
+     * @param xhtml handler
+     * @param config TesseractOCRConfig to use for this parse
+     * @throws IOException
+     * @throws SAXException
+     * @throws TikaException
+     *
+     * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, 
ParseContext, TesseractOCRConfig)}
+     */
+    public void parseInline(InputStream stream, XHTMLContentHandler xhtml, 
ParseContext parseContext,
+                            TesseractOCRConfig config)
+            throws IOException, SAXException, TikaException {
         // If Tesseract is not on the path with the current config, do not try 
to run OCR
         // getSupportedTypes shouldn't have listed us as handling it, so this 
should only
         //  occur if someone directly calls this parser, not via DefaultParser 
or similar
@@ -249,7 +277,7 @@ public class TesseractOCRParser extends AbstractParser {
         try {
             TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
             File tmpImgFile = tmp.createTemporaryFile();
-            parse(tikaStream, tmpImgFile, xhtml, config);
+            parse(tikaStream, tmpImgFile, parseContext, xhtml, config);
         } finally {
             tmp.dispose();
         }
@@ -305,10 +333,10 @@ public class TesseractOCRParser extends AbstractParser {
         tmp.close();
     }
     
-    private void parse(TikaInputStream tikaInputStream, File tmpImgFile, 
XHTMLContentHandler xhtml, TesseractOCRConfig config)
+    private void parse(TikaInputStream tikaInputStream, File tmpImgFile, 
ParseContext parseContext,
+                       XHTMLContentHandler xhtml, TesseractOCRConfig config)
             throws IOException, SAXException, TikaException {
         File tmpTxtOutput = null;
-
         try {
             File input = tikaInputStream.getFile();
             long size = tikaInputStream.getLength();
@@ -333,7 +361,11 @@ public class TesseractOCRParser extends AbstractParser {
 
                 if (tmpTxtOutput.exists()) {
                     try (InputStream is = new FileInputStream(tmpTxtOutput)) {
-                        extractOutput(is, xhtml);
+                        if 
(config.getOutputType().equals(TesseractOCRConfig.OUTPUT_TYPE.HOCR)) {
+                            extractHOCROutput(is, parseContext, xhtml);
+                        } else {
+                            extractOutput(is, xhtml);
+                        }
                     }
                 }
              
@@ -347,6 +379,7 @@ public class TesseractOCRParser extends AbstractParser {
         }
     }
 
+
     // TIKA-1445 workaround parser
     private static Parser _TMP_IMAGE_METADATA_PARSER = new 
CompositeImageParser();
     private static class CompositeImageParser extends CompositeParser {
@@ -375,7 +408,7 @@ public class TesseractOCRParser extends AbstractParser {
      */
     private void doOCR(File input, File output, TesseractOCRConfig config) 
throws IOException, TikaException {
         String[] cmd = { config.getTesseractPath() + getTesseractProg(), 
input.getPath(), output.getPath(), "-l",
-                config.getLanguage(), "-psm", config.getPageSegMode(), 
config.getOutputType()};
+                config.getLanguage(), "-psm", config.getPageSegMode(), 
config.getOutputType().name().toLowerCase(Locale.US)};
 
         ProcessBuilder pb = new ProcessBuilder(cmd);
         setEnv(config, pb);
@@ -441,7 +474,17 @@ public class TesseractOCRParser extends AbstractParser {
             }
         }
         xhtml.endElement("div");
+    }
 
+    private void extractHOCROutput(InputStream is, ParseContext parseContext,
+                                   XHTMLContentHandler xhtml) throws 
TikaException, IOException, SAXException {
+        if (parseContext == null) {
+            parseContext = new ParseContext();
+        }
+        SAXParser parser = parseContext.getSAXParser();
+        xhtml.startElement("div", "class", "ocr");
+        parser.parse(is, new OfflineContentHandler(new 
HOCRPassThroughHandler(xhtml)));
+        xhtml.endElement("div");
     }
 
     /**
@@ -477,5 +520,53 @@ public class TesseractOCRParser extends AbstractParser {
     static String getImageMagickProg() {
        return System.getProperty("os.name").startsWith("Windows") ? 
"convert.exe" : "convert";
     }
+
+
+    private static class HOCRPassThroughHandler extends DefaultHandler {
+        private final ContentHandler xhtml;
+        public static final Set<String> IGNORE = unmodifiableSet(
+                "html", "head", "title", "meta", "body");
+
+        public HOCRPassThroughHandler(ContentHandler xhtml) {
+            this.xhtml = xhtml;
+        }
+
+        /**
+         * Starts the given element. Table cells and list items are 
automatically
+         * indented by emitting a tab character as ignorable whitespace.
+         */
+        @Override
+        public void startElement(
+                String uri, String local, String name, Attributes attributes)
+                throws SAXException {
+            if (!IGNORE.contains(name)) {
+                xhtml.startElement(uri, local, name, attributes);
+            }
+        }
+
+        /**
+         * Ends the given element. Block elements are automatically followed
+         * by a newline character.
+         */
+        @Override
+        public void endElement(String uri, String local, String name) throws 
SAXException {
+            if (!IGNORE.contains(name)) {
+                xhtml.endElement(uri, local, name);
+            }
+        }
+
+        /**
+         * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-210";>TIKA-210</a>
+         */
+        @Override
+        public void characters(char[] ch, int start, int length) throws 
SAXException {
+            xhtml.characters(ch, start, length);
+        }
+
+        private static Set<String> unmodifiableSet(String... elements) {
+            return Collections.unmodifiableSet(
+                    new HashSet<String>(Arrays.asList(elements)));
+        }
+    }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/3a5431e2/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 4490953..b81ded3 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -21,10 +21,7 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assume.assumeTrue;
 
-import java.io.BufferedReader;
-import java.io.File;
 import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.util.List;
 
 import org.apache.tika.TikaTest;
@@ -129,15 +126,23 @@ public class TesseractOCRParserTest extends TikaTest {
     
     @Test
     public void testOCROutputsHOCR() throws Exception {
+        assumeTrue(canRun());
+
         String resource = "/test-documents/testOCR.pdf";
+
         String[] nonOCRContains = new String[0];
-        String contents = runOCR(resource, nonOCRContains, 2, "hocr");        
-        assertTrue(contents.contains("<meta name='ocr-system' 
content='tesseract"));
+        String contents = runOCR(resource, nonOCRContains, 2,
+                BasicContentHandlerFactory.HANDLER_TYPE.XML,
+                TesseractOCRConfig.OUTPUT_TYPE.HOCR);
+
+        assertContains("<span class=\"ocrx_word\" id=\"word_1_1\"", contents);
+        assertContains("Happy</span>", contents);
 
     }
 
     private void testBasicOCR(String resource, String[] nonOCRContains, int 
numMetadatas) throws Exception{
-       String contents = runOCR(resource, nonOCRContains, numMetadatas, "txt");
+       String contents = runOCR(resource, nonOCRContains, numMetadatas,
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 
TesseractOCRConfig.OUTPUT_TYPE.TXT);
         if (canRun()) {
                if(resource.substring(resource.lastIndexOf('.'), 
resource.length()).equals(".jpg")) {
                        assertTrue(contents.toString().contains("Apache"));
@@ -147,13 +152,15 @@ public class TesseractOCRParserTest extends TikaTest {
         }
     }
     
-    private String runOCR(String resource, String[] nonOCRContains, int 
numMetadatas, String outputType) throws Exception {
+    private String runOCR(String resource, String[] nonOCRContains, int 
numMetadatas,
+                          BasicContentHandlerFactory.HANDLER_TYPE handlerType,
+                          TesseractOCRConfig.OUTPUT_TYPE outputType) throws 
Exception {
         TesseractOCRConfig config = new TesseractOCRConfig();
         config.setOutputType(outputType);
         
         Parser parser = new RecursiveParserWrapper(new AutoDetectParser(),
                 new BasicContentHandlerFactory(
-                        BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+                        handlerType, -1));
 
         PDFParserConfig pdfConfig = new PDFParserConfig();
         pdfConfig.setExtractInlineImages(true);

Reply via email to