tika git commit: TIKA-2169 -- fix xhtml markup caused by bug in OCR parser

tallison Mon, 28 Nov 2016 07:36:18 -0800

Repository: tika
Updated Branches:
  refs/heads/master 81fad8c97 -> 2df8567ff



TIKA-2169 -- fix xhtml markup caused by bug in OCR parser


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/2df8567f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/2df8567f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/2df8567f

Branch: refs/heads/master
Commit: 2df8567ffc688a29de1394a208e651961a8ab53a
Parents: 81fad8c
Author: tballison <[email protected]>
Authored: Mon Nov 28 10:34:57 2016 -0500
Committer: tballison <[email protected]>
Committed: Mon Nov 28 10:34:57 2016 -0500

----------------------------------------------------------------------
 .../src/test/java/org/apache/tika/TikaTest.java | 16 +++++-
 .../tika/parser/ocr/TesseractOCRParser.java     | 52 +++++++++++---------
 .../tika/parser/ocr/TesseractOCRParserTest.java | 10 ++++
 3 files changed, 55 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/2df8567f/tika-core/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java 
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 462c1e5..aa673f0 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -16,6 +16,7 @@
  */
 package org.apache.tika;
 
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
@@ -85,9 +86,22 @@ public abstract class TikaTest {
        return stream;
    }
 
+    public static void assertContainsCount(String needle, String haystack, int 
targetCount) {
+        int i = haystack.indexOf(needle);
+        int count = 0;
+        while (i > -1) {
+            count++;
+            i = haystack.indexOf(needle, i+1);
+        }
+        assertEquals("found "+count +" but should have found: "+targetCount,
+                targetCount, count);
+    }
+
+
     public static void assertContains(String needle, String haystack) {
-       assertTrue(needle + " not found in:\n" + haystack, 
haystack.contains(needle));
+        assertTrue(needle + " not found in:\n" + haystack, 
haystack.contains(needle));
     }
+
     public static <T> void assertContains(T needle, Collection<? extends T> 
haystack) {
         assertTrue(needle + " not found in:\n" + haystack, 
haystack.contains(needle));
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/2df8567f/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 8e11e00..ffbef1c 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -67,7 +67,6 @@ import org.apache.tika.parser.external.ExternalParser;
 import org.apache.tika.parser.image.ImageParser;
 import org.apache.tika.parser.image.TiffParser;
 import org.apache.tika.parser.jpeg.JpegParser;
-import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.OfflineContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.Attributes;
@@ -219,15 +218,22 @@ public class TesseractOCRParser extends AbstractParser {
         try {
             TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
 
-            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
-            xhtml.startDocument();
-            File tmpImgFile = tmp.createTemporaryFile();
-            parse(tikaStream, tmpImgFile, parseContext, xhtml, config);
+            //trigger the spooling to a tmp file if the stream wasn't
+            //already a TikaInputStream that contained a file
+            tikaStream.getPath();
+            //this is the text output file name specified on the tesseract
+            //commandline.  The actual output file name will have a suffix 
added.
+            File tmpOCROutputFile = tmp.createTemporaryFile();
+
             // Temporary workaround for TIKA-1445 - until we can specify
             //  composite parsers with strategies (eg Composite, Try In Turn),
             //  always send the image onwards to the regular parser to have
             //  the metadata for them extracted as well
-            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new 
EmbeddedContentHandler(xhtml), metadata, parseContext);
+            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), 
metadata, parseContext);
+
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+            xhtml.startDocument();
+            parse(tikaStream, tmpOCROutputFile, parseContext, xhtml, config);
             xhtml.endDocument();
         } finally {
             tmp.dispose();
@@ -263,7 +269,6 @@ public class TesseractOCRParser extends AbstractParser {
      * @throws SAXException
      * @throws TikaException
      *
-     * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, 
ParseContext, TesseractOCRConfig)}
      */
     public void parseInline(InputStream stream, XHTMLContentHandler xhtml, 
ParseContext parseContext,
                             TesseractOCRConfig config)
@@ -334,7 +339,7 @@ public class TesseractOCRParser extends AbstractParser {
         tmp.close();
     }
     
-    private void parse(TikaInputStream tikaInputStream, File tmpImgFile, 
ParseContext parseContext,
+    private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, 
ParseContext parseContext,
                        XHTMLContentHandler xhtml, TesseractOCRConfig config)
             throws IOException, SAXException, TikaException {
         File tmpTxtOutput = null;
@@ -344,21 +349,27 @@ public class TesseractOCRParser extends AbstractParser {
 
             if (size >= config.getMinFileSizeToOcr() && size <= 
config.getMaxFileSizeToOcr()) {
 
-               // copy the contents of the original input file into a 
temporary file
-               // which will be processed for OCR
-               TemporaryResources tmp = new TemporaryResources();
-               File tmpFile = tmp.createTemporaryFile();
-               FileUtils.copyFile(input, tmpFile);
-               
                // Process image if ImageMagick Tool is present
                if(config.isEnableImageProcessing() == 1 && 
hasImageMagick(config)) {
-                       processImage(tmpFile,config);
-               }
-               
-                doOCR(tmpFile, tmpImgFile, config);                
+                    // copy the contents of the original input file into a 
temporary file
+                    // which will be preprocessed for OCR
+                    TemporaryResources tmp = new TemporaryResources();
+                    try {
+                        File tmpFile = tmp.createTemporaryFile();
+                        FileUtils.copyFile(input, tmpFile);
+                        processImage(tmpFile, config);
+                        doOCR(tmpFile, tmpOCROutputFile, config);
+                    } finally {
+                        if (tmp != null) {
+                            tmp.dispose();
+                        }
+                    }
+               } else {
+                    doOCR(input, tmpOCROutputFile, config);
+                }
 
                 // Tesseract appends the output type (.txt or .hocr) to output 
file name
-                tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + "." +
+                tmpTxtOutput = new File(tmpOCROutputFile.getAbsolutePath() + 
"." +
                         
config.getOutputType().toString().toLowerCase(Locale.US));
 
                 if (tmpTxtOutput.exists()) {
@@ -370,10 +381,7 @@ public class TesseractOCRParser extends AbstractParser {
                         }
                     }
                 }
-             
-                tmp.close();
             }
-
         } finally {
             if (tmpTxtOutput != null) {
                 tmpTxtOutput.delete();

http://git-wip-us.apache.org/repos/asf/tika/blob/2df8567f/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 956a71b..e0f89ac 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -197,6 +197,16 @@ public class TesseractOCRParserTest extends TikaTest {
         assumeTrue(canRun());
         String xml = getXML("testOCR.jpg").xml;
         assertContains("OCR Testing", xml);
+        //test metadata extraction
+        assertContains("<meta name=\"Image Width\" content=\"136 pixels\" />", 
xml);
+
+        //TIKA-2169
+        assertContainsCount("<html", xml, 1);
+        assertContainsCount("<title", xml, 1);
+        assertContainsCount("</title", xml, 1);
+        assertContainsCount("<body", xml, 1);
+        assertContainsCount("</body", xml, 1);
+        assertContainsCount("</html", xml, 1);
     }
 
     @Test

tika git commit: TIKA-2169 -- fix xhtml markup caused by bug in OCR parser

Reply via email to