Repository: tika Updated Branches: refs/heads/master 81fad8c97 -> 2df8567ff
TIKA-2169 -- fix xhtml markup caused by bug in OCR parser Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/2df8567f Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/2df8567f Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/2df8567f Branch: refs/heads/master Commit: 2df8567ffc688a29de1394a208e651961a8ab53a Parents: 81fad8c Author: tballison <[email protected]> Authored: Mon Nov 28 10:34:57 2016 -0500 Committer: tballison <[email protected]> Committed: Mon Nov 28 10:34:57 2016 -0500 ---------------------------------------------------------------------- .../src/test/java/org/apache/tika/TikaTest.java | 16 +++++- .../tika/parser/ocr/TesseractOCRParser.java | 52 +++++++++++--------- .../tika/parser/ocr/TesseractOCRParserTest.java | 10 ++++ 3 files changed, 55 insertions(+), 23 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/2df8567f/tika-core/src/test/java/org/apache/tika/TikaTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index 462c1e5..aa673f0 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -16,6 +16,7 @@ */ package org.apache.tika; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -85,9 +86,22 @@ public abstract class TikaTest { return stream; } + public static void assertContainsCount(String needle, String haystack, int targetCount) { + int i = haystack.indexOf(needle); + int count = 0; + while (i > -1) { + count++; + i = haystack.indexOf(needle, i+1); + } + assertEquals("found "+count +" but should have found: "+targetCount, + targetCount, count); + } + + public static void assertContains(String needle, String haystack) { - assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle)); + assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle)); } + public static <T> void assertContains(T needle, Collection<? extends T> haystack) { assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle)); } http://git-wip-us.apache.org/repos/asf/tika/blob/2df8567f/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 8e11e00..ffbef1c 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -67,7 +67,6 @@ import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.parser.image.ImageParser; import org.apache.tika.parser.image.TiffParser; import org.apache.tika.parser.jpeg.JpegParser; -import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.OfflineContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.Attributes; @@ -219,15 +218,22 @@ public class TesseractOCRParser extends AbstractParser { try { TikaInputStream tikaStream = TikaInputStream.get(stream, tmp); - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); - xhtml.startDocument(); - File tmpImgFile = tmp.createTemporaryFile(); - parse(tikaStream, tmpImgFile, parseContext, xhtml, config); + //trigger the spooling to a tmp file if the stream wasn't + //already a TikaInputStream that contained a file + tikaStream.getPath(); + //this is the text output file name specified on the tesseract + //commandline. The actual output file name will have a suffix added. + File tmpOCROutputFile = tmp.createTemporaryFile(); + // Temporary workaround for TIKA-1445 - until we can specify // composite parsers with strategies (eg Composite, Try In Turn), // always send the image onwards to the regular parser to have // the metadata for them extracted as well - _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new EmbeddedContentHandler(xhtml), metadata, parseContext); + _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + parse(tikaStream, tmpOCROutputFile, parseContext, xhtml, config); xhtml.endDocument(); } finally { tmp.dispose(); @@ -263,7 +269,6 @@ public class TesseractOCRParser extends AbstractParser { * @throws SAXException * @throws TikaException * - * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, ParseContext, TesseractOCRConfig)} */ public void parseInline(InputStream stream, XHTMLContentHandler xhtml, ParseContext parseContext, TesseractOCRConfig config) @@ -334,7 +339,7 @@ public class TesseractOCRParser extends AbstractParser { tmp.close(); } - private void parse(TikaInputStream tikaInputStream, File tmpImgFile, ParseContext parseContext, + private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, ParseContext parseContext, XHTMLContentHandler xhtml, TesseractOCRConfig config) throws IOException, SAXException, TikaException { File tmpTxtOutput = null; @@ -344,21 +349,27 @@ public class TesseractOCRParser extends AbstractParser { if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) { - // copy the contents of the original input file into a temporary file - // which will be processed for OCR - TemporaryResources tmp = new TemporaryResources(); - File tmpFile = tmp.createTemporaryFile(); - FileUtils.copyFile(input, tmpFile); - // Process image if ImageMagick Tool is present if(config.isEnableImageProcessing() == 1 && hasImageMagick(config)) { - processImage(tmpFile,config); - } - - doOCR(tmpFile, tmpImgFile, config); + // copy the contents of the original input file into a temporary file + // which will be preprocessed for OCR + TemporaryResources tmp = new TemporaryResources(); + try { + File tmpFile = tmp.createTemporaryFile(); + FileUtils.copyFile(input, tmpFile); + processImage(tmpFile, config); + doOCR(tmpFile, tmpOCROutputFile, config); + } finally { + if (tmp != null) { + tmp.dispose(); + } + } + } else { + doOCR(input, tmpOCROutputFile, config); + } // Tesseract appends the output type (.txt or .hocr) to output file name - tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + "." + + tmpTxtOutput = new File(tmpOCROutputFile.getAbsolutePath() + "." + config.getOutputType().toString().toLowerCase(Locale.US)); if (tmpTxtOutput.exists()) { @@ -370,10 +381,7 @@ public class TesseractOCRParser extends AbstractParser { } } } - - tmp.close(); } - } finally { if (tmpTxtOutput != null) { tmpTxtOutput.delete(); http://git-wip-us.apache.org/repos/asf/tika/blob/2df8567f/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index 956a71b..e0f89ac 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -197,6 +197,16 @@ public class TesseractOCRParserTest extends TikaTest { assumeTrue(canRun()); String xml = getXML("testOCR.jpg").xml; assertContains("OCR Testing", xml); + //test metadata extraction + assertContains("<meta name=\"Image Width\" content=\"136 pixels\" />", xml); + + //TIKA-2169 + assertContainsCount("<html", xml, 1); + assertContainsCount("<title", xml, 1); + assertContainsCount("</title", xml, 1); + assertContainsCount("<body", xml, 1); + assertContainsCount("</body", xml, 1); + assertContainsCount("</html", xml, 1); } @Test
