Repository: tika Updated Branches: refs/heads/2.x 2f452304b -> a47a69933
TIKA-2169 fix xhtml in ocr Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/a47a6993 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/a47a6993 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/a47a6993 Branch: refs/heads/2.x Commit: a47a6993375f4105b16c84872a48b327e213084b Parents: 2f45230 Author: tballison <[email protected]> Authored: Mon Nov 28 10:41:53 2016 -0500 Committer: tballison <[email protected]> Committed: Mon Nov 28 10:41:53 2016 -0500 ---------------------------------------------------------------------- .../src/test/java/org/apache/tika/TikaTest.java | 13 +++++ .../tika/parser/ocr/TesseractOCRParser.java | 50 ++++++++++++-------- .../tika/parser/ocr/TesseractOCRParserTest.java | 10 ++++ 3 files changed, 52 insertions(+), 21 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/a47a6993/tika-core/src/test/java/org/apache/tika/TikaTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index 0f6303e..34e9a94 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -16,6 +16,7 @@ */ package org.apache.tika; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -106,6 +107,18 @@ public abstract class TikaTest { return stream; } + public static void assertContainsCount(String needle, String haystack, int targetCount) { + int i = haystack.indexOf(needle); + int count = 0; + while (i > -1) { + count++; + i = haystack.indexOf(needle, i+1); + } + assertEquals("found "+count +" but should have found: "+targetCount, + targetCount, count); + } + + public static void assertContains(String needle, String haystack) { assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle)); } http://git-wip-us.apache.org/repos/asf/tika/blob/a47a6993/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index a63eae1..0ac2b6b 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -67,7 +67,6 @@ import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.parser.image.ImageParser; import org.apache.tika.parser.image.TiffParser; import org.apache.tika.parser.jpeg.JpegParser; -import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.OfflineContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.Attributes; @@ -220,15 +219,22 @@ public class TesseractOCRParser extends AbstractParser { try { TikaInputStream tikaStream = TikaInputStream.get(stream, tmp); - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); - xhtml.startDocument(); - File tmpImgFile = tmp.createTemporaryFile(); - parse(tikaStream, tmpImgFile, parseContext, xhtml, config); + //trigger the spooling to a tmp file if the stream wasn't + //already a TikaInputStream that contained a backing file + tikaStream.getPath(); + //this is the text output file name specified on the tesseract + //commandline. The actual output file name will have a suffix added. + File tmpOCROutputFile = tmp.createTemporaryFile(); + // Temporary workaround for TIKA-1445 - until we can specify // composite parsers with strategies (eg Composite, Try In Turn), // always send the image onwards to the regular parser to have // the metadata for them extracted as well - _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new EmbeddedContentHandler(xhtml), metadata, parseContext); + _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + parse(tikaStream, tmpOCROutputFile, parseContext, xhtml, config); xhtml.endDocument(); } finally { tmp.dispose(); @@ -264,7 +270,6 @@ public class TesseractOCRParser extends AbstractParser { * @throws SAXException * @throws TikaException * - * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, ParseContext, TesseractOCRConfig)} */ public void parseInline(InputStream stream, XHTMLContentHandler xhtml, ParseContext parseContext, TesseractOCRConfig config) @@ -335,7 +340,7 @@ public class TesseractOCRParser extends AbstractParser { tmp.close(); } - private void parse(TikaInputStream tikaInputStream, File tmpImgFile, ParseContext parseContext, + private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, ParseContext parseContext, XHTMLContentHandler xhtml, TesseractOCRConfig config) throws IOException, SAXException, TikaException { File tmpTxtOutput = null; @@ -345,21 +350,27 @@ public class TesseractOCRParser extends AbstractParser { if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) { - // copy the contents of the original input file into a temporary file - // which will be processed for OCR - TemporaryResources tmp = new TemporaryResources(); - File tmpFile = tmp.createTemporaryFile(); - FileUtils.copyFile(input, tmpFile); - // Process image if ImageMagick Tool is present if(config.isEnableImageProcessing() == 1 && hasImageMagick(config)) { - processImage(tmpFile,config); + // copy the contents of the original input file into a temporary file + // which will be preprocessed for OCR + TemporaryResources tmp = new TemporaryResources(); + try { + File tmpFile = tmp.createTemporaryFile(); + FileUtils.copyFile(input, tmpFile); + processImage(tmpFile, config); + doOCR(tmpFile, tmpOCROutputFile, config); + } finally { + if (tmp != null) { + tmp.dispose(); + } + } + } else { + doOCR(input, tmpOCROutputFile, config); } - doOCR(tmpFile, tmpImgFile, config); - // Tesseract appends the output type (.txt or .hocr) to output file name - tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + "." + + tmpTxtOutput = new File(tmpOCROutputFile.getAbsolutePath() + "." + config.getOutputType().toString().toLowerCase(Locale.US)); if (tmpTxtOutput.exists()) { @@ -371,10 +382,7 @@ public class TesseractOCRParser extends AbstractParser { } } } - - tmp.close(); } - } finally { if (tmpTxtOutput != null) { tmpTxtOutput.delete(); http://git-wip-us.apache.org/repos/asf/tika/blob/a47a6993/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index c0befa1..82414ef 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -197,6 +197,16 @@ public class TesseractOCRParserTest extends TikaTest { assumeTrue(canRun()); String xml = getXML("testOCR.jpg").xml; assertContains("OCR Testing", xml); + //test metadata extraction + assertContains("<meta name=\"Image Width\" content=\"136 pixels\" />", xml); + + //TIKA-2169 + assertContainsCount("<html", xml, 1); + assertContainsCount("<title", xml, 1); + assertContainsCount("</title", xml, 1); + assertContainsCount("<body", xml, 1); + assertContainsCount("</body", xml, 1); + assertContainsCount("</html", xml, 1); } @Test
