Author: tallison Date: Wed Jan 7 16:48:43 2015 New Revision: 1650117 URL: http://svn.apache.org/r1650117 Log: TIKA-1445: add tests to TesseractOCRParserTest to ensure metadata is extracted
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java?rev=1650117&r1=1650116&r2=1650117&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java Wed Jan 7 16:48:43 2015 @@ -22,6 +22,7 @@ import static org.junit.Assert.assertTru import static org.junit.Assume.assumeTrue; import java.io.InputStream; +import java.util.List; import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; @@ -30,11 +31,14 @@ import org.apache.tika.parser.AutoDetect import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.parser.image.ImageParser; import org.apache.tika.parser.pdf.PDFParserConfig; +import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.junit.Test; +import org.xml.sax.helpers.DefaultHandler; public class TesseractOCRParserTest extends TikaTest { @@ -49,151 +53,148 @@ public class TesseractOCRParserTest exte // If Tesseract is not on the path, do not run the test. return ExternalParser.check(checkCmd); } - + @Test public void offersNoTypesIfNotFound() throws Exception { TesseractOCRParser parser = new TesseractOCRParser(); DefaultParser defaultParser = new DefaultParser(); MediaType png = MediaType.image("png"); - + // With an invalid path, will offer no types TesseractOCRConfig invalidConfig = new TesseractOCRConfig(); invalidConfig.setTesseractPath("/made/up/path"); - + ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, invalidConfig); // No types offered assertEquals(0, parser.getSupportedTypes(parseContext).size()); - + // And DefaultParser won't use us assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass()); - - + + // With a correct path, with offer the usual types TesseractOCRConfig normalConfig = new TesseractOCRConfig(); assumeTrue(canRun(normalConfig)); parseContext.set(TesseractOCRConfig.class, normalConfig); - + assertEquals(5, parser.getSupportedTypes(parseContext).size()); assertTrue(parser.getSupportedTypes(parseContext).contains(png)); - + // DefaultParser now will assertEquals(TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass()); } @Test public void testPDFOCR() throws Exception { - TesseractOCRConfig config = new TesseractOCRConfig(); - assumeTrue(canRun(config)); - - Parser parser = new AutoDetectParser(); - BodyContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); - - PDFParserConfig pdfConfig = new PDFParserConfig(); - pdfConfig.setExtractInlineImages(true); - - ParseContext parseContext = new ParseContext(); - parseContext.set(TesseractOCRConfig.class, config); - parseContext.set(Parser.class, new TesseractOCRParser()); - parseContext.set(PDFParserConfig.class, pdfConfig); - - InputStream stream = TesseractOCRParserTest.class.getResourceAsStream( - "/test-documents/testOCR.pdf"); - - try { - parser.parse(stream, handler, metadata, parseContext); - assertContains("Happy New Year 2003!", handler.toString()); - } finally { - stream.close(); - } + String resource = "/test-documents/testOCR.pdf"; + String[] nonOCRContains = new String[0]; + testBasicOCR(resource, nonOCRContains, 2); } @Test public void testDOCXOCR() throws Exception { - TesseractOCRConfig config = new TesseractOCRConfig(); - assumeTrue(canRun(config)); - - Parser parser = new AutoDetectParser(); - BodyContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); - - ParseContext parseContext = new ParseContext(); - parseContext.set(TesseractOCRConfig.class, config); - parseContext.set(Parser.class, new TesseractOCRParser()); - - InputStream stream = TesseractOCRParserTest.class.getResourceAsStream( - "/test-documents/testOCR.docx"); - - try { - parser.parse(stream, handler, metadata, parseContext); - - assertContains("Happy New Year 2003!", handler.toString()); - assertContains("This is some text.", handler.toString()); - assertContains("Here is an embedded image:", handler.toString()); - } finally { - stream.close(); - } + String resource = "/test-documents/testOCR.docx"; + String[] nonOCRContains = { + "This is some text.", + "Here is an embedded image:" + }; + testBasicOCR(resource, nonOCRContains, 3); } @Test public void testPPTXOCR() throws Exception { + String resource = "/test-documents/testOCR.pptx"; + String[] nonOCRContains = { + "This is some text" + }; + testBasicOCR(resource, nonOCRContains, 3); + } + + private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception { TesseractOCRConfig config = new TesseractOCRConfig(); - assumeTrue(canRun(config)); + Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), + new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); - Parser parser = new AutoDetectParser(); - BodyContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); + PDFParserConfig pdfConfig = new PDFParserConfig(); + pdfConfig.setExtractInlineImages(true); ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, config); - parseContext.set(Parser.class, new TesseractOCRParser()); + parseContext.set(Parser.class, parser); + parseContext.set(PDFParserConfig.class, pdfConfig); InputStream stream = TesseractOCRParserTest.class.getResourceAsStream( - "/test-documents/testOCR.pptx"); + resource); try { - parser.parse(stream, handler, metadata, parseContext); - - assertTrue("Check for the image's text.", handler.toString().contains("Happy New Year 2003!")); - assertTrue("Check for the standard text.", handler.toString().contains("This is some text")); + parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext); } finally { stream.close(); } + List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata(); + assertEquals(numMetadatas, metadataList.size()); + + StringBuilder contents = new StringBuilder(); + for (Metadata m : metadataList) { + contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT)); + } + if (canRun()) { + assertTrue(contents.toString().contains("Happy New Year 2003!")); + } + for (String needle : nonOCRContains) { + assertContains(needle, contents.toString()); + } + assertTrue(metadataList.get(0).names().length > 10); + assertTrue(metadataList.get(1).names().length > 10); + //test at least one value + assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName")); } - + @Test - public void getNormalMetadataToo() throws Exception { + public void testSingleImage() throws Exception { TesseractOCRConfig config = new TesseractOCRConfig(); assumeTrue(canRun(config)); + String xml = getXML("testOCR.jpg").xml; + assertContains("OCR Testing", xml); + } - Parser parser = new AutoDetectParser(); - BodyContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); - - ParseContext parseContext = new ParseContext(); - parseContext.set(TesseractOCRConfig.class, config); - parseContext.set(Parser.class, new TesseractOCRParser()); - - InputStream stream = TesseractOCRParserTest.class.getResourceAsStream( - "/test-documents/testOCR.jpg"); - - try { - parser.parse(stream, handler, metadata, parseContext); - - // OCR text - assertContains("Apache", handler.toString()); - assertContains("OCR Testing", handler.toString()); - - // Core JPEG properties from JPEGParser should still come through - assertEquals("136", metadata.get(Metadata.IMAGE_WIDTH)); - assertEquals("66", metadata.get(Metadata.IMAGE_LENGTH)); - assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE)); - assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL)); - assertContains("This is a test Apache Tika imag", metadata.get(Metadata.COMMENTS)); - } finally { - stream.close(); - } + @Test + public void getNormalMetadataToo() throws Exception { + //this should be successful whether or not TesseractOCR is installed/active + //If tesseract is installed, the internal metadata extraction parser should + //work; and if tesseract isn't installed, the regular parsers should take over. + + //gif + Metadata m = getXML("testGIF.gif").metadata; + assertTrue(m.names().length > 20); + assertEquals("RGB", m.get("Chroma ColorSpaceType")); + + //jpg + m = getXML("testOCR.jpg").metadata; + assertEquals("136", m.get(Metadata.IMAGE_WIDTH)); + assertEquals("66", m.get(Metadata.IMAGE_LENGTH)); + assertEquals("8", m.get(Metadata.BITS_PER_SAMPLE)); + assertEquals(null, m.get(Metadata.SAMPLES_PER_PIXEL)); + assertContains("This is a test Apache Tika imag", m.get(Metadata.COMMENTS)); + + //bmp + m = getXML("testBMP.bmp").metadata; + assertEquals("100", m.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); + + //png + m = getXML("testPNG.png").metadata; + assertEquals("100", m.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); + assertEquals("UnsignedIntegral", m.get("Data SampleFormat")); + + //tiff + m = getXML("testTIFF.tif").metadata; + assertEquals("100", m.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); + assertEquals("72 dots per inch", m.get("Y Resolution")); } }