TIKA-1994 -- Integrate TesseractOCR with full page image rendering for PDFs
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ebe70289 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ebe70289 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ebe70289 Branch: refs/heads/2.x Commit: ebe70289815776f6ce6c271c7faf8d23cfd31337 Parents: e5a7604 Author: tballison <[email protected]> Authored: Fri Jun 3 14:52:19 2016 -0400 Committer: tballison <[email protected]> Committed: Fri Jun 3 14:52:19 2016 -0400 ---------------------------------------------------------------------- CHANGES.txt | 2 + .../apache/tika/module/journal/BundleIT.java | 2 +- .../tika-parser-pdf-bundle/pom.xml | 21 +- .../org/apache/tika/module/pdf/BundleIT.java | 2 +- .../tika-parser-multimedia-module/pom.xml | 6 - .../tika/parser/ocr/TesseractOCRParser.java | 93 ++- .../tika/parser/ocr/TesseractOCRParserTest.java | 527 +++++++++-------- .../tika-parser-pdf-module/pom.xml | 5 + .../tika/parser/pdf/AbstractPDF2XHTML.java | 575 +++++++++++++++++++ .../org/apache/tika/parser/pdf/OCR2XHTML.java | 125 ++++ .../org/apache/tika/parser/pdf/PDF2XHTML.java | 498 +--------------- .../org/apache/tika/parser/pdf/PDFParser.java | 8 + .../apache/tika/parser/pdf/PDFParserConfig.java | 274 ++++++--- .../apache/tika/parser/pdf/PDFParser.properties | 10 +- .../apache/tika/parser/pdf/PDFParserTest.java | 37 ++ 15 files changed, 1322 insertions(+), 863 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index f359484..fbc2236 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -10,6 +10,8 @@ Release 2.0 - Future Development Release 1.14 - ??? + * Integrate TesseractOCR with full page image rendering for PDFs (TIKA-1994). + * Add mime detection via Nick C and parser for DBF files (TIKA-1513). * Add mime detection and parsers for MSOffice 2003 XML Word http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java b/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java index 6d65164..c8e8448 100644 --- a/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java +++ b/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java @@ -92,6 +92,6 @@ public class BundleIT { @Test public void testServicesCreated() throws Exception { ServiceReference[] services = bc.getAllServiceReferences(Parser.class.getName(), null); - assertEquals("Not all Services have started", 2, services.length); + assertEquals("Not all Services have started", 16, services.length); } } http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml index dbd65e1..25eef2e 100644 --- a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml +++ b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml @@ -47,6 +47,7 @@ <Bundle-Activator>org.apache.tika.module.pdf.internal.Activator</Bundle-Activator> <Embed-Dependency> tika-parser-pdf-module;inline=true, + tika-parser-multimedia-module;inline=true, tika-parser-xmp-commons;inline=true, commons-io;inline=true, pdfbox;inline=true, @@ -65,6 +66,22 @@ <Import-Package> *, com.ibm.icu.text;resolution:=optional, + com.coremedia.iso;resolution:=optional, + com.coremedia.iso.boxes;resolution:=optional, + com.coremedia.iso.boxes.apple;resolution:=optional, + com.coremedia.iso.boxes.sampleentry;resolution:=optional, + com.drew.imaging.jpeg;resolution:=optional, + com.drew.imaging.riff;resolution:=optional, + com.drew.imaging.tiff;resolution:=optional, + com.drew.imaging.webp;resolution:=optional, + com.drew.lang;resolution:=optional, + com.drew.metadata;resolution:=optional, + com.drew.metadata.exif;resolution:=optional, + com.drew.metadata.iptc;resolution:=optional, + com.drew.metadata.jpeg;resolution:=optional, + com.googlecode.mp4parser;resolution:=optional, + com.googlecode.mp4parser.boxes.apple;resolution:=optional, + com.googlecode.mp4parser.util;resolution:=optional, javax.mail;resolution:=optional, javax.mail.internet;resolution:=optional, org.bouncycastle.cert;resolution:=optional, @@ -73,7 +90,9 @@ org.bouncycastle.cms.bc;resolution:=optional, org.bouncycastle.operator;resolution:=optional, org.bouncycastle.operator.bc;resolution:=optional, - org.bouncycastle.tsp;resolution:=optional + org.bouncycastle.tsp;resolution:=optional, + org.apache.commons.exec;resolution:=optional, + org.apache.commons.exec.environment;resolution:=optional </Import-Package> </instructions> </configuration> http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java b/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java index bbc72bb..8e1d010 100644 --- a/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java +++ b/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java @@ -91,6 +91,6 @@ public class BundleIT { @Test public void testServicesCreated() throws Exception { ServiceReference[] services = bc.getAllServiceReferences(Parser.class.getName(), null); - assertEquals("Not all Services have started", 1, services.length); + assertEquals("Not all Services have started", 15, services.length); } } http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-multimedia-module/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/pom.xml b/tika-parser-modules/tika-parser-multimedia-module/pom.xml index 0192b8b..7a3a704 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/pom.xml +++ b/tika-parser-modules/tika-parser-multimedia-module/pom.xml @@ -83,12 +83,6 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-parser-pdf-module</artifactId> - <version>${project.version}</version> - <scope>test</scope> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> <artifactId>tika-parser-office-module</artifactId> <version>${project.version}</version> <scope>test</scope> http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 7db29c8..83fe7fe 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -16,9 +16,7 @@ */ package org.apache.tika.parser.ocr; -import javax.imageio.ImageIO; - -import java.awt.Image; +import java.awt.*; import java.awt.image.BufferedImage; import java.io.File; import java.io.FileInputStream; @@ -40,6 +38,7 @@ import java.util.concurrent.FutureTask; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import javax.imageio.ImageIO; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.LogFactory; import org.apache.tika.exception.TikaException; @@ -56,10 +55,10 @@ import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.parser.image.ImageParser; import org.apache.tika.parser.image.TiffParser; import org.apache.tika.parser.jpeg.JpegParser; +import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; - import static java.nio.charset.StandardCharsets.UTF_8; /** @@ -110,7 +109,7 @@ public class TesseractOCRParser extends AbstractParser { } } - private boolean hasTesseract(TesseractOCRConfig config) { + public boolean hasTesseract(TesseractOCRConfig config) { // Fetch where the config says to find Tesseract String tesseract = config.getTesseractPath() + getTesseractProg(); @@ -157,47 +156,90 @@ public class TesseractOCRParser extends AbstractParser { public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG); - // If Tesseract is not on the path with the current config, do not try to run OCR // getSupportedTypes shouldn't have listed us as handling it, so this should only // occur if someone directly calls this parser, not via DefaultParser or similar if (! hasTesseract(config)) return; - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + TemporaryResources tmp = new TemporaryResources(); + try { + TikaInputStream tikaStream = TikaInputStream.get(stream, tmp); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + File tmpImgFile = tmp.createTemporaryFile(); + parse(tikaStream, tmpImgFile, xhtml, config); + // Temporary workaround for TIKA-1445 - until we can specify + // composite parsers with strategies (eg Composite, Try In Turn), + // always send the image onwards to the regular parser to have + // the metadata for them extracted as well + _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new EmbeddedContentHandler(xhtml), metadata, context); + xhtml.endDocument(); + } finally { + tmp.dispose(); + } + } + + /** + * Use this to parse content without starting a new document. + * This appends SAX events to xhtml without re-adding the metadata, body start, etc. + * @param stream inputstream + * @param xhtml handler + * @param config TesseractOCRConfig to use for this parse + * @throws IOException + * @throws SAXException + * @throws TikaException + */ + public void parseInline(InputStream stream, XHTMLContentHandler xhtml, TesseractOCRConfig config) + throws IOException, SAXException, TikaException { + // If Tesseract is not on the path with the current config, do not try to run OCR + // getSupportedTypes shouldn't have listed us as handling it, so this should only + // occur if someone directly calls this parser, not via DefaultParser or similar + if (! hasTesseract(config)) + return; TemporaryResources tmp = new TemporaryResources(); - File output = null; try { TikaInputStream tikaStream = TikaInputStream.get(stream, tmp); - File input = tikaStream.getFile(); - long size = tikaStream.getLength(); + File tmpImgFile = tmp.createTemporaryFile(); + parse(tikaStream, tmpImgFile, xhtml, config); + } finally { + tmp.dispose(); + } + + } + + private void parse(TikaInputStream tikaInputStream, File tmpImgFile, XHTMLContentHandler xhtml, TesseractOCRConfig config) + throws IOException, SAXException, TikaException { + File tmpTxtOutput = null; + + try { + File input = tikaInputStream.getFile(); + long size = tikaInputStream.getLength(); if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) { - output = tmp.createTemporaryFile(); - doOCR(input, output, config); + doOCR(input, tmpImgFile, config); // Tesseract appends .txt to output file name - output = new File(output.getAbsolutePath() + ".txt"); + tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + ".txt"); - if (output.exists()) - extractOutput(new FileInputStream(output), xhtml); + if (tmpTxtOutput.exists()) { + try (InputStream is = new FileInputStream(tmpTxtOutput)) { + extractOutput(is, xhtml); + } + } } - // Temporary workaround for TIKA-1445 - until we can specify - // composite parsers with strategies (eg Composite, Try In Turn), - // always send the image onwards to the regular parser to have - // the metadata for them extracted as well - _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context); } finally { - tmp.dispose(); - if (output != null) { - output.delete(); + if (tmpTxtOutput != null) { + tmpTxtOutput.delete(); } } } + // TIKA-1445 workaround parser private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser(); private static class CompositeImageParser extends CompositeParser { @@ -283,8 +325,7 @@ public class TesseractOCRParser extends AbstractParser { */ private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException { - xhtml.startDocument(); - xhtml.startElement("div"); + xhtml.startElement("div", "class", "ocr"); try (Reader reader = new InputStreamReader(stream, UTF_8)) { char[] buffer = new char[1024]; for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { @@ -293,7 +334,7 @@ public class TesseractOCRParser extends AbstractParser { } } xhtml.endElement("div"); - xhtml.endDocument(); + } /** http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index e99f6ae..9ab958e 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -1,265 +1,262 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.ocr; - -import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; -import static org.junit.Assume.assumeTrue; -import static org.mockito.Matchers.any; -import static org.mockito.Matchers.eq; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.verify; - -import java.io.InputStream; -import java.util.List; - -import org.apache.tika.TikaTest; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.DefaultParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.RecursiveParserWrapper; -import org.apache.tika.parser.external.ExternalParser; -import org.apache.tika.parser.image.ImageParser; -import org.apache.tika.parser.mail.RFC822Parser; -import org.apache.tika.parser.pdf.PDFParserConfig; -import org.apache.tika.sax.BasicContentHandlerFactory; -import org.apache.tika.sax.BodyContentHandler; -import org.apache.tika.sax.XHTMLContentHandler; -import org.junit.Test; -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.helpers.DefaultHandler; - -public class TesseractOCRParserTest extends TikaTest { - - public static boolean canRun() { - TesseractOCRConfig config = new TesseractOCRConfig(); - TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest(); - return tesseractOCRTest.canRun(config); - } - - private boolean canRun(TesseractOCRConfig config) { - String[] checkCmd = {config.getTesseractPath() + getTesseractProg()}; - // If Tesseract is not on the path, do not run the test. - return ExternalParser.check(checkCmd); - } - - /* - Check that if Tesseract is not found, the TesseractOCRParser claims to not support - any file types. So, the standard image parser is called instead. - */ - @Test - public void offersNoTypesIfNotFound() throws Exception { - TesseractOCRParser parser = new TesseractOCRParser(); - DefaultParser defaultParser = new DefaultParser(); - MediaType png = MediaType.image("png"); - - // With an invalid path, will offer no types - TesseractOCRConfig invalidConfig = new TesseractOCRConfig(); - invalidConfig.setTesseractPath("/made/up/path"); - - ParseContext parseContext = new ParseContext(); - parseContext.set(TesseractOCRConfig.class, invalidConfig); - - // No types offered - assertEquals(0, parser.getSupportedTypes(parseContext).size()); - - // And DefaultParser won't use us - assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass()); - } - - /* - If Tesseract is found, test we retrieve the proper number of supporting Parsers. - */ - @Test - public void offersTypesIfFound() throws Exception { - TesseractOCRParser parser = new TesseractOCRParser(); - DefaultParser defaultParser = new DefaultParser(); - - ParseContext parseContext = new ParseContext(); - MediaType png = MediaType.image("png"); - - // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG. - assumeTrue(canRun()); - - assertEquals(5, parser.getSupportedTypes(parseContext).size()); - assertTrue(parser.getSupportedTypes(parseContext).contains(png)); - - // DefaultParser will now select the TesseractOCRParser. - assertEquals(TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass()); - } - - @Test - public void testPDFOCR() throws Exception { - String resource = "/test-documents/testOCR.pdf"; - String[] nonOCRContains = new String[0]; - testBasicOCR(resource, nonOCRContains, 2); - } - - @Test - public void testDOCXOCR() throws Exception { - String resource = "/test-documents/testOCR.docx"; - String[] nonOCRContains = { - "This is some text.", - "Here is an embedded image:" - }; - testBasicOCR(resource, nonOCRContains, 3); - } - - @Test - public void testPPTXOCR() throws Exception { - String resource = "/test-documents/testOCR.pptx"; - String[] nonOCRContains = { - "This is some text" - }; - testBasicOCR(resource, nonOCRContains, 3); - } - - private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception { - TesseractOCRConfig config = new TesseractOCRConfig(); - Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), - new BasicContentHandlerFactory( - BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); - - PDFParserConfig pdfConfig = new PDFParserConfig(); - pdfConfig.setExtractInlineImages(true); - - ParseContext parseContext = new ParseContext(); - parseContext.set(TesseractOCRConfig.class, config); - parseContext.set(Parser.class, parser); - parseContext.set(PDFParserConfig.class, pdfConfig); - - try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) { - parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext); - } - List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata(); - assertEquals(numMetadatas, metadataList.size()); - - StringBuilder contents = new StringBuilder(); - for (Metadata m : metadataList) { - contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT)); - } - if (canRun()) { - assertTrue(contents.toString().contains("Happy New Year 2003!")); - } - for (String needle : nonOCRContains) { - assertContains(needle, contents.toString()); - } - assertTrue(metadataList.get(0).names().length > 10); - assertTrue(metadataList.get(1).names().length > 10); - //test at least one value - assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName")); - } - - @Test - public void testSingleImage() throws Exception { - assumeTrue(canRun()); - String xml = getXML("testOCR.jpg").xml; - assertContains("OCR Testing", xml); - } - - @Test - public void getNormalMetadataToo() throws Exception { - //this should be successful whether or not TesseractOCR is installed/active - //If tesseract is installed, the internal metadata extraction parser should - //work; and if tesseract isn't installed, the regular parsers should take over. - - //gif - Metadata m = getXML("testGIF.gif").metadata; - assertTrue(m.names().length > 20); - assertEquals("RGB", m.get("Chroma ColorSpaceType")); - - //jpg - m = getXML("testOCR.jpg").metadata; - assertEquals("136", m.get(Metadata.IMAGE_WIDTH)); - assertEquals("66", m.get(Metadata.IMAGE_LENGTH)); - assertEquals("8", m.get(Metadata.BITS_PER_SAMPLE)); - assertEquals(null, m.get(Metadata.SAMPLES_PER_PIXEL)); - assertContains("This is a test Apache Tika imag", m.get(Metadata.COMMENTS)); - - //bmp - m = getXML("testBMP.bmp").metadata; - assertEquals("100", m.get(Metadata.IMAGE_WIDTH)); - assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); - - //png - m = getXML("testPNG.png").metadata; - assertEquals("100", m.get(Metadata.IMAGE_WIDTH)); - assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); - assertEquals("UnsignedIntegral", m.get("Data SampleFormat")); - - //tiff - m = getXML("testTIFF.tif").metadata; - assertEquals("100", m.get(Metadata.IMAGE_WIDTH)); - assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); - assertEquals("72 dots per inch", m.get("Y Resolution")); - } - - @Test - public void testMultipart() { - Parser parser = new RFC822Parser(); - Metadata metadata = new Metadata(); - InputStream stream = getStream("test-documents/testRFC822-multipart"); - ContentHandler handler = mock(XHTMLContentHandler.class); - - try { - parser.parse(stream, handler, metadata, new ParseContext()); - verify(handler).startDocument(); - int bodyExpectedTimes = 4, multipackExpectedTimes = 5; - // TIKA-1422. TesseractOCRParser interferes with the number of times the handler is invoked. - // But, different versions of Tesseract lead to a different number of invocations. So, we - // only verify the handler if Tesseract cannot run. - if (!TesseractOCRParserTest.canRun()) { - verify(handler, times(bodyExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class)); - verify(handler, times(bodyExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div"); - } - } catch (Exception e) { - fail("Exception thrown: " + e.getMessage()); - } - - //repeat, this time looking at content - parser = new RFC822Parser(); - metadata = new Metadata(); - stream = getStream("test-documents/testRFC822-multipart"); - handler = new BodyContentHandler(); - try { - parser.parse(stream, handler, metadata, new ParseContext()); - //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode - String bodyText = handler.toString(); - assertTrue(bodyText.contains("body 1")); - assertTrue(bodyText.contains("body 2")); - assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif - } catch (Exception e) { - fail("Exception thrown: " + e.getMessage()); - } - } - - private static InputStream getStream(String name) { - InputStream stream = Thread.currentThread().getContextClassLoader() - .getResourceAsStream(name); - assertNotNull("Test file not found " + name, stream); - return stream; - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ocr; + +import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.junit.Assume.assumeTrue; +import static org.mockito.Matchers.any; +import static org.mockito.Matchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +import java.io.InputStream; +import java.util.List; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.DefaultParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.parser.image.ImageParser; +import org.apache.tika.parser.mail.RFC822Parser; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.junit.Ignore; +import org.junit.Test; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.DefaultHandler; + +public class TesseractOCRParserTest extends TikaTest { + + public static boolean canRun() { + TesseractOCRConfig config = new TesseractOCRConfig(); + TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest(); + return tesseractOCRTest.canRun(config); + } + + private boolean canRun(TesseractOCRConfig config) { + String[] checkCmd = {config.getTesseractPath() + getTesseractProg()}; + // If Tesseract is not on the path, do not run the test. + return ExternalParser.check(checkCmd); + } + + /* + Check that if Tesseract is not found, the TesseractOCRParser claims to not support + any file types. So, the standard image parser is called instead. + */ + @Test + public void offersNoTypesIfNotFound() throws Exception { + TesseractOCRParser parser = new TesseractOCRParser(); + DefaultParser defaultParser = new DefaultParser(); + MediaType png = MediaType.image("png"); + + // With an invalid path, will offer no types + TesseractOCRConfig invalidConfig = new TesseractOCRConfig(); + invalidConfig.setTesseractPath("/made/up/path"); + + ParseContext parseContext = new ParseContext(); + parseContext.set(TesseractOCRConfig.class, invalidConfig); + + // No types offered + assertEquals(0, parser.getSupportedTypes(parseContext).size()); + + // And DefaultParser won't use us + assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass()); + } + + /* + If Tesseract is found, test we retrieve the proper number of supporting Parsers. + */ + @Test + public void offersTypesIfFound() throws Exception { + TesseractOCRParser parser = new TesseractOCRParser(); + DefaultParser defaultParser = new DefaultParser(); + + ParseContext parseContext = new ParseContext(); + MediaType png = MediaType.image("png"); + + // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG. + assumeTrue(canRun()); + + assertEquals(5, parser.getSupportedTypes(parseContext).size()); + assertTrue(parser.getSupportedTypes(parseContext).contains(png)); + + // DefaultParser will now select the TesseractOCRParser. + assertEquals(TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass()); + } + + @Test + @Ignore("TODO: cyclic reference to pdf-module...maybe move these all to tika-app?") + public void testPDFOCR() throws Exception { + String resource = "/test-documents/testOCR.pdf"; + String[] nonOCRContains = new String[0]; + testBasicOCR(resource, nonOCRContains, 2); + } + + @Test + public void testDOCXOCR() throws Exception { + String resource = "/test-documents/testOCR.docx"; + String[] nonOCRContains = { + "This is some text.", + "Here is an embedded image:" + }; + testBasicOCR(resource, nonOCRContains, 3); + } + + @Test + public void testPPTXOCR() throws Exception { + String resource = "/test-documents/testOCR.pptx"; + String[] nonOCRContains = { + "This is some text" + }; + testBasicOCR(resource, nonOCRContains, 3); + } + + private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception { + TesseractOCRConfig config = new TesseractOCRConfig(); + Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), + new BasicContentHandlerFactory( + BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); + + ParseContext parseContext = new ParseContext(); + parseContext.set(TesseractOCRConfig.class, config); + parseContext.set(Parser.class, parser); + + try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) { + parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext); + } + List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata(); + assertEquals(numMetadatas, metadataList.size()); + + StringBuilder contents = new StringBuilder(); + for (Metadata m : metadataList) { + contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT)); + } + if (canRun()) { + assertTrue(contents.toString().contains("Happy New Year 2003!")); + } + for (String needle : nonOCRContains) { + assertContains(needle, contents.toString()); + } + assertTrue(metadataList.get(0).names().length > 10); + assertTrue(metadataList.get(1).names().length > 10); + //test at least one value + assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName")); + } + + @Test + public void testSingleImage() throws Exception { + assumeTrue(canRun()); + String xml = getXML("testOCR.jpg").xml; + assertContains("OCR Testing", xml); + } + + @Test + public void getNormalMetadataToo() throws Exception { + //this should be successful whether or not TesseractOCR is installed/active + //If tesseract is installed, the internal metadata extraction parser should + //work; and if tesseract isn't installed, the regular parsers should take over. + + //gif + Metadata m = getXML("testGIF.gif").metadata; + assertTrue(m.names().length > 20); + assertEquals("RGB", m.get("Chroma ColorSpaceType")); + + //jpg + m = getXML("testOCR.jpg").metadata; + assertEquals("136", m.get(Metadata.IMAGE_WIDTH)); + assertEquals("66", m.get(Metadata.IMAGE_LENGTH)); + assertEquals("8", m.get(Metadata.BITS_PER_SAMPLE)); + assertEquals(null, m.get(Metadata.SAMPLES_PER_PIXEL)); + assertContains("This is a test Apache Tika imag", m.get(Metadata.COMMENTS)); + + //bmp + m = getXML("testBMP.bmp").metadata; + assertEquals("100", m.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); + + //png + m = getXML("testPNG.png").metadata; + assertEquals("100", m.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); + assertEquals("UnsignedIntegral", m.get("Data SampleFormat")); + + //tiff + m = getXML("testTIFF.tif").metadata; + assertEquals("100", m.get(Metadata.IMAGE_WIDTH)); + assertEquals("75", m.get(Metadata.IMAGE_LENGTH)); + assertEquals("72 dots per inch", m.get("Y Resolution")); + } + + @Test + public void testMultipart() { + Parser parser = new RFC822Parser(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("test-documents/testRFC822-multipart"); + ContentHandler handler = mock(XHTMLContentHandler.class); + + try { + parser.parse(stream, handler, metadata, new ParseContext()); + verify(handler).startDocument(); + int bodyExpectedTimes = 4, multipackExpectedTimes = 5; + // TIKA-1422. TesseractOCRParser interferes with the number of times the handler is invoked. + // But, different versions of Tesseract lead to a different number of invocations. So, we + // only verify the handler if Tesseract cannot run. + if (!TesseractOCRParserTest.canRun()) { + verify(handler, times(bodyExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class)); + verify(handler, times(bodyExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div"); + } + } catch (Exception e) { + fail("Exception thrown: " + e.getMessage()); + } + + //repeat, this time looking at content + parser = new RFC822Parser(); + metadata = new Metadata(); + stream = getStream("test-documents/testRFC822-multipart"); + handler = new BodyContentHandler(); + try { + parser.parse(stream, handler, metadata, new ParseContext()); + //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode + String bodyText = handler.toString(); + assertTrue(bodyText.contains("body 1")); + assertTrue(bodyText.contains("body 2")); + assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif + } catch (Exception e) { + fail("Exception thrown: " + e.getMessage()); + } + } + + private static InputStream getStream(String name) { + InputStream stream = Thread.currentThread().getContextClassLoader() + .getResourceAsStream(name); + assertNotNull("Test file not found " + name, stream); + return stream; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/pom.xml b/tika-parser-modules/tika-parser-pdf-module/pom.xml index 2156b95..11f259e 100644 --- a/tika-parser-modules/tika-parser-pdf-module/pom.xml +++ b/tika-parser-modules/tika-parser-pdf-module/pom.xml @@ -35,6 +35,11 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> + <artifactId>tika-parser-multimedia-module</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> <artifactId>tika-parser-xmp-commons</artifactId> <version>${project.version}</version> </dependency> http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java new file mode 100644 index 0000000..9a73bde --- /dev/null +++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -0,0 +1,575 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +import java.awt.image.BufferedImage; +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.List; +import java.util.ListIterator; +import java.util.Locale; +import java.util.Map; +import java.util.TreeMap; + +import javax.xml.stream.XMLStreamException; +import org.apache.commons.io.IOExceptionWithCause; +import org.apache.commons.io.IOUtils; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentCatalog; +import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; +import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; +import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; +import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; +import org.apache.pdfbox.pdmodel.interactive.action.PDAction; +import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup; +import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode; +import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; +import org.apache.pdfbox.pdmodel.interactive.form.PDField; +import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField; +import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField; +import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.tools.imageio.ImageIOUtil; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.ocr.TesseractOCRConfig; +import org.apache.tika.parser.ocr.TesseractOCRParser; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; +import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR; + +class AbstractPDF2XHTML extends PDFTextStripper { + + /** + * Maximum recursive depth during AcroForm processing. + * Prevents theoretical AcroForm recursion bomb. + */ + private final static int MAX_ACROFORM_RECURSIONS = 10; + + private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig(); + + /** + * Format used for signature dates + * TODO Make this thread-safe + */ + private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT); + + + final List<IOException> exceptions = new ArrayList<>(); + final PDDocument pdDocument; + final XHTMLContentHandler xhtml; + private final ParseContext context; + private final Metadata metadata; + final PDFParserConfig config; + + private int pageIndex = 0; + + AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata, + PDFParserConfig config) throws IOException { + this.pdDocument = pdDocument; + this.xhtml = new XHTMLContentHandler(handler, metadata); + this.context = context; + this.metadata = metadata; + this.config = config; + } + + @Override + protected void startPage(PDPage page) throws IOException { + try { + xhtml.startElement("div", "class", "page"); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to start a page", e); + } + writeParagraphStart(); + } + + EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() { + EmbeddedDocumentExtractor extractor = + context.get(EmbeddedDocumentExtractor.class); + if (extractor == null) { + extractor = new ParsingEmbeddedDocumentExtractor(context); + } + return extractor; + } + + private void extractEmbeddedDocuments(PDDocument document) + throws IOException, SAXException, TikaException { + PDDocumentNameDictionary namesDictionary = + new PDDocumentNameDictionary(document.getDocumentCatalog()); + PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); + if (efTree == null) { + return; + } + + Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames(); + //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. + //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java + //If there is a need we could add a fully recursive search to find a non-null + //Map<String, COSObjectable> that contains the doc info. + if (embeddedFileNames != null) { + processEmbeddedDocNames(embeddedFileNames); + } else { + List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); + if (kids == null) { + return; + } + for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { + embeddedFileNames = node.getNames(); + if (embeddedFileNames != null) { + processEmbeddedDocNames(embeddedFileNames); + } + } + } + } + + private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames) + throws IOException, SAXException, TikaException { + if (embeddedFileNames == null || embeddedFileNames.isEmpty()) { + return; + } + + EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); + for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) { + PDComplexFileSpecification spec = ent.getValue(); + extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor); + } + } + + private void extractMultiOSPDEmbeddedFiles(String defaultName, + PDComplexFileSpecification spec, + EmbeddedDocumentExtractor extractor) throws IOException, + SAXException, TikaException { + + if (spec == null) { + return; + } + //current strategy is to pull all, not just first non-null + extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor); + extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor); + extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor); + extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor); + } + + private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file, + EmbeddedDocumentExtractor extractor) + throws SAXException, IOException, TikaException { + + if (file == null) { + //skip silently + return; + } + + fileName = (fileName == null) ? defaultName : fileName; + + // TODO: other metadata? + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); + metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); + metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); + + if (extractor.shouldParseEmbedded(metadata)) { + TikaInputStream stream = null; + try { + stream = TikaInputStream.get(file.createInputStream()); + extractor.parseEmbedded( + stream, + new EmbeddedContentHandler(xhtml), + metadata, false); + + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", fileName); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + } finally { + IOUtils.closeQuietly(stream); + } + } + } + + void handleCatchableIOE(IOException e) throws IOException { + if (config.isCatchIntermediateIOExceptions()) { + String msg = e.getMessage(); + if (msg == null) { + msg = "IOException, no message"; + } + metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg); + exceptions.add(e); + } else { + throw e; + } + } + + void doOCROnCurrentPage() throws IOException, TikaException, SAXException { + if (config.getOCRStrategy().equals(NO_OCR)) { + return; + } + TesseractOCRConfig tesseractConfig = + context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG); + + TesseractOCRParser tesseractOCRParser = new TesseractOCRParser(); + if (! tesseractOCRParser.hasTesseract(tesseractConfig)) { + throw new TikaException("Tesseract is not available. "+ + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly"); + } + + PDFRenderer renderer = new PDFRenderer(pdDocument); + TemporaryResources tmp = new TemporaryResources(); + try { + BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOCRImageType()); + Path tmpFile = tmp.createTempFile(); + try (OutputStream os = Files.newOutputStream(tmpFile)) { + //TODO: get output format from TesseractConfig + ImageIOUtil.writeImage(image, config.getOCRImageFormatName(), + os, config.getOCRDPI()); + } + try (InputStream is = TikaInputStream.get(tmpFile)) { + tesseractOCRParser.parseInline(is, xhtml, tesseractConfig); + } + } catch (IOException e) { + handleCatchableIOE(e); + } catch (SAXException e) { + throw new IOExceptionWithCause("error writing OCR content from PDF", e); + } finally { + tmp.dispose(); + } + } + + @Override + protected void endPage(PDPage page) throws IOException { + + try { + EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); + for (PDAnnotation annotation : page.getAnnotations()) { + + if (annotation instanceof PDAnnotationFileAttachment) { + PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; + PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); + try { + extractMultiOSPDEmbeddedFiles("", fileSpec, extractor); + } catch (SAXException e) { + throw new IOExceptionWithCause("file embedded in annotation sax exception", e); + } catch (TikaException e) { + throw new IOExceptionWithCause("file embedded in annotation tika exception", e); + } catch (IOException e) { + handleCatchableIOE(e); + } + } + // TODO: remove once PDFBOX-1143 is fixed: + if (config.getExtractAnnotationText()) { + if (annotation instanceof PDAnnotationLink) { + PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; + if (annotationlink.getAction() != null) { + PDAction action = annotationlink.getAction(); + if (action instanceof PDActionURI) { + PDActionURI uri = (PDActionURI) action; + String link = uri.getURI(); + if (link != null) { + xhtml.startElement("div", "class", "annotation"); + xhtml.startElement("a", "href", link); + xhtml.endElement("a"); + xhtml.endElement("div"); + } + } + } + } + + if (annotation instanceof PDAnnotationMarkup) { + PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; + String title = annotationMarkup.getTitlePopup(); + String subject = annotationMarkup.getSubject(); + String contents = annotationMarkup.getContents(); + // TODO: maybe also annotationMarkup.getRichContents()? + if (title != null || subject != null || contents != null) { + xhtml.startElement("div", "class", "annotation"); + + if (title != null) { + xhtml.startElement("div", "class", "annotationTitle"); + xhtml.characters(title); + xhtml.endElement("div"); + } + + if (subject != null) { + xhtml.startElement("div", "class", "annotationSubject"); + xhtml.characters(subject); + xhtml.endElement("div"); + } + + if (contents != null) { + xhtml.startElement("div", "class", "annotationContents"); + xhtml.characters(contents); + xhtml.endElement("div"); + } + + xhtml.endElement("div"); + } + } + } + } + if (config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { + doOCROnCurrentPage(); + } + xhtml.endElement("div"); + } catch (SAXException|TikaException e) { + throw new IOExceptionWithCause("Unable to end a page", e); + } catch (IOException e) { + exceptions.add(e); + } finally { + pageIndex++; + } + } + + @Override + protected void startDocument(PDDocument pdf) throws IOException { + try { + xhtml.startDocument(); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to start a document", e); + } + } + + @Override + protected void endDocument(PDDocument pdf) throws IOException { + try { + // Extract text for any bookmarks: + extractBookmarkText(); + try { + extractEmbeddedDocuments(pdf); + } catch (IOException e) { + handleCatchableIOE(e); + } + + //extract acroform data at end of doc + if (config.getExtractAcroFormContent() == true) { + try { + extractAcroForm(pdf); + } catch (IOException e) { + handleCatchableIOE(e); + } + } + xhtml.endDocument(); + } catch (TikaException e) { + throw new IOExceptionWithCause("Unable to end a document", e); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to end a document", e); + } + } + + void extractBookmarkText() throws SAXException { + PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline(); + if (outline != null) { + extractBookmarkText(outline); + } + } + + void extractBookmarkText(PDOutlineNode bookmark) throws SAXException { + PDOutlineItem current = bookmark.getFirstChild(); + if (current != null) { + xhtml.startElement("ul"); + while (current != null) { + xhtml.startElement("li"); + xhtml.characters(current.getTitle()); + xhtml.endElement("li"); + // Recurse: + extractBookmarkText(current); + current = current.getNextSibling(); + } + xhtml.endElement("ul"); + } + } + + void extractAcroForm(PDDocument pdf) throws IOException, + SAXException { + //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields + //this code derives from Ben's code + PDDocumentCatalog catalog = pdf.getDocumentCatalog(); + + if (catalog == null) + return; + + PDAcroForm form = catalog.getAcroForm(); + if (form == null) + return; + + //if it has xfa, try that. + //if it doesn't exist or there's an exception, + //go with traditional AcroForm + PDXFAResource pdxfa = form.getXFA(); + + if (pdxfa != null) { + //if successful, return + XFAExtractor xfaExtractor = new XFAExtractor(); + try (InputStream is = new BufferedInputStream( + new ByteArrayInputStream(pdxfa.getBytes()))) { + xfaExtractor.extract(is, xhtml, metadata, context); + return; + } catch (XMLStreamException |IOException e) { + //if there was an xml parse exception in xfa, try the AcroForm + } + } + + @SuppressWarnings("rawtypes") + List fields = form.getFields(); + + if (fields == null) + return; + + @SuppressWarnings("rawtypes") + ListIterator itr = fields.listIterator(); + + if (itr == null) + return; + + xhtml.startElement("div", "class", "acroform"); + xhtml.startElement("ol"); + + while (itr.hasNext()) { + Object obj = itr.next(); + if (obj != null && obj instanceof PDField) { + processAcroField((PDField) obj, 0); + } + } + xhtml.endElement("ol"); + xhtml.endElement("div"); + } + + private void processAcroField(PDField field, final int currentRecursiveDepth) + throws SAXException, IOException { + + if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) { + return; + } + addFieldString(field); + if (field instanceof PDNonTerminalField) { + int r = currentRecursiveDepth + 1; + xhtml.startElement("ol"); + for (PDField child : ((PDNonTerminalField)field).getChildren()) { + processAcroField(child, r); + } + xhtml.endElement("ol"); + } + } + + private void addFieldString(PDField field) throws SAXException { + //Pick partial name to present in content and altName for attribute + //Ignoring FullyQualifiedName for now + String partName = field.getPartialName(); + String altName = field.getAlternateFieldName(); + + StringBuilder sb = new StringBuilder(); + AttributesImpl attrs = new AttributesImpl(); + + if (partName != null) { + sb.append(partName).append(": "); + } + if (altName != null) { + attrs.addAttribute("", "altName", "altName", "CDATA", altName); + } + //return early if PDSignature field + if (field instanceof PDSignatureField) { + handleSignature(attrs, (PDSignatureField) field); + return; + } + String value = field.getValueAsString(); + if (value != null && !value.equals("null")) { + sb.append(value); + } + + if (attrs.getLength() > 0 || sb.length() > 0) { + xhtml.startElement("li", attrs); + xhtml.characters(sb.toString()); + xhtml.endElement("li"); + } + } + + private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField) + throws SAXException { + + PDSignature sig = sigField.getSignature(); + if (sig == null) { + return; + } + Map<String, String> vals = new TreeMap<>(); + vals.put("name", sig.getName()); + vals.put("contactInfo", sig.getContactInfo()); + vals.put("location", sig.getLocation()); + vals.put("reason", sig.getReason()); + + Calendar cal = sig.getSignDate(); + if (cal != null) { + dateFormat.setTimeZone(cal.getTimeZone()); + vals.put("date", dateFormat.format(cal.getTime())); + } + //see if there is any data + int nonNull = 0; + for (String val : vals.keySet()) { + if (val != null && !val.equals("")) { + nonNull++; + } + } + //if there is, process it + if (nonNull > 0) { + xhtml.startElement("li", parentAttributes); + + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute("", "type", "type", "CDATA", "signaturedata"); + + xhtml.startElement("ol", attrs); + for (Map.Entry<String, String> e : vals.entrySet()) { + if (e.getValue() == null || e.getValue().equals("")) { + continue; + } + attrs = new AttributesImpl(); + attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey()); + xhtml.startElement("li", attrs); + xhtml.characters(e.getValue()); + xhtml.endElement("li"); + } + xhtml.endElement("ol"); + xhtml.endElement("li"); + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java new file mode 100644 index 0000000..3ad551d --- /dev/null +++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +import java.io.IOException; +import java.io.Writer; + +import org.apache.commons.io.IOExceptionWithCause; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + + +/** + * Utility class that overrides the {@link PDFTextStripper} functionality + * to integrate text extraction via OCR only. + * + */ +class OCR2XHTML extends AbstractPDF2XHTML { + + private OCR2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, + PDFParserConfig config) + throws IOException { + super(document, handler, context, metadata, config); + } + + /** + * Converts the given PDF document (and related metadata) to a stream + * of XHTML SAX events sent to the given content handler. + * + * @param document PDF document + * @param handler SAX content handler + * @param metadata PDF metadata + * @throws SAXException if the content handler fails to process SAX events + * @throws TikaException if there was an exception outside of per page processing + */ + public static void process( + PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, + PDFParserConfig config) + throws SAXException, TikaException { + OCR2XHTML ocr2XHTML = null; + try { + ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config); + ocr2XHTML.writeText(document, new Writer() { + @Override + public void write(char[] cbuf, int off, int len) { + } + + @Override + public void flush() { + } + + @Override + public void close() { + } + }); + } catch (IOException e) { + if (e.getCause() instanceof SAXException) { + throw (SAXException) e.getCause(); + } else { + throw new TikaException("Unable to extract PDF content", e); + } + } + if (ocr2XHTML.exceptions.size() > 0) { + //throw the first + throw new TikaException("Unable to extract all PDF content", + ocr2XHTML.exceptions.get(0)); + } + } + + @Override + public void processPage(PDPage pdPage) throws IOException { + try { + startPage(pdPage); + doOCROnCurrentPage(); + endPage(pdPage); + } catch (TikaException |SAXException e) { + throw new IOExceptionWithCause(e); + } catch (IOException e) { + handleCatchableIOE(e); + } + } + + @Override + protected void writeString(String text) throws IOException { + //no-op + } + + @Override + protected void writeCharacters(TextPosition text) throws IOException { + //no-op + } + + @Override + protected void writeWordSeparator() throws IOException { + //no-op + } + + @Override + protected void writeLineSeparator() throws IOException { + //no-op + } + +} +
