[2/2] tika git commit: TIKA-1994 -- Integrate TesseractOCR with full page image rendering for PDFs

tallison Fri, 03 Jun 2016 11:52:40 -0700

TIKA-1994 -- Integrate TesseractOCR with full page image rendering for PDFs



Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ebe70289
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ebe70289
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ebe70289

Branch: refs/heads/2.x
Commit: ebe70289815776f6ce6c271c7faf8d23cfd31337
Parents: e5a7604
Author: tballison <[email protected]>
Authored: Fri Jun 3 14:52:19 2016 -0400
Committer: tballison <[email protected]>
Committed: Fri Jun 3 14:52:19 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   2 +
 .../apache/tika/module/journal/BundleIT.java    |   2 +-
 .../tika-parser-pdf-bundle/pom.xml              |  21 +-
 .../org/apache/tika/module/pdf/BundleIT.java    |   2 +-
 .../tika-parser-multimedia-module/pom.xml       |   6 -
 .../tika/parser/ocr/TesseractOCRParser.java     |  93 ++-
 .../tika/parser/ocr/TesseractOCRParserTest.java | 527 +++++++++--------
 .../tika-parser-pdf-module/pom.xml              |   5 +
 .../tika/parser/pdf/AbstractPDF2XHTML.java      | 575 +++++++++++++++++++
 .../org/apache/tika/parser/pdf/OCR2XHTML.java   | 125 ++++
 .../org/apache/tika/parser/pdf/PDF2XHTML.java   | 498 +---------------
 .../org/apache/tika/parser/pdf/PDFParser.java   |   8 +
 .../apache/tika/parser/pdf/PDFParserConfig.java | 274 ++++++---
 .../apache/tika/parser/pdf/PDFParser.properties |  10 +-
 .../apache/tika/parser/pdf/PDFParserTest.java   |  37 ++
 15 files changed, 1322 insertions(+), 863 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index f359484..fbc2236 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,8 @@ Release 2.0 - Future Development
 
 Release 1.14 - ???
 
+  * Integrate TesseractOCR with full page image rendering for PDFs (TIKA-1994).
+
   * Add mime detection via Nick C and parser for DBF files (TIKA-1513).
 
   * Add mime detection and parsers for MSOffice 2003 XML Word

http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java
 
b/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java
index 6d65164..c8e8448 100644
--- 
a/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java
+++ 
b/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java
@@ -92,6 +92,6 @@ public class BundleIT {
     @Test
     public void testServicesCreated() throws Exception {
         ServiceReference[] services = 
bc.getAllServiceReferences(Parser.class.getName(), null);
-        assertEquals("Not all Services have started", 2, services.length);
+        assertEquals("Not all Services have started", 16, services.length);
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml 
b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
index dbd65e1..25eef2e 100644
--- a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
@@ -47,6 +47,7 @@
             
<Bundle-Activator>org.apache.tika.module.pdf.internal.Activator</Bundle-Activator>
             <Embed-Dependency>
               tika-parser-pdf-module;inline=true,
+              tika-parser-multimedia-module;inline=true,
               tika-parser-xmp-commons;inline=true,
               commons-io;inline=true,
               pdfbox;inline=true,
@@ -65,6 +66,22 @@
             <Import-Package>
               *,
               com.ibm.icu.text;resolution:=optional,
+              com.coremedia.iso;resolution:=optional,
+              com.coremedia.iso.boxes;resolution:=optional,
+              com.coremedia.iso.boxes.apple;resolution:=optional,
+              com.coremedia.iso.boxes.sampleentry;resolution:=optional,
+              com.drew.imaging.jpeg;resolution:=optional,
+              com.drew.imaging.riff;resolution:=optional,
+              com.drew.imaging.tiff;resolution:=optional,
+              com.drew.imaging.webp;resolution:=optional,
+              com.drew.lang;resolution:=optional,
+              com.drew.metadata;resolution:=optional,
+              com.drew.metadata.exif;resolution:=optional,
+              com.drew.metadata.iptc;resolution:=optional,
+              com.drew.metadata.jpeg;resolution:=optional,
+              com.googlecode.mp4parser;resolution:=optional,
+              com.googlecode.mp4parser.boxes.apple;resolution:=optional,
+              com.googlecode.mp4parser.util;resolution:=optional,
               javax.mail;resolution:=optional,
               javax.mail.internet;resolution:=optional,
               org.bouncycastle.cert;resolution:=optional,
@@ -73,7 +90,9 @@
               org.bouncycastle.cms.bc;resolution:=optional,
               org.bouncycastle.operator;resolution:=optional,
               org.bouncycastle.operator.bc;resolution:=optional,
-              org.bouncycastle.tsp;resolution:=optional
+              org.bouncycastle.tsp;resolution:=optional,
+              org.apache.commons.exec;resolution:=optional,
+              org.apache.commons.exec.environment;resolution:=optional
             </Import-Package>
           </instructions>
         </configuration>

http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java
 
b/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java
index bbc72bb..8e1d010 100644
--- 
a/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java
+++ 
b/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java
@@ -91,6 +91,6 @@ public class BundleIT {
     @Test
     public void testServicesCreated() throws Exception {
         ServiceReference[] services = 
bc.getAllServiceReferences(Parser.class.getName(), null);
-        assertEquals("Not all Services have started", 1, services.length);
+        assertEquals("Not all Services have started", 15, services.length);
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-multimedia-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/pom.xml 
b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
index 0192b8b..7a3a704 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/pom.xml
+++ b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
@@ -83,12 +83,6 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-pdf-module</artifactId>
-      <version>${project.version}</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
       <artifactId>tika-parser-office-module</artifactId>
       <version>${project.version}</version>
       <scope>test</scope>

http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 7db29c8..83fe7fe 100644
--- 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -16,9 +16,7 @@
  */
 package org.apache.tika.parser.ocr;
 
-import javax.imageio.ImageIO;
-
-import java.awt.Image;
+import java.awt.*;
 import java.awt.image.BufferedImage;
 import java.io.File;
 import java.io.FileInputStream;
@@ -40,6 +38,7 @@ import java.util.concurrent.FutureTask;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 
+import javax.imageio.ImageIO;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.logging.LogFactory;
 import org.apache.tika.exception.TikaException;
@@ -56,10 +55,10 @@ import org.apache.tika.parser.external.ExternalParser;
 import org.apache.tika.parser.image.ImageParser;
 import org.apache.tika.parser.image.TiffParser;
 import org.apache.tika.parser.jpeg.JpegParser;
+import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
-
 import static java.nio.charset.StandardCharsets.UTF_8;
 
 /**
@@ -110,7 +109,7 @@ public class TesseractOCRParser extends AbstractParser {
         }
     }
 
-    private boolean hasTesseract(TesseractOCRConfig config) {
+    public boolean hasTesseract(TesseractOCRConfig config) {
         // Fetch where the config says to find Tesseract
         String tesseract = config.getTesseractPath() + getTesseractProg();
 
@@ -157,47 +156,90 @@ public class TesseractOCRParser extends AbstractParser {
     public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
         TesseractOCRConfig config = context.get(TesseractOCRConfig.class, 
DEFAULT_CONFIG);
-
         // If Tesseract is not on the path with the current config, do not try 
to run OCR
         // getSupportedTypes shouldn't have listed us as handling it, so this 
should only
         //  occur if someone directly calls this parser, not via DefaultParser 
or similar
         if (! hasTesseract(config))
             return;
 
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+            xhtml.startDocument();
+            File tmpImgFile = tmp.createTemporaryFile();
+            parse(tikaStream, tmpImgFile, xhtml, config);
+            // Temporary workaround for TIKA-1445 - until we can specify
+            //  composite parsers with strategies (eg Composite, Try In Turn),
+            //  always send the image onwards to the regular parser to have
+            //  the metadata for them extracted as well
+            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new 
EmbeddedContentHandler(xhtml), metadata, context);
+            xhtml.endDocument();
+        } finally {
+            tmp.dispose();
+        }
+    }
+
+    /**
+     * Use this to parse content without starting a new document.
+     * This appends SAX events to xhtml without re-adding the metadata, body 
start, etc.
+     * @param stream inputstream
+     * @param xhtml handler
+     * @param config TesseractOCRConfig to use for this parse
+     * @throws IOException
+     * @throws SAXException
+     * @throws TikaException
+     */
+    public void parseInline(InputStream stream, XHTMLContentHandler xhtml, 
TesseractOCRConfig config)
+            throws IOException, SAXException, TikaException {
+        // If Tesseract is not on the path with the current config, do not try 
to run OCR
+        // getSupportedTypes shouldn't have listed us as handling it, so this 
should only
+        //  occur if someone directly calls this parser, not via DefaultParser 
or similar
+        if (! hasTesseract(config))
+            return;
 
         TemporaryResources tmp = new TemporaryResources();
-        File output = null;
         try {
             TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
-            File input = tikaStream.getFile();
-            long size = tikaStream.getLength();
+            File tmpImgFile = tmp.createTemporaryFile();
+            parse(tikaStream, tmpImgFile, xhtml, config);
+        } finally {
+            tmp.dispose();
+        }
+
+    }
+
+    private void parse(TikaInputStream tikaInputStream, File tmpImgFile, 
XHTMLContentHandler xhtml, TesseractOCRConfig config)
+            throws IOException, SAXException, TikaException {
+        File tmpTxtOutput = null;
+
+        try {
+            File input = tikaInputStream.getFile();
+            long size = tikaInputStream.getLength();
 
             if (size >= config.getMinFileSizeToOcr() && size <= 
config.getMaxFileSizeToOcr()) {
 
-                output = tmp.createTemporaryFile();
-                doOCR(input, output, config);
+                doOCR(input, tmpImgFile, config);
 
                 // Tesseract appends .txt to output file name
-                output = new File(output.getAbsolutePath() + ".txt");
+                tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + ".txt");
 
-                if (output.exists())
-                    extractOutput(new FileInputStream(output), xhtml);
+                if (tmpTxtOutput.exists()) {
+                    try (InputStream is = new FileInputStream(tmpTxtOutput)) {
+                        extractOutput(is, xhtml);
+                    }
+                }
 
             }
 
-            // Temporary workaround for TIKA-1445 - until we can specify
-            //  composite parsers with strategies (eg Composite, Try In Turn),
-            //  always send the image onwards to the regular parser to have
-            //  the metadata for them extracted as well
-            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, 
context);
         } finally {
-            tmp.dispose();
-            if (output != null) {
-                output.delete();
+            if (tmpTxtOutput != null) {
+                tmpTxtOutput.delete();
             }
         }
     }
+
     // TIKA-1445 workaround parser
     private static Parser _TMP_IMAGE_METADATA_PARSER = new 
CompositeImageParser();
     private static class CompositeImageParser extends CompositeParser {
@@ -283,8 +325,7 @@ public class TesseractOCRParser extends AbstractParser {
      */
     private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) 
throws SAXException, IOException {
 
-        xhtml.startDocument();
-        xhtml.startElement("div");
+        xhtml.startElement("div", "class", "ocr");
         try (Reader reader = new InputStreamReader(stream, UTF_8)) {
             char[] buffer = new char[1024];
             for (int n = reader.read(buffer); n != -1; n = 
reader.read(buffer)) {
@@ -293,7 +334,7 @@ public class TesseractOCRParser extends AbstractParser {
             }
         }
         xhtml.endElement("div");
-        xhtml.endDocument();
+
     }
 
     /**

http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index e99f6ae..9ab958e 100644
--- 
a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -1,265 +1,262 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.ocr;
-
-import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-import static org.junit.Assume.assumeTrue;
-import static org.mockito.Matchers.any;
-import static org.mockito.Matchers.eq;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.times;
-import static org.mockito.Mockito.verify;
-
-import java.io.InputStream;
-import java.util.List;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.DefaultParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.parser.external.ExternalParser;
-import org.apache.tika.parser.image.ImageParser;
-import org.apache.tika.parser.mail.RFC822Parser;
-import org.apache.tika.parser.pdf.PDFParserConfig;
-import org.apache.tika.sax.BasicContentHandlerFactory;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.junit.Test;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class TesseractOCRParserTest extends TikaTest {
-
-    public static boolean canRun() {
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest();
-        return tesseractOCRTest.canRun(config);
-    }
-
-    private boolean canRun(TesseractOCRConfig config) {
-        String[] checkCmd = {config.getTesseractPath() + getTesseractProg()};
-        // If Tesseract is not on the path, do not run the test.
-        return ExternalParser.check(checkCmd);
-    }
-
-    /*
-    Check that if Tesseract is not found, the TesseractOCRParser claims to not 
support
-    any file types. So, the standard image parser is called instead.
-     */
-    @Test
-    public void offersNoTypesIfNotFound() throws Exception {
-        TesseractOCRParser parser = new TesseractOCRParser();
-        DefaultParser defaultParser = new DefaultParser();
-        MediaType png = MediaType.image("png");
-
-        // With an invalid path, will offer no types
-        TesseractOCRConfig invalidConfig = new TesseractOCRConfig();
-        invalidConfig.setTesseractPath("/made/up/path");
-
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(TesseractOCRConfig.class, invalidConfig);
-
-        // No types offered
-        assertEquals(0, parser.getSupportedTypes(parseContext).size());
-
-        // And DefaultParser won't use us
-        assertEquals(ImageParser.class, 
defaultParser.getParsers(parseContext).get(png).getClass());
-    }
-
-    /*
-    If Tesseract is found, test we retrieve the proper number of supporting 
Parsers.
-     */
-    @Test
-    public void offersTypesIfFound() throws Exception {
-        TesseractOCRParser parser = new TesseractOCRParser();
-        DefaultParser defaultParser = new DefaultParser();
-
-        ParseContext parseContext = new ParseContext();
-        MediaType png = MediaType.image("png");
-
-        // Assuming that Tesseract is on the path, we should find 5 Parsers 
that support PNG.
-        assumeTrue(canRun());
-
-        assertEquals(5, parser.getSupportedTypes(parseContext).size());
-        assertTrue(parser.getSupportedTypes(parseContext).contains(png));
-
-        // DefaultParser will now select the TesseractOCRParser.
-        assertEquals(TesseractOCRParser.class, 
defaultParser.getParsers(parseContext).get(png).getClass());
-    }
-
-    @Test
-    public void testPDFOCR() throws Exception {
-        String resource = "/test-documents/testOCR.pdf";
-        String[] nonOCRContains = new String[0];
-        testBasicOCR(resource, nonOCRContains, 2);
-    }
-
-    @Test
-    public void testDOCXOCR() throws Exception {
-        String resource = "/test-documents/testOCR.docx";
-        String[] nonOCRContains = {
-                "This is some text.",
-                "Here is an embedded image:"
-        };
-        testBasicOCR(resource, nonOCRContains, 3);
-    }
-
-    @Test
-    public void testPPTXOCR() throws Exception {
-        String resource = "/test-documents/testOCR.pptx";
-        String[] nonOCRContains = {
-                "This is some text"
-        };
-        testBasicOCR(resource, nonOCRContains, 3);
-    }
-
-    private void testBasicOCR(String resource, String[] nonOCRContains, int 
numMetadatas) throws Exception {
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        Parser parser = new RecursiveParserWrapper(new AutoDetectParser(),
-                new BasicContentHandlerFactory(
-                        BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
-
-        PDFParserConfig pdfConfig = new PDFParserConfig();
-        pdfConfig.setExtractInlineImages(true);
-
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(TesseractOCRConfig.class, config);
-        parseContext.set(Parser.class, parser);
-        parseContext.set(PDFParserConfig.class, pdfConfig);
-
-        try (InputStream stream = 
TesseractOCRParserTest.class.getResourceAsStream(resource)) {
-            parser.parse(stream, new DefaultHandler(), new Metadata(), 
parseContext);
-        }
-        List<Metadata> metadataList = ((RecursiveParserWrapper) 
parser).getMetadata();
-        assertEquals(numMetadatas, metadataList.size());
-
-        StringBuilder contents = new StringBuilder();
-        for (Metadata m : metadataList) {
-            contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
-        }
-        if (canRun()) {
-            assertTrue(contents.toString().contains("Happy New Year 2003!"));
-        }
-        for (String needle : nonOCRContains) {
-            assertContains(needle, contents.toString());
-        }
-        assertTrue(metadataList.get(0).names().length > 10);
-        assertTrue(metadataList.get(1).names().length > 10);
-        //test at least one value
-        assertEquals("deflate", metadataList.get(1).get("Compression 
CompressionTypeName"));
-    }
-
-    @Test
-    public void testSingleImage() throws Exception {
-        assumeTrue(canRun());
-        String xml = getXML("testOCR.jpg").xml;
-        assertContains("OCR Testing", xml);
-    }
-
-    @Test
-    public void getNormalMetadataToo() throws Exception {
-        //this should be successful whether or not TesseractOCR is 
installed/active
-        //If tesseract is installed, the internal metadata extraction parser 
should
-        //work; and if tesseract isn't installed, the regular parsers should 
take over.
-
-        //gif
-        Metadata m = getXML("testGIF.gif").metadata;
-        assertTrue(m.names().length > 20);
-        assertEquals("RGB", m.get("Chroma ColorSpaceType"));
-
-        //jpg
-        m = getXML("testOCR.jpg").metadata;
-        assertEquals("136", m.get(Metadata.IMAGE_WIDTH));
-        assertEquals("66", m.get(Metadata.IMAGE_LENGTH));
-        assertEquals("8", m.get(Metadata.BITS_PER_SAMPLE));
-        assertEquals(null, m.get(Metadata.SAMPLES_PER_PIXEL));
-        assertContains("This is a test Apache Tika imag", 
m.get(Metadata.COMMENTS));
-
-        //bmp
-        m = getXML("testBMP.bmp").metadata;
-        assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
-        assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
-
-        //png
-        m = getXML("testPNG.png").metadata;
-        assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
-        assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
-        assertEquals("UnsignedIntegral", m.get("Data SampleFormat"));
-
-        //tiff
-        m = getXML("testTIFF.tif").metadata;
-        assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
-        assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
-        assertEquals("72 dots per inch", m.get("Y Resolution"));
-    }
-    
-    @Test
-    public void testMultipart() {
-        Parser parser = new RFC822Parser();
-        Metadata metadata = new Metadata();
-        InputStream stream = getStream("test-documents/testRFC822-multipart");
-        ContentHandler handler = mock(XHTMLContentHandler.class);
-
-        try {
-            parser.parse(stream, handler, metadata, new ParseContext());
-            verify(handler).startDocument();
-            int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
-            // TIKA-1422. TesseractOCRParser interferes with the number of 
times the handler is invoked.
-            // But, different versions of Tesseract lead to a different number 
of invocations. So, we
-            // only verify the handler if Tesseract cannot run.
-            if (!TesseractOCRParserTest.canRun()) {
-                verify(handler, 
times(bodyExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), 
eq("div"), eq("div"), any(Attributes.class));
-                verify(handler, 
times(bodyExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div");
-            }
-        } catch (Exception e) {
-            fail("Exception thrown: " + e.getMessage());
-        }
-
-        //repeat, this time looking at content
-        parser = new RFC822Parser();
-        metadata = new Metadata();
-        stream = getStream("test-documents/testRFC822-multipart");
-        handler = new BodyContentHandler();
-        try {
-            parser.parse(stream, handler, metadata, new ParseContext());
-            //tests correct decoding of quoted printable text, including UTF-8 
bytes into Unicode
-            String bodyText = handler.toString();
-            assertTrue(bodyText.contains("body 1"));
-            assertTrue(bodyText.contains("body 2"));
-            assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of 
encoded gif
-        } catch (Exception e) {
-            fail("Exception thrown: " + e.getMessage());
-        }
-    }
-    
-    private static InputStream getStream(String name) {
-        InputStream stream = Thread.currentThread().getContextClassLoader()
-                .getResourceAsStream(name);
-        assertNotNull("Test file not found " + name, stream);
-        return stream;
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+import static org.junit.Assume.assumeTrue;
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.image.ImageParser;
+import org.apache.tika.parser.mail.RFC822Parser;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class TesseractOCRParserTest extends TikaTest {
+
+    public static boolean canRun() {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest();
+        return tesseractOCRTest.canRun(config);
+    }
+
+    private boolean canRun(TesseractOCRConfig config) {
+        String[] checkCmd = {config.getTesseractPath() + getTesseractProg()};
+        // If Tesseract is not on the path, do not run the test.
+        return ExternalParser.check(checkCmd);
+    }
+
+    /*
+    Check that if Tesseract is not found, the TesseractOCRParser claims to not 
support
+    any file types. So, the standard image parser is called instead.
+     */
+    @Test
+    public void offersNoTypesIfNotFound() throws Exception {
+        TesseractOCRParser parser = new TesseractOCRParser();
+        DefaultParser defaultParser = new DefaultParser();
+        MediaType png = MediaType.image("png");
+
+        // With an invalid path, will offer no types
+        TesseractOCRConfig invalidConfig = new TesseractOCRConfig();
+        invalidConfig.setTesseractPath("/made/up/path");
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(TesseractOCRConfig.class, invalidConfig);
+
+        // No types offered
+        assertEquals(0, parser.getSupportedTypes(parseContext).size());
+
+        // And DefaultParser won't use us
+        assertEquals(ImageParser.class, 
defaultParser.getParsers(parseContext).get(png).getClass());
+    }
+
+    /*
+    If Tesseract is found, test we retrieve the proper number of supporting 
Parsers.
+     */
+    @Test
+    public void offersTypesIfFound() throws Exception {
+        TesseractOCRParser parser = new TesseractOCRParser();
+        DefaultParser defaultParser = new DefaultParser();
+
+        ParseContext parseContext = new ParseContext();
+        MediaType png = MediaType.image("png");
+
+        // Assuming that Tesseract is on the path, we should find 5 Parsers 
that support PNG.
+        assumeTrue(canRun());
+
+        assertEquals(5, parser.getSupportedTypes(parseContext).size());
+        assertTrue(parser.getSupportedTypes(parseContext).contains(png));
+
+        // DefaultParser will now select the TesseractOCRParser.
+        assertEquals(TesseractOCRParser.class, 
defaultParser.getParsers(parseContext).get(png).getClass());
+    }
+
+    @Test
+    @Ignore("TODO: cyclic reference to pdf-module...maybe move these all to 
tika-app?")
+    public void testPDFOCR() throws Exception {
+        String resource = "/test-documents/testOCR.pdf";
+        String[] nonOCRContains = new String[0];
+        testBasicOCR(resource, nonOCRContains, 2);
+    }
+
+    @Test
+    public void testDOCXOCR() throws Exception {
+        String resource = "/test-documents/testOCR.docx";
+        String[] nonOCRContains = {
+                "This is some text.",
+                "Here is an embedded image:"
+        };
+        testBasicOCR(resource, nonOCRContains, 3);
+    }
+
+    @Test
+    public void testPPTXOCR() throws Exception {
+        String resource = "/test-documents/testOCR.pptx";
+        String[] nonOCRContains = {
+                "This is some text"
+        };
+        testBasicOCR(resource, nonOCRContains, 3);
+    }
+
+    private void testBasicOCR(String resource, String[] nonOCRContains, int 
numMetadatas) throws Exception {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        Parser parser = new RecursiveParserWrapper(new AutoDetectParser(),
+                new BasicContentHandlerFactory(
+                        BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(TesseractOCRConfig.class, config);
+        parseContext.set(Parser.class, parser);
+
+        try (InputStream stream = 
TesseractOCRParserTest.class.getResourceAsStream(resource)) {
+            parser.parse(stream, new DefaultHandler(), new Metadata(), 
parseContext);
+        }
+        List<Metadata> metadataList = ((RecursiveParserWrapper) 
parser).getMetadata();
+        assertEquals(numMetadatas, metadataList.size());
+
+        StringBuilder contents = new StringBuilder();
+        for (Metadata m : metadataList) {
+            contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
+        }
+        if (canRun()) {
+            assertTrue(contents.toString().contains("Happy New Year 2003!"));
+        }
+        for (String needle : nonOCRContains) {
+            assertContains(needle, contents.toString());
+        }
+        assertTrue(metadataList.get(0).names().length > 10);
+        assertTrue(metadataList.get(1).names().length > 10);
+        //test at least one value
+        assertEquals("deflate", metadataList.get(1).get("Compression 
CompressionTypeName"));
+    }
+
+    @Test
+    public void testSingleImage() throws Exception {
+        assumeTrue(canRun());
+        String xml = getXML("testOCR.jpg").xml;
+        assertContains("OCR Testing", xml);
+    }
+
+    @Test
+    public void getNormalMetadataToo() throws Exception {
+        //this should be successful whether or not TesseractOCR is 
installed/active
+        //If tesseract is installed, the internal metadata extraction parser 
should
+        //work; and if tesseract isn't installed, the regular parsers should 
take over.
+
+        //gif
+        Metadata m = getXML("testGIF.gif").metadata;
+        assertTrue(m.names().length > 20);
+        assertEquals("RGB", m.get("Chroma ColorSpaceType"));
+
+        //jpg
+        m = getXML("testOCR.jpg").metadata;
+        assertEquals("136", m.get(Metadata.IMAGE_WIDTH));
+        assertEquals("66", m.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8", m.get(Metadata.BITS_PER_SAMPLE));
+        assertEquals(null, m.get(Metadata.SAMPLES_PER_PIXEL));
+        assertContains("This is a test Apache Tika imag", 
m.get(Metadata.COMMENTS));
+
+        //bmp
+        m = getXML("testBMP.bmp").metadata;
+        assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+
+        //png
+        m = getXML("testPNG.png").metadata;
+        assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+        assertEquals("UnsignedIntegral", m.get("Data SampleFormat"));
+
+        //tiff
+        m = getXML("testTIFF.tif").metadata;
+        assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+        assertEquals("72 dots per inch", m.get("Y Resolution"));
+    }
+    
+    @Test
+    public void testMultipart() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822-multipart");
+        ContentHandler handler = mock(XHTMLContentHandler.class);
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            verify(handler).startDocument();
+            int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
+            // TIKA-1422. TesseractOCRParser interferes with the number of 
times the handler is invoked.
+            // But, different versions of Tesseract lead to a different number 
of invocations. So, we
+            // only verify the handler if Tesseract cannot run.
+            if (!TesseractOCRParserTest.canRun()) {
+                verify(handler, 
times(bodyExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), 
eq("div"), eq("div"), any(Attributes.class));
+                verify(handler, 
times(bodyExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div");
+            }
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+
+        //repeat, this time looking at content
+        parser = new RFC822Parser();
+        metadata = new Metadata();
+        stream = getStream("test-documents/testRFC822-multipart");
+        handler = new BodyContentHandler();
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            //tests correct decoding of quoted printable text, including UTF-8 
bytes into Unicode
+            String bodyText = handler.toString();
+            assertTrue(bodyText.contains("body 1"));
+            assertTrue(bodyText.contains("body 2"));
+            assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of 
encoded gif
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+    
+    private static InputStream getStream(String name) {
+        InputStream stream = Thread.currentThread().getContextClassLoader()
+                .getResourceAsStream(name);
+        assertNotNull("Test file not found " + name, stream);
+        return stream;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/pom.xml 
b/tika-parser-modules/tika-parser-pdf-module/pom.xml
index 2156b95..11f259e 100644
--- a/tika-parser-modules/tika-parser-pdf-module/pom.xml
+++ b/tika-parser-modules/tika-parser-pdf-module/pom.xml
@@ -35,6 +35,11 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-multimedia-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
       <artifactId>tika-parser-xmp-commons</artifactId>
       <version>${project.version}</version>
     </dependency>

http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
new file mode 100644
index 0000000..9a73bde
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -0,0 +1,575 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.awt.image.BufferedImage;
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TreeMap;
+
+import javax.xml.stream.XMLStreamException;
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.commons.io.IOUtils;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
+import 
org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
+import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import 
org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
+import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
+import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
+import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
+import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
+import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
+import org.apache.pdfbox.pdmodel.interactive.form.PDField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
+
+class AbstractPDF2XHTML extends PDFTextStripper {
+
+    /**
+     * Maximum recursive depth during AcroForm processing.
+     * Prevents theoretical AcroForm recursion bomb.
+     */
+    private final static int MAX_ACROFORM_RECURSIONS = 10;
+
+    private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new 
TesseractOCRConfig();
+
+    /**
+     * Format used for signature dates
+     * TODO Make this thread-safe
+     */
+    private final SimpleDateFormat dateFormat = new 
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
+
+
+    final List<IOException> exceptions = new ArrayList<>();
+    final PDDocument pdDocument;
+    final XHTMLContentHandler xhtml;
+    private final ParseContext context;
+    private final Metadata metadata;
+    final PDFParserConfig config;
+
+    private int pageIndex = 0;
+
+    AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, 
ParseContext context, Metadata metadata,
+                      PDFParserConfig config) throws IOException {
+        this.pdDocument = pdDocument;
+        this.xhtml = new XHTMLContentHandler(handler, metadata);
+        this.context = context;
+        this.metadata = metadata;
+        this.config = config;
+    }
+
+    @Override
+    protected void startPage(PDPage page) throws IOException {
+        try {
+            xhtml.startElement("div", "class", "page");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a page", e);
+        }
+        writeParagraphStart();
+    }
+
+    EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
+        EmbeddedDocumentExtractor extractor =
+                context.get(EmbeddedDocumentExtractor.class);
+        if (extractor == null) {
+            extractor = new ParsingEmbeddedDocumentExtractor(context);
+        }
+        return extractor;
+    }
+
+    private void extractEmbeddedDocuments(PDDocument document)
+            throws IOException, SAXException, TikaException {
+        PDDocumentNameDictionary namesDictionary =
+                new PDDocumentNameDictionary(document.getDocumentCatalog());
+        PDEmbeddedFilesNameTreeNode efTree = 
namesDictionary.getEmbeddedFiles();
+        if (efTree == null) {
+            return;
+        }
+
+        Map<String, PDComplexFileSpecification> embeddedFileNames = 
efTree.getNames();
+        //For now, try to get the embeddedFileNames out of embeddedFiles or 
its kids.
+        //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
+        //If there is a need we could add a fully recursive search to find a 
non-null
+        //Map<String, COSObjectable> that contains the doc info.
+        if (embeddedFileNames != null) {
+            processEmbeddedDocNames(embeddedFileNames);
+        } else {
+            List<PDNameTreeNode<PDComplexFileSpecification>> kids = 
efTree.getKids();
+            if (kids == null) {
+                return;
+            }
+            for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
+                embeddedFileNames = node.getNames();
+                if (embeddedFileNames != null) {
+                    processEmbeddedDocNames(embeddedFileNames);
+                }
+            }
+        }
+    }
+
+    private void processEmbeddedDocNames(Map<String, 
PDComplexFileSpecification> embeddedFileNames)
+            throws IOException, SAXException, TikaException {
+        if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
+            return;
+        }
+
+        EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
+        for (Map.Entry<String, PDComplexFileSpecification> ent : 
embeddedFileNames.entrySet()) {
+            PDComplexFileSpecification spec = ent.getValue();
+            extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
+        }
+    }
+
+    private void extractMultiOSPDEmbeddedFiles(String defaultName,
+                                       PDComplexFileSpecification spec,
+                                       EmbeddedDocumentExtractor extractor) 
throws IOException,
+            SAXException, TikaException {
+
+        if (spec == null) {
+            return;
+        }
+        //current strategy is to pull all, not just first non-null
+        extractPDEmbeddedFile(defaultName, spec.getFile(), 
spec.getEmbeddedFile(), extractor);
+        extractPDEmbeddedFile(defaultName, spec.getFileMac(), 
spec.getEmbeddedFileMac(), extractor);
+        extractPDEmbeddedFile(defaultName, spec.getFileDos(), 
spec.getEmbeddedFileDos(), extractor);
+        extractPDEmbeddedFile(defaultName, spec.getFileUnix(), 
spec.getEmbeddedFileUnix(), extractor);
+    }
+
+    private void extractPDEmbeddedFile(String defaultName, String fileName, 
PDEmbeddedFile file,
+                                       EmbeddedDocumentExtractor extractor)
+            throws SAXException, IOException, TikaException {
+
+        if (file == null) {
+            //skip silently
+            return;
+        }
+
+        fileName = (fileName == null) ? defaultName : fileName;
+
+        // TODO: other metadata?
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+        metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+        metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
+
+        if (extractor.shouldParseEmbedded(metadata)) {
+            TikaInputStream stream = null;
+            try {
+                stream = TikaInputStream.get(file.createInputStream());
+                extractor.parseEmbedded(
+                        stream,
+                        new EmbeddedContentHandler(xhtml),
+                        metadata, false);
+
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
+                attributes.addAttribute("", "id", "id", "CDATA", fileName);
+                xhtml.startElement("div", attributes);
+                xhtml.endElement("div");
+            } finally {
+                IOUtils.closeQuietly(stream);
+            }
+        }
+    }
+
+    void handleCatchableIOE(IOException e) throws IOException {
+        if (config.isCatchIntermediateIOExceptions()) {
+            String msg = e.getMessage();
+            if (msg == null) {
+                msg = "IOException, no message";
+            }
+            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
+            exceptions.add(e);
+        } else {
+            throw e;
+        }
+    }
+
+    void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
+        if (config.getOCRStrategy().equals(NO_OCR)) {
+            return;
+        }
+        TesseractOCRConfig tesseractConfig =
+                context.get(TesseractOCRConfig.class, 
DEFAULT_TESSERACT_CONFIG);
+
+        TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
+        if (! tesseractOCRParser.hasTesseract(tesseractConfig)) {
+            throw new TikaException("Tesseract is not available. "+
+                    "Please set the OCR_STRATEGY to NO_OCR or configure 
Tesseract correctly");
+        }
+
+        PDFRenderer renderer = new PDFRenderer(pdDocument);
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            BufferedImage image = renderer.renderImage(pageIndex, 2.0f, 
config.getOCRImageType());
+            Path tmpFile = tmp.createTempFile();
+            try (OutputStream os = Files.newOutputStream(tmpFile)) {
+                //TODO: get output format from TesseractConfig
+                ImageIOUtil.writeImage(image, config.getOCRImageFormatName(),
+                        os, config.getOCRDPI());
+            }
+            try (InputStream is = TikaInputStream.get(tmpFile)) {
+                tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
+            }
+        } catch (IOException e) {
+            handleCatchableIOE(e);
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("error writing OCR content from 
PDF", e);
+        } finally {
+            tmp.dispose();
+        }
+    }
+
+    @Override
+    protected void endPage(PDPage page) throws IOException {
+
+        try {
+            EmbeddedDocumentExtractor extractor = 
getEmbeddedDocumentExtractor();
+            for (PDAnnotation annotation : page.getAnnotations()) {
+
+                if (annotation instanceof PDAnnotationFileAttachment) {
+                    PDAnnotationFileAttachment fann = 
(PDAnnotationFileAttachment) annotation;
+                    PDComplexFileSpecification fileSpec = 
(PDComplexFileSpecification) fann.getFile();
+                    try {
+                        extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
+                    } catch (SAXException e) {
+                        throw new IOExceptionWithCause("file embedded in 
annotation sax exception", e);
+                    } catch (TikaException e) {
+                        throw new IOExceptionWithCause("file embedded in 
annotation tika exception", e);
+                    } catch (IOException e) {
+                        handleCatchableIOE(e);
+                    }
+                }
+                // TODO: remove once PDFBOX-1143 is fixed:
+                if (config.getExtractAnnotationText()) {
+                    if (annotation instanceof PDAnnotationLink) {
+                        PDAnnotationLink annotationlink = (PDAnnotationLink) 
annotation;
+                        if (annotationlink.getAction() != null) {
+                            PDAction action = annotationlink.getAction();
+                            if (action instanceof PDActionURI) {
+                                PDActionURI uri = (PDActionURI) action;
+                                String link = uri.getURI();
+                                if (link != null) {
+                                    xhtml.startElement("div", "class", 
"annotation");
+                                    xhtml.startElement("a", "href", link);
+                                    xhtml.endElement("a");
+                                    xhtml.endElement("div");
+                                }
+                            }
+                        }
+                    }
+
+                    if (annotation instanceof PDAnnotationMarkup) {
+                        PDAnnotationMarkup annotationMarkup = 
(PDAnnotationMarkup) annotation;
+                        String title = annotationMarkup.getTitlePopup();
+                        String subject = annotationMarkup.getSubject();
+                        String contents = annotationMarkup.getContents();
+                        // TODO: maybe also annotationMarkup.getRichContents()?
+                        if (title != null || subject != null || contents != 
null) {
+                            xhtml.startElement("div", "class", "annotation");
+
+                            if (title != null) {
+                                xhtml.startElement("div", "class", 
"annotationTitle");
+                                xhtml.characters(title);
+                                xhtml.endElement("div");
+                            }
+
+                            if (subject != null) {
+                                xhtml.startElement("div", "class", 
"annotationSubject");
+                                xhtml.characters(subject);
+                                xhtml.endElement("div");
+                            }
+
+                            if (contents != null) {
+                                xhtml.startElement("div", "class", 
"annotationContents");
+                                xhtml.characters(contents);
+                                xhtml.endElement("div");
+                            }
+
+                            xhtml.endElement("div");
+                        }
+                    }
+                }
+            }
+            if 
(config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION))
 {
+                doOCROnCurrentPage();
+            }
+            xhtml.endElement("div");
+        } catch (SAXException|TikaException e) {
+            throw new IOExceptionWithCause("Unable to end a page", e);
+        } catch (IOException e) {
+            exceptions.add(e);
+        } finally {
+            pageIndex++;
+        }
+    }
+
+    @Override
+    protected void startDocument(PDDocument pdf) throws IOException {
+        try {
+            xhtml.startDocument();
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a document", e);
+        }
+    }
+
+    @Override
+    protected void endDocument(PDDocument pdf) throws IOException {
+        try {
+            // Extract text for any bookmarks:
+            extractBookmarkText();
+            try {
+                extractEmbeddedDocuments(pdf);
+            } catch (IOException e) {
+                handleCatchableIOE(e);
+            }
+
+            //extract acroform data at end of doc
+            if (config.getExtractAcroFormContent() == true) {
+                try {
+                    extractAcroForm(pdf);
+                } catch (IOException e) {
+                    handleCatchableIOE(e);
+                }
+            }
+            xhtml.endDocument();
+        } catch (TikaException e) {
+            throw new IOExceptionWithCause("Unable to end a document", e);
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to end a document", e);
+        }
+    }
+
+    void extractBookmarkText() throws SAXException {
+        PDDocumentOutline outline = 
document.getDocumentCatalog().getDocumentOutline();
+        if (outline != null) {
+            extractBookmarkText(outline);
+        }
+    }
+
+    void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
+        PDOutlineItem current = bookmark.getFirstChild();
+        if (current != null) {
+            xhtml.startElement("ul");
+            while (current != null) {
+                xhtml.startElement("li");
+                xhtml.characters(current.getTitle());
+                xhtml.endElement("li");
+                // Recurse:
+                extractBookmarkText(current);
+                current = current.getNextSibling();
+            }
+            xhtml.endElement("ul");
+        }
+    }
+
+    void extractAcroForm(PDDocument pdf) throws IOException,
+            SAXException {
+        //Thank you, Ben Litchfield, for 
org.apache.pdfbox.examples.fdf.PrintFields
+        //this code derives from Ben's code
+        PDDocumentCatalog catalog = pdf.getDocumentCatalog();
+
+        if (catalog == null)
+            return;
+
+        PDAcroForm form = catalog.getAcroForm();
+        if (form == null)
+            return;
+
+        //if it has xfa, try that.
+        //if it doesn't exist or there's an exception,
+        //go with traditional AcroForm
+        PDXFAResource pdxfa = form.getXFA();
+
+        if (pdxfa != null) {
+            //if successful, return
+            XFAExtractor xfaExtractor = new XFAExtractor();
+            try (InputStream is = new BufferedInputStream(
+                    new ByteArrayInputStream(pdxfa.getBytes()))) {
+                xfaExtractor.extract(is, xhtml, metadata, context);
+                return;
+            } catch (XMLStreamException |IOException e) {
+                //if there was an xml parse exception in xfa, try the AcroForm
+            }
+        }
+
+        @SuppressWarnings("rawtypes")
+        List fields = form.getFields();
+
+        if (fields == null)
+            return;
+
+        @SuppressWarnings("rawtypes")
+        ListIterator itr = fields.listIterator();
+
+        if (itr == null)
+            return;
+
+        xhtml.startElement("div", "class", "acroform");
+        xhtml.startElement("ol");
+
+        while (itr.hasNext()) {
+            Object obj = itr.next();
+            if (obj != null && obj instanceof PDField) {
+                processAcroField((PDField) obj, 0);
+            }
+        }
+        xhtml.endElement("ol");
+        xhtml.endElement("div");
+    }
+
+    private void processAcroField(PDField field, final int 
currentRecursiveDepth)
+            throws SAXException, IOException {
+
+        if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
+            return;
+        }
+        addFieldString(field);
+        if (field instanceof PDNonTerminalField) {
+            int r = currentRecursiveDepth + 1;
+            xhtml.startElement("ol");
+            for (PDField child : ((PDNonTerminalField)field).getChildren()) {
+                processAcroField(child, r);
+            }
+            xhtml.endElement("ol");
+        }
+    }
+
+    private void addFieldString(PDField field) throws SAXException {
+        //Pick partial name to present in content and altName for attribute
+        //Ignoring FullyQualifiedName for now
+        String partName = field.getPartialName();
+        String altName = field.getAlternateFieldName();
+
+        StringBuilder sb = new StringBuilder();
+        AttributesImpl attrs = new AttributesImpl();
+
+        if (partName != null) {
+            sb.append(partName).append(": ");
+        }
+        if (altName != null) {
+            attrs.addAttribute("", "altName", "altName", "CDATA", altName);
+        }
+        //return early if PDSignature field
+        if (field instanceof PDSignatureField) {
+            handleSignature(attrs, (PDSignatureField) field);
+            return;
+        }
+        String value = field.getValueAsString();
+        if (value != null && !value.equals("null")) {
+            sb.append(value);
+        }
+
+        if (attrs.getLength() > 0 || sb.length() > 0) {
+            xhtml.startElement("li", attrs);
+            xhtml.characters(sb.toString());
+            xhtml.endElement("li");
+        }
+    }
+
+    private void handleSignature(AttributesImpl parentAttributes, 
PDSignatureField sigField)
+            throws SAXException {
+
+        PDSignature sig = sigField.getSignature();
+        if (sig == null) {
+            return;
+        }
+        Map<String, String> vals = new TreeMap<>();
+        vals.put("name", sig.getName());
+        vals.put("contactInfo", sig.getContactInfo());
+        vals.put("location", sig.getLocation());
+        vals.put("reason", sig.getReason());
+
+        Calendar cal = sig.getSignDate();
+        if (cal != null) {
+            dateFormat.setTimeZone(cal.getTimeZone());
+            vals.put("date", dateFormat.format(cal.getTime()));
+        }
+        //see if there is any data
+        int nonNull = 0;
+        for (String val : vals.keySet()) {
+            if (val != null && !val.equals("")) {
+                nonNull++;
+            }
+        }
+        //if there is, process it
+        if (nonNull > 0) {
+            xhtml.startElement("li", parentAttributes);
+
+            AttributesImpl attrs = new AttributesImpl();
+            attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
+
+            xhtml.startElement("ol", attrs);
+            for (Map.Entry<String, String> e : vals.entrySet()) {
+                if (e.getValue() == null || e.getValue().equals("")) {
+                    continue;
+                }
+                attrs = new AttributesImpl();
+                attrs.addAttribute("", "signdata", "signdata", "CDATA", 
e.getKey());
+                xhtml.startElement("li", attrs);
+                xhtml.characters(e.getValue());
+                xhtml.endElement("li");
+            }
+            xhtml.endElement("ol");
+            xhtml.endElement("li");
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
new file mode 100644
index 0000000..3ad551d
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.text.TextPosition;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+
+/**
+ * Utility class that overrides the {@link PDFTextStripper} functionality
+ * to integrate text extraction via OCR only.
+ *
+ */
+class OCR2XHTML extends AbstractPDF2XHTML {
+
+    private OCR2XHTML(PDDocument document, ContentHandler handler, 
ParseContext context, Metadata metadata,
+                      PDFParserConfig config)
+            throws IOException {
+        super(document, handler, context, metadata, config);
+    }
+
+    /**
+     * Converts the given PDF document (and related metadata) to a stream
+     * of XHTML SAX events sent to the given content handler.
+     *
+     * @param document PDF document
+     * @param handler  SAX content handler
+     * @param metadata PDF metadata
+     * @throws SAXException  if the content handler fails to process SAX events
+     * @throws TikaException if there was an exception outside of per page 
processing
+     */
+    public static void process(
+            PDDocument document, ContentHandler handler, ParseContext context, 
Metadata metadata,
+            PDFParserConfig config)
+            throws SAXException, TikaException {
+        OCR2XHTML ocr2XHTML = null;
+        try {
+            ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, 
config);
+            ocr2XHTML.writeText(document, new Writer() {
+                @Override
+                public void write(char[] cbuf, int off, int len) {
+                }
+
+                @Override
+                public void flush() {
+                }
+
+                @Override
+                public void close() {
+                }
+            });
+        } catch (IOException e) {
+            if (e.getCause() instanceof SAXException) {
+                throw (SAXException) e.getCause();
+            } else {
+                throw new TikaException("Unable to extract PDF content", e);
+            }
+        }
+        if (ocr2XHTML.exceptions.size() > 0) {
+            //throw the first
+            throw new TikaException("Unable to extract all PDF content",
+                    ocr2XHTML.exceptions.get(0));
+        }
+    }
+
+    @Override
+    public void processPage(PDPage pdPage) throws IOException {
+        try {
+            startPage(pdPage);
+            doOCROnCurrentPage();
+            endPage(pdPage);
+        } catch (TikaException |SAXException e) {
+            throw new IOExceptionWithCause(e);
+        } catch (IOException e) {
+            handleCatchableIOE(e);
+        }
+    }
+
+    @Override
+    protected void writeString(String text) throws IOException {
+        //no-op
+    }
+
+    @Override
+    protected void writeCharacters(TextPosition text) throws IOException {
+        //no-op
+    }
+
+    @Override
+    protected void writeWordSeparator() throws IOException {
+        //no-op
+    }
+
+    @Override
+    protected void writeLineSeparator() throws IOException {
+        //no-op
+    }
+
+}
+

[2/2] tika git commit: TIKA-1994 -- Integrate TesseractOCR with full page image rendering for PDFs

Reply via email to