[1/4] tika git commit: TIKA-1994 -- integrate OCR with PDFParser

tallison Fri, 03 Jun 2016 12:56:01 -0700

Repository: tika
Updated Branches:
  refs/heads/TIKA-1508 1202f459e -> 18ab8f91f



TIKA-1994 -- integrate OCR with PDFParser


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7aeb95d6
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7aeb95d6
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7aeb95d6

Branch: refs/heads/TIKA-1508
Commit: 7aeb95d6c7a6ac3611f2dd975baa73f566631061
Parents: a20c46c
Author: tballison <[email protected]>
Authored: Thu Jun 2 12:04:30 2016 -0400
Committer: tballison <[email protected]>
Committed: Thu Jun 2 12:04:30 2016 -0400

----------------------------------------------------------------------
 .../tika/parser/ocr/TesseractOCRParser.java     |  87 ++-
 .../tika/parser/pdf/AbstractPDF2XHTML.java      | 576 +++++++++++++++++++
 .../org/apache/tika/parser/pdf/OCR2XHTML.java   | 127 ++++
 .../org/apache/tika/parser/pdf/PDF2XHTML.java   | 492 +---------------
 .../org/apache/tika/parser/pdf/PDFParser.java   |   7 +
 .../apache/tika/parser/pdf/PDFParserConfig.java | 274 ++++++---
 .../apache/tika/parser/pdf/PDFParser.properties |  10 +-
 .../apache/tika/parser/pdf/PDFParserTest.java   |  38 ++
 8 files changed, 1029 insertions(+), 582 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 7db29c8..a238a7c 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -56,6 +56,7 @@ import org.apache.tika.parser.external.ExternalParser;
 import org.apache.tika.parser.image.ImageParser;
 import org.apache.tika.parser.image.TiffParser;
 import org.apache.tika.parser.jpeg.JpegParser;
+import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -110,7 +111,7 @@ public class TesseractOCRParser extends AbstractParser {
         }
     }
 
-    private boolean hasTesseract(TesseractOCRConfig config) {
+    public boolean hasTesseract(TesseractOCRConfig config) {
         // Fetch where the config says to find Tesseract
         String tesseract = config.getTesseractPath() + getTesseractProg();
 
@@ -157,47 +158,90 @@ public class TesseractOCRParser extends AbstractParser {
     public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
         TesseractOCRConfig config = context.get(TesseractOCRConfig.class, 
DEFAULT_CONFIG);
-
         // If Tesseract is not on the path with the current config, do not try 
to run OCR
         // getSupportedTypes shouldn't have listed us as handling it, so this 
should only
         //  occur if someone directly calls this parser, not via DefaultParser 
or similar
         if (! hasTesseract(config))
             return;
 
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+            xhtml.startDocument();
+            File tmpImgFile = tmp.createTemporaryFile();
+            parse(tikaStream, tmpImgFile, xhtml, config);
+            // Temporary workaround for TIKA-1445 - until we can specify
+            //  composite parsers with strategies (eg Composite, Try In Turn),
+            //  always send the image onwards to the regular parser to have
+            //  the metadata for them extracted as well
+            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new 
EmbeddedContentHandler(xhtml), metadata, context);
+            xhtml.endDocument();
+        } finally {
+            tmp.dispose();
+        }
+    }
+
+    /**
+     * Use this to parse content without starting a new document.
+     * This appends SAX events to xhtml without re-adding the metadata, body 
start, etc.
+     * @param stream inputstream
+     * @param xhtml handler
+     * @param config TesseractOCRConfig to use for this parse
+     * @throws IOException
+     * @throws SAXException
+     * @throws TikaException
+     */
+    public void parseInline(InputStream stream, XHTMLContentHandler xhtml, 
TesseractOCRConfig config)
+            throws IOException, SAXException, TikaException {
+        // If Tesseract is not on the path with the current config, do not try 
to run OCR
+        // getSupportedTypes shouldn't have listed us as handling it, so this 
should only
+        //  occur if someone directly calls this parser, not via DefaultParser 
or similar
+        if (! hasTesseract(config))
+            return;
 
         TemporaryResources tmp = new TemporaryResources();
-        File output = null;
         try {
             TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
-            File input = tikaStream.getFile();
-            long size = tikaStream.getLength();
+            File tmpImgFile = tmp.createTemporaryFile();
+            parse(tikaStream, tmpImgFile, xhtml, config);
+        } finally {
+            tmp.dispose();
+        }
+
+    }
+
+    private void parse(TikaInputStream tikaInputStream, File tmpImgFile, 
XHTMLContentHandler xhtml, TesseractOCRConfig config)
+            throws IOException, SAXException, TikaException {
+        File tmpTxtOutput = null;
+
+        try {
+            File input = tikaInputStream.getFile();
+            long size = tikaInputStream.getLength();
 
             if (size >= config.getMinFileSizeToOcr() && size <= 
config.getMaxFileSizeToOcr()) {
 
-                output = tmp.createTemporaryFile();
-                doOCR(input, output, config);
+                doOCR(input, tmpImgFile, config);
 
                 // Tesseract appends .txt to output file name
-                output = new File(output.getAbsolutePath() + ".txt");
+                tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + ".txt");
 
-                if (output.exists())
-                    extractOutput(new FileInputStream(output), xhtml);
+                if (tmpTxtOutput.exists()) {
+                    try (InputStream is = new FileInputStream(tmpTxtOutput)) {
+                        extractOutput(is, xhtml);
+                    }
+                }
 
             }
 
-            // Temporary workaround for TIKA-1445 - until we can specify
-            //  composite parsers with strategies (eg Composite, Try In Turn),
-            //  always send the image onwards to the regular parser to have
-            //  the metadata for them extracted as well
-            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, 
context);
         } finally {
-            tmp.dispose();
-            if (output != null) {
-                output.delete();
+            if (tmpTxtOutput != null) {
+                tmpTxtOutput.delete();
             }
         }
     }
+
     // TIKA-1445 workaround parser
     private static Parser _TMP_IMAGE_METADATA_PARSER = new 
CompositeImageParser();
     private static class CompositeImageParser extends CompositeParser {
@@ -283,8 +327,7 @@ public class TesseractOCRParser extends AbstractParser {
      */
     private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) 
throws SAXException, IOException {
 
-        xhtml.startDocument();
-        xhtml.startElement("div");
+        xhtml.startElement("div", "class", "ocr");
         try (Reader reader = new InputStreamReader(stream, UTF_8)) {
             char[] buffer = new char[1024];
             for (int n = reader.read(buffer); n != -1; n = 
reader.read(buffer)) {
@@ -293,7 +336,7 @@ public class TesseractOCRParser extends AbstractParser {
             }
         }
         xhtml.endElement("div");
-        xhtml.endDocument();
+
     }
 
     /**

http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
new file mode 100644
index 0000000..d8a46a2
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -0,0 +1,576 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import javax.xml.stream.XMLStreamException;
+import java.awt.image.BufferedImage;
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.commons.io.IOUtils;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
+import 
org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
+import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import 
org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
+import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
+import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
+import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
+import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
+import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
+import org.apache.pdfbox.pdmodel.interactive.form.PDField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
+
+class AbstractPDF2XHTML extends PDFTextStripper {
+
+    /**
+     * Maximum recursive depth during AcroForm processing.
+     * Prevents theoretical AcroForm recursion bomb.
+     */
+    private final static int MAX_ACROFORM_RECURSIONS = 10;
+
+    private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new 
TesseractOCRConfig();
+
+    /**
+     * Format used for signature dates
+     * TODO Make this thread-safe
+     */
+    private final SimpleDateFormat dateFormat = new 
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
+
+
+    final List<IOException> exceptions = new ArrayList<>();
+    final PDDocument pdDocument;
+    final XHTMLContentHandler xhtml;
+    private final ParseContext context;
+    private final Metadata metadata;
+    final PDFParserConfig config;
+
+    private int pageIndex = 0;
+
+    AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, 
ParseContext context, Metadata metadata,
+                      PDFParserConfig config) throws IOException {
+        this.pdDocument = pdDocument;
+        this.xhtml = new XHTMLContentHandler(handler, metadata);
+        this.context = context;
+        this.metadata = metadata;
+        this.config = config;
+    }
+
+    @Override
+    protected void startPage(PDPage page) throws IOException {
+        try {
+            xhtml.startElement("div", "class", "page");
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a page", e);
+        }
+        writeParagraphStart();
+    }
+
+    EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
+        EmbeddedDocumentExtractor extractor =
+                context.get(EmbeddedDocumentExtractor.class);
+        if (extractor == null) {
+            extractor = new ParsingEmbeddedDocumentExtractor(context);
+        }
+        return extractor;
+    }
+
+    private void extractEmbeddedDocuments(PDDocument document)
+            throws IOException, SAXException, TikaException {
+        PDDocumentNameDictionary namesDictionary =
+                new PDDocumentNameDictionary(document.getDocumentCatalog());
+        PDEmbeddedFilesNameTreeNode efTree = 
namesDictionary.getEmbeddedFiles();
+        if (efTree == null) {
+            return;
+        }
+
+        Map<String, PDComplexFileSpecification> embeddedFileNames = 
efTree.getNames();
+        //For now, try to get the embeddedFileNames out of embeddedFiles or 
its kids.
+        //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
+        //If there is a need we could add a fully recursive search to find a 
non-null
+        //Map<String, COSObjectable> that contains the doc info.
+        if (embeddedFileNames != null) {
+            processEmbeddedDocNames(embeddedFileNames);
+        } else {
+            List<PDNameTreeNode<PDComplexFileSpecification>> kids = 
efTree.getKids();
+            if (kids == null) {
+                return;
+            }
+            for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
+                embeddedFileNames = node.getNames();
+                if (embeddedFileNames != null) {
+                    processEmbeddedDocNames(embeddedFileNames);
+                }
+            }
+        }
+    }
+
+    private void processEmbeddedDocNames(Map<String, 
PDComplexFileSpecification> embeddedFileNames)
+            throws IOException, SAXException, TikaException {
+        if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
+            return;
+        }
+
+        EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
+        for (Map.Entry<String, PDComplexFileSpecification> ent : 
embeddedFileNames.entrySet()) {
+            PDComplexFileSpecification spec = ent.getValue();
+            extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
+        }
+    }
+
+    private void extractMultiOSPDEmbeddedFiles(String defaultName,
+                                       PDComplexFileSpecification spec,
+                                       EmbeddedDocumentExtractor extractor) 
throws IOException,
+            SAXException, TikaException {
+
+        if (spec == null) {
+            return;
+        }
+        //current strategy is to pull all, not just first non-null
+        extractPDEmbeddedFile(defaultName, spec.getFile(), 
spec.getEmbeddedFile(), extractor);
+        extractPDEmbeddedFile(defaultName, spec.getFileMac(), 
spec.getEmbeddedFileMac(), extractor);
+        extractPDEmbeddedFile(defaultName, spec.getFileDos(), 
spec.getEmbeddedFileDos(), extractor);
+        extractPDEmbeddedFile(defaultName, spec.getFileUnix(), 
spec.getEmbeddedFileUnix(), extractor);
+    }
+
+    private void extractPDEmbeddedFile(String defaultName, String fileName, 
PDEmbeddedFile file,
+                                       EmbeddedDocumentExtractor extractor)
+            throws SAXException, IOException, TikaException {
+
+        if (file == null) {
+            //skip silently
+            return;
+        }
+
+        fileName = (fileName == null) ? defaultName : fileName;
+
+        // TODO: other metadata?
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+        metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+        metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
+
+        if (extractor.shouldParseEmbedded(metadata)) {
+            TikaInputStream stream = null;
+            try {
+                stream = TikaInputStream.get(file.createInputStream());
+                extractor.parseEmbedded(
+                        stream,
+                        new EmbeddedContentHandler(xhtml),
+                        metadata, false);
+
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
+                attributes.addAttribute("", "id", "id", "CDATA", fileName);
+                xhtml.startElement("div", attributes);
+                xhtml.endElement("div");
+            } finally {
+                IOUtils.closeQuietly(stream);
+            }
+        }
+    }
+
+    void handleCatchableIOE(IOException e) throws IOException {
+        if (config.isCatchIntermediateIOExceptions()) {
+            String msg = e.getMessage();
+            if (msg == null) {
+                msg = "IOException, no message";
+            }
+            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
+            exceptions.add(e);
+        } else {
+            throw e;
+        }
+    }
+
+    void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
+        if (config.getOCRStrategy().equals(NO_OCR)) {
+            return;
+        }
+        TesseractOCRConfig tesseractConfig =
+                context.get(TesseractOCRConfig.class, 
DEFAULT_TESSERACT_CONFIG);
+
+        TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
+        if (! tesseractOCRParser.hasTesseract(tesseractConfig)) {
+            throw new TikaException("Tesseract is not available. "+
+                    "Please set the OCR_STRATEGY to NO_OCR or configure 
Tesseract correctly");
+        }
+
+        PDFRenderer renderer = new PDFRenderer(pdDocument);
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            BufferedImage image = renderer.renderImage(pageIndex, 2.0f, 
config.getOCRImageType());
+            Path tmpFile = tmp.createTempFile();
+            try (OutputStream os = Files.newOutputStream(tmpFile)) {
+                //TODO: get output format from TesseractConfig
+                ImageIOUtil.writeImage(image, config.getOCRImageFormatName(),
+                        os, config.getOCRDPI());
+            }
+            try (InputStream is = TikaInputStream.get(tmpFile)) {
+                tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
+            }
+        } catch (IOException e) {
+            handleCatchableIOE(e);
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("error writing OCR content from 
PDF", e);
+        } finally {
+            tmp.dispose();
+        }
+    }
+
+    @Override
+    protected void endPage(PDPage page) throws IOException {
+
+        try {
+            EmbeddedDocumentExtractor extractor = 
getEmbeddedDocumentExtractor();
+            for (PDAnnotation annotation : page.getAnnotations()) {
+
+                if (annotation instanceof PDAnnotationFileAttachment) {
+                    PDAnnotationFileAttachment fann = 
(PDAnnotationFileAttachment) annotation;
+                    PDComplexFileSpecification fileSpec = 
(PDComplexFileSpecification) fann.getFile();
+                    try {
+                        extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
+                    } catch (SAXException e) {
+                        throw new IOExceptionWithCause("file embedded in 
annotation sax exception", e);
+                    } catch (TikaException e) {
+                        throw new IOExceptionWithCause("file embedded in 
annotation tika exception", e);
+                    } catch (IOException e) {
+                        handleCatchableIOE(e);
+                    }
+                }
+                // TODO: remove once PDFBOX-1143 is fixed:
+                if (config.getExtractAnnotationText()) {
+                    if (annotation instanceof PDAnnotationLink) {
+                        PDAnnotationLink annotationlink = (PDAnnotationLink) 
annotation;
+                        if (annotationlink.getAction() != null) {
+                            PDAction action = annotationlink.getAction();
+                            if (action instanceof PDActionURI) {
+                                PDActionURI uri = (PDActionURI) action;
+                                String link = uri.getURI();
+                                if (link != null) {
+                                    xhtml.startElement("div", "class", 
"annotation");
+                                    xhtml.startElement("a", "href", link);
+                                    xhtml.endElement("a");
+                                    xhtml.endElement("div");
+                                }
+                            }
+                        }
+                    }
+
+                    if (annotation instanceof PDAnnotationMarkup) {
+                        PDAnnotationMarkup annotationMarkup = 
(PDAnnotationMarkup) annotation;
+                        String title = annotationMarkup.getTitlePopup();
+                        String subject = annotationMarkup.getSubject();
+                        String contents = annotationMarkup.getContents();
+                        // TODO: maybe also annotationMarkup.getRichContents()?
+                        if (title != null || subject != null || contents != 
null) {
+                            xhtml.startElement("div", "class", "annotation");
+
+                            if (title != null) {
+                                xhtml.startElement("div", "class", 
"annotationTitle");
+                                xhtml.characters(title);
+                                xhtml.endElement("div");
+                            }
+
+                            if (subject != null) {
+                                xhtml.startElement("div", "class", 
"annotationSubject");
+                                xhtml.characters(subject);
+                                xhtml.endElement("div");
+                            }
+
+                            if (contents != null) {
+                                xhtml.startElement("div", "class", 
"annotationContents");
+                                xhtml.characters(contents);
+                                xhtml.endElement("div");
+                            }
+
+                            xhtml.endElement("div");
+                        }
+                    }
+                }
+            }
+            if 
(config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION))
 {
+                doOCROnCurrentPage();
+            }
+            xhtml.endElement("div");
+        } catch (SAXException|TikaException e) {
+            throw new IOExceptionWithCause("Unable to end a page", e);
+        } catch (IOException e) {
+            exceptions.add(e);
+        } finally {
+            pageIndex++;
+        }
+    }
+
+    @Override
+    protected void startDocument(PDDocument pdf) throws IOException {
+        try {
+            xhtml.startDocument();
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to start a document", e);
+        }
+    }
+
+    @Override
+    protected void endDocument(PDDocument pdf) throws IOException {
+        try {
+            // Extract text for any bookmarks:
+            extractBookmarkText();
+            try {
+                extractEmbeddedDocuments(pdf);
+            } catch (IOException e) {
+                handleCatchableIOE(e);
+            }
+
+            //extract acroform data at end of doc
+            if (config.getExtractAcroFormContent() == true) {
+                try {
+                    extractAcroForm(pdf);
+                } catch (IOException e) {
+                    handleCatchableIOE(e);
+                }
+            }
+            xhtml.endDocument();
+        } catch (TikaException e) {
+            throw new IOExceptionWithCause("Unable to end a document", e);
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause("Unable to end a document", e);
+        }
+    }
+
+    void extractBookmarkText() throws SAXException {
+        PDDocumentOutline outline = 
document.getDocumentCatalog().getDocumentOutline();
+        if (outline != null) {
+            extractBookmarkText(outline);
+        }
+    }
+
+    void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
+        PDOutlineItem current = bookmark.getFirstChild();
+        if (current != null) {
+            xhtml.startElement("ul");
+            while (current != null) {
+                xhtml.startElement("li");
+                xhtml.characters(current.getTitle());
+                xhtml.endElement("li");
+                // Recurse:
+                extractBookmarkText(current);
+                current = current.getNextSibling();
+            }
+            xhtml.endElement("ul");
+        }
+    }
+
+    void extractAcroForm(PDDocument pdf) throws IOException,
+            SAXException {
+        //Thank you, Ben Litchfield, for 
org.apache.pdfbox.examples.fdf.PrintFields
+        //this code derives from Ben's code
+        PDDocumentCatalog catalog = pdf.getDocumentCatalog();
+
+        if (catalog == null)
+            return;
+
+        PDAcroForm form = catalog.getAcroForm();
+        if (form == null)
+            return;
+
+        //if it has xfa, try that.
+        //if it doesn't exist or there's an exception,
+        //go with traditional AcroForm
+        PDXFAResource pdxfa = form.getXFA();
+
+        if (pdxfa != null) {
+            //if successful, return
+            XFAExtractor xfaExtractor = new XFAExtractor();
+            try (InputStream is = new BufferedInputStream(
+                    new ByteArrayInputStream(pdxfa.getBytes()))) {
+                xfaExtractor.extract(is, xhtml, metadata, context);
+                return;
+            } catch (XMLStreamException |IOException e) {
+                //if there was an xml parse exception in xfa, try the AcroForm
+            }
+        }
+
+        @SuppressWarnings("rawtypes")
+        List fields = form.getFields();
+
+        if (fields == null)
+            return;
+
+        @SuppressWarnings("rawtypes")
+        ListIterator itr = fields.listIterator();
+
+        if (itr == null)
+            return;
+
+        xhtml.startElement("div", "class", "acroform");
+        xhtml.startElement("ol");
+
+        while (itr.hasNext()) {
+            Object obj = itr.next();
+            if (obj != null && obj instanceof PDField) {
+                processAcroField((PDField) obj, 0);
+            }
+        }
+        xhtml.endElement("ol");
+        xhtml.endElement("div");
+    }
+
+    private void processAcroField(PDField field, final int 
currentRecursiveDepth)
+            throws SAXException, IOException {
+
+        if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
+            return;
+        }
+        addFieldString(field);
+        if (field instanceof PDNonTerminalField) {
+            int r = currentRecursiveDepth + 1;
+            xhtml.startElement("ol");
+            for (PDField child : ((PDNonTerminalField)field).getChildren()) {
+                processAcroField(child, r);
+            }
+            xhtml.endElement("ol");
+        }
+    }
+
+    private void addFieldString(PDField field) throws SAXException {
+        //Pick partial name to present in content and altName for attribute
+        //Ignoring FullyQualifiedName for now
+        String partName = field.getPartialName();
+        String altName = field.getAlternateFieldName();
+
+        StringBuilder sb = new StringBuilder();
+        AttributesImpl attrs = new AttributesImpl();
+
+        if (partName != null) {
+            sb.append(partName).append(": ");
+        }
+        if (altName != null) {
+            attrs.addAttribute("", "altName", "altName", "CDATA", altName);
+        }
+        //return early if PDSignature field
+        if (field instanceof PDSignatureField) {
+            handleSignature(attrs, (PDSignatureField) field);
+            return;
+        }
+        String value = field.getValueAsString();
+        if (value != null && !value.equals("null")) {
+            sb.append(value);
+        }
+
+        if (attrs.getLength() > 0 || sb.length() > 0) {
+            xhtml.startElement("li", attrs);
+            xhtml.characters(sb.toString());
+            xhtml.endElement("li");
+        }
+    }
+
+    private void handleSignature(AttributesImpl parentAttributes, 
PDSignatureField sigField)
+            throws SAXException {
+
+        PDSignature sig = sigField.getSignature();
+        if (sig == null) {
+            return;
+        }
+        Map<String, String> vals = new TreeMap<>();
+        vals.put("name", sig.getName());
+        vals.put("contactInfo", sig.getContactInfo());
+        vals.put("location", sig.getLocation());
+        vals.put("reason", sig.getReason());
+
+        Calendar cal = sig.getSignDate();
+        if (cal != null) {
+            dateFormat.setTimeZone(cal.getTimeZone());
+            vals.put("date", dateFormat.format(cal.getTime()));
+        }
+        //see if there is any data
+        int nonNull = 0;
+        for (String val : vals.keySet()) {
+            if (val != null && !val.equals("")) {
+                nonNull++;
+            }
+        }
+        //if there is, process it
+        if (nonNull > 0) {
+            xhtml.startElement("li", parentAttributes);
+
+            AttributesImpl attrs = new AttributesImpl();
+            attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
+
+            xhtml.startElement("ol", attrs);
+            for (Map.Entry<String, String> e : vals.entrySet()) {
+                if (e.getValue() == null || e.getValue().equals("")) {
+                    continue;
+                }
+                attrs = new AttributesImpl();
+                attrs.addAttribute("", "signdata", "signdata", "CDATA", 
e.getKey());
+                xhtml.startElement("li", attrs);
+                xhtml.characters(e.getValue());
+                xhtml.endElement("li");
+            }
+            xhtml.endElement("ol");
+            xhtml.endElement("li");
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
new file mode 100644
index 0000000..539cd50
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.text.TextPosition;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+
+
+/**
+ * Utility class that overrides the {@link PDFTextStripper} functionality
+ * to integrate text extraction via OCR only.
+ *
+ */
+class OCR2XHTML extends AbstractPDF2XHTML {
+
+    private OCR2XHTML(PDDocument document, ContentHandler handler, 
ParseContext context, Metadata metadata,
+                      PDFParserConfig config)
+            throws IOException {
+        super(document, handler, context, metadata, config);
+    }
+
+    /**
+     * Converts the given PDF document (and related metadata) to a stream
+     * of XHTML SAX events sent to the given content handler.
+     *
+     * @param document PDF document
+     * @param handler  SAX content handler
+     * @param metadata PDF metadata
+     * @throws SAXException  if the content handler fails to process SAX events
+     * @throws TikaException if there was an exception outside of per page 
processing
+     */
+    public static void process(
+            PDDocument document, ContentHandler handler, ParseContext context, 
Metadata metadata,
+            PDFParserConfig config)
+            throws SAXException, TikaException {
+        OCR2XHTML ocr2XHTML = null;
+        try {
+            ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, 
config);
+            ocr2XHTML.writeText(document, new Writer() {
+                @Override
+                public void write(char[] cbuf, int off, int len) {
+                }
+
+                @Override
+                public void flush() {
+                }
+
+                @Override
+                public void close() {
+                }
+            });
+        } catch (IOException e) {
+            if (e.getCause() instanceof SAXException) {
+                throw (SAXException) e.getCause();
+            } else {
+                throw new TikaException("Unable to extract PDF content", e);
+            }
+        }
+        if (ocr2XHTML.exceptions.size() > 0) {
+            //throw the first
+            throw new TikaException("Unable to extract all PDF content",
+                    ocr2XHTML.exceptions.get(0));
+        }
+    }
+
+    @Override
+    public void processPage(PDPage pdPage) throws IOException {
+        try {
+            startPage(pdPage);
+            doOCROnCurrentPage();
+            endPage(pdPage);
+        } catch (TikaException|SAXException e) {
+            throw new IOExceptionWithCause(e);
+        } catch (IOException e) {
+            handleCatchableIOE(e);
+        }
+    }
+
+    @Override
+    protected void writeString(String text) throws IOException {
+        //no-op
+    }
+
+    @Override
+    protected void writeCharacters(TextPosition text) throws IOException {
+        //no-op
+    }
+
+    @Override
+    protected void writeWordSeparator() throws IOException {
+        //no-op
+    }
+
+    @Override
+    protected void writeLineSeparator() throws IOException {
+        //no-op
+    }
+
+}
+

http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 1a8bfb4..ac9823e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -16,74 +16,41 @@
  */
 package org.apache.tika.parser.pdf;
 
-import javax.xml.stream.XMLStreamException;
 import java.awt.image.BufferedImage;
-import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.Writer;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.Calendar;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
-import java.util.ListIterator;
-import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
-import java.util.TreeMap;
 
 import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.commons.io.IOUtils;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSStream;
 import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
-import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
-import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDResources;
-import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
-import 
org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
-import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
 import org.apache.pdfbox.pdmodel.graphics.PDXObject;
 import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
 import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
 import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
 import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
-import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
-import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
-import 
org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
-import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
-import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
-import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
-import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
-import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
-import org.apache.pdfbox.pdmodel.interactive.form.PDField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.TextPosition;
 import org.apache.pdfbox.tools.imageio.ImageIOUtil;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
@@ -93,28 +60,12 @@ import org.xml.sax.helpers.AttributesImpl;
  * to produce a semi-structured XHTML SAX events instead of a plain text
  * stream.
  */
-class PDF2XHTML extends PDFTextStripper {
+class PDF2XHTML extends AbstractPDF2XHTML {
 
-    /**
-     * Maximum recursive depth during AcroForm processing.
-     * Prevents theoretical AcroForm recursion bomb.
-     */
-    private final static int MAX_ACROFORM_RECURSIONS = 10;
 
     private static final List<String> JPEG = Arrays.asList(
             COSName.DCT_DECODE.getName(),
             COSName.DCT_DECODE_ABBREVIATION.getName());
-    /**
-     * Format used for signature dates
-     * TODO Make this thread-safe
-     */
-    private final SimpleDateFormat dateFormat = new 
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
-    private final ContentHandler originalHandler;
-    private final ParseContext context;
-    private final XHTMLContentHandler handler;
-    private final PDFParserConfig config;
-    private final Metadata metadata;
-    private final List<IOException> exceptions = new ArrayList<>();
 
     /**
      * This keeps track of the pdf object ids for inline
@@ -129,16 +80,10 @@ class PDF2XHTML extends PDFTextStripper {
      */
     private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
     private int inlineImageCounter = 0;
-    private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata 
metadata,
+    private PDF2XHTML(PDDocument document, ContentHandler handler, 
ParseContext context, Metadata metadata,
                       PDFParserConfig config)
             throws IOException {
-        //source of config (derives from context or PDFParser?) is
-        //already determined in PDFParser.  No need to check context here.
-        this.config = config;
-        this.originalHandler = handler;
-        this.context = context;
-        this.handler = new XHTMLContentHandler(handler, metadata);
-        this.metadata = metadata;
+        super(document, handler, context, metadata, config);
     }
 
     /**
@@ -160,7 +105,7 @@ class PDF2XHTML extends PDFTextStripper {
             // Extract text using a dummy Writer as we override the
             // key methods to output to the given content
             // handler.
-            pdf2XHTML = new PDF2XHTML(handler, context, metadata, config);
+            pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, 
config);
 
             config.configure(pdf2XHTML);
 
@@ -192,28 +137,6 @@ class PDF2XHTML extends PDFTextStripper {
         }
     }
 
-    void extractBookmarkText() throws SAXException {
-        PDDocumentOutline outline = 
document.getDocumentCatalog().getDocumentOutline();
-        if (outline != null) {
-            extractBookmarkText(outline);
-        }
-    }
-
-    void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
-        PDOutlineItem current = bookmark.getFirstChild();
-        if (current != null) {
-            handler.startElement("ul");
-            while (current != null) {
-                handler.startElement("li");
-                handler.characters(current.getTitle());
-                handler.endElement("li");
-                // Recurse:
-                extractBookmarkText(current);
-                current = current.getNextSibling();
-            }
-            handler.endElement("ul");
-        }
-    }
 
     @Override
     public void processPage(PDPage page) throws IOException {
@@ -225,52 +148,6 @@ class PDF2XHTML extends PDFTextStripper {
     }
 
     @Override
-    protected void startDocument(PDDocument pdf) throws IOException {
-        try {
-            handler.startDocument();
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to start a document", e);
-        }
-    }
-
-    @Override
-    protected void endDocument(PDDocument pdf) throws IOException {
-        try {
-            // Extract text for any bookmarks:
-            extractBookmarkText();
-            try {
-                extractEmbeddedDocuments(pdf, originalHandler);
-            } catch (IOException e) {
-                handleCatchableIOE(e);
-            }
-
-            //extract acroform data at end of doc
-            if (config.getExtractAcroFormContent() == true) {
-                try {
-                    extractAcroForm(pdf, handler);
-                } catch (IOException e) {
-                    handleCatchableIOE(e);
-                }
-            }
-            handler.endDocument();
-        } catch (TikaException e) {
-            throw new IOExceptionWithCause("Unable to end a document", e);
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to end a document", e);
-        }
-    }
-
-    @Override
-    protected void startPage(PDPage page) throws IOException {
-        try {
-            handler.startElement("div", "class", "page");
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to start a page", e);
-        }
-        writeParagraphStart();
-    }
-
-    @Override
     protected void endPage(PDPage page) throws IOException {
         try {
             writeParagraphEnd();
@@ -279,76 +156,7 @@ class PDF2XHTML extends PDFTextStripper {
             } catch (IOException e) {
                 handleCatchableIOE(e);
             }
-
-            EmbeddedDocumentExtractor extractor = 
getEmbeddedDocumentExtractor();
-            for (PDAnnotation annotation : page.getAnnotations()) {
-
-                if (annotation instanceof PDAnnotationFileAttachment) {
-                    PDAnnotationFileAttachment fann = 
(PDAnnotationFileAttachment) annotation;
-                    PDComplexFileSpecification fileSpec = 
(PDComplexFileSpecification) fann.getFile();
-                    try {
-                        extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
-                    } catch (SAXException e) {
-                        throw new IOExceptionWithCause("file embedded in 
annotation sax exception", e);
-                    } catch (TikaException e) {
-                        throw new IOExceptionWithCause("file embedded in 
annotation tika exception", e);
-                    } catch (IOException e) {
-                        handleCatchableIOE(e);
-                    }
-                }
-                // TODO: remove once PDFBOX-1143 is fixed:
-                if (config.getExtractAnnotationText()) {
-                    if (annotation instanceof PDAnnotationLink) {
-                        PDAnnotationLink annotationlink = (PDAnnotationLink) 
annotation;
-                        if (annotationlink.getAction() != null) {
-                            PDAction action = annotationlink.getAction();
-                            if (action instanceof PDActionURI) {
-                                PDActionURI uri = (PDActionURI) action;
-                                String link = uri.getURI();
-                                if (link != null) {
-                                    handler.startElement("div", "class", 
"annotation");
-                                    handler.startElement("a", "href", link);
-                                    handler.endElement("a");
-                                    handler.endElement("div");
-                                }
-                            }
-                        }
-                    }
-
-                    if (annotation instanceof PDAnnotationMarkup) {
-                        PDAnnotationMarkup annotationMarkup = 
(PDAnnotationMarkup) annotation;
-                        String title = annotationMarkup.getTitlePopup();
-                        String subject = annotationMarkup.getSubject();
-                        String contents = annotationMarkup.getContents();
-                        // TODO: maybe also annotationMarkup.getRichContents()?
-                        if (title != null || subject != null || contents != 
null) {
-                            handler.startElement("div", "class", "annotation");
-
-                            if (title != null) {
-                                handler.startElement("div", "class", 
"annotationTitle");
-                                handler.characters(title);
-                                handler.endElement("div");
-                            }
-
-                            if (subject != null) {
-                                handler.startElement("div", "class", 
"annotationSubject");
-                                handler.characters(subject);
-                                handler.endElement("div");
-                            }
-
-                            if (contents != null) {
-                                handler.startElement("div", "class", 
"annotationContents");
-                                handler.characters(contents);
-                                handler.endElement("div");
-                            }
-
-                            handler.endElement("div");
-                        }
-                    }
-                }
-            }
-
-            handler.endElement("div");
+            super.endPage(page);
         } catch (SAXException e) {
             throw new IOExceptionWithCause("Unable to end a page", e);
         } catch (IOException e) {
@@ -406,8 +214,8 @@ class PDF2XHTML extends PDFTextStripper {
                 AttributesImpl attr = new AttributesImpl();
                 attr.addAttribute("", "src", "src", "CDATA", "embedded:" + 
fileName);
                 attr.addAttribute("", "alt", "alt", "CDATA", fileName);
-                handler.startElement("img", attr);
-                handler.endElement("img");
+                xhtml.startElement("img", attr);
+                xhtml.endElement("img");
 
                 //Do we only want to process unique COSObject ids?
                 //If so, have we already processed this one?
@@ -430,7 +238,7 @@ class PDF2XHTML extends PDFTextStripper {
                         writeToBuffer(image, extension, buffer);
                         extractor.parseEmbedded(
                                 new ByteArrayInputStream(buffer.toByteArray()),
-                                new EmbeddedContentHandler(handler),
+                                new EmbeddedContentHandler(xhtml),
                                 metadata, false);
                     } catch (IOException e) {
                         handleCatchableIOE(e);
@@ -467,20 +275,11 @@ class PDF2XHTML extends PDFTextStripper {
         out.flush();
     }
 
-    protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
-        EmbeddedDocumentExtractor extractor =
-                context.get(EmbeddedDocumentExtractor.class);
-        if (extractor == null) {
-            extractor = new ParsingEmbeddedDocumentExtractor(context);
-        }
-        return extractor;
-    }
-
     @Override
     protected void writeParagraphStart() throws IOException {
         super.writeParagraphStart();
         try {
-            handler.startElement("p");
+            xhtml.startElement("p");
         } catch (SAXException e) {
             throw new IOExceptionWithCause("Unable to start a paragraph", e);
         }
@@ -490,7 +289,7 @@ class PDF2XHTML extends PDFTextStripper {
     protected void writeParagraphEnd() throws IOException {
         super.writeParagraphEnd();
         try {
-            handler.endElement("p");
+            xhtml.endElement("p");
         } catch (SAXException e) {
             throw new IOExceptionWithCause("Unable to end a paragraph", e);
         }
@@ -499,7 +298,7 @@ class PDF2XHTML extends PDFTextStripper {
     @Override
     protected void writeString(String text) throws IOException {
         try {
-            handler.characters(text);
+            xhtml.characters(text);
         } catch (SAXException e) {
             throw new IOExceptionWithCause(
                     "Unable to write a string: " + text, e);
@@ -509,7 +308,7 @@ class PDF2XHTML extends PDFTextStripper {
     @Override
     protected void writeCharacters(TextPosition text) throws IOException {
         try {
-            handler.characters(text.getUnicode());
+            xhtml.characters(text.getUnicode());
         } catch (SAXException e) {
             throw new IOExceptionWithCause(
                     "Unable to write a character: " + text.getUnicode(), e);
@@ -519,7 +318,7 @@ class PDF2XHTML extends PDFTextStripper {
     @Override
     protected void writeWordSeparator() throws IOException {
         try {
-            handler.characters(getWordSeparator());
+            xhtml.characters(getWordSeparator());
         } catch (SAXException e) {
             throw new IOExceptionWithCause(
                     "Unable to write a space character", e);
@@ -529,275 +328,12 @@ class PDF2XHTML extends PDFTextStripper {
     @Override
     protected void writeLineSeparator() throws IOException {
         try {
-            handler.newline();
+            xhtml.newline();
         } catch (SAXException e) {
             throw new IOExceptionWithCause(
                     "Unable to write a newline character", e);
         }
     }
 
-    private void extractEmbeddedDocuments(PDDocument document, ContentHandler 
handler)
-            throws IOException, SAXException, TikaException {
-        PDDocumentNameDictionary namesDictionary =
-                new PDDocumentNameDictionary( document.getDocumentCatalog() );
-        PDEmbeddedFilesNameTreeNode efTree = 
namesDictionary.getEmbeddedFiles();
-        if (efTree == null) {
-            return;
-        }
-
-        Map<String, PDComplexFileSpecification> embeddedFileNames = 
efTree.getNames();
-        //For now, try to get the embeddedFileNames out of embeddedFiles or 
its kids.
-        //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
-        //If there is a need we could add a fully recursive search to find a 
non-null
-        //Map<String, COSObjectable> that contains the doc info.
-        if (embeddedFileNames != null) {
-            processEmbeddedDocNames(embeddedFileNames);
-        } else {
-            List<PDNameTreeNode<PDComplexFileSpecification>> kids = 
efTree.getKids();
-            if (kids == null) {
-                return;
-            }
-            for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
-                embeddedFileNames = node.getNames();
-                if (embeddedFileNames != null) {
-                    processEmbeddedDocNames(embeddedFileNames);
-                }
-            }
-        }
-    }
-
-    private void processEmbeddedDocNames(Map<String, 
PDComplexFileSpecification> embeddedFileNames)
-            throws IOException, SAXException, TikaException {
-        if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
-            return;
-        }
-
-        EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
-        for (Map.Entry<String, PDComplexFileSpecification> ent : 
embeddedFileNames.entrySet()) {
-            PDComplexFileSpecification spec = ent.getValue();
-            extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
-        }
-    }
-
-    private void extractMultiOSPDEmbeddedFiles(String defaultName,
-                                               PDComplexFileSpecification spec,
-                                               EmbeddedDocumentExtractor 
extractor) throws IOException,
-            SAXException, TikaException {
-
-        if (spec == null) {
-            return;
-        }
-        //current strategy is to pull all, not just first non-null
-        extractPDEmbeddedFile(defaultName, spec.getFile(), 
spec.getEmbeddedFile(), extractor);
-        extractPDEmbeddedFile(defaultName, spec.getFileMac(), 
spec.getEmbeddedFileMac(), extractor);
-        extractPDEmbeddedFile(defaultName, spec.getFileDos(), 
spec.getEmbeddedFileDos(), extractor);
-        extractPDEmbeddedFile(defaultName, spec.getFileUnix(), 
spec.getEmbeddedFileUnix(), extractor);
-    }
-
-    private void extractPDEmbeddedFile(String defaultName, String fileName, 
PDEmbeddedFile file,
-                                       EmbeddedDocumentExtractor extractor)
-            throws SAXException, IOException, TikaException {
-
-        if (file == null) {
-            //skip silently
-            return;
-        }
-
-        fileName = (fileName == null) ? defaultName : fileName;
-
-        // TODO: other metadata?
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
-        metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
-        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
-        metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
-                TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
-
-        if (extractor.shouldParseEmbedded(metadata)) {
-            TikaInputStream stream = null;
-            try {
-                stream = TikaInputStream.get(file.createInputStream());
-                extractor.parseEmbedded(
-                        stream,
-                        new EmbeddedContentHandler(handler),
-                        metadata, false);
-
-                AttributesImpl attributes = new AttributesImpl();
-                attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
-                attributes.addAttribute("", "id", "id", "CDATA", fileName);
-                handler.startElement("div", attributes);
-                handler.endElement("div");
-            } finally {
-                IOUtils.closeQuietly(stream);
-            }
-        }
-    }
-
-    private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) 
throws IOException,
-            SAXException {
-        //Thank you, Ben Litchfield, for 
org.apache.pdfbox.examples.fdf.PrintFields
-        //this code derives from Ben's code
-        PDDocumentCatalog catalog = pdf.getDocumentCatalog();
-
-        if (catalog == null)
-            return;
-
-        PDAcroForm form = catalog.getAcroForm();
-        if (form == null)
-            return;
-
-        //if it has xfa, try that.
-        //if it doesn't exist or there's an exception,
-        //go with traditional AcroForm
-        PDXFAResource pdxfa = form.getXFA();
-
-        if (pdxfa != null) {
-            //if successful, return
-            XFAExtractor xfaExtractor = new XFAExtractor();
-            try (InputStream is = new BufferedInputStream(
-                    new ByteArrayInputStream(pdxfa.getBytes()))) {
-                xfaExtractor.extract(is, handler, metadata, context);
-                return;
-            } catch (XMLStreamException |IOException e) {
-                //if there was an xml parse exception in xfa, try the AcroForm
-            }
-        }
-
-        @SuppressWarnings("rawtypes")
-        List fields = form.getFields();
-
-        if (fields == null)
-            return;
-
-        @SuppressWarnings("rawtypes")
-        ListIterator itr = fields.listIterator();
-
-        if (itr == null)
-            return;
-
-        handler.startElement("div", "class", "acroform");
-        handler.startElement("ol");
-
-        while (itr.hasNext()) {
-            Object obj = itr.next();
-            if (obj != null && obj instanceof PDField) {
-                processAcroField((PDField) obj, handler, 0);
-            }
-        }
-        handler.endElement("ol");
-        handler.endElement("div");
-    }
-
-    private void processAcroField(PDField field,
-                                  XHTMLContentHandler handler, final int 
currentRecursiveDepth)
-            throws SAXException, IOException {
-
-        if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
-            return;
-        }
-        addFieldString(field, handler);
-        if (field instanceof PDNonTerminalField) {
-            int r = currentRecursiveDepth + 1;
-            handler.startElement("ol");
-            for (PDField child : ((PDNonTerminalField)field).getChildren()) {
-                processAcroField(child, handler, r);
-            }
-            handler.endElement("ol");
-        }
-    }
-
-    private void addFieldString(PDField field, XHTMLContentHandler handler) 
throws SAXException {
-        //Pick partial name to present in content and altName for attribute
-        //Ignoring FullyQualifiedName for now
-        String partName = field.getPartialName();
-        String altName = field.getAlternateFieldName();
-
-        StringBuilder sb = new StringBuilder();
-        AttributesImpl attrs = new AttributesImpl();
-
-        if (partName != null) {
-            sb.append(partName).append(": ");
-        }
-        if (altName != null) {
-            attrs.addAttribute("", "altName", "altName", "CDATA", altName);
-        }
-        //return early if PDSignature field
-        if (field instanceof PDSignatureField) {
-            handleSignature(attrs, (PDSignatureField) field, handler);
-            return;
-        }
-        String value = field.getValueAsString();
-        if (value != null && !value.equals("null")) {
-            sb.append(value);
-        }
-
-        if (attrs.getLength() > 0 || sb.length() > 0) {
-            handler.startElement("li", attrs);
-            handler.characters(sb.toString());
-            handler.endElement("li");
-        }
-    }
-
-    private void handleSignature(AttributesImpl parentAttributes, 
PDSignatureField sigField,
-                                 XHTMLContentHandler handler) throws 
SAXException {
-
-
-        PDSignature sig = sigField.getSignature();
-        if (sig == null) {
-            return;
-        }
-        Map<String, String> vals = new TreeMap<>();
-        vals.put("name", sig.getName());
-        vals.put("contactInfo", sig.getContactInfo());
-        vals.put("location", sig.getLocation());
-        vals.put("reason", sig.getReason());
-
-        Calendar cal = sig.getSignDate();
-        if (cal != null) {
-            dateFormat.setTimeZone(cal.getTimeZone());
-            vals.put("date", dateFormat.format(cal.getTime()));
-        }
-        //see if there is any data
-        int nonNull = 0;
-        for (String val : vals.keySet()) {
-            if (val != null && !val.equals("")) {
-                nonNull++;
-            }
-        }
-        //if there is, process it
-        if (nonNull > 0) {
-            handler.startElement("li", parentAttributes);
-
-            AttributesImpl attrs = new AttributesImpl();
-            attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
-
-            handler.startElement("ol", attrs);
-            for (Map.Entry<String, String> e : vals.entrySet()) {
-                if (e.getValue() == null || e.getValue().equals("")) {
-                    continue;
-                }
-                attrs = new AttributesImpl();
-                attrs.addAttribute("", "signdata", "signdata", "CDATA", 
e.getKey());
-                handler.startElement("li", attrs);
-                handler.characters(e.getValue());
-                handler.endElement("li");
-            }
-            handler.endElement("ol");
-            handler.endElement("li");
-        }
-    }
-
-    private void handleCatchableIOE(IOException e) throws IOException {
-        if (config.isCatchIntermediateIOExceptions()) {
-            String msg = e.getMessage();
-            if (msg == null) {
-                msg = "IOException, no message";
-            }
-            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
-            exceptions.add(e);
-        } else {
-            throw e;
-        }
-    }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index b677d84..3e33962 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -58,6 +58,7 @@ import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.image.xmp.JempboxExtractor;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.w3c.dom.Document;
 import org.xml.sax.ContentHandler;
@@ -140,7 +141,13 @@ public class PDFParser extends AbstractParser {
             if (handler != null) {
                 if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
                     handleXFAOnly(pdfDocument, handler, metadata, context);
+                } else if 
(localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
+                    metadata.add("X-Parsed-By", 
TesseractOCRParser.class.toString());
+                    OCR2XHTML.process(pdfDocument, handler, context, metadata, 
localConfig);
                 } else {
+                    if 
(localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION))
 {
+                        metadata.add("X-Parsed-By", 
TesseractOCRParser.class.toString());
+                    }
                     PDF2XHTML.process(pdfDocument, handler, context, metadata, 
localConfig);
                 }
             }

http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 9baeb37..296b191 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -23,6 +23,7 @@ import java.io.Serializable;
 import java.util.Locale;
 import java.util.Properties;
 
+import org.apache.pdfbox.rendering.ImageType;
 import org.apache.pdfbox.text.PDFTextStripper;
 
 /**
@@ -44,6 +45,26 @@ import org.apache.pdfbox.text.PDFTextStripper;
  */
 public class PDFParserConfig implements Serializable {
 
+    public enum OCR_STRATEGY {
+        NO_OCR,
+        OCR_ONLY,
+        OCR_AND_TEXT_EXTRACTION;
+
+        private static OCR_STRATEGY parse(String s) {
+            if (s == null) {
+                return NO_OCR;
+            } else if ("no_ocr".equals(s.toLowerCase(Locale.ROOT))) {
+                return NO_OCR;
+            } else if ("ocr_only".equals(s.toLowerCase(Locale.ROOT))) {
+                return OCR_ONLY;
+            } else if (s.toLowerCase(Locale.ROOT).contains("ocr_and_text")) {
+                return OCR_AND_TEXT_EXTRACTION;
+            }
+            //default -- no ocr
+            return NO_OCR;
+        }
+    }
+
     private static final long serialVersionUID = 6492570218190936986L;
 
     // True if we let PDFBox "guess" where spaces should go:
@@ -80,6 +101,12 @@ public class PDFParserConfig implements Serializable {
     //content from elsewhere in the document.
     private boolean ifXFAExtractOnlyXFA = false;
 
+    private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.NO_OCR;
+
+    private int ocrDPI = 200;
+    private ImageType ocrImageType = ImageType.GRAY;
+    private String ocrImageFormatName = "png";
+
     private AccessChecker accessChecker;
 
     //The PDFParser can throw IOExceptions if there is a problem
@@ -123,36 +150,45 @@ public class PDFParserConfig implements Serializable {
             }
         }
         setEnableAutoSpace(
-                getProp(props.getProperty("enableAutoSpace"), 
getEnableAutoSpace()));
+                getBooleanProp(props.getProperty("enableAutoSpace"), 
getEnableAutoSpace()));
         setSuppressDuplicateOverlappingText(
-                getProp(props.getProperty("suppressDuplicateOverlappingText"),
+                
getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"),
                         getSuppressDuplicateOverlappingText()));
         setExtractAnnotationText(
-                getProp(props.getProperty("extractAnnotationText"),
+                getBooleanProp(props.getProperty("extractAnnotationText"),
                         getExtractAnnotationText()));
         setSortByPosition(
-                getProp(props.getProperty("sortByPosition"),
+                getBooleanProp(props.getProperty("sortByPosition"),
                         getSortByPosition()));
         setExtractAcroFormContent(
-                getProp(props.getProperty("extractAcroFormContent"),
+                getBooleanProp(props.getProperty("extractAcroFormContent"),
                         getExtractAcroFormContent()));
         setExtractInlineImages(
-                getProp(props.getProperty("extractInlineImages"),
+                getBooleanProp(props.getProperty("extractInlineImages"),
                         getExtractInlineImages()));
         setExtractUniqueInlineImagesOnly(
-                getProp(props.getProperty("extractUniqueInlineImagesOnly"),
+                
getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"),
                         getExtractUniqueInlineImagesOnly()));
 
         setIfXFAExtractOnlyXFA(
-            getProp(props.getProperty("ifXFAExtractOnlyXFA"),
+            getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"),
                 getIfXFAExtractOnlyXFA()));
 
         setCatchIntermediateIOExceptions(
-                getProp(props.getProperty("catchIntermediateIOExceptions"),
+                
getBooleanProp(props.getProperty("catchIntermediateIOExceptions"),
                 isCatchIntermediateIOExceptions()));
 
-        boolean checkExtractAccessPermission = 
getProp(props.getProperty("checkExtractAccessPermission"), false);
-        boolean allowExtractionForAccessibility = 
getProp(props.getProperty("allowExtractionForAccessibility"), true);
+        setOCRStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));
+
+        setOCRDPI(getIntProp(props.getProperty("ocrDPI"), getOCRDPI()));
+
+        setOCRImageFormatName(props.getProperty("ocrImageFormatName"));
+
+        setOCRImageType(parseImageType(props.getProperty("ocrImageType")));
+
+
+        boolean checkExtractAccessPermission = 
getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
+        boolean allowExtractionForAccessibility = 
getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
 
         if (checkExtractAccessPermission == false) {
             //silently ignore the crazy configuration of 
checkExtractAccessPermission = false,
@@ -408,7 +444,23 @@ public class PDFParserConfig implements Serializable {
         isCatchIntermediateIOExceptions = catchIntermediateIOExceptions;
     }
 
-    private boolean getProp(String p, boolean defaultMissing) {
+    /**
+     * Which strategy to use for OCR
+     * @param ocrStrategy
+     */
+    public void setOCRStrategy(OCR_STRATEGY ocrStrategy) {
+        this.ocrStrategy = ocrStrategy;
+    }
+
+    /**
+     *
+     * @return strategy to use for OCR
+     */
+    public OCR_STRATEGY getOCRStrategy() {
+        return ocrStrategy;
+    }
+
+    private boolean getBooleanProp(String p, boolean defaultMissing) {
         if (p == null) {
             return defaultMissing;
         }
@@ -420,83 +472,143 @@ public class PDFParserConfig implements Serializable {
             return defaultMissing;
         }
     }
+    //throws NumberFormatException if there's a non-null unparseable
+    //string passed in
+    private int getIntProp(String p, int defaultMissing) {
+        if (p == null) {
+            return defaultMissing;
+        }
 
-    @Override
-    public int hashCode() {
-        final int prime = 31;
-        int result = 1;
-        result = prime
-                * result
-                + ((averageCharTolerance == null) ? 0 : averageCharTolerance
-                .hashCode());
-        result = prime * result + (enableAutoSpace ? 1231 : 1237);
-        result = prime * result + (extractAcroFormContent ? 1231 : 1237);
-        result = prime * result + (extractAnnotationText ? 1231 : 1237);
-        result = prime * result + (extractInlineImages ? 1231 : 1237);
-        result = prime * result + (extractUniqueInlineImagesOnly ? 1231 : 
1237);
-        result = prime * result + (sortByPosition ? 1231 : 1237);
-        result = prime
-                * result
-                + ((spacingTolerance == null) ? 0 : 
spacingTolerance.hashCode());
-        result = prime * result
-                + (suppressDuplicateOverlappingText ? 1231 : 1237);
-        result = prime * result + (ifXFAExtractOnlyXFA ? 1231 : 1237);
-        return result;
+        return Integer.parseInt(p);
     }
 
-    @Override
-    public boolean equals(Object obj) {
-        if (this == obj)
-            return true;
-        if (obj == null)
-            return false;
-        if (getClass() != obj.getClass())
-            return false;
-        PDFParserConfig other = (PDFParserConfig) obj;
-        if (averageCharTolerance == null) {
-            if (other.averageCharTolerance != null)
-                return false;
-        } else if (!averageCharTolerance.equals(other.averageCharTolerance))
-            return false;
-        if (enableAutoSpace != other.enableAutoSpace)
-            return false;
-        if (extractAcroFormContent != other.extractAcroFormContent)
-            return false;
-        if (extractAnnotationText != other.extractAnnotationText)
-            return false;
-        if (extractInlineImages != other.extractInlineImages)
-            return false;
-        if (extractUniqueInlineImagesOnly != 
other.extractUniqueInlineImagesOnly)
-            return false;
-        if (sortByPosition != other.sortByPosition)
-            return false;
-        if (spacingTolerance == null) {
-            if (other.spacingTolerance != null)
-                return false;
-        } else if (!spacingTolerance.equals(other.spacingTolerance))
-            return false;
-        if (suppressDuplicateOverlappingText != 
other.suppressDuplicateOverlappingText)
-            return false;
-        if (ifXFAExtractOnlyXFA != other.ifXFAExtractOnlyXFA)
-            return false;
+    /**
+     * String representation of the image format used to render
+     * the page image for OCR (examples: png, tiff, jpeg)
+     * @return
+     */
+    public String getOCRImageFormatName() {
+        return ocrImageFormatName;
+    }
 
-        return true;
+    /**
+     * @see #getOCRImageFormatName()
+     *
+     * @param ocrImageFormatName name of image format used to render
+     *                           page image
+     */
+    public void setOCRImageFormatName(String ocrImageFormatName) {
+        this.ocrImageFormatName = ocrImageFormatName;
+    }
+
+    /**
+     * Image type used to render the page image for OCR.
+     * @see #setOCRImageType(ImageType)
+     * @return image type
+     */
+    public ImageType getOCRImageType() {
+        return ocrImageType;
+    }
+
+    /**
+     * Image type used to render the page image for OCR.
+     * @param ocrImageType
+     */
+    public void setOCRImageType(ImageType ocrImageType) {
+        this.ocrImageType = ocrImageType;
+    }
+
+    /**
+     * Dots per inch used to render the page image for OCR
+     * @return dots per inch
+     */
+    public int getOCRDPI() {
+        return ocrDPI;
+    }
+
+    /**
+     * Dots per inche used to render the page image for OCR
+     * @param ocrDPI
+     */
+    public void setOCRDPI(int ocrDPI) {
+        this.ocrDPI = ocrDPI;
+    }
+
+    private ImageType parseImageType(String ocrImageType) {
+        for (ImageType t : ImageType.values()) {
+            if (ocrImageType.equalsIgnoreCase(t.toString())) {
+                return t;
+            }
+        }
+        return null;
     }
 
     @Override
-    public String toString() {
-        return "PDFParserConfig [enableAutoSpace=" + enableAutoSpace
-                + ", suppressDuplicateOverlappingText="
-                + suppressDuplicateOverlappingText + ", extractAnnotationText="
-                + extractAnnotationText + ", sortByPosition=" + sortByPosition
-                + ", extractAcroFormContent=" + extractAcroFormContent
-                + ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA
-                + ", extractInlineImages=" + extractInlineImages
-                + ", extractUniqueInlineImagesOnly="
-                + extractUniqueInlineImagesOnly + ", averageCharTolerance="
-                + averageCharTolerance + ", spacingTolerance="
-                + spacingTolerance + "]";
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (!(o instanceof PDFParserConfig)) return false;
+
+        PDFParserConfig config = (PDFParserConfig) o;
+
+        if (getEnableAutoSpace() != config.getEnableAutoSpace()) return false;
+        if (getSuppressDuplicateOverlappingText() != 
config.getSuppressDuplicateOverlappingText()) return false;
+        if (getExtractAnnotationText() != config.getExtractAnnotationText()) 
return false;
+        if (getSortByPosition() != config.getSortByPosition()) return false;
+        if (getExtractAcroFormContent() != config.getExtractAcroFormContent()) 
return false;
+        if (getExtractInlineImages() != config.getExtractInlineImages()) 
return false;
+        if (getExtractUniqueInlineImagesOnly() != 
config.getExtractUniqueInlineImagesOnly()) return false;
+        if (getIfXFAExtractOnlyXFA() != config.getIfXFAExtractOnlyXFA()) 
return false;
+        if (getOCRDPI() != config.getOCRDPI()) return false;
+        if (isCatchIntermediateIOExceptions() != 
config.isCatchIntermediateIOExceptions()) return false;
+        if 
(!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return 
false;
+        if (!getSpacingTolerance().equals(config.getSpacingTolerance())) 
return false;
+        if (!getOCRStrategy().equals(config.getOCRStrategy())) return false;
+        if (getOCRImageType() != config.getOCRImageType()) return false;
+        if (!getOCRImageFormatName().equals(config.getOCRImageFormatName())) 
return false;
+        return getAccessChecker().equals(config.getAccessChecker());
+
     }
 
+    @Override
+    public int hashCode() {
+        int result = (getEnableAutoSpace() ? 1 : 0);
+        result = 31 * result + (getSuppressDuplicateOverlappingText() ? 1 : 0);
+        result = 31 * result + (getExtractAnnotationText() ? 1 : 0);
+        result = 31 * result + (getSortByPosition() ? 1 : 0);
+        result = 31 * result + (getExtractAcroFormContent() ? 1 : 0);
+        result = 31 * result + (getExtractInlineImages() ? 1 : 0);
+        result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0);
+        result = 31 * result + getAverageCharTolerance().hashCode();
+        result = 31 * result + getSpacingTolerance().hashCode();
+        result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0);
+        result = 31 * result + ocrStrategy.hashCode();
+        result = 31 * result + getOCRDPI();
+        result = 31 * result + getOCRImageType().hashCode();
+        result = 31 * result + getOCRImageFormatName().hashCode();
+        result = 31 * result + getAccessChecker().hashCode();
+        result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0);
+        return result;
+    }
 
+    @Override
+    public String toString() {
+        return "PDFParserConfig{" +
+                "enableAutoSpace=" + enableAutoSpace +
+                ", suppressDuplicateOverlappingText=" + 
suppressDuplicateOverlappingText +
+                ", extractAnnotationText=" + extractAnnotationText +
+                ", sortByPosition=" + sortByPosition +
+                ", extractAcroFormContent=" + extractAcroFormContent +
+                ", extractInlineImages=" + extractInlineImages +
+                ", extractUniqueInlineImagesOnly=" + 
extractUniqueInlineImagesOnly +
+                ", averageCharTolerance=" + averageCharTolerance +
+                ", spacingTolerance=" + spacingTolerance +
+                ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA +
+                ", ocrStrategy=" + ocrStrategy +
+                ", ocrDPI=" + ocrDPI +
+                ", ocrImageType=" + ocrImageType +
+                ", ocrImageFormatName='" + ocrImageFormatName + '\'' +
+                ", accessChecker=" + accessChecker +
+                ", isCatchIntermediateIOExceptions=" + 
isCatchIntermediateIOExceptions +
+                '}';
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
 
b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
index 9b404a3..319e693 100644
--- 
a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
+++ 
b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
@@ -23,4 +23,12 @@ extractUniqueInlineImagesOnly true
 checkExtractAccessPermission false
 allowExtractionForAccessibility true
 ifXFAExtractOnlyXFA false
-catchIntermediateIOExceptions true
\ No newline at end of file
+catchIntermediateIOExceptions true
+#options: no_ocr, ocr_only, ocr_and_text_extraction
+ocrStrategy no_ocr
+#dots per inch for the ocr rendering of the page image
+ocrDPI 200
+#if you request tif, make sure you have imageio jars on your classpath!
+ocrImageFormatName png
+#options: argb, binary, gray, rgb
+ocrImageType gray

http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 6d07c59..df2e27c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -50,6 +50,8 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerDecorator;
@@ -70,6 +72,16 @@ public class PDFParserTest extends TikaTest {
     public static final MediaType TYPE_DOC = MediaType.application("msword");
     public static Level PDFBOX_LOG_LEVEL = Level.INFO;
 
+    private static Boolean hasTesseract = null;
+
+    public static boolean canRunOCR() {
+        if (hasTesseract != null) {
+            return hasTesseract;
+        }
+        hasTesseract = new TesseractOCRParser().hasTesseract(new 
TesseractOCRConfig());
+        return hasTesseract;
+    }
+
     @BeforeClass
     public static void setup() {
         //remember default logging level, but turn off for PDFParserTest
@@ -1175,6 +1187,32 @@ public class PDFParserTest extends TikaTest {
         assertEquals("1425", jpegMetadata.get(Metadata.IMAGE_LENGTH));
     }
 
+    @Test
+    public void testEmbeddedDocsWithOCROnly() throws Exception {
+        if (! canRunOCR()) { return; }
+
+        for (PDFParserConfig.OCR_STRATEGY strategy : 
PDFParserConfig.OCR_STRATEGY.values()) {
+            PDFParserConfig config = new PDFParserConfig();
+            config.setOCRStrategy(strategy);
+            ParseContext context = new ParseContext();
+            context.set(PDFParserConfig.class, config);
+            context.set(Parser.class, new AutoDetectParser());
+            //make sure everything works with regular xml _and_ with recursive
+            XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx", 
context);
+            assertContains("pdf_haystack", xmlResult.xml);
+            assertContains("Haystack", xmlResult.xml);
+            assertContains("Needle", xmlResult.xml);
+            if (! strategy.equals(PDFParserConfig.OCR_STRATEGY.NO_OCR)) {
+                assertContains("<div class=\"ocr\">pdf_haystack", 
xmlResult.xml);
+            } else {
+                assertNotContained("<div class=\"ocr\">pdf_haystack", 
xmlResult.xml);
+            }
+            assertEquals(4, 
getRecursiveJson("testPDFEmbeddingAndEmbedded.docx", context).size());
+        }
+
+    }
+
+
     private void assertException(String path, Parser parser, ParseContext 
context, Class expected) {
         boolean noEx = false;
         InputStream is = getResourceAsStream(path);

[1/4] tika git commit: TIKA-1994 -- integrate OCR with PDFParser

Reply via email to