[1/2] tika git commit: TIKA-1994 -- Integrate TesseractOCR with full page image rendering for PDFs

tallison Fri, 03 Jun 2016 11:53:44 -0700

Repository: tika
Updated Branches:
  refs/heads/2.x e5a7604bc -> ebe702898



http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index e98bead..ac9823e 100644
--- 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -16,74 +16,41 @@
  */
 package org.apache.tika.parser.pdf;
 
-import javax.xml.stream.XMLStreamException;
 import java.awt.image.BufferedImage;
-import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.Writer;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.Calendar;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
-import java.util.ListIterator;
-import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
-import java.util.TreeMap;
 
 import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.commons.io.IOUtils;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSStream;
 import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
-import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
-import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDResources;
-import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
-import 
org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
-import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
 import org.apache.pdfbox.pdmodel.graphics.PDXObject;
 import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
 import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
 import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
 import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
-import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
-import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
-import 
org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
-import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
-import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
-import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
-import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
-import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
-import org.apache.pdfbox.pdmodel.interactive.form.PDField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
-import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.TextPosition;
 import org.apache.pdfbox.tools.imageio.ImageIOUtil;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
@@ -93,30 +60,14 @@ import org.xml.sax.helpers.AttributesImpl;
  * to produce a semi-structured XHTML SAX events instead of a plain text
  * stream.
  */
-class PDF2XHTML extends PDFTextStripper {
+class PDF2XHTML extends AbstractPDF2XHTML {
 
-    /**
-     * Maximum recursive depth during AcroForm processing.
-     * Prevents theoretical AcroForm recursion bomb.
-     */
-    private final static int MAX_ACROFORM_RECURSIONS = 10;
 
     private static final List<String> JPEG = Arrays.asList(
             COSName.DCT_DECODE.getName(),
             COSName.DCT_DECODE_ABBREVIATION.getName());
 
     /**
-     * Format used for signature dates
-     * TODO Make this thread-safe
-     */
-    private final SimpleDateFormat dateFormat = new 
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
-    private final ContentHandler originalHandler;
-    private final ParseContext context;
-    private final XHTMLContentHandler handler;
-    private final PDFParserConfig config;
-    private final Metadata metadata;
-    private final List<IOException> exceptions = new ArrayList<>();
-    /**
      * This keeps track of the pdf object ids for inline
      * images that have been processed.
      * If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly()
@@ -129,16 +80,10 @@ class PDF2XHTML extends PDFTextStripper {
      */
     private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
     private int inlineImageCounter = 0;
-    private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata 
metadata,
+    private PDF2XHTML(PDDocument document, ContentHandler handler, 
ParseContext context, Metadata metadata,
                       PDFParserConfig config)
             throws IOException {
-        //source of config (derives from context or PDFParser?) is
-        //already determined in PDFParser.  No need to check context here.
-        this.config = config;
-        this.originalHandler = handler;
-        this.context = context;
-        this.handler = new XHTMLContentHandler(handler, metadata);
-        this.metadata = metadata;
+        super(document, handler, context, metadata, config);
     }
 
     /**
@@ -160,7 +105,7 @@ class PDF2XHTML extends PDFTextStripper {
             // Extract text using a dummy Writer as we override the
             // key methods to output to the given content
             // handler.
-            pdf2XHTML = new PDF2XHTML(handler, context, metadata, config);
+            pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, 
config);
 
             config.configure(pdf2XHTML);
 
@@ -192,28 +137,6 @@ class PDF2XHTML extends PDFTextStripper {
         }
     }
 
-    void extractBookmarkText() throws SAXException {
-        PDDocumentOutline outline = 
document.getDocumentCatalog().getDocumentOutline();
-        if (outline != null) {
-            extractBookmarkText(outline);
-        }
-    }
-
-    void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
-        PDOutlineItem current = bookmark.getFirstChild();
-        if (current != null) {
-            handler.startElement("ul");
-            while (current != null) {
-                handler.startElement("li");
-                handler.characters(current.getTitle());
-                handler.endElement("li");
-                // Recurse:
-                extractBookmarkText(current);
-                current = current.getNextSibling();
-            }
-            handler.endElement("ul");
-        }
-    }
 
     @Override
     public void processPage(PDPage page) throws IOException {
@@ -225,130 +148,15 @@ class PDF2XHTML extends PDFTextStripper {
     }
 
     @Override
-    protected void startDocument(PDDocument pdf) throws IOException {
-        try {
-            handler.startDocument();
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to start a document", e);
-        }
-    }
-
-    @Override
-    protected void endDocument(PDDocument pdf) throws IOException {
-        try {
-            // Extract text for any bookmarks:
-            extractBookmarkText();
-            try {
-                extractEmbeddedDocuments(pdf, originalHandler);
-            } catch (IOException e) {
-                handleCatchableIOE(e);
-            }
-
-            //extract acroform data at end of doc
-            if (config.getExtractAcroFormContent() == true) {
-                try {
-                    extractAcroForm(pdf, handler);
-                } catch (IOException e) {
-                    handleCatchableIOE(e);
-                }
-            }
-            handler.endDocument();
-        } catch (TikaException e) {
-            throw new IOExceptionWithCause("Unable to end a document", e);
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to end a document", e);
-        }
-    }
-
-    @Override
-    protected void startPage(PDPage page) throws IOException {
-        try {
-            handler.startElement("div", "class", "page");
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to start a page", e);
-        }
-        writeParagraphStart();
-    }
-
-    @Override
     protected void endPage(PDPage page) throws IOException {
         try {
             writeParagraphEnd();
             try {
-                extractImages(page.getResources(), new HashSet<COSStream>());
+                extractImages(page.getResources(), new HashSet<COSBase>());
             } catch (IOException e) {
                 handleCatchableIOE(e);
             }
-
-            EmbeddedDocumentExtractor extractor = 
getEmbeddedDocumentExtractor();
-            for (PDAnnotation annotation : page.getAnnotations()) {
-
-                if (annotation instanceof PDAnnotationFileAttachment) {
-                    PDAnnotationFileAttachment fann = 
(PDAnnotationFileAttachment) annotation;
-                    PDComplexFileSpecification fileSpec = 
(PDComplexFileSpecification) fann.getFile();
-                    try {
-                        extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
-                    } catch (SAXException e) {
-                        throw new IOExceptionWithCause("file embedded in 
annotation sax exception", e);
-                    } catch (TikaException e) {
-                        throw new IOExceptionWithCause("file embedded in 
annotation tika exception", e);
-                    } catch (IOException e) {
-                        handleCatchableIOE(e);
-                    }
-                }
-                // TODO: remove once PDFBOX-1143 is fixed:
-                if (config.getExtractAnnotationText()) {
-                    if (annotation instanceof PDAnnotationLink) {
-                        PDAnnotationLink annotationlink = (PDAnnotationLink) 
annotation;
-                        if (annotationlink.getAction() != null) {
-                            PDAction action = annotationlink.getAction();
-                            if (action instanceof PDActionURI) {
-                                PDActionURI uri = (PDActionURI) action;
-                                String link = uri.getURI();
-                                if (link != null) {
-                                    handler.startElement("div", "class", 
"annotation");
-                                    handler.startElement("a", "href", link);
-                                    handler.endElement("a");
-                                    handler.endElement("div");
-                                }
-                            }
-                        }
-                    }
-
-                    if (annotation instanceof PDAnnotationMarkup) {
-                        PDAnnotationMarkup annotationMarkup = 
(PDAnnotationMarkup) annotation;
-                        String title = annotationMarkup.getTitlePopup();
-                        String subject = annotationMarkup.getSubject();
-                        String contents = annotationMarkup.getContents();
-                        // TODO: maybe also annotationMarkup.getRichContents()?
-                        if (title != null || subject != null || contents != 
null) {
-                            handler.startElement("div", "class", "annotation");
-
-                            if (title != null) {
-                                handler.startElement("div", "class", 
"annotationTitle");
-                                handler.characters(title);
-                                handler.endElement("div");
-                            }
-
-                            if (subject != null) {
-                                handler.startElement("div", "class", 
"annotationSubject");
-                                handler.characters(subject);
-                                handler.endElement("div");
-                            }
-
-                            if (contents != null) {
-                                handler.startElement("div", "class", 
"annotationContents");
-                                handler.characters(contents);
-                                handler.endElement("div");
-                            }
-
-                            handler.endElement("div");
-                        }
-                    }
-                }
-            }
-
-            handler.endElement("div");
+            super.endPage(page);
         } catch (SAXException e) {
             throw new IOExceptionWithCause("Unable to end a page", e);
         } catch (IOException e) {
@@ -356,7 +164,7 @@ class PDF2XHTML extends PDFTextStripper {
         }
     }
 
-    private void extractImages(PDResources resources, Set<COSStream> 
seenThisPage) throws SAXException, IOException {
+    private void extractImages(PDResources resources, Set<COSBase> 
seenThisPage) throws SAXException, IOException {
         if (resources == null || config.getExtractInlineImages() == false) {
             return;
         }
@@ -395,7 +203,7 @@ class PDF2XHTML extends PDFTextStripper {
                     //throw new RuntimeException("EXTEN:" + extension);
                 }
 
-                Integer imageNumber = 
processedInlineImages.get(object.getCOSObject());
+                Integer imageNumber = processedInlineImages.get(cosStream);
                 if (imageNumber == null) {
                     imageNumber = inlineImageCounter++;
                 }
@@ -406,8 +214,8 @@ class PDF2XHTML extends PDFTextStripper {
                 AttributesImpl attr = new AttributesImpl();
                 attr.addAttribute("", "src", "src", "CDATA", "embedded:" + 
fileName);
                 attr.addAttribute("", "alt", "alt", "CDATA", fileName);
-                handler.startElement("img", attr);
-                handler.endElement("img");
+                xhtml.startElement("img", attr);
+                xhtml.endElement("img");
 
                 //Do we only want to process unique COSObject ids?
                 //If so, have we already processed this one?
@@ -430,7 +238,7 @@ class PDF2XHTML extends PDFTextStripper {
                         writeToBuffer(image, extension, buffer);
                         extractor.parseEmbedded(
                                 new ByteArrayInputStream(buffer.toByteArray()),
-                                new EmbeddedContentHandler(handler),
+                                new EmbeddedContentHandler(xhtml),
                                 metadata, false);
                     } catch (IOException e) {
                         handleCatchableIOE(e);
@@ -467,20 +275,11 @@ class PDF2XHTML extends PDFTextStripper {
         out.flush();
     }
 
-    protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
-        EmbeddedDocumentExtractor extractor =
-                context.get(EmbeddedDocumentExtractor.class);
-        if (extractor == null) {
-            extractor = new ParsingEmbeddedDocumentExtractor(context);
-        }
-        return extractor;
-    }
-
     @Override
     protected void writeParagraphStart() throws IOException {
         super.writeParagraphStart();
         try {
-            handler.startElement("p");
+            xhtml.startElement("p");
         } catch (SAXException e) {
             throw new IOExceptionWithCause("Unable to start a paragraph", e);
         }
@@ -490,7 +289,7 @@ class PDF2XHTML extends PDFTextStripper {
     protected void writeParagraphEnd() throws IOException {
         super.writeParagraphEnd();
         try {
-            handler.endElement("p");
+            xhtml.endElement("p");
         } catch (SAXException e) {
             throw new IOExceptionWithCause("Unable to end a paragraph", e);
         }
@@ -499,7 +298,7 @@ class PDF2XHTML extends PDFTextStripper {
     @Override
     protected void writeString(String text) throws IOException {
         try {
-            handler.characters(text);
+            xhtml.characters(text);
         } catch (SAXException e) {
             throw new IOExceptionWithCause(
                     "Unable to write a string: " + text, e);
@@ -509,7 +308,7 @@ class PDF2XHTML extends PDFTextStripper {
     @Override
     protected void writeCharacters(TextPosition text) throws IOException {
         try {
-            handler.characters(text.getUnicode());
+            xhtml.characters(text.getUnicode());
         } catch (SAXException e) {
             throw new IOExceptionWithCause(
                     "Unable to write a character: " + text.getUnicode(), e);
@@ -519,7 +318,7 @@ class PDF2XHTML extends PDFTextStripper {
     @Override
     protected void writeWordSeparator() throws IOException {
         try {
-            handler.characters(getWordSeparator());
+            xhtml.characters(getWordSeparator());
         } catch (SAXException e) {
             throw new IOExceptionWithCause(
                     "Unable to write a space character", e);
@@ -529,275 +328,12 @@ class PDF2XHTML extends PDFTextStripper {
     @Override
     protected void writeLineSeparator() throws IOException {
         try {
-            handler.newline();
+            xhtml.newline();
         } catch (SAXException e) {
             throw new IOExceptionWithCause(
                     "Unable to write a newline character", e);
         }
     }
 
-    private void extractEmbeddedDocuments(PDDocument document, ContentHandler 
handler)
-            throws IOException, SAXException, TikaException {
-        PDDocumentNameDictionary namesDictionary =
-                new PDDocumentNameDictionary( document.getDocumentCatalog() );
-        PDEmbeddedFilesNameTreeNode efTree = 
namesDictionary.getEmbeddedFiles();
-        if (efTree == null) {
-            return;
-        }
-
-        Map<String, PDComplexFileSpecification> embeddedFileNames = 
efTree.getNames();
-        //For now, try to get the embeddedFileNames out of embeddedFiles or 
its kids.
-        //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
-        //If there is a need we could add a fully recursive search to find a 
non-null
-        //Map<String, COSObjectable> that contains the doc info.
-        if (embeddedFileNames != null) {
-            processEmbeddedDocNames(embeddedFileNames);
-        } else {
-            List<PDNameTreeNode<PDComplexFileSpecification>> kids = 
efTree.getKids();
-            if (kids == null) {
-                return;
-            }
-            for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
-                embeddedFileNames = node.getNames();
-                if (embeddedFileNames != null) {
-                    processEmbeddedDocNames(embeddedFileNames);
-                }
-            }
-        }
-    }
-
-    private void processEmbeddedDocNames(Map<String, 
PDComplexFileSpecification> embeddedFileNames)
-            throws IOException, SAXException, TikaException {
-        if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
-            return;
-        }
-
-        EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
-        for (Map.Entry<String, PDComplexFileSpecification> ent : 
embeddedFileNames.entrySet()) {
-            PDComplexFileSpecification spec = ent.getValue();
-            extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
-        }
-    }
-
-    private void extractMultiOSPDEmbeddedFiles(String defaultName,
-                                               PDComplexFileSpecification spec,
-                                               EmbeddedDocumentExtractor 
extractor) throws IOException,
-            SAXException, TikaException {
-
-        if (spec == null) {
-            return;
-        }
-        //current strategy is to pull all, not just first non-null
-        extractPDEmbeddedFile(defaultName, spec.getFile(), 
spec.getEmbeddedFile(), extractor);
-        extractPDEmbeddedFile(defaultName, spec.getFileMac(), 
spec.getEmbeddedFileMac(), extractor);
-        extractPDEmbeddedFile(defaultName, spec.getFileDos(), 
spec.getEmbeddedFileDos(), extractor);
-        extractPDEmbeddedFile(defaultName, spec.getFileUnix(), 
spec.getEmbeddedFileUnix(), extractor);
-    }
-
-    private void extractPDEmbeddedFile(String defaultName, String fileName, 
PDEmbeddedFile file,
-                                       EmbeddedDocumentExtractor extractor)
-            throws SAXException, IOException, TikaException {
-
-        if (file == null) {
-            //skip silently
-            return;
-        }
-
-        fileName = (fileName == null) ? defaultName : fileName;
-
-        // TODO: other metadata?
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
-        metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
-        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
-        metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
-                TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
-
-        if (extractor.shouldParseEmbedded(metadata)) {
-            TikaInputStream stream = null;
-            try {
-                stream = TikaInputStream.get(file.createInputStream());
-                extractor.parseEmbedded(
-                        stream,
-                        new EmbeddedContentHandler(handler),
-                        metadata, false);
-
-                AttributesImpl attributes = new AttributesImpl();
-                attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
-                attributes.addAttribute("", "id", "id", "CDATA", fileName);
-                handler.startElement("div", attributes);
-                handler.endElement("div");
-            } finally {
-                IOUtils.closeQuietly(stream);
-            }
-        }
-    }
-
-    private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) 
throws IOException,
-            SAXException {
-        //Thank you, Ben Litchfield, for 
org.apache.pdfbox.examples.fdf.PrintFields
-        //this code derives from Ben's code
-        PDDocumentCatalog catalog = pdf.getDocumentCatalog();
-
-        if (catalog == null)
-            return;
-
-        PDAcroForm form = catalog.getAcroForm();
-        if (form == null)
-            return;
-
-        //if it has xfa, try that.
-        //if it doesn't exist or there's an exception,
-        //go with traditional AcroForm
-        PDXFAResource pdxfa = form.getXFA();
-
-        if (pdxfa != null) {
-            //if successful, return
-            XFAExtractor xfaExtractor = new XFAExtractor();
-            try (InputStream is = new BufferedInputStream(
-                    new ByteArrayInputStream(pdxfa.getBytes()))) {
-                xfaExtractor.extract(is, handler, metadata, context);
-                return;
-            } catch (XMLStreamException |IOException e) {
-                //if there was a potentially expected failure in xfa, try the 
AcroForm
-            }
-        }
-
-        @SuppressWarnings("rawtypes")
-        List fields = form.getFields();
-
-        if (fields == null)
-            return;
-
-        @SuppressWarnings("rawtypes")
-        ListIterator itr = fields.listIterator();
-
-        if (itr == null)
-            return;
-
-        handler.startElement("div", "class", "acroform");
-        handler.startElement("ol");
-
-        while (itr.hasNext()) {
-            Object obj = itr.next();
-            if (obj != null && obj instanceof PDField) {
-                processAcroField((PDField) obj, handler, 0);
-            }
-        }
-        handler.endElement("ol");
-        handler.endElement("div");
-    }
-
-    private void processAcroField(PDField field,
-                                  XHTMLContentHandler handler, final int 
currentRecursiveDepth)
-            throws SAXException, IOException {
-
-        if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
-            return;
-        }
-        addFieldString(field, handler);
-        if (field instanceof PDNonTerminalField) {
-            int r = currentRecursiveDepth + 1;
-            handler.startElement("ol");
-            for (PDField child : ((PDNonTerminalField)field).getChildren()) {
-                processAcroField(child, handler, r);
-            }
-            handler.endElement("ol");
-        }
-    }
-
-    private void addFieldString(PDField field, XHTMLContentHandler handler) 
throws SAXException {
-        //Pick partial name to present in content and altName for attribute
-        //Ignoring FullyQualifiedName for now
-        String partName = field.getPartialName();
-        String altName = field.getAlternateFieldName();
-
-        StringBuilder sb = new StringBuilder();
-        AttributesImpl attrs = new AttributesImpl();
-
-        if (partName != null) {
-            sb.append(partName).append(": ");
-        }
-        if (altName != null) {
-            attrs.addAttribute("", "altName", "altName", "CDATA", altName);
-        }
-        //return early if PDSignature field
-        if (field instanceof PDSignatureField) {
-            handleSignature(attrs, (PDSignatureField) field, handler);
-            return;
-        }
-        String value = field.getValueAsString();
-        if (value != null && !value.equals("null")) {
-            sb.append(value);
-        }
-
-        if (attrs.getLength() > 0 || sb.length() > 0) {
-            handler.startElement("li", attrs);
-            handler.characters(sb.toString());
-            handler.endElement("li");
-        }
-    }
-
-    private void handleSignature(AttributesImpl parentAttributes, 
PDSignatureField sigField,
-                                 XHTMLContentHandler handler) throws 
SAXException {
-
-
-        PDSignature sig = sigField.getSignature();
-        if (sig == null) {
-            return;
-        }
-        Map<String, String> vals = new TreeMap<>();
-        vals.put("name", sig.getName());
-        vals.put("contactInfo", sig.getContactInfo());
-        vals.put("location", sig.getLocation());
-        vals.put("reason", sig.getReason());
-
-        Calendar cal = sig.getSignDate();
-        if (cal != null) {
-            dateFormat.setTimeZone(cal.getTimeZone());
-            vals.put("date", dateFormat.format(cal.getTime()));
-        }
-        //see if there is any data
-        int nonNull = 0;
-        for (String val : vals.keySet()) {
-            if (val != null && !val.equals("")) {
-                nonNull++;
-            }
-        }
-        //if there is, process it
-        if (nonNull > 0) {
-            handler.startElement("li", parentAttributes);
-
-            AttributesImpl attrs = new AttributesImpl();
-            attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
-
-            handler.startElement("ol", attrs);
-            for (Map.Entry<String, String> e : vals.entrySet()) {
-                if (e.getValue() == null || e.getValue().equals("")) {
-                    continue;
-                }
-                attrs = new AttributesImpl();
-                attrs.addAttribute("", "signdata", "signdata", "CDATA", 
e.getKey());
-                handler.startElement("li", attrs);
-                handler.characters(e.getValue());
-                handler.endElement("li");
-            }
-            handler.endElement("ol");
-            handler.endElement("li");
-        }
-    }
-
-    private void handleCatchableIOE(IOException e) throws IOException {
-        if (config.isCatchIntermediateIOExceptions()) {
-            String msg = e.getMessage();
-            if (msg == null) {
-                msg = "IOException, no message";
-            }
-            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
-            exceptions.add(e);
-        } else {
-            throw e;
-        }
-    }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 4dee7dd..f735f25 100644
--- 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -57,6 +57,7 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.parser.xmp.JempboxExtractor;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.w3c.dom.Document;
@@ -140,9 +141,16 @@ public class PDFParser extends AbstractParser {
             if (handler != null) {
                 if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
                     handleXFAOnly(pdfDocument, handler, metadata, context);
+                } else if 
(localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
+                    metadata.add("X-Parsed-By", 
TesseractOCRParser.class.toString());
+                    OCR2XHTML.process(pdfDocument, handler, context, metadata, 
localConfig);
                 } else {
+                    if 
(localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION))
 {
+                        metadata.add("X-Parsed-By", 
TesseractOCRParser.class.toString());
+                    }
                     PDF2XHTML.process(pdfDocument, handler, context, metadata, 
localConfig);
                 }
+
             }
         } catch (InvalidPasswordException e) {
             metadata.set("pdf:encrypted", "true");

http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 9baeb37..296b191 100644
--- 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -23,6 +23,7 @@ import java.io.Serializable;
 import java.util.Locale;
 import java.util.Properties;
 
+import org.apache.pdfbox.rendering.ImageType;
 import org.apache.pdfbox.text.PDFTextStripper;
 
 /**
@@ -44,6 +45,26 @@ import org.apache.pdfbox.text.PDFTextStripper;
  */
 public class PDFParserConfig implements Serializable {
 
+    public enum OCR_STRATEGY {
+        NO_OCR,
+        OCR_ONLY,
+        OCR_AND_TEXT_EXTRACTION;
+
+        private static OCR_STRATEGY parse(String s) {
+            if (s == null) {
+                return NO_OCR;
+            } else if ("no_ocr".equals(s.toLowerCase(Locale.ROOT))) {
+                return NO_OCR;
+            } else if ("ocr_only".equals(s.toLowerCase(Locale.ROOT))) {
+                return OCR_ONLY;
+            } else if (s.toLowerCase(Locale.ROOT).contains("ocr_and_text")) {
+                return OCR_AND_TEXT_EXTRACTION;
+            }
+            //default -- no ocr
+            return NO_OCR;
+        }
+    }
+
     private static final long serialVersionUID = 6492570218190936986L;
 
     // True if we let PDFBox "guess" where spaces should go:
@@ -80,6 +101,12 @@ public class PDFParserConfig implements Serializable {
     //content from elsewhere in the document.
     private boolean ifXFAExtractOnlyXFA = false;
 
+    private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.NO_OCR;
+
+    private int ocrDPI = 200;
+    private ImageType ocrImageType = ImageType.GRAY;
+    private String ocrImageFormatName = "png";
+
     private AccessChecker accessChecker;
 
     //The PDFParser can throw IOExceptions if there is a problem
@@ -123,36 +150,45 @@ public class PDFParserConfig implements Serializable {
             }
         }
         setEnableAutoSpace(
-                getProp(props.getProperty("enableAutoSpace"), 
getEnableAutoSpace()));
+                getBooleanProp(props.getProperty("enableAutoSpace"), 
getEnableAutoSpace()));
         setSuppressDuplicateOverlappingText(
-                getProp(props.getProperty("suppressDuplicateOverlappingText"),
+                
getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"),
                         getSuppressDuplicateOverlappingText()));
         setExtractAnnotationText(
-                getProp(props.getProperty("extractAnnotationText"),
+                getBooleanProp(props.getProperty("extractAnnotationText"),
                         getExtractAnnotationText()));
         setSortByPosition(
-                getProp(props.getProperty("sortByPosition"),
+                getBooleanProp(props.getProperty("sortByPosition"),
                         getSortByPosition()));
         setExtractAcroFormContent(
-                getProp(props.getProperty("extractAcroFormContent"),
+                getBooleanProp(props.getProperty("extractAcroFormContent"),
                         getExtractAcroFormContent()));
         setExtractInlineImages(
-                getProp(props.getProperty("extractInlineImages"),
+                getBooleanProp(props.getProperty("extractInlineImages"),
                         getExtractInlineImages()));
         setExtractUniqueInlineImagesOnly(
-                getProp(props.getProperty("extractUniqueInlineImagesOnly"),
+                
getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"),
                         getExtractUniqueInlineImagesOnly()));
 
         setIfXFAExtractOnlyXFA(
-            getProp(props.getProperty("ifXFAExtractOnlyXFA"),
+            getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"),
                 getIfXFAExtractOnlyXFA()));
 
         setCatchIntermediateIOExceptions(
-                getProp(props.getProperty("catchIntermediateIOExceptions"),
+                
getBooleanProp(props.getProperty("catchIntermediateIOExceptions"),
                 isCatchIntermediateIOExceptions()));
 
-        boolean checkExtractAccessPermission = 
getProp(props.getProperty("checkExtractAccessPermission"), false);
-        boolean allowExtractionForAccessibility = 
getProp(props.getProperty("allowExtractionForAccessibility"), true);
+        setOCRStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));
+
+        setOCRDPI(getIntProp(props.getProperty("ocrDPI"), getOCRDPI()));
+
+        setOCRImageFormatName(props.getProperty("ocrImageFormatName"));
+
+        setOCRImageType(parseImageType(props.getProperty("ocrImageType")));
+
+
+        boolean checkExtractAccessPermission = 
getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
+        boolean allowExtractionForAccessibility = 
getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
 
         if (checkExtractAccessPermission == false) {
             //silently ignore the crazy configuration of 
checkExtractAccessPermission = false,
@@ -408,7 +444,23 @@ public class PDFParserConfig implements Serializable {
         isCatchIntermediateIOExceptions = catchIntermediateIOExceptions;
     }
 
-    private boolean getProp(String p, boolean defaultMissing) {
+    /**
+     * Which strategy to use for OCR
+     * @param ocrStrategy
+     */
+    public void setOCRStrategy(OCR_STRATEGY ocrStrategy) {
+        this.ocrStrategy = ocrStrategy;
+    }
+
+    /**
+     *
+     * @return strategy to use for OCR
+     */
+    public OCR_STRATEGY getOCRStrategy() {
+        return ocrStrategy;
+    }
+
+    private boolean getBooleanProp(String p, boolean defaultMissing) {
         if (p == null) {
             return defaultMissing;
         }
@@ -420,83 +472,143 @@ public class PDFParserConfig implements Serializable {
             return defaultMissing;
         }
     }
+    //throws NumberFormatException if there's a non-null unparseable
+    //string passed in
+    private int getIntProp(String p, int defaultMissing) {
+        if (p == null) {
+            return defaultMissing;
+        }
 
-    @Override
-    public int hashCode() {
-        final int prime = 31;
-        int result = 1;
-        result = prime
-                * result
-                + ((averageCharTolerance == null) ? 0 : averageCharTolerance
-                .hashCode());
-        result = prime * result + (enableAutoSpace ? 1231 : 1237);
-        result = prime * result + (extractAcroFormContent ? 1231 : 1237);
-        result = prime * result + (extractAnnotationText ? 1231 : 1237);
-        result = prime * result + (extractInlineImages ? 1231 : 1237);
-        result = prime * result + (extractUniqueInlineImagesOnly ? 1231 : 
1237);
-        result = prime * result + (sortByPosition ? 1231 : 1237);
-        result = prime
-                * result
-                + ((spacingTolerance == null) ? 0 : 
spacingTolerance.hashCode());
-        result = prime * result
-                + (suppressDuplicateOverlappingText ? 1231 : 1237);
-        result = prime * result + (ifXFAExtractOnlyXFA ? 1231 : 1237);
-        return result;
+        return Integer.parseInt(p);
     }
 
-    @Override
-    public boolean equals(Object obj) {
-        if (this == obj)
-            return true;
-        if (obj == null)
-            return false;
-        if (getClass() != obj.getClass())
-            return false;
-        PDFParserConfig other = (PDFParserConfig) obj;
-        if (averageCharTolerance == null) {
-            if (other.averageCharTolerance != null)
-                return false;
-        } else if (!averageCharTolerance.equals(other.averageCharTolerance))
-            return false;
-        if (enableAutoSpace != other.enableAutoSpace)
-            return false;
-        if (extractAcroFormContent != other.extractAcroFormContent)
-            return false;
-        if (extractAnnotationText != other.extractAnnotationText)
-            return false;
-        if (extractInlineImages != other.extractInlineImages)
-            return false;
-        if (extractUniqueInlineImagesOnly != 
other.extractUniqueInlineImagesOnly)
-            return false;
-        if (sortByPosition != other.sortByPosition)
-            return false;
-        if (spacingTolerance == null) {
-            if (other.spacingTolerance != null)
-                return false;
-        } else if (!spacingTolerance.equals(other.spacingTolerance))
-            return false;
-        if (suppressDuplicateOverlappingText != 
other.suppressDuplicateOverlappingText)
-            return false;
-        if (ifXFAExtractOnlyXFA != other.ifXFAExtractOnlyXFA)
-            return false;
+    /**
+     * String representation of the image format used to render
+     * the page image for OCR (examples: png, tiff, jpeg)
+     * @return
+     */
+    public String getOCRImageFormatName() {
+        return ocrImageFormatName;
+    }
 
-        return true;
+    /**
+     * @see #getOCRImageFormatName()
+     *
+     * @param ocrImageFormatName name of image format used to render
+     *                           page image
+     */
+    public void setOCRImageFormatName(String ocrImageFormatName) {
+        this.ocrImageFormatName = ocrImageFormatName;
+    }
+
+    /**
+     * Image type used to render the page image for OCR.
+     * @see #setOCRImageType(ImageType)
+     * @return image type
+     */
+    public ImageType getOCRImageType() {
+        return ocrImageType;
+    }
+
+    /**
+     * Image type used to render the page image for OCR.
+     * @param ocrImageType
+     */
+    public void setOCRImageType(ImageType ocrImageType) {
+        this.ocrImageType = ocrImageType;
+    }
+
+    /**
+     * Dots per inch used to render the page image for OCR
+     * @return dots per inch
+     */
+    public int getOCRDPI() {
+        return ocrDPI;
+    }
+
+    /**
+     * Dots per inche used to render the page image for OCR
+     * @param ocrDPI
+     */
+    public void setOCRDPI(int ocrDPI) {
+        this.ocrDPI = ocrDPI;
+    }
+
+    private ImageType parseImageType(String ocrImageType) {
+        for (ImageType t : ImageType.values()) {
+            if (ocrImageType.equalsIgnoreCase(t.toString())) {
+                return t;
+            }
+        }
+        return null;
     }
 
     @Override
-    public String toString() {
-        return "PDFParserConfig [enableAutoSpace=" + enableAutoSpace
-                + ", suppressDuplicateOverlappingText="
-                + suppressDuplicateOverlappingText + ", extractAnnotationText="
-                + extractAnnotationText + ", sortByPosition=" + sortByPosition
-                + ", extractAcroFormContent=" + extractAcroFormContent
-                + ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA
-                + ", extractInlineImages=" + extractInlineImages
-                + ", extractUniqueInlineImagesOnly="
-                + extractUniqueInlineImagesOnly + ", averageCharTolerance="
-                + averageCharTolerance + ", spacingTolerance="
-                + spacingTolerance + "]";
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (!(o instanceof PDFParserConfig)) return false;
+
+        PDFParserConfig config = (PDFParserConfig) o;
+
+        if (getEnableAutoSpace() != config.getEnableAutoSpace()) return false;
+        if (getSuppressDuplicateOverlappingText() != 
config.getSuppressDuplicateOverlappingText()) return false;
+        if (getExtractAnnotationText() != config.getExtractAnnotationText()) 
return false;
+        if (getSortByPosition() != config.getSortByPosition()) return false;
+        if (getExtractAcroFormContent() != config.getExtractAcroFormContent()) 
return false;
+        if (getExtractInlineImages() != config.getExtractInlineImages()) 
return false;
+        if (getExtractUniqueInlineImagesOnly() != 
config.getExtractUniqueInlineImagesOnly()) return false;
+        if (getIfXFAExtractOnlyXFA() != config.getIfXFAExtractOnlyXFA()) 
return false;
+        if (getOCRDPI() != config.getOCRDPI()) return false;
+        if (isCatchIntermediateIOExceptions() != 
config.isCatchIntermediateIOExceptions()) return false;
+        if 
(!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return 
false;
+        if (!getSpacingTolerance().equals(config.getSpacingTolerance())) 
return false;
+        if (!getOCRStrategy().equals(config.getOCRStrategy())) return false;
+        if (getOCRImageType() != config.getOCRImageType()) return false;
+        if (!getOCRImageFormatName().equals(config.getOCRImageFormatName())) 
return false;
+        return getAccessChecker().equals(config.getAccessChecker());
+
     }
 
+    @Override
+    public int hashCode() {
+        int result = (getEnableAutoSpace() ? 1 : 0);
+        result = 31 * result + (getSuppressDuplicateOverlappingText() ? 1 : 0);
+        result = 31 * result + (getExtractAnnotationText() ? 1 : 0);
+        result = 31 * result + (getSortByPosition() ? 1 : 0);
+        result = 31 * result + (getExtractAcroFormContent() ? 1 : 0);
+        result = 31 * result + (getExtractInlineImages() ? 1 : 0);
+        result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0);
+        result = 31 * result + getAverageCharTolerance().hashCode();
+        result = 31 * result + getSpacingTolerance().hashCode();
+        result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0);
+        result = 31 * result + ocrStrategy.hashCode();
+        result = 31 * result + getOCRDPI();
+        result = 31 * result + getOCRImageType().hashCode();
+        result = 31 * result + getOCRImageFormatName().hashCode();
+        result = 31 * result + getAccessChecker().hashCode();
+        result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0);
+        return result;
+    }
 
+    @Override
+    public String toString() {
+        return "PDFParserConfig{" +
+                "enableAutoSpace=" + enableAutoSpace +
+                ", suppressDuplicateOverlappingText=" + 
suppressDuplicateOverlappingText +
+                ", extractAnnotationText=" + extractAnnotationText +
+                ", sortByPosition=" + sortByPosition +
+                ", extractAcroFormContent=" + extractAcroFormContent +
+                ", extractInlineImages=" + extractInlineImages +
+                ", extractUniqueInlineImagesOnly=" + 
extractUniqueInlineImagesOnly +
+                ", averageCharTolerance=" + averageCharTolerance +
+                ", spacingTolerance=" + spacingTolerance +
+                ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA +
+                ", ocrStrategy=" + ocrStrategy +
+                ", ocrDPI=" + ocrDPI +
+                ", ocrImageType=" + ocrImageType +
+                ", ocrImageFormatName='" + ocrImageFormatName + '\'' +
+                ", accessChecker=" + accessChecker +
+                ", isCatchIntermediateIOExceptions=" + 
isCatchIntermediateIOExceptions +
+                '}';
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
 
b/tika-parser-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
index 9b404a3..319e693 100644
--- 
a/tika-parser-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
+++ 
b/tika-parser-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
@@ -23,4 +23,12 @@ extractUniqueInlineImagesOnly true
 checkExtractAccessPermission false
 allowExtractionForAccessibility true
 ifXFAExtractOnlyXFA false
-catchIntermediateIOExceptions true
\ No newline at end of file
+catchIntermediateIOExceptions true
+#options: no_ocr, ocr_only, ocr_and_text_extraction
+ocrStrategy no_ocr
+#dots per inch for the ocr rendering of the page image
+ocrDPI 200
+#if you request tif, make sure you have imageio jars on your classpath!
+ocrImageFormatName png
+#options: argb, binary, gray, rgb
+ocrImageType gray

http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 
b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index bd2e5ad..b7582f2 100644
--- 
a/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ 
b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -50,6 +50,8 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerDecorator;
@@ -69,6 +71,15 @@ public class PDFParserTest extends TikaTest {
     public static final MediaType TYPE_DOCX = 
MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
     public static final MediaType TYPE_DOC = MediaType.application("msword");
     public static Level PDFBOX_LOG_LEVEL = Level.INFO;
+    private static Boolean hasTesseract = null;
+
+    public static boolean canRunOCR() {
+        if (hasTesseract != null) {
+            return hasTesseract;
+        }
+        hasTesseract = new TesseractOCRParser().hasTesseract(new 
TesseractOCRConfig());
+        return hasTesseract;
+    }
 
     @BeforeClass
     public static void setup() {
@@ -1158,6 +1169,32 @@ public class PDFParserTest extends TikaTest {
         assertEquals(0, 
m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
         assertNotContained("1309.61", content);
     }
+
+    @Test
+    public void testEmbeddedDocsWithOCR() throws Exception {
+        if (! canRunOCR()) { return; }
+
+        for (PDFParserConfig.OCR_STRATEGY strategy : 
PDFParserConfig.OCR_STRATEGY.values()) {
+            PDFParserConfig config = new PDFParserConfig();
+            config.setOCRStrategy(strategy);
+            ParseContext context = new ParseContext();
+            context.set(PDFParserConfig.class, config);
+            context.set(Parser.class, new AutoDetectParser());
+            //make sure everything works with regular xml _and_ with recursive
+            XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx", 
context);
+            assertContains("pdf_haystack", xmlResult.xml);
+            assertContains("Haystack", xmlResult.xml);
+            assertContains("Needle", xmlResult.xml);
+            if (! strategy.equals(PDFParserConfig.OCR_STRATEGY.NO_OCR)) {
+                assertContains("<div class=\"ocr\">pdf_haystack", 
xmlResult.xml);
+            } else {
+                assertNotContained("<div class=\"ocr\">pdf_haystack", 
xmlResult.xml);
+            }
+            assertEquals(4, 
getRecursiveJson("testPDFEmbeddingAndEmbedded.docx", context).size());
+        }
+
+    }
+
     private void assertException(String path, Parser parser, ParseContext 
context, Class expected) {
         boolean noEx = false;
         InputStream is = getResourceAsStream(path);

[1/2] tika git commit: TIKA-1994 -- Integrate TesseractOCR with full page image rendering for PDFs

Reply via email to