Repository: tika Updated Branches: refs/heads/TIKA-1508 1202f459e -> 18ab8f91f
TIKA-1994 -- integrate OCR with PDFParser Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7aeb95d6 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7aeb95d6 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7aeb95d6 Branch: refs/heads/TIKA-1508 Commit: 7aeb95d6c7a6ac3611f2dd975baa73f566631061 Parents: a20c46c Author: tballison <[email protected]> Authored: Thu Jun 2 12:04:30 2016 -0400 Committer: tballison <[email protected]> Committed: Thu Jun 2 12:04:30 2016 -0400 ---------------------------------------------------------------------- .../tika/parser/ocr/TesseractOCRParser.java | 87 ++- .../tika/parser/pdf/AbstractPDF2XHTML.java | 576 +++++++++++++++++++ .../org/apache/tika/parser/pdf/OCR2XHTML.java | 127 ++++ .../org/apache/tika/parser/pdf/PDF2XHTML.java | 492 +--------------- .../org/apache/tika/parser/pdf/PDFParser.java | 7 + .../apache/tika/parser/pdf/PDFParserConfig.java | 274 ++++++--- .../apache/tika/parser/pdf/PDFParser.properties | 10 +- .../apache/tika/parser/pdf/PDFParserTest.java | 38 ++ 8 files changed, 1029 insertions(+), 582 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 7db29c8..a238a7c 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -56,6 +56,7 @@ import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.parser.image.ImageParser; import org.apache.tika.parser.image.TiffParser; import org.apache.tika.parser.jpeg.JpegParser; +import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -110,7 +111,7 @@ public class TesseractOCRParser extends AbstractParser { } } - private boolean hasTesseract(TesseractOCRConfig config) { + public boolean hasTesseract(TesseractOCRConfig config) { // Fetch where the config says to find Tesseract String tesseract = config.getTesseractPath() + getTesseractProg(); @@ -157,47 +158,90 @@ public class TesseractOCRParser extends AbstractParser { public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG); - // If Tesseract is not on the path with the current config, do not try to run OCR // getSupportedTypes shouldn't have listed us as handling it, so this should only // occur if someone directly calls this parser, not via DefaultParser or similar if (! hasTesseract(config)) return; - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + TemporaryResources tmp = new TemporaryResources(); + try { + TikaInputStream tikaStream = TikaInputStream.get(stream, tmp); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + File tmpImgFile = tmp.createTemporaryFile(); + parse(tikaStream, tmpImgFile, xhtml, config); + // Temporary workaround for TIKA-1445 - until we can specify + // composite parsers with strategies (eg Composite, Try In Turn), + // always send the image onwards to the regular parser to have + // the metadata for them extracted as well + _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new EmbeddedContentHandler(xhtml), metadata, context); + xhtml.endDocument(); + } finally { + tmp.dispose(); + } + } + + /** + * Use this to parse content without starting a new document. + * This appends SAX events to xhtml without re-adding the metadata, body start, etc. + * @param stream inputstream + * @param xhtml handler + * @param config TesseractOCRConfig to use for this parse + * @throws IOException + * @throws SAXException + * @throws TikaException + */ + public void parseInline(InputStream stream, XHTMLContentHandler xhtml, TesseractOCRConfig config) + throws IOException, SAXException, TikaException { + // If Tesseract is not on the path with the current config, do not try to run OCR + // getSupportedTypes shouldn't have listed us as handling it, so this should only + // occur if someone directly calls this parser, not via DefaultParser or similar + if (! hasTesseract(config)) + return; TemporaryResources tmp = new TemporaryResources(); - File output = null; try { TikaInputStream tikaStream = TikaInputStream.get(stream, tmp); - File input = tikaStream.getFile(); - long size = tikaStream.getLength(); + File tmpImgFile = tmp.createTemporaryFile(); + parse(tikaStream, tmpImgFile, xhtml, config); + } finally { + tmp.dispose(); + } + + } + + private void parse(TikaInputStream tikaInputStream, File tmpImgFile, XHTMLContentHandler xhtml, TesseractOCRConfig config) + throws IOException, SAXException, TikaException { + File tmpTxtOutput = null; + + try { + File input = tikaInputStream.getFile(); + long size = tikaInputStream.getLength(); if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) { - output = tmp.createTemporaryFile(); - doOCR(input, output, config); + doOCR(input, tmpImgFile, config); // Tesseract appends .txt to output file name - output = new File(output.getAbsolutePath() + ".txt"); + tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + ".txt"); - if (output.exists()) - extractOutput(new FileInputStream(output), xhtml); + if (tmpTxtOutput.exists()) { + try (InputStream is = new FileInputStream(tmpTxtOutput)) { + extractOutput(is, xhtml); + } + } } - // Temporary workaround for TIKA-1445 - until we can specify - // composite parsers with strategies (eg Composite, Try In Turn), - // always send the image onwards to the regular parser to have - // the metadata for them extracted as well - _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context); } finally { - tmp.dispose(); - if (output != null) { - output.delete(); + if (tmpTxtOutput != null) { + tmpTxtOutput.delete(); } } } + // TIKA-1445 workaround parser private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser(); private static class CompositeImageParser extends CompositeParser { @@ -283,8 +327,7 @@ public class TesseractOCRParser extends AbstractParser { */ private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException { - xhtml.startDocument(); - xhtml.startElement("div"); + xhtml.startElement("div", "class", "ocr"); try (Reader reader = new InputStreamReader(stream, UTF_8)) { char[] buffer = new char[1024]; for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { @@ -293,7 +336,7 @@ public class TesseractOCRParser extends AbstractParser { } } xhtml.endElement("div"); - xhtml.endDocument(); + } /** http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java new file mode 100644 index 0000000..d8a46a2 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -0,0 +1,576 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +import javax.xml.stream.XMLStreamException; +import java.awt.image.BufferedImage; +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.List; +import java.util.ListIterator; +import java.util.Locale; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.commons.io.IOExceptionWithCause; +import org.apache.commons.io.IOUtils; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentCatalog; +import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; +import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; +import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; +import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; +import org.apache.pdfbox.pdmodel.interactive.action.PDAction; +import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup; +import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode; +import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; +import org.apache.pdfbox.pdmodel.interactive.form.PDField; +import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField; +import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField; +import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.tools.imageio.ImageIOUtil; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.ocr.TesseractOCRConfig; +import org.apache.tika.parser.ocr.TesseractOCRParser; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR; + +class AbstractPDF2XHTML extends PDFTextStripper { + + /** + * Maximum recursive depth during AcroForm processing. + * Prevents theoretical AcroForm recursion bomb. + */ + private final static int MAX_ACROFORM_RECURSIONS = 10; + + private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig(); + + /** + * Format used for signature dates + * TODO Make this thread-safe + */ + private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT); + + + final List<IOException> exceptions = new ArrayList<>(); + final PDDocument pdDocument; + final XHTMLContentHandler xhtml; + private final ParseContext context; + private final Metadata metadata; + final PDFParserConfig config; + + private int pageIndex = 0; + + AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata, + PDFParserConfig config) throws IOException { + this.pdDocument = pdDocument; + this.xhtml = new XHTMLContentHandler(handler, metadata); + this.context = context; + this.metadata = metadata; + this.config = config; + } + + @Override + protected void startPage(PDPage page) throws IOException { + try { + xhtml.startElement("div", "class", "page"); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to start a page", e); + } + writeParagraphStart(); + } + + EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() { + EmbeddedDocumentExtractor extractor = + context.get(EmbeddedDocumentExtractor.class); + if (extractor == null) { + extractor = new ParsingEmbeddedDocumentExtractor(context); + } + return extractor; + } + + private void extractEmbeddedDocuments(PDDocument document) + throws IOException, SAXException, TikaException { + PDDocumentNameDictionary namesDictionary = + new PDDocumentNameDictionary(document.getDocumentCatalog()); + PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); + if (efTree == null) { + return; + } + + Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames(); + //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. + //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java + //If there is a need we could add a fully recursive search to find a non-null + //Map<String, COSObjectable> that contains the doc info. + if (embeddedFileNames != null) { + processEmbeddedDocNames(embeddedFileNames); + } else { + List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); + if (kids == null) { + return; + } + for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { + embeddedFileNames = node.getNames(); + if (embeddedFileNames != null) { + processEmbeddedDocNames(embeddedFileNames); + } + } + } + } + + private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames) + throws IOException, SAXException, TikaException { + if (embeddedFileNames == null || embeddedFileNames.isEmpty()) { + return; + } + + EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); + for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) { + PDComplexFileSpecification spec = ent.getValue(); + extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor); + } + } + + private void extractMultiOSPDEmbeddedFiles(String defaultName, + PDComplexFileSpecification spec, + EmbeddedDocumentExtractor extractor) throws IOException, + SAXException, TikaException { + + if (spec == null) { + return; + } + //current strategy is to pull all, not just first non-null + extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor); + extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor); + extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor); + extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor); + } + + private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file, + EmbeddedDocumentExtractor extractor) + throws SAXException, IOException, TikaException { + + if (file == null) { + //skip silently + return; + } + + fileName = (fileName == null) ? defaultName : fileName; + + // TODO: other metadata? + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); + metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); + metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); + + if (extractor.shouldParseEmbedded(metadata)) { + TikaInputStream stream = null; + try { + stream = TikaInputStream.get(file.createInputStream()); + extractor.parseEmbedded( + stream, + new EmbeddedContentHandler(xhtml), + metadata, false); + + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", fileName); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + } finally { + IOUtils.closeQuietly(stream); + } + } + } + + void handleCatchableIOE(IOException e) throws IOException { + if (config.isCatchIntermediateIOExceptions()) { + String msg = e.getMessage(); + if (msg == null) { + msg = "IOException, no message"; + } + metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg); + exceptions.add(e); + } else { + throw e; + } + } + + void doOCROnCurrentPage() throws IOException, TikaException, SAXException { + if (config.getOCRStrategy().equals(NO_OCR)) { + return; + } + TesseractOCRConfig tesseractConfig = + context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG); + + TesseractOCRParser tesseractOCRParser = new TesseractOCRParser(); + if (! tesseractOCRParser.hasTesseract(tesseractConfig)) { + throw new TikaException("Tesseract is not available. "+ + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly"); + } + + PDFRenderer renderer = new PDFRenderer(pdDocument); + TemporaryResources tmp = new TemporaryResources(); + try { + BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOCRImageType()); + Path tmpFile = tmp.createTempFile(); + try (OutputStream os = Files.newOutputStream(tmpFile)) { + //TODO: get output format from TesseractConfig + ImageIOUtil.writeImage(image, config.getOCRImageFormatName(), + os, config.getOCRDPI()); + } + try (InputStream is = TikaInputStream.get(tmpFile)) { + tesseractOCRParser.parseInline(is, xhtml, tesseractConfig); + } + } catch (IOException e) { + handleCatchableIOE(e); + } catch (SAXException e) { + throw new IOExceptionWithCause("error writing OCR content from PDF", e); + } finally { + tmp.dispose(); + } + } + + @Override + protected void endPage(PDPage page) throws IOException { + + try { + EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); + for (PDAnnotation annotation : page.getAnnotations()) { + + if (annotation instanceof PDAnnotationFileAttachment) { + PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; + PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); + try { + extractMultiOSPDEmbeddedFiles("", fileSpec, extractor); + } catch (SAXException e) { + throw new IOExceptionWithCause("file embedded in annotation sax exception", e); + } catch (TikaException e) { + throw new IOExceptionWithCause("file embedded in annotation tika exception", e); + } catch (IOException e) { + handleCatchableIOE(e); + } + } + // TODO: remove once PDFBOX-1143 is fixed: + if (config.getExtractAnnotationText()) { + if (annotation instanceof PDAnnotationLink) { + PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; + if (annotationlink.getAction() != null) { + PDAction action = annotationlink.getAction(); + if (action instanceof PDActionURI) { + PDActionURI uri = (PDActionURI) action; + String link = uri.getURI(); + if (link != null) { + xhtml.startElement("div", "class", "annotation"); + xhtml.startElement("a", "href", link); + xhtml.endElement("a"); + xhtml.endElement("div"); + } + } + } + } + + if (annotation instanceof PDAnnotationMarkup) { + PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; + String title = annotationMarkup.getTitlePopup(); + String subject = annotationMarkup.getSubject(); + String contents = annotationMarkup.getContents(); + // TODO: maybe also annotationMarkup.getRichContents()? + if (title != null || subject != null || contents != null) { + xhtml.startElement("div", "class", "annotation"); + + if (title != null) { + xhtml.startElement("div", "class", "annotationTitle"); + xhtml.characters(title); + xhtml.endElement("div"); + } + + if (subject != null) { + xhtml.startElement("div", "class", "annotationSubject"); + xhtml.characters(subject); + xhtml.endElement("div"); + } + + if (contents != null) { + xhtml.startElement("div", "class", "annotationContents"); + xhtml.characters(contents); + xhtml.endElement("div"); + } + + xhtml.endElement("div"); + } + } + } + } + if (config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { + doOCROnCurrentPage(); + } + xhtml.endElement("div"); + } catch (SAXException|TikaException e) { + throw new IOExceptionWithCause("Unable to end a page", e); + } catch (IOException e) { + exceptions.add(e); + } finally { + pageIndex++; + } + } + + @Override + protected void startDocument(PDDocument pdf) throws IOException { + try { + xhtml.startDocument(); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to start a document", e); + } + } + + @Override + protected void endDocument(PDDocument pdf) throws IOException { + try { + // Extract text for any bookmarks: + extractBookmarkText(); + try { + extractEmbeddedDocuments(pdf); + } catch (IOException e) { + handleCatchableIOE(e); + } + + //extract acroform data at end of doc + if (config.getExtractAcroFormContent() == true) { + try { + extractAcroForm(pdf); + } catch (IOException e) { + handleCatchableIOE(e); + } + } + xhtml.endDocument(); + } catch (TikaException e) { + throw new IOExceptionWithCause("Unable to end a document", e); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to end a document", e); + } + } + + void extractBookmarkText() throws SAXException { + PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline(); + if (outline != null) { + extractBookmarkText(outline); + } + } + + void extractBookmarkText(PDOutlineNode bookmark) throws SAXException { + PDOutlineItem current = bookmark.getFirstChild(); + if (current != null) { + xhtml.startElement("ul"); + while (current != null) { + xhtml.startElement("li"); + xhtml.characters(current.getTitle()); + xhtml.endElement("li"); + // Recurse: + extractBookmarkText(current); + current = current.getNextSibling(); + } + xhtml.endElement("ul"); + } + } + + void extractAcroForm(PDDocument pdf) throws IOException, + SAXException { + //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields + //this code derives from Ben's code + PDDocumentCatalog catalog = pdf.getDocumentCatalog(); + + if (catalog == null) + return; + + PDAcroForm form = catalog.getAcroForm(); + if (form == null) + return; + + //if it has xfa, try that. + //if it doesn't exist or there's an exception, + //go with traditional AcroForm + PDXFAResource pdxfa = form.getXFA(); + + if (pdxfa != null) { + //if successful, return + XFAExtractor xfaExtractor = new XFAExtractor(); + try (InputStream is = new BufferedInputStream( + new ByteArrayInputStream(pdxfa.getBytes()))) { + xfaExtractor.extract(is, xhtml, metadata, context); + return; + } catch (XMLStreamException |IOException e) { + //if there was an xml parse exception in xfa, try the AcroForm + } + } + + @SuppressWarnings("rawtypes") + List fields = form.getFields(); + + if (fields == null) + return; + + @SuppressWarnings("rawtypes") + ListIterator itr = fields.listIterator(); + + if (itr == null) + return; + + xhtml.startElement("div", "class", "acroform"); + xhtml.startElement("ol"); + + while (itr.hasNext()) { + Object obj = itr.next(); + if (obj != null && obj instanceof PDField) { + processAcroField((PDField) obj, 0); + } + } + xhtml.endElement("ol"); + xhtml.endElement("div"); + } + + private void processAcroField(PDField field, final int currentRecursiveDepth) + throws SAXException, IOException { + + if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) { + return; + } + addFieldString(field); + if (field instanceof PDNonTerminalField) { + int r = currentRecursiveDepth + 1; + xhtml.startElement("ol"); + for (PDField child : ((PDNonTerminalField)field).getChildren()) { + processAcroField(child, r); + } + xhtml.endElement("ol"); + } + } + + private void addFieldString(PDField field) throws SAXException { + //Pick partial name to present in content and altName for attribute + //Ignoring FullyQualifiedName for now + String partName = field.getPartialName(); + String altName = field.getAlternateFieldName(); + + StringBuilder sb = new StringBuilder(); + AttributesImpl attrs = new AttributesImpl(); + + if (partName != null) { + sb.append(partName).append(": "); + } + if (altName != null) { + attrs.addAttribute("", "altName", "altName", "CDATA", altName); + } + //return early if PDSignature field + if (field instanceof PDSignatureField) { + handleSignature(attrs, (PDSignatureField) field); + return; + } + String value = field.getValueAsString(); + if (value != null && !value.equals("null")) { + sb.append(value); + } + + if (attrs.getLength() > 0 || sb.length() > 0) { + xhtml.startElement("li", attrs); + xhtml.characters(sb.toString()); + xhtml.endElement("li"); + } + } + + private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField) + throws SAXException { + + PDSignature sig = sigField.getSignature(); + if (sig == null) { + return; + } + Map<String, String> vals = new TreeMap<>(); + vals.put("name", sig.getName()); + vals.put("contactInfo", sig.getContactInfo()); + vals.put("location", sig.getLocation()); + vals.put("reason", sig.getReason()); + + Calendar cal = sig.getSignDate(); + if (cal != null) { + dateFormat.setTimeZone(cal.getTimeZone()); + vals.put("date", dateFormat.format(cal.getTime())); + } + //see if there is any data + int nonNull = 0; + for (String val : vals.keySet()) { + if (val != null && !val.equals("")) { + nonNull++; + } + } + //if there is, process it + if (nonNull > 0) { + xhtml.startElement("li", parentAttributes); + + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute("", "type", "type", "CDATA", "signaturedata"); + + xhtml.startElement("ol", attrs); + for (Map.Entry<String, String> e : vals.entrySet()) { + if (e.getValue() == null || e.getValue().equals("")) { + continue; + } + attrs = new AttributesImpl(); + attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey()); + xhtml.startElement("li", attrs); + xhtml.characters(e.getValue()); + xhtml.endElement("li"); + } + xhtml.endElement("ol"); + xhtml.endElement("li"); + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java new file mode 100644 index 0000000..539cd50 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +import java.io.IOException; +import java.io.Writer; + +import org.apache.commons.io.IOExceptionWithCause; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.IOException; + + +/** + * Utility class that overrides the {@link PDFTextStripper} functionality + * to integrate text extraction via OCR only. + * + */ +class OCR2XHTML extends AbstractPDF2XHTML { + + private OCR2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, + PDFParserConfig config) + throws IOException { + super(document, handler, context, metadata, config); + } + + /** + * Converts the given PDF document (and related metadata) to a stream + * of XHTML SAX events sent to the given content handler. + * + * @param document PDF document + * @param handler SAX content handler + * @param metadata PDF metadata + * @throws SAXException if the content handler fails to process SAX events + * @throws TikaException if there was an exception outside of per page processing + */ + public static void process( + PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, + PDFParserConfig config) + throws SAXException, TikaException { + OCR2XHTML ocr2XHTML = null; + try { + ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config); + ocr2XHTML.writeText(document, new Writer() { + @Override + public void write(char[] cbuf, int off, int len) { + } + + @Override + public void flush() { + } + + @Override + public void close() { + } + }); + } catch (IOException e) { + if (e.getCause() instanceof SAXException) { + throw (SAXException) e.getCause(); + } else { + throw new TikaException("Unable to extract PDF content", e); + } + } + if (ocr2XHTML.exceptions.size() > 0) { + //throw the first + throw new TikaException("Unable to extract all PDF content", + ocr2XHTML.exceptions.get(0)); + } + } + + @Override + public void processPage(PDPage pdPage) throws IOException { + try { + startPage(pdPage); + doOCROnCurrentPage(); + endPage(pdPage); + } catch (TikaException|SAXException e) { + throw new IOExceptionWithCause(e); + } catch (IOException e) { + handleCatchableIOE(e); + } + } + + @Override + protected void writeString(String text) throws IOException { + //no-op + } + + @Override + protected void writeCharacters(TextPosition text) throws IOException { + //no-op + } + + @Override + protected void writeWordSeparator() throws IOException { + //no-op + } + + @Override + protected void writeLineSeparator() throws IOException { + //no-op + } + +} + http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index 1a8bfb4..ac9823e 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -16,74 +16,41 @@ */ package org.apache.tika.parser.pdf; -import javax.xml.stream.XMLStreamException; import java.awt.image.BufferedImage; -import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.Writer; -import java.text.SimpleDateFormat; -import java.util.ArrayList; import java.util.Arrays; -import java.util.Calendar; import java.util.HashMap; import java.util.HashSet; import java.util.List; -import java.util.ListIterator; -import java.util.Locale; import java.util.Map; import java.util.Set; -import java.util.TreeMap; import org.apache.commons.io.IOExceptionWithCause; -import org.apache.commons.io.IOUtils; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSStream; import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDDocumentCatalog; -import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; -import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; -import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; -import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; -import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; import org.apache.pdfbox.pdmodel.graphics.PDXObject; import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; -import org.apache.pdfbox.pdmodel.interactive.action.PDAction; -import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI; -import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; -import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment; -import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; -import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup; -import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature; -import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; -import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; -import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode; -import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; -import org.apache.pdfbox.pdmodel.interactive.form.PDField; -import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField; -import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField; -import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.tools.imageio.ImageIOUtil; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; -import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.EmbeddedContentHandler; -import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -93,28 +60,12 @@ import org.xml.sax.helpers.AttributesImpl; * to produce a semi-structured XHTML SAX events instead of a plain text * stream. */ -class PDF2XHTML extends PDFTextStripper { +class PDF2XHTML extends AbstractPDF2XHTML { - /** - * Maximum recursive depth during AcroForm processing. - * Prevents theoretical AcroForm recursion bomb. - */ - private final static int MAX_ACROFORM_RECURSIONS = 10; private static final List<String> JPEG = Arrays.asList( COSName.DCT_DECODE.getName(), COSName.DCT_DECODE_ABBREVIATION.getName()); - /** - * Format used for signature dates - * TODO Make this thread-safe - */ - private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT); - private final ContentHandler originalHandler; - private final ParseContext context; - private final XHTMLContentHandler handler; - private final PDFParserConfig config; - private final Metadata metadata; - private final List<IOException> exceptions = new ArrayList<>(); /** * This keeps track of the pdf object ids for inline @@ -129,16 +80,10 @@ class PDF2XHTML extends PDFTextStripper { */ private Map<COSStream, Integer> processedInlineImages = new HashMap<>(); private int inlineImageCounter = 0; - private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata, + private PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws IOException { - //source of config (derives from context or PDFParser?) is - //already determined in PDFParser. No need to check context here. - this.config = config; - this.originalHandler = handler; - this.context = context; - this.handler = new XHTMLContentHandler(handler, metadata); - this.metadata = metadata; + super(document, handler, context, metadata, config); } /** @@ -160,7 +105,7 @@ class PDF2XHTML extends PDFTextStripper { // Extract text using a dummy Writer as we override the // key methods to output to the given content // handler. - pdf2XHTML = new PDF2XHTML(handler, context, metadata, config); + pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config); config.configure(pdf2XHTML); @@ -192,28 +137,6 @@ class PDF2XHTML extends PDFTextStripper { } } - void extractBookmarkText() throws SAXException { - PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline(); - if (outline != null) { - extractBookmarkText(outline); - } - } - - void extractBookmarkText(PDOutlineNode bookmark) throws SAXException { - PDOutlineItem current = bookmark.getFirstChild(); - if (current != null) { - handler.startElement("ul"); - while (current != null) { - handler.startElement("li"); - handler.characters(current.getTitle()); - handler.endElement("li"); - // Recurse: - extractBookmarkText(current); - current = current.getNextSibling(); - } - handler.endElement("ul"); - } - } @Override public void processPage(PDPage page) throws IOException { @@ -225,52 +148,6 @@ class PDF2XHTML extends PDFTextStripper { } @Override - protected void startDocument(PDDocument pdf) throws IOException { - try { - handler.startDocument(); - } catch (SAXException e) { - throw new IOExceptionWithCause("Unable to start a document", e); - } - } - - @Override - protected void endDocument(PDDocument pdf) throws IOException { - try { - // Extract text for any bookmarks: - extractBookmarkText(); - try { - extractEmbeddedDocuments(pdf, originalHandler); - } catch (IOException e) { - handleCatchableIOE(e); - } - - //extract acroform data at end of doc - if (config.getExtractAcroFormContent() == true) { - try { - extractAcroForm(pdf, handler); - } catch (IOException e) { - handleCatchableIOE(e); - } - } - handler.endDocument(); - } catch (TikaException e) { - throw new IOExceptionWithCause("Unable to end a document", e); - } catch (SAXException e) { - throw new IOExceptionWithCause("Unable to end a document", e); - } - } - - @Override - protected void startPage(PDPage page) throws IOException { - try { - handler.startElement("div", "class", "page"); - } catch (SAXException e) { - throw new IOExceptionWithCause("Unable to start a page", e); - } - writeParagraphStart(); - } - - @Override protected void endPage(PDPage page) throws IOException { try { writeParagraphEnd(); @@ -279,76 +156,7 @@ class PDF2XHTML extends PDFTextStripper { } catch (IOException e) { handleCatchableIOE(e); } - - EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); - for (PDAnnotation annotation : page.getAnnotations()) { - - if (annotation instanceof PDAnnotationFileAttachment) { - PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; - PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); - try { - extractMultiOSPDEmbeddedFiles("", fileSpec, extractor); - } catch (SAXException e) { - throw new IOExceptionWithCause("file embedded in annotation sax exception", e); - } catch (TikaException e) { - throw new IOExceptionWithCause("file embedded in annotation tika exception", e); - } catch (IOException e) { - handleCatchableIOE(e); - } - } - // TODO: remove once PDFBOX-1143 is fixed: - if (config.getExtractAnnotationText()) { - if (annotation instanceof PDAnnotationLink) { - PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; - if (annotationlink.getAction() != null) { - PDAction action = annotationlink.getAction(); - if (action instanceof PDActionURI) { - PDActionURI uri = (PDActionURI) action; - String link = uri.getURI(); - if (link != null) { - handler.startElement("div", "class", "annotation"); - handler.startElement("a", "href", link); - handler.endElement("a"); - handler.endElement("div"); - } - } - } - } - - if (annotation instanceof PDAnnotationMarkup) { - PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; - String title = annotationMarkup.getTitlePopup(); - String subject = annotationMarkup.getSubject(); - String contents = annotationMarkup.getContents(); - // TODO: maybe also annotationMarkup.getRichContents()? - if (title != null || subject != null || contents != null) { - handler.startElement("div", "class", "annotation"); - - if (title != null) { - handler.startElement("div", "class", "annotationTitle"); - handler.characters(title); - handler.endElement("div"); - } - - if (subject != null) { - handler.startElement("div", "class", "annotationSubject"); - handler.characters(subject); - handler.endElement("div"); - } - - if (contents != null) { - handler.startElement("div", "class", "annotationContents"); - handler.characters(contents); - handler.endElement("div"); - } - - handler.endElement("div"); - } - } - } - } - - handler.endElement("div"); + super.endPage(page); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a page", e); } catch (IOException e) { @@ -406,8 +214,8 @@ class PDF2XHTML extends PDFTextStripper { AttributesImpl attr = new AttributesImpl(); attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName); attr.addAttribute("", "alt", "alt", "CDATA", fileName); - handler.startElement("img", attr); - handler.endElement("img"); + xhtml.startElement("img", attr); + xhtml.endElement("img"); //Do we only want to process unique COSObject ids? //If so, have we already processed this one? @@ -430,7 +238,7 @@ class PDF2XHTML extends PDFTextStripper { writeToBuffer(image, extension, buffer); extractor.parseEmbedded( new ByteArrayInputStream(buffer.toByteArray()), - new EmbeddedContentHandler(handler), + new EmbeddedContentHandler(xhtml), metadata, false); } catch (IOException e) { handleCatchableIOE(e); @@ -467,20 +275,11 @@ class PDF2XHTML extends PDFTextStripper { out.flush(); } - protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() { - EmbeddedDocumentExtractor extractor = - context.get(EmbeddedDocumentExtractor.class); - if (extractor == null) { - extractor = new ParsingEmbeddedDocumentExtractor(context); - } - return extractor; - } - @Override protected void writeParagraphStart() throws IOException { super.writeParagraphStart(); try { - handler.startElement("p"); + xhtml.startElement("p"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to start a paragraph", e); } @@ -490,7 +289,7 @@ class PDF2XHTML extends PDFTextStripper { protected void writeParagraphEnd() throws IOException { super.writeParagraphEnd(); try { - handler.endElement("p"); + xhtml.endElement("p"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a paragraph", e); } @@ -499,7 +298,7 @@ class PDF2XHTML extends PDFTextStripper { @Override protected void writeString(String text) throws IOException { try { - handler.characters(text); + xhtml.characters(text); } catch (SAXException e) { throw new IOExceptionWithCause( "Unable to write a string: " + text, e); @@ -509,7 +308,7 @@ class PDF2XHTML extends PDFTextStripper { @Override protected void writeCharacters(TextPosition text) throws IOException { try { - handler.characters(text.getUnicode()); + xhtml.characters(text.getUnicode()); } catch (SAXException e) { throw new IOExceptionWithCause( "Unable to write a character: " + text.getUnicode(), e); @@ -519,7 +318,7 @@ class PDF2XHTML extends PDFTextStripper { @Override protected void writeWordSeparator() throws IOException { try { - handler.characters(getWordSeparator()); + xhtml.characters(getWordSeparator()); } catch (SAXException e) { throw new IOExceptionWithCause( "Unable to write a space character", e); @@ -529,275 +328,12 @@ class PDF2XHTML extends PDFTextStripper { @Override protected void writeLineSeparator() throws IOException { try { - handler.newline(); + xhtml.newline(); } catch (SAXException e) { throw new IOExceptionWithCause( "Unable to write a newline character", e); } } - private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler) - throws IOException, SAXException, TikaException { - PDDocumentNameDictionary namesDictionary = - new PDDocumentNameDictionary( document.getDocumentCatalog() ); - PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); - if (efTree == null) { - return; - } - - Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames(); - //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. - //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java - //If there is a need we could add a fully recursive search to find a non-null - //Map<String, COSObjectable> that contains the doc info. - if (embeddedFileNames != null) { - processEmbeddedDocNames(embeddedFileNames); - } else { - List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); - if (kids == null) { - return; - } - for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { - embeddedFileNames = node.getNames(); - if (embeddedFileNames != null) { - processEmbeddedDocNames(embeddedFileNames); - } - } - } - } - - private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames) - throws IOException, SAXException, TikaException { - if (embeddedFileNames == null || embeddedFileNames.isEmpty()) { - return; - } - - EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); - for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) { - PDComplexFileSpecification spec = ent.getValue(); - extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor); - } - } - - private void extractMultiOSPDEmbeddedFiles(String defaultName, - PDComplexFileSpecification spec, - EmbeddedDocumentExtractor extractor) throws IOException, - SAXException, TikaException { - - if (spec == null) { - return; - } - //current strategy is to pull all, not just first non-null - extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor); - extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor); - extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor); - extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor); - } - - private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file, - EmbeddedDocumentExtractor extractor) - throws SAXException, IOException, TikaException { - - if (file == null) { - //skip silently - return; - } - - fileName = (fileName == null) ? defaultName : fileName; - - // TODO: other metadata? - Metadata metadata = new Metadata(); - metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); - metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); - metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); - metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, - TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); - - if (extractor.shouldParseEmbedded(metadata)) { - TikaInputStream stream = null; - try { - stream = TikaInputStream.get(file.createInputStream()); - extractor.parseEmbedded( - stream, - new EmbeddedContentHandler(handler), - metadata, false); - - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "class", "class", "CDATA", "embedded"); - attributes.addAttribute("", "id", "id", "CDATA", fileName); - handler.startElement("div", attributes); - handler.endElement("div"); - } finally { - IOUtils.closeQuietly(stream); - } - } - } - - private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException, - SAXException { - //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields - //this code derives from Ben's code - PDDocumentCatalog catalog = pdf.getDocumentCatalog(); - - if (catalog == null) - return; - - PDAcroForm form = catalog.getAcroForm(); - if (form == null) - return; - - //if it has xfa, try that. - //if it doesn't exist or there's an exception, - //go with traditional AcroForm - PDXFAResource pdxfa = form.getXFA(); - - if (pdxfa != null) { - //if successful, return - XFAExtractor xfaExtractor = new XFAExtractor(); - try (InputStream is = new BufferedInputStream( - new ByteArrayInputStream(pdxfa.getBytes()))) { - xfaExtractor.extract(is, handler, metadata, context); - return; - } catch (XMLStreamException |IOException e) { - //if there was an xml parse exception in xfa, try the AcroForm - } - } - - @SuppressWarnings("rawtypes") - List fields = form.getFields(); - - if (fields == null) - return; - - @SuppressWarnings("rawtypes") - ListIterator itr = fields.listIterator(); - - if (itr == null) - return; - - handler.startElement("div", "class", "acroform"); - handler.startElement("ol"); - - while (itr.hasNext()) { - Object obj = itr.next(); - if (obj != null && obj instanceof PDField) { - processAcroField((PDField) obj, handler, 0); - } - } - handler.endElement("ol"); - handler.endElement("div"); - } - - private void processAcroField(PDField field, - XHTMLContentHandler handler, final int currentRecursiveDepth) - throws SAXException, IOException { - - if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) { - return; - } - addFieldString(field, handler); - if (field instanceof PDNonTerminalField) { - int r = currentRecursiveDepth + 1; - handler.startElement("ol"); - for (PDField child : ((PDNonTerminalField)field).getChildren()) { - processAcroField(child, handler, r); - } - handler.endElement("ol"); - } - } - - private void addFieldString(PDField field, XHTMLContentHandler handler) throws SAXException { - //Pick partial name to present in content and altName for attribute - //Ignoring FullyQualifiedName for now - String partName = field.getPartialName(); - String altName = field.getAlternateFieldName(); - - StringBuilder sb = new StringBuilder(); - AttributesImpl attrs = new AttributesImpl(); - - if (partName != null) { - sb.append(partName).append(": "); - } - if (altName != null) { - attrs.addAttribute("", "altName", "altName", "CDATA", altName); - } - //return early if PDSignature field - if (field instanceof PDSignatureField) { - handleSignature(attrs, (PDSignatureField) field, handler); - return; - } - String value = field.getValueAsString(); - if (value != null && !value.equals("null")) { - sb.append(value); - } - - if (attrs.getLength() > 0 || sb.length() > 0) { - handler.startElement("li", attrs); - handler.characters(sb.toString()); - handler.endElement("li"); - } - } - - private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField, - XHTMLContentHandler handler) throws SAXException { - - - PDSignature sig = sigField.getSignature(); - if (sig == null) { - return; - } - Map<String, String> vals = new TreeMap<>(); - vals.put("name", sig.getName()); - vals.put("contactInfo", sig.getContactInfo()); - vals.put("location", sig.getLocation()); - vals.put("reason", sig.getReason()); - - Calendar cal = sig.getSignDate(); - if (cal != null) { - dateFormat.setTimeZone(cal.getTimeZone()); - vals.put("date", dateFormat.format(cal.getTime())); - } - //see if there is any data - int nonNull = 0; - for (String val : vals.keySet()) { - if (val != null && !val.equals("")) { - nonNull++; - } - } - //if there is, process it - if (nonNull > 0) { - handler.startElement("li", parentAttributes); - - AttributesImpl attrs = new AttributesImpl(); - attrs.addAttribute("", "type", "type", "CDATA", "signaturedata"); - - handler.startElement("ol", attrs); - for (Map.Entry<String, String> e : vals.entrySet()) { - if (e.getValue() == null || e.getValue().equals("")) { - continue; - } - attrs = new AttributesImpl(); - attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey()); - handler.startElement("li", attrs); - handler.characters(e.getValue()); - handler.endElement("li"); - } - handler.endElement("ol"); - handler.endElement("li"); - } - } - - private void handleCatchableIOE(IOException e) throws IOException { - if (config.isCatchIntermediateIOExceptions()) { - String msg = e.getMessage(); - if (msg == null) { - msg = "IOException, no message"; - } - metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg); - exceptions.add(e); - } else { - throw e; - } - } } http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index b677d84..3e33962 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -58,6 +58,7 @@ import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.image.xmp.JempboxExtractor; +import org.apache.tika.parser.ocr.TesseractOCRParser; import org.apache.tika.sax.XHTMLContentHandler; import org.w3c.dom.Document; import org.xml.sax.ContentHandler; @@ -140,7 +141,13 @@ public class PDFParser extends AbstractParser { if (handler != null) { if (shouldHandleXFAOnly(pdfDocument, localConfig)) { handleXFAOnly(pdfDocument, handler, metadata, context); + } else if (localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) { + metadata.add("X-Parsed-By", TesseractOCRParser.class.toString()); + OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } else { + if (localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { + metadata.add("X-Parsed-By", TesseractOCRParser.class.toString()); + } PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } } http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index 9baeb37..296b191 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -23,6 +23,7 @@ import java.io.Serializable; import java.util.Locale; import java.util.Properties; +import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.text.PDFTextStripper; /** @@ -44,6 +45,26 @@ import org.apache.pdfbox.text.PDFTextStripper; */ public class PDFParserConfig implements Serializable { + public enum OCR_STRATEGY { + NO_OCR, + OCR_ONLY, + OCR_AND_TEXT_EXTRACTION; + + private static OCR_STRATEGY parse(String s) { + if (s == null) { + return NO_OCR; + } else if ("no_ocr".equals(s.toLowerCase(Locale.ROOT))) { + return NO_OCR; + } else if ("ocr_only".equals(s.toLowerCase(Locale.ROOT))) { + return OCR_ONLY; + } else if (s.toLowerCase(Locale.ROOT).contains("ocr_and_text")) { + return OCR_AND_TEXT_EXTRACTION; + } + //default -- no ocr + return NO_OCR; + } + } + private static final long serialVersionUID = 6492570218190936986L; // True if we let PDFBox "guess" where spaces should go: @@ -80,6 +101,12 @@ public class PDFParserConfig implements Serializable { //content from elsewhere in the document. private boolean ifXFAExtractOnlyXFA = false; + private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.NO_OCR; + + private int ocrDPI = 200; + private ImageType ocrImageType = ImageType.GRAY; + private String ocrImageFormatName = "png"; + private AccessChecker accessChecker; //The PDFParser can throw IOExceptions if there is a problem @@ -123,36 +150,45 @@ public class PDFParserConfig implements Serializable { } } setEnableAutoSpace( - getProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace())); + getBooleanProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace())); setSuppressDuplicateOverlappingText( - getProp(props.getProperty("suppressDuplicateOverlappingText"), + getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"), getSuppressDuplicateOverlappingText())); setExtractAnnotationText( - getProp(props.getProperty("extractAnnotationText"), + getBooleanProp(props.getProperty("extractAnnotationText"), getExtractAnnotationText())); setSortByPosition( - getProp(props.getProperty("sortByPosition"), + getBooleanProp(props.getProperty("sortByPosition"), getSortByPosition())); setExtractAcroFormContent( - getProp(props.getProperty("extractAcroFormContent"), + getBooleanProp(props.getProperty("extractAcroFormContent"), getExtractAcroFormContent())); setExtractInlineImages( - getProp(props.getProperty("extractInlineImages"), + getBooleanProp(props.getProperty("extractInlineImages"), getExtractInlineImages())); setExtractUniqueInlineImagesOnly( - getProp(props.getProperty("extractUniqueInlineImagesOnly"), + getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"), getExtractUniqueInlineImagesOnly())); setIfXFAExtractOnlyXFA( - getProp(props.getProperty("ifXFAExtractOnlyXFA"), + getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"), getIfXFAExtractOnlyXFA())); setCatchIntermediateIOExceptions( - getProp(props.getProperty("catchIntermediateIOExceptions"), + getBooleanProp(props.getProperty("catchIntermediateIOExceptions"), isCatchIntermediateIOExceptions())); - boolean checkExtractAccessPermission = getProp(props.getProperty("checkExtractAccessPermission"), false); - boolean allowExtractionForAccessibility = getProp(props.getProperty("allowExtractionForAccessibility"), true); + setOCRStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy"))); + + setOCRDPI(getIntProp(props.getProperty("ocrDPI"), getOCRDPI())); + + setOCRImageFormatName(props.getProperty("ocrImageFormatName")); + + setOCRImageType(parseImageType(props.getProperty("ocrImageType"))); + + + boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false); + boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true); if (checkExtractAccessPermission == false) { //silently ignore the crazy configuration of checkExtractAccessPermission = false, @@ -408,7 +444,23 @@ public class PDFParserConfig implements Serializable { isCatchIntermediateIOExceptions = catchIntermediateIOExceptions; } - private boolean getProp(String p, boolean defaultMissing) { + /** + * Which strategy to use for OCR + * @param ocrStrategy + */ + public void setOCRStrategy(OCR_STRATEGY ocrStrategy) { + this.ocrStrategy = ocrStrategy; + } + + /** + * + * @return strategy to use for OCR + */ + public OCR_STRATEGY getOCRStrategy() { + return ocrStrategy; + } + + private boolean getBooleanProp(String p, boolean defaultMissing) { if (p == null) { return defaultMissing; } @@ -420,83 +472,143 @@ public class PDFParserConfig implements Serializable { return defaultMissing; } } + //throws NumberFormatException if there's a non-null unparseable + //string passed in + private int getIntProp(String p, int defaultMissing) { + if (p == null) { + return defaultMissing; + } - @Override - public int hashCode() { - final int prime = 31; - int result = 1; - result = prime - * result - + ((averageCharTolerance == null) ? 0 : averageCharTolerance - .hashCode()); - result = prime * result + (enableAutoSpace ? 1231 : 1237); - result = prime * result + (extractAcroFormContent ? 1231 : 1237); - result = prime * result + (extractAnnotationText ? 1231 : 1237); - result = prime * result + (extractInlineImages ? 1231 : 1237); - result = prime * result + (extractUniqueInlineImagesOnly ? 1231 : 1237); - result = prime * result + (sortByPosition ? 1231 : 1237); - result = prime - * result - + ((spacingTolerance == null) ? 0 : spacingTolerance.hashCode()); - result = prime * result - + (suppressDuplicateOverlappingText ? 1231 : 1237); - result = prime * result + (ifXFAExtractOnlyXFA ? 1231 : 1237); - return result; + return Integer.parseInt(p); } - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - PDFParserConfig other = (PDFParserConfig) obj; - if (averageCharTolerance == null) { - if (other.averageCharTolerance != null) - return false; - } else if (!averageCharTolerance.equals(other.averageCharTolerance)) - return false; - if (enableAutoSpace != other.enableAutoSpace) - return false; - if (extractAcroFormContent != other.extractAcroFormContent) - return false; - if (extractAnnotationText != other.extractAnnotationText) - return false; - if (extractInlineImages != other.extractInlineImages) - return false; - if (extractUniqueInlineImagesOnly != other.extractUniqueInlineImagesOnly) - return false; - if (sortByPosition != other.sortByPosition) - return false; - if (spacingTolerance == null) { - if (other.spacingTolerance != null) - return false; - } else if (!spacingTolerance.equals(other.spacingTolerance)) - return false; - if (suppressDuplicateOverlappingText != other.suppressDuplicateOverlappingText) - return false; - if (ifXFAExtractOnlyXFA != other.ifXFAExtractOnlyXFA) - return false; + /** + * String representation of the image format used to render + * the page image for OCR (examples: png, tiff, jpeg) + * @return + */ + public String getOCRImageFormatName() { + return ocrImageFormatName; + } - return true; + /** + * @see #getOCRImageFormatName() + * + * @param ocrImageFormatName name of image format used to render + * page image + */ + public void setOCRImageFormatName(String ocrImageFormatName) { + this.ocrImageFormatName = ocrImageFormatName; + } + + /** + * Image type used to render the page image for OCR. + * @see #setOCRImageType(ImageType) + * @return image type + */ + public ImageType getOCRImageType() { + return ocrImageType; + } + + /** + * Image type used to render the page image for OCR. + * @param ocrImageType + */ + public void setOCRImageType(ImageType ocrImageType) { + this.ocrImageType = ocrImageType; + } + + /** + * Dots per inch used to render the page image for OCR + * @return dots per inch + */ + public int getOCRDPI() { + return ocrDPI; + } + + /** + * Dots per inche used to render the page image for OCR + * @param ocrDPI + */ + public void setOCRDPI(int ocrDPI) { + this.ocrDPI = ocrDPI; + } + + private ImageType parseImageType(String ocrImageType) { + for (ImageType t : ImageType.values()) { + if (ocrImageType.equalsIgnoreCase(t.toString())) { + return t; + } + } + return null; } @Override - public String toString() { - return "PDFParserConfig [enableAutoSpace=" + enableAutoSpace - + ", suppressDuplicateOverlappingText=" - + suppressDuplicateOverlappingText + ", extractAnnotationText=" - + extractAnnotationText + ", sortByPosition=" + sortByPosition - + ", extractAcroFormContent=" + extractAcroFormContent - + ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA - + ", extractInlineImages=" + extractInlineImages - + ", extractUniqueInlineImagesOnly=" - + extractUniqueInlineImagesOnly + ", averageCharTolerance=" - + averageCharTolerance + ", spacingTolerance=" - + spacingTolerance + "]"; + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof PDFParserConfig)) return false; + + PDFParserConfig config = (PDFParserConfig) o; + + if (getEnableAutoSpace() != config.getEnableAutoSpace()) return false; + if (getSuppressDuplicateOverlappingText() != config.getSuppressDuplicateOverlappingText()) return false; + if (getExtractAnnotationText() != config.getExtractAnnotationText()) return false; + if (getSortByPosition() != config.getSortByPosition()) return false; + if (getExtractAcroFormContent() != config.getExtractAcroFormContent()) return false; + if (getExtractInlineImages() != config.getExtractInlineImages()) return false; + if (getExtractUniqueInlineImagesOnly() != config.getExtractUniqueInlineImagesOnly()) return false; + if (getIfXFAExtractOnlyXFA() != config.getIfXFAExtractOnlyXFA()) return false; + if (getOCRDPI() != config.getOCRDPI()) return false; + if (isCatchIntermediateIOExceptions() != config.isCatchIntermediateIOExceptions()) return false; + if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false; + if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false; + if (!getOCRStrategy().equals(config.getOCRStrategy())) return false; + if (getOCRImageType() != config.getOCRImageType()) return false; + if (!getOCRImageFormatName().equals(config.getOCRImageFormatName())) return false; + return getAccessChecker().equals(config.getAccessChecker()); + } + @Override + public int hashCode() { + int result = (getEnableAutoSpace() ? 1 : 0); + result = 31 * result + (getSuppressDuplicateOverlappingText() ? 1 : 0); + result = 31 * result + (getExtractAnnotationText() ? 1 : 0); + result = 31 * result + (getSortByPosition() ? 1 : 0); + result = 31 * result + (getExtractAcroFormContent() ? 1 : 0); + result = 31 * result + (getExtractInlineImages() ? 1 : 0); + result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0); + result = 31 * result + getAverageCharTolerance().hashCode(); + result = 31 * result + getSpacingTolerance().hashCode(); + result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0); + result = 31 * result + ocrStrategy.hashCode(); + result = 31 * result + getOCRDPI(); + result = 31 * result + getOCRImageType().hashCode(); + result = 31 * result + getOCRImageFormatName().hashCode(); + result = 31 * result + getAccessChecker().hashCode(); + result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0); + return result; + } + @Override + public String toString() { + return "PDFParserConfig{" + + "enableAutoSpace=" + enableAutoSpace + + ", suppressDuplicateOverlappingText=" + suppressDuplicateOverlappingText + + ", extractAnnotationText=" + extractAnnotationText + + ", sortByPosition=" + sortByPosition + + ", extractAcroFormContent=" + extractAcroFormContent + + ", extractInlineImages=" + extractInlineImages + + ", extractUniqueInlineImagesOnly=" + extractUniqueInlineImagesOnly + + ", averageCharTolerance=" + averageCharTolerance + + ", spacingTolerance=" + spacingTolerance + + ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA + + ", ocrStrategy=" + ocrStrategy + + ", ocrDPI=" + ocrDPI + + ", ocrImageType=" + ocrImageType + + ", ocrImageFormatName='" + ocrImageFormatName + '\'' + + ", accessChecker=" + accessChecker + + ", isCatchIntermediateIOExceptions=" + isCatchIntermediateIOExceptions + + '}'; + } } http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties index 9b404a3..319e693 100644 --- a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties +++ b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties @@ -23,4 +23,12 @@ extractUniqueInlineImagesOnly true checkExtractAccessPermission false allowExtractionForAccessibility true ifXFAExtractOnlyXFA false -catchIntermediateIOExceptions true \ No newline at end of file +catchIntermediateIOExceptions true +#options: no_ocr, ocr_only, ocr_and_text_extraction +ocrStrategy no_ocr +#dots per inch for the ocr rendering of the page image +ocrDPI 200 +#if you request tif, make sure you have imageio jars on your classpath! +ocrImageFormatName png +#options: argb, binary, gray, rgb +ocrImageType gray http://git-wip-us.apache.org/repos/asf/tika/blob/7aeb95d6/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 6d07c59..df2e27c 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -50,6 +50,8 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.parser.ocr.TesseractOCRConfig; +import org.apache.tika.parser.ocr.TesseractOCRParser; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; @@ -70,6 +72,16 @@ public class PDFParserTest extends TikaTest { public static final MediaType TYPE_DOC = MediaType.application("msword"); public static Level PDFBOX_LOG_LEVEL = Level.INFO; + private static Boolean hasTesseract = null; + + public static boolean canRunOCR() { + if (hasTesseract != null) { + return hasTesseract; + } + hasTesseract = new TesseractOCRParser().hasTesseract(new TesseractOCRConfig()); + return hasTesseract; + } + @BeforeClass public static void setup() { //remember default logging level, but turn off for PDFParserTest @@ -1175,6 +1187,32 @@ public class PDFParserTest extends TikaTest { assertEquals("1425", jpegMetadata.get(Metadata.IMAGE_LENGTH)); } + @Test + public void testEmbeddedDocsWithOCROnly() throws Exception { + if (! canRunOCR()) { return; } + + for (PDFParserConfig.OCR_STRATEGY strategy : PDFParserConfig.OCR_STRATEGY.values()) { + PDFParserConfig config = new PDFParserConfig(); + config.setOCRStrategy(strategy); + ParseContext context = new ParseContext(); + context.set(PDFParserConfig.class, config); + context.set(Parser.class, new AutoDetectParser()); + //make sure everything works with regular xml _and_ with recursive + XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx", context); + assertContains("pdf_haystack", xmlResult.xml); + assertContains("Haystack", xmlResult.xml); + assertContains("Needle", xmlResult.xml); + if (! strategy.equals(PDFParserConfig.OCR_STRATEGY.NO_OCR)) { + assertContains("<div class=\"ocr\">pdf_haystack", xmlResult.xml); + } else { + assertNotContained("<div class=\"ocr\">pdf_haystack", xmlResult.xml); + } + assertEquals(4, getRecursiveJson("testPDFEmbeddingAndEmbedded.docx", context).size()); + } + + } + + private void assertException(String path, Parser parser, ParseContext context, Class expected) { boolean noEx = false; InputStream is = getResourceAsStream(path);
