Repository: tika Updated Branches: refs/heads/2.x e5a7604bc -> ebe702898
http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index e98bead..ac9823e 100644 --- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -16,74 +16,41 @@ */ package org.apache.tika.parser.pdf; -import javax.xml.stream.XMLStreamException; import java.awt.image.BufferedImage; -import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.Writer; -import java.text.SimpleDateFormat; -import java.util.ArrayList; import java.util.Arrays; -import java.util.Calendar; import java.util.HashMap; import java.util.HashSet; import java.util.List; -import java.util.ListIterator; -import java.util.Locale; import java.util.Map; import java.util.Set; -import java.util.TreeMap; import org.apache.commons.io.IOExceptionWithCause; -import org.apache.commons.io.IOUtils; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSStream; import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDDocumentCatalog; -import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; -import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; -import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; -import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; -import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; import org.apache.pdfbox.pdmodel.graphics.PDXObject; import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; -import org.apache.pdfbox.pdmodel.interactive.action.PDAction; -import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI; -import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; -import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment; -import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; -import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup; -import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature; -import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; -import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; -import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode; -import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; -import org.apache.pdfbox.pdmodel.interactive.form.PDField; -import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField; -import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField; -import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition; import org.apache.pdfbox.tools.imageio.ImageIOUtil; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; -import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.EmbeddedContentHandler; -import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -93,30 +60,14 @@ import org.xml.sax.helpers.AttributesImpl; * to produce a semi-structured XHTML SAX events instead of a plain text * stream. */ -class PDF2XHTML extends PDFTextStripper { +class PDF2XHTML extends AbstractPDF2XHTML { - /** - * Maximum recursive depth during AcroForm processing. - * Prevents theoretical AcroForm recursion bomb. - */ - private final static int MAX_ACROFORM_RECURSIONS = 10; private static final List<String> JPEG = Arrays.asList( COSName.DCT_DECODE.getName(), COSName.DCT_DECODE_ABBREVIATION.getName()); /** - * Format used for signature dates - * TODO Make this thread-safe - */ - private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT); - private final ContentHandler originalHandler; - private final ParseContext context; - private final XHTMLContentHandler handler; - private final PDFParserConfig config; - private final Metadata metadata; - private final List<IOException> exceptions = new ArrayList<>(); - /** * This keeps track of the pdf object ids for inline * images that have been processed. * If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly() @@ -129,16 +80,10 @@ class PDF2XHTML extends PDFTextStripper { */ private Map<COSStream, Integer> processedInlineImages = new HashMap<>(); private int inlineImageCounter = 0; - private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata, + private PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws IOException { - //source of config (derives from context or PDFParser?) is - //already determined in PDFParser. No need to check context here. - this.config = config; - this.originalHandler = handler; - this.context = context; - this.handler = new XHTMLContentHandler(handler, metadata); - this.metadata = metadata; + super(document, handler, context, metadata, config); } /** @@ -160,7 +105,7 @@ class PDF2XHTML extends PDFTextStripper { // Extract text using a dummy Writer as we override the // key methods to output to the given content // handler. - pdf2XHTML = new PDF2XHTML(handler, context, metadata, config); + pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config); config.configure(pdf2XHTML); @@ -192,28 +137,6 @@ class PDF2XHTML extends PDFTextStripper { } } - void extractBookmarkText() throws SAXException { - PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline(); - if (outline != null) { - extractBookmarkText(outline); - } - } - - void extractBookmarkText(PDOutlineNode bookmark) throws SAXException { - PDOutlineItem current = bookmark.getFirstChild(); - if (current != null) { - handler.startElement("ul"); - while (current != null) { - handler.startElement("li"); - handler.characters(current.getTitle()); - handler.endElement("li"); - // Recurse: - extractBookmarkText(current); - current = current.getNextSibling(); - } - handler.endElement("ul"); - } - } @Override public void processPage(PDPage page) throws IOException { @@ -225,130 +148,15 @@ class PDF2XHTML extends PDFTextStripper { } @Override - protected void startDocument(PDDocument pdf) throws IOException { - try { - handler.startDocument(); - } catch (SAXException e) { - throw new IOExceptionWithCause("Unable to start a document", e); - } - } - - @Override - protected void endDocument(PDDocument pdf) throws IOException { - try { - // Extract text for any bookmarks: - extractBookmarkText(); - try { - extractEmbeddedDocuments(pdf, originalHandler); - } catch (IOException e) { - handleCatchableIOE(e); - } - - //extract acroform data at end of doc - if (config.getExtractAcroFormContent() == true) { - try { - extractAcroForm(pdf, handler); - } catch (IOException e) { - handleCatchableIOE(e); - } - } - handler.endDocument(); - } catch (TikaException e) { - throw new IOExceptionWithCause("Unable to end a document", e); - } catch (SAXException e) { - throw new IOExceptionWithCause("Unable to end a document", e); - } - } - - @Override - protected void startPage(PDPage page) throws IOException { - try { - handler.startElement("div", "class", "page"); - } catch (SAXException e) { - throw new IOExceptionWithCause("Unable to start a page", e); - } - writeParagraphStart(); - } - - @Override protected void endPage(PDPage page) throws IOException { try { writeParagraphEnd(); try { - extractImages(page.getResources(), new HashSet<COSStream>()); + extractImages(page.getResources(), new HashSet<COSBase>()); } catch (IOException e) { handleCatchableIOE(e); } - - EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); - for (PDAnnotation annotation : page.getAnnotations()) { - - if (annotation instanceof PDAnnotationFileAttachment) { - PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; - PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); - try { - extractMultiOSPDEmbeddedFiles("", fileSpec, extractor); - } catch (SAXException e) { - throw new IOExceptionWithCause("file embedded in annotation sax exception", e); - } catch (TikaException e) { - throw new IOExceptionWithCause("file embedded in annotation tika exception", e); - } catch (IOException e) { - handleCatchableIOE(e); - } - } - // TODO: remove once PDFBOX-1143 is fixed: - if (config.getExtractAnnotationText()) { - if (annotation instanceof PDAnnotationLink) { - PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; - if (annotationlink.getAction() != null) { - PDAction action = annotationlink.getAction(); - if (action instanceof PDActionURI) { - PDActionURI uri = (PDActionURI) action; - String link = uri.getURI(); - if (link != null) { - handler.startElement("div", "class", "annotation"); - handler.startElement("a", "href", link); - handler.endElement("a"); - handler.endElement("div"); - } - } - } - } - - if (annotation instanceof PDAnnotationMarkup) { - PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; - String title = annotationMarkup.getTitlePopup(); - String subject = annotationMarkup.getSubject(); - String contents = annotationMarkup.getContents(); - // TODO: maybe also annotationMarkup.getRichContents()? - if (title != null || subject != null || contents != null) { - handler.startElement("div", "class", "annotation"); - - if (title != null) { - handler.startElement("div", "class", "annotationTitle"); - handler.characters(title); - handler.endElement("div"); - } - - if (subject != null) { - handler.startElement("div", "class", "annotationSubject"); - handler.characters(subject); - handler.endElement("div"); - } - - if (contents != null) { - handler.startElement("div", "class", "annotationContents"); - handler.characters(contents); - handler.endElement("div"); - } - - handler.endElement("div"); - } - } - } - } - - handler.endElement("div"); + super.endPage(page); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a page", e); } catch (IOException e) { @@ -356,7 +164,7 @@ class PDF2XHTML extends PDFTextStripper { } } - private void extractImages(PDResources resources, Set<COSStream> seenThisPage) throws SAXException, IOException { + private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException { if (resources == null || config.getExtractInlineImages() == false) { return; } @@ -395,7 +203,7 @@ class PDF2XHTML extends PDFTextStripper { //throw new RuntimeException("EXTEN:" + extension); } - Integer imageNumber = processedInlineImages.get(object.getCOSObject()); + Integer imageNumber = processedInlineImages.get(cosStream); if (imageNumber == null) { imageNumber = inlineImageCounter++; } @@ -406,8 +214,8 @@ class PDF2XHTML extends PDFTextStripper { AttributesImpl attr = new AttributesImpl(); attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName); attr.addAttribute("", "alt", "alt", "CDATA", fileName); - handler.startElement("img", attr); - handler.endElement("img"); + xhtml.startElement("img", attr); + xhtml.endElement("img"); //Do we only want to process unique COSObject ids? //If so, have we already processed this one? @@ -430,7 +238,7 @@ class PDF2XHTML extends PDFTextStripper { writeToBuffer(image, extension, buffer); extractor.parseEmbedded( new ByteArrayInputStream(buffer.toByteArray()), - new EmbeddedContentHandler(handler), + new EmbeddedContentHandler(xhtml), metadata, false); } catch (IOException e) { handleCatchableIOE(e); @@ -467,20 +275,11 @@ class PDF2XHTML extends PDFTextStripper { out.flush(); } - protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() { - EmbeddedDocumentExtractor extractor = - context.get(EmbeddedDocumentExtractor.class); - if (extractor == null) { - extractor = new ParsingEmbeddedDocumentExtractor(context); - } - return extractor; - } - @Override protected void writeParagraphStart() throws IOException { super.writeParagraphStart(); try { - handler.startElement("p"); + xhtml.startElement("p"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to start a paragraph", e); } @@ -490,7 +289,7 @@ class PDF2XHTML extends PDFTextStripper { protected void writeParagraphEnd() throws IOException { super.writeParagraphEnd(); try { - handler.endElement("p"); + xhtml.endElement("p"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a paragraph", e); } @@ -499,7 +298,7 @@ class PDF2XHTML extends PDFTextStripper { @Override protected void writeString(String text) throws IOException { try { - handler.characters(text); + xhtml.characters(text); } catch (SAXException e) { throw new IOExceptionWithCause( "Unable to write a string: " + text, e); @@ -509,7 +308,7 @@ class PDF2XHTML extends PDFTextStripper { @Override protected void writeCharacters(TextPosition text) throws IOException { try { - handler.characters(text.getUnicode()); + xhtml.characters(text.getUnicode()); } catch (SAXException e) { throw new IOExceptionWithCause( "Unable to write a character: " + text.getUnicode(), e); @@ -519,7 +318,7 @@ class PDF2XHTML extends PDFTextStripper { @Override protected void writeWordSeparator() throws IOException { try { - handler.characters(getWordSeparator()); + xhtml.characters(getWordSeparator()); } catch (SAXException e) { throw new IOExceptionWithCause( "Unable to write a space character", e); @@ -529,275 +328,12 @@ class PDF2XHTML extends PDFTextStripper { @Override protected void writeLineSeparator() throws IOException { try { - handler.newline(); + xhtml.newline(); } catch (SAXException e) { throw new IOExceptionWithCause( "Unable to write a newline character", e); } } - private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler) - throws IOException, SAXException, TikaException { - PDDocumentNameDictionary namesDictionary = - new PDDocumentNameDictionary( document.getDocumentCatalog() ); - PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); - if (efTree == null) { - return; - } - - Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames(); - //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. - //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java - //If there is a need we could add a fully recursive search to find a non-null - //Map<String, COSObjectable> that contains the doc info. - if (embeddedFileNames != null) { - processEmbeddedDocNames(embeddedFileNames); - } else { - List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); - if (kids == null) { - return; - } - for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { - embeddedFileNames = node.getNames(); - if (embeddedFileNames != null) { - processEmbeddedDocNames(embeddedFileNames); - } - } - } - } - - private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames) - throws IOException, SAXException, TikaException { - if (embeddedFileNames == null || embeddedFileNames.isEmpty()) { - return; - } - - EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); - for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) { - PDComplexFileSpecification spec = ent.getValue(); - extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor); - } - } - - private void extractMultiOSPDEmbeddedFiles(String defaultName, - PDComplexFileSpecification spec, - EmbeddedDocumentExtractor extractor) throws IOException, - SAXException, TikaException { - - if (spec == null) { - return; - } - //current strategy is to pull all, not just first non-null - extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor); - extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor); - extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor); - extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor); - } - - private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file, - EmbeddedDocumentExtractor extractor) - throws SAXException, IOException, TikaException { - - if (file == null) { - //skip silently - return; - } - - fileName = (fileName == null) ? defaultName : fileName; - - // TODO: other metadata? - Metadata metadata = new Metadata(); - metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); - metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); - metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); - metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, - TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); - - if (extractor.shouldParseEmbedded(metadata)) { - TikaInputStream stream = null; - try { - stream = TikaInputStream.get(file.createInputStream()); - extractor.parseEmbedded( - stream, - new EmbeddedContentHandler(handler), - metadata, false); - - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "class", "class", "CDATA", "embedded"); - attributes.addAttribute("", "id", "id", "CDATA", fileName); - handler.startElement("div", attributes); - handler.endElement("div"); - } finally { - IOUtils.closeQuietly(stream); - } - } - } - - private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException, - SAXException { - //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields - //this code derives from Ben's code - PDDocumentCatalog catalog = pdf.getDocumentCatalog(); - - if (catalog == null) - return; - - PDAcroForm form = catalog.getAcroForm(); - if (form == null) - return; - - //if it has xfa, try that. - //if it doesn't exist or there's an exception, - //go with traditional AcroForm - PDXFAResource pdxfa = form.getXFA(); - - if (pdxfa != null) { - //if successful, return - XFAExtractor xfaExtractor = new XFAExtractor(); - try (InputStream is = new BufferedInputStream( - new ByteArrayInputStream(pdxfa.getBytes()))) { - xfaExtractor.extract(is, handler, metadata, context); - return; - } catch (XMLStreamException |IOException e) { - //if there was a potentially expected failure in xfa, try the AcroForm - } - } - - @SuppressWarnings("rawtypes") - List fields = form.getFields(); - - if (fields == null) - return; - - @SuppressWarnings("rawtypes") - ListIterator itr = fields.listIterator(); - - if (itr == null) - return; - - handler.startElement("div", "class", "acroform"); - handler.startElement("ol"); - - while (itr.hasNext()) { - Object obj = itr.next(); - if (obj != null && obj instanceof PDField) { - processAcroField((PDField) obj, handler, 0); - } - } - handler.endElement("ol"); - handler.endElement("div"); - } - - private void processAcroField(PDField field, - XHTMLContentHandler handler, final int currentRecursiveDepth) - throws SAXException, IOException { - - if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) { - return; - } - addFieldString(field, handler); - if (field instanceof PDNonTerminalField) { - int r = currentRecursiveDepth + 1; - handler.startElement("ol"); - for (PDField child : ((PDNonTerminalField)field).getChildren()) { - processAcroField(child, handler, r); - } - handler.endElement("ol"); - } - } - - private void addFieldString(PDField field, XHTMLContentHandler handler) throws SAXException { - //Pick partial name to present in content and altName for attribute - //Ignoring FullyQualifiedName for now - String partName = field.getPartialName(); - String altName = field.getAlternateFieldName(); - - StringBuilder sb = new StringBuilder(); - AttributesImpl attrs = new AttributesImpl(); - - if (partName != null) { - sb.append(partName).append(": "); - } - if (altName != null) { - attrs.addAttribute("", "altName", "altName", "CDATA", altName); - } - //return early if PDSignature field - if (field instanceof PDSignatureField) { - handleSignature(attrs, (PDSignatureField) field, handler); - return; - } - String value = field.getValueAsString(); - if (value != null && !value.equals("null")) { - sb.append(value); - } - - if (attrs.getLength() > 0 || sb.length() > 0) { - handler.startElement("li", attrs); - handler.characters(sb.toString()); - handler.endElement("li"); - } - } - - private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField, - XHTMLContentHandler handler) throws SAXException { - - - PDSignature sig = sigField.getSignature(); - if (sig == null) { - return; - } - Map<String, String> vals = new TreeMap<>(); - vals.put("name", sig.getName()); - vals.put("contactInfo", sig.getContactInfo()); - vals.put("location", sig.getLocation()); - vals.put("reason", sig.getReason()); - - Calendar cal = sig.getSignDate(); - if (cal != null) { - dateFormat.setTimeZone(cal.getTimeZone()); - vals.put("date", dateFormat.format(cal.getTime())); - } - //see if there is any data - int nonNull = 0; - for (String val : vals.keySet()) { - if (val != null && !val.equals("")) { - nonNull++; - } - } - //if there is, process it - if (nonNull > 0) { - handler.startElement("li", parentAttributes); - - AttributesImpl attrs = new AttributesImpl(); - attrs.addAttribute("", "type", "type", "CDATA", "signaturedata"); - - handler.startElement("ol", attrs); - for (Map.Entry<String, String> e : vals.entrySet()) { - if (e.getValue() == null || e.getValue().equals("")) { - continue; - } - attrs = new AttributesImpl(); - attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey()); - handler.startElement("li", attrs); - handler.characters(e.getValue()); - handler.endElement("li"); - } - handler.endElement("ol"); - handler.endElement("li"); - } - } - - private void handleCatchableIOE(IOException e) throws IOException { - if (config.isCatchIntermediateIOExceptions()) { - String msg = e.getMessage(); - if (msg == null) { - msg = "IOException, no message"; - } - metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg); - exceptions.add(e); - } else { - throw e; - } - } } http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 4dee7dd..f735f25 100644 --- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -57,6 +57,7 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.ocr.TesseractOCRParser; import org.apache.tika.parser.xmp.JempboxExtractor; import org.apache.tika.sax.XHTMLContentHandler; import org.w3c.dom.Document; @@ -140,9 +141,16 @@ public class PDFParser extends AbstractParser { if (handler != null) { if (shouldHandleXFAOnly(pdfDocument, localConfig)) { handleXFAOnly(pdfDocument, handler, metadata, context); + } else if (localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) { + metadata.add("X-Parsed-By", TesseractOCRParser.class.toString()); + OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } else { + if (localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { + metadata.add("X-Parsed-By", TesseractOCRParser.class.toString()); + } PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } + } } catch (InvalidPasswordException e) { metadata.set("pdf:encrypted", "true"); http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index 9baeb37..296b191 100644 --- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -23,6 +23,7 @@ import java.io.Serializable; import java.util.Locale; import java.util.Properties; +import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.text.PDFTextStripper; /** @@ -44,6 +45,26 @@ import org.apache.pdfbox.text.PDFTextStripper; */ public class PDFParserConfig implements Serializable { + public enum OCR_STRATEGY { + NO_OCR, + OCR_ONLY, + OCR_AND_TEXT_EXTRACTION; + + private static OCR_STRATEGY parse(String s) { + if (s == null) { + return NO_OCR; + } else if ("no_ocr".equals(s.toLowerCase(Locale.ROOT))) { + return NO_OCR; + } else if ("ocr_only".equals(s.toLowerCase(Locale.ROOT))) { + return OCR_ONLY; + } else if (s.toLowerCase(Locale.ROOT).contains("ocr_and_text")) { + return OCR_AND_TEXT_EXTRACTION; + } + //default -- no ocr + return NO_OCR; + } + } + private static final long serialVersionUID = 6492570218190936986L; // True if we let PDFBox "guess" where spaces should go: @@ -80,6 +101,12 @@ public class PDFParserConfig implements Serializable { //content from elsewhere in the document. private boolean ifXFAExtractOnlyXFA = false; + private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.NO_OCR; + + private int ocrDPI = 200; + private ImageType ocrImageType = ImageType.GRAY; + private String ocrImageFormatName = "png"; + private AccessChecker accessChecker; //The PDFParser can throw IOExceptions if there is a problem @@ -123,36 +150,45 @@ public class PDFParserConfig implements Serializable { } } setEnableAutoSpace( - getProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace())); + getBooleanProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace())); setSuppressDuplicateOverlappingText( - getProp(props.getProperty("suppressDuplicateOverlappingText"), + getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"), getSuppressDuplicateOverlappingText())); setExtractAnnotationText( - getProp(props.getProperty("extractAnnotationText"), + getBooleanProp(props.getProperty("extractAnnotationText"), getExtractAnnotationText())); setSortByPosition( - getProp(props.getProperty("sortByPosition"), + getBooleanProp(props.getProperty("sortByPosition"), getSortByPosition())); setExtractAcroFormContent( - getProp(props.getProperty("extractAcroFormContent"), + getBooleanProp(props.getProperty("extractAcroFormContent"), getExtractAcroFormContent())); setExtractInlineImages( - getProp(props.getProperty("extractInlineImages"), + getBooleanProp(props.getProperty("extractInlineImages"), getExtractInlineImages())); setExtractUniqueInlineImagesOnly( - getProp(props.getProperty("extractUniqueInlineImagesOnly"), + getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"), getExtractUniqueInlineImagesOnly())); setIfXFAExtractOnlyXFA( - getProp(props.getProperty("ifXFAExtractOnlyXFA"), + getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"), getIfXFAExtractOnlyXFA())); setCatchIntermediateIOExceptions( - getProp(props.getProperty("catchIntermediateIOExceptions"), + getBooleanProp(props.getProperty("catchIntermediateIOExceptions"), isCatchIntermediateIOExceptions())); - boolean checkExtractAccessPermission = getProp(props.getProperty("checkExtractAccessPermission"), false); - boolean allowExtractionForAccessibility = getProp(props.getProperty("allowExtractionForAccessibility"), true); + setOCRStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy"))); + + setOCRDPI(getIntProp(props.getProperty("ocrDPI"), getOCRDPI())); + + setOCRImageFormatName(props.getProperty("ocrImageFormatName")); + + setOCRImageType(parseImageType(props.getProperty("ocrImageType"))); + + + boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false); + boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true); if (checkExtractAccessPermission == false) { //silently ignore the crazy configuration of checkExtractAccessPermission = false, @@ -408,7 +444,23 @@ public class PDFParserConfig implements Serializable { isCatchIntermediateIOExceptions = catchIntermediateIOExceptions; } - private boolean getProp(String p, boolean defaultMissing) { + /** + * Which strategy to use for OCR + * @param ocrStrategy + */ + public void setOCRStrategy(OCR_STRATEGY ocrStrategy) { + this.ocrStrategy = ocrStrategy; + } + + /** + * + * @return strategy to use for OCR + */ + public OCR_STRATEGY getOCRStrategy() { + return ocrStrategy; + } + + private boolean getBooleanProp(String p, boolean defaultMissing) { if (p == null) { return defaultMissing; } @@ -420,83 +472,143 @@ public class PDFParserConfig implements Serializable { return defaultMissing; } } + //throws NumberFormatException if there's a non-null unparseable + //string passed in + private int getIntProp(String p, int defaultMissing) { + if (p == null) { + return defaultMissing; + } - @Override - public int hashCode() { - final int prime = 31; - int result = 1; - result = prime - * result - + ((averageCharTolerance == null) ? 0 : averageCharTolerance - .hashCode()); - result = prime * result + (enableAutoSpace ? 1231 : 1237); - result = prime * result + (extractAcroFormContent ? 1231 : 1237); - result = prime * result + (extractAnnotationText ? 1231 : 1237); - result = prime * result + (extractInlineImages ? 1231 : 1237); - result = prime * result + (extractUniqueInlineImagesOnly ? 1231 : 1237); - result = prime * result + (sortByPosition ? 1231 : 1237); - result = prime - * result - + ((spacingTolerance == null) ? 0 : spacingTolerance.hashCode()); - result = prime * result - + (suppressDuplicateOverlappingText ? 1231 : 1237); - result = prime * result + (ifXFAExtractOnlyXFA ? 1231 : 1237); - return result; + return Integer.parseInt(p); } - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - PDFParserConfig other = (PDFParserConfig) obj; - if (averageCharTolerance == null) { - if (other.averageCharTolerance != null) - return false; - } else if (!averageCharTolerance.equals(other.averageCharTolerance)) - return false; - if (enableAutoSpace != other.enableAutoSpace) - return false; - if (extractAcroFormContent != other.extractAcroFormContent) - return false; - if (extractAnnotationText != other.extractAnnotationText) - return false; - if (extractInlineImages != other.extractInlineImages) - return false; - if (extractUniqueInlineImagesOnly != other.extractUniqueInlineImagesOnly) - return false; - if (sortByPosition != other.sortByPosition) - return false; - if (spacingTolerance == null) { - if (other.spacingTolerance != null) - return false; - } else if (!spacingTolerance.equals(other.spacingTolerance)) - return false; - if (suppressDuplicateOverlappingText != other.suppressDuplicateOverlappingText) - return false; - if (ifXFAExtractOnlyXFA != other.ifXFAExtractOnlyXFA) - return false; + /** + * String representation of the image format used to render + * the page image for OCR (examples: png, tiff, jpeg) + * @return + */ + public String getOCRImageFormatName() { + return ocrImageFormatName; + } - return true; + /** + * @see #getOCRImageFormatName() + * + * @param ocrImageFormatName name of image format used to render + * page image + */ + public void setOCRImageFormatName(String ocrImageFormatName) { + this.ocrImageFormatName = ocrImageFormatName; + } + + /** + * Image type used to render the page image for OCR. + * @see #setOCRImageType(ImageType) + * @return image type + */ + public ImageType getOCRImageType() { + return ocrImageType; + } + + /** + * Image type used to render the page image for OCR. + * @param ocrImageType + */ + public void setOCRImageType(ImageType ocrImageType) { + this.ocrImageType = ocrImageType; + } + + /** + * Dots per inch used to render the page image for OCR + * @return dots per inch + */ + public int getOCRDPI() { + return ocrDPI; + } + + /** + * Dots per inche used to render the page image for OCR + * @param ocrDPI + */ + public void setOCRDPI(int ocrDPI) { + this.ocrDPI = ocrDPI; + } + + private ImageType parseImageType(String ocrImageType) { + for (ImageType t : ImageType.values()) { + if (ocrImageType.equalsIgnoreCase(t.toString())) { + return t; + } + } + return null; } @Override - public String toString() { - return "PDFParserConfig [enableAutoSpace=" + enableAutoSpace - + ", suppressDuplicateOverlappingText=" - + suppressDuplicateOverlappingText + ", extractAnnotationText=" - + extractAnnotationText + ", sortByPosition=" + sortByPosition - + ", extractAcroFormContent=" + extractAcroFormContent - + ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA - + ", extractInlineImages=" + extractInlineImages - + ", extractUniqueInlineImagesOnly=" - + extractUniqueInlineImagesOnly + ", averageCharTolerance=" - + averageCharTolerance + ", spacingTolerance=" - + spacingTolerance + "]"; + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof PDFParserConfig)) return false; + + PDFParserConfig config = (PDFParserConfig) o; + + if (getEnableAutoSpace() != config.getEnableAutoSpace()) return false; + if (getSuppressDuplicateOverlappingText() != config.getSuppressDuplicateOverlappingText()) return false; + if (getExtractAnnotationText() != config.getExtractAnnotationText()) return false; + if (getSortByPosition() != config.getSortByPosition()) return false; + if (getExtractAcroFormContent() != config.getExtractAcroFormContent()) return false; + if (getExtractInlineImages() != config.getExtractInlineImages()) return false; + if (getExtractUniqueInlineImagesOnly() != config.getExtractUniqueInlineImagesOnly()) return false; + if (getIfXFAExtractOnlyXFA() != config.getIfXFAExtractOnlyXFA()) return false; + if (getOCRDPI() != config.getOCRDPI()) return false; + if (isCatchIntermediateIOExceptions() != config.isCatchIntermediateIOExceptions()) return false; + if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false; + if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false; + if (!getOCRStrategy().equals(config.getOCRStrategy())) return false; + if (getOCRImageType() != config.getOCRImageType()) return false; + if (!getOCRImageFormatName().equals(config.getOCRImageFormatName())) return false; + return getAccessChecker().equals(config.getAccessChecker()); + } + @Override + public int hashCode() { + int result = (getEnableAutoSpace() ? 1 : 0); + result = 31 * result + (getSuppressDuplicateOverlappingText() ? 1 : 0); + result = 31 * result + (getExtractAnnotationText() ? 1 : 0); + result = 31 * result + (getSortByPosition() ? 1 : 0); + result = 31 * result + (getExtractAcroFormContent() ? 1 : 0); + result = 31 * result + (getExtractInlineImages() ? 1 : 0); + result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0); + result = 31 * result + getAverageCharTolerance().hashCode(); + result = 31 * result + getSpacingTolerance().hashCode(); + result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0); + result = 31 * result + ocrStrategy.hashCode(); + result = 31 * result + getOCRDPI(); + result = 31 * result + getOCRImageType().hashCode(); + result = 31 * result + getOCRImageFormatName().hashCode(); + result = 31 * result + getAccessChecker().hashCode(); + result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0); + return result; + } + @Override + public String toString() { + return "PDFParserConfig{" + + "enableAutoSpace=" + enableAutoSpace + + ", suppressDuplicateOverlappingText=" + suppressDuplicateOverlappingText + + ", extractAnnotationText=" + extractAnnotationText + + ", sortByPosition=" + sortByPosition + + ", extractAcroFormContent=" + extractAcroFormContent + + ", extractInlineImages=" + extractInlineImages + + ", extractUniqueInlineImagesOnly=" + extractUniqueInlineImagesOnly + + ", averageCharTolerance=" + averageCharTolerance + + ", spacingTolerance=" + spacingTolerance + + ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA + + ", ocrStrategy=" + ocrStrategy + + ", ocrDPI=" + ocrDPI + + ", ocrImageType=" + ocrImageType + + ", ocrImageFormatName='" + ocrImageFormatName + '\'' + + ", accessChecker=" + accessChecker + + ", isCatchIntermediateIOExceptions=" + isCatchIntermediateIOExceptions + + '}'; + } } http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties b/tika-parser-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties index 9b404a3..319e693 100644 --- a/tika-parser-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties +++ b/tika-parser-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties @@ -23,4 +23,12 @@ extractUniqueInlineImagesOnly true checkExtractAccessPermission false allowExtractionForAccessibility true ifXFAExtractOnlyXFA false -catchIntermediateIOExceptions true \ No newline at end of file +catchIntermediateIOExceptions true +#options: no_ocr, ocr_only, ocr_and_text_extraction +ocrStrategy no_ocr +#dots per inch for the ocr rendering of the page image +ocrDPI 200 +#if you request tif, make sure you have imageio jars on your classpath! +ocrImageFormatName png +#options: argb, binary, gray, rgb +ocrImageType gray http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index bd2e5ad..b7582f2 100644 --- a/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -50,6 +50,8 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.parser.ocr.TesseractOCRConfig; +import org.apache.tika.parser.ocr.TesseractOCRParser; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; @@ -69,6 +71,15 @@ public class PDFParserTest extends TikaTest { public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"); public static final MediaType TYPE_DOC = MediaType.application("msword"); public static Level PDFBOX_LOG_LEVEL = Level.INFO; + private static Boolean hasTesseract = null; + + public static boolean canRunOCR() { + if (hasTesseract != null) { + return hasTesseract; + } + hasTesseract = new TesseractOCRParser().hasTesseract(new TesseractOCRConfig()); + return hasTesseract; + } @BeforeClass public static void setup() { @@ -1158,6 +1169,32 @@ public class PDFParserTest extends TikaTest { assertEquals(0, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length); assertNotContained("1309.61", content); } + + @Test + public void testEmbeddedDocsWithOCR() throws Exception { + if (! canRunOCR()) { return; } + + for (PDFParserConfig.OCR_STRATEGY strategy : PDFParserConfig.OCR_STRATEGY.values()) { + PDFParserConfig config = new PDFParserConfig(); + config.setOCRStrategy(strategy); + ParseContext context = new ParseContext(); + context.set(PDFParserConfig.class, config); + context.set(Parser.class, new AutoDetectParser()); + //make sure everything works with regular xml _and_ with recursive + XMLResult xmlResult = getXML("testPDFEmbeddingAndEmbedded.docx", context); + assertContains("pdf_haystack", xmlResult.xml); + assertContains("Haystack", xmlResult.xml); + assertContains("Needle", xmlResult.xml); + if (! strategy.equals(PDFParserConfig.OCR_STRATEGY.NO_OCR)) { + assertContains("<div class=\"ocr\">pdf_haystack", xmlResult.xml); + } else { + assertNotContained("<div class=\"ocr\">pdf_haystack", xmlResult.xml); + } + assertEquals(4, getRecursiveJson("testPDFEmbeddingAndEmbedded.docx", context).size()); + } + + } + private void assertException(String path, Parser parser, ParseContext context, Class expected) { boolean noEx = false; InputStream is = getResourceAsStream(path);
