This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push: new 4dbaa08c4 TIKA-4012 -- improve extraction of embedded docs in PDFs by looking beyond names tree and annotations (#1079) 4dbaa08c4 is described below commit 4dbaa08c41ed3bc24e930598076deb50d502d08b Author: Tim Allison <talli...@apache.org> AuthorDate: Thu Apr 13 15:03:12 2023 -0400 TIKA-4012 -- improve extraction of embedded docs in PDFs by looking beyond names tree and annotations (#1079) * TIKA-4012 -- improve embedded document extraction in PDFs --- CHANGES.txt | 2 + .../main/java/org/apache/tika/metadata/PDF.java | 3 + .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 235 +++++++++++++-------- .../org/apache/tika/parser/pdf/PDFParserTest.java | 1 - 4 files changed, 153 insertions(+), 88 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 4f424d520..c437ddaf1 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -8,6 +8,8 @@ Release 2.7.1 - ??? * Add extraction of rendition layout value and version from Epub (TIKA-4013). + * Improve embedded file extraction from PDFs (TIKA-4012). + * Update to PDFBox 2.0.28 (TIKA-4016). * Users may now avoid the ZeroByteFileException via a diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java index 1b96231c4..2b21e2590 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java @@ -175,4 +175,7 @@ public interface PDF { * Number of 3D annotations a PDF contains. This makes {@link PDF#HAS_3D} redundant. */ Property NUM_3D_ANNOTATIONS = Property.internalInteger(PDF_PREFIX + "num3DAnnotations"); + + Property ASSOCIATED_FILE_RELATIONSHIP = Property.internalText(PDF_PREFIX + + "associatedFileRelationship"); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index 4f2b2c864..f48bf204a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -35,7 +35,6 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Collections; -import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.ListIterator; @@ -48,12 +47,12 @@ import javax.xml.stream.XMLStreamException; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; +import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentCatalog; -import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; -import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageTree; import org.apache.pdfbox.pdmodel.common.COSObjectable; @@ -148,6 +147,8 @@ class AbstractPDF2XHTML extends PDFTextStripper { private static final String NULL_STRING = "null"; private static final MediaType XFA_MEDIA_TYPE = MediaType.application("vnd.adobe.xdp+xml"); private static final MediaType XMP_MEDIA_TYPE = MediaType.application("rdf+xml"); + + private static final COSName AF_RELATIONSHIP = COSName.getPDFName("AFRelationship"); final List<IOException> exceptions = new ArrayList<>(); final PDDocument pdDocument; final XHTMLContentHandler xhtml; @@ -169,6 +170,11 @@ class AbstractPDF2XHTML extends PDFTextStripper { private final Set<String> triggers = new TreeSet<>(); private final Set<String> actionTypes = new TreeSet<>(); + + //these are files that we extract as part of Annotations + //We don't want to extract them twice when we go through the + //full DOM looking for /Type = /EmbeddedFile + private final Set<COSBase> extractedFiles = new HashSet<>(); //zero-based pageIndex int pageIndex = 0; int startPage = -1; @@ -333,52 +339,69 @@ class AbstractPDF2XHTML extends PDFTextStripper { private void extractEmbeddedDocuments(PDDocument document) throws IOException, SAXException, TikaException { - PDDocumentNameDictionary namesDictionary = - new PDDocumentNameDictionary(document.getDocumentCatalog()); - PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); - if (efTree == null) { + //See 14.13.10 for the 2.0 spec. Associated files can show up in lots of places...even + // streams. + // It would be great to get more context from the /AF info, but we risk missing files + //if we don't look everywhere. With the current method, we're at least getting all + //filespecs at the cost of losing context (to what was this file attached: doc, page, + // stream, etc?). + + //find all Filespecs TIKA-4012 + List<COSObject> objs = document.getDocument().getObjectsByType(COSName.FILESPEC); + Set<COSBase> seen = new HashSet<>(); + for (COSObject obj : objs) { + processDoc("", "", createFileSpecification(obj.getObject()), new AttributesImpl()); + seen.add(obj.getObject()); + } + + //now go through the embedded files names tree to get those rare cases where + //a file (instead of a filespec) is attached directly to the names tree + //or where the filespec is a direct object + + if (document.getDocumentCatalog() == null) { return; } - - Map<String, PDComplexFileSpecification> embeddedFileNames = new HashMap<>(); - int depth = 0; - //recursively find embedded files - extractFilesfromEFTree(efTree, embeddedFileNames, depth); - processEmbeddedDocNames(embeddedFileNames); - - } - - private void extractFilesfromEFTree(PDNameTreeNode efTree, - Map<String, PDComplexFileSpecification> embeddedFileNames, - int depth) throws IOException { - if (depth > MAX_RECURSION_DEPTH) { - throw new IOException("Hit max recursion depth"); + if (document.getDocumentCatalog().getNames() == null) { + return; } - Map<String, PDComplexFileSpecification> names = null; - try { - names = efTree.getNames(); - } catch (IOException e) { - //LOG? + if (document.getDocumentCatalog().getNames().getEmbeddedFiles() == null) { + return; } - if (names != null) { - for (Map.Entry<String, PDComplexFileSpecification> e : names.entrySet()) { - embeddedFileNames.put(e.getKey(), e.getValue()); + //use a list instead of a name-based map in case there are key collisions + //that could hide attachments + List<NameSpecTuple> specs = new ArrayList<>(); + extractFilesfromEFTree( + document.getDocumentCatalog().getNames().getEmbeddedFiles(), specs, 0); + //this avoids duplication with the above /FileSpec searching, but also in the case + //where the same underlying file has different names in the EFTree + for (NameSpecTuple nameSpecTuple : specs) { + if (seen.contains(nameSpecTuple.getSpec().getCOSObject())) { + continue; } + processDoc(nameSpecTuple.getName(), "", nameSpecTuple.getSpec(), new AttributesImpl()); + seen.add(nameSpecTuple.getSpec().getCOSObject()); } + } - List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); - if (kids == null) { + private void processDocOnAction(String name, String annotationType, PDFileSpecification spec, + AttributesImpl attributes) + throws TikaException, SAXException, IOException { + if (spec == null) { return; - } else { - for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { - extractFilesfromEFTree(node, embeddedFileNames, depth + 1); - } } + processDoc(name, annotationType, spec, attributes); + extractedFiles.add(spec.getCOSObject()); } private void processDoc(String name, String annotationType, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException { + if (spec == null) { + return; + } + if (extractedFiles.contains(spec.getCOSObject())) { + return; + } if (spec instanceof PDSimpleFileSpecification) { //((PDSimpleFileSpecification)spec).getFile(); attributes.addAttribute("", "class", "class", "CDATA", "linked"); @@ -394,17 +417,6 @@ class AbstractPDF2XHTML extends PDFTextStripper { } } - private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames) - throws IOException, SAXException, TikaException { - if (embeddedFileNames == null || embeddedFileNames.isEmpty()) { - return; - } - - for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) { - processDoc(ent.getKey(), "", ent.getValue(), new AttributesImpl()); - } - } - private void extractMultiOSPDEmbeddedFiles(String displayName, String annotationType, PDComplexFileSpecification spec, @@ -414,42 +426,42 @@ class AbstractPDF2XHTML extends PDFTextStripper { if (spec == null) { return; } + //current strategy is to pull all, not just first non-null - extractPDEmbeddedFile(displayName, annotationType, spec.getFileUnicode(), spec.getFile(), - spec.getFileDescription(), spec.getEmbeddedFile(), attributes); - extractPDEmbeddedFile(displayName, annotationType, spec.getFileUnicode(), spec.getFileMac(), - spec.getFileDescription(), spec.getEmbeddedFileMac(), attributes); - extractPDEmbeddedFile(displayName, annotationType, spec.getFileUnicode(), spec.getFileDos(), - spec.getFileDescription(), spec.getEmbeddedFileDos(), attributes); - extractPDEmbeddedFile(displayName, annotationType, spec.getFileUnicode(), - spec.getFileUnix(), - spec.getFileDescription(), spec.getEmbeddedFileUnix(), attributes); + extractPDEmbeddedFile(displayName, annotationType, spec, + spec.getFile(), spec.getEmbeddedFile(), attributes); + extractPDEmbeddedFile(displayName, annotationType, spec, + spec.getFileMac(), spec.getEmbeddedFileMac(), attributes); + extractPDEmbeddedFile(displayName, annotationType, spec, + spec.getFileDos(), spec.getEmbeddedFileDos(), attributes); + extractPDEmbeddedFile(displayName, annotationType, spec, + spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes); //Check for /Thumb (thumbnail image); // /CI (collection item) adobe specific, can have /adobe:DisplayName and a summary } private void extractPDEmbeddedFile(String displayName, - String annotationType, String unicodeFileName, + String annotationType, + PDComplexFileSpecification spec, String fileName, - String description, PDEmbeddedFile file, + PDEmbeddedFile pdEmbeddedFile, AttributesImpl attributes) - throws SAXException, IOException, TikaException { + throws SAXException, IOException { - if (file == null) { + if (pdEmbeddedFile == null) { //skip silently return; } - fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName; + fileName = (fileName == null || "".equals(fileName.trim())) ? spec.getFileUnicode() : fileName; fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName; // TODO: other metadata? Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName); - embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); //if the stream is missing a size, -1 is returned - long sz = file.getSize(); + long sz = pdEmbeddedFile.getSize(); if (sz > -1) { embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(sz)); } @@ -459,18 +471,25 @@ class AbstractPDF2XHTML extends PDFTextStripper { if (!StringUtils.isBlank(annotationType)) { embeddedMetadata.set(PDF.EMBEDDED_FILE_ANNOTATION_TYPE, annotationType); } - if (!StringUtils.isBlank(file.getSubtype())) { - embeddedMetadata.set(PDF.EMBEDDED_FILE_SUBTYPE, file.getSubtype()); + if (!StringUtils.isBlank(pdEmbeddedFile.getSubtype())) { + embeddedMetadata.set(PDF.EMBEDDED_FILE_SUBTYPE, pdEmbeddedFile.getSubtype()); + } + if (!StringUtils.isBlank(spec.getFileDescription())) { + embeddedMetadata.set(PDF.EMBEDDED_FILE_DESCRIPTION, spec.getFileDescription()); } - if (!StringUtils.isBlank(description)) { - embeddedMetadata.set(PDF.EMBEDDED_FILE_DESCRIPTION, description); + String afRelationship = spec.getCOSObject().getNameAsString(AF_RELATIONSHIP); + if (StringUtils.isBlank(afRelationship)) { + afRelationship = spec.getCOSObject().getString(AF_RELATIONSHIP); + } + if (!StringUtils.isBlank(afRelationship)) { + embeddedMetadata.set(PDF.ASSOCIATED_FILE_RELATIONSHIP, afRelationship); } if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { return; } TikaInputStream stream = null; try { - stream = TikaInputStream.get(file.createInputStream()); + stream = TikaInputStream.get(pdEmbeddedFile.createInputStream()); } catch (IOException e) { //store this exception in the parent's metadata EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); @@ -699,11 +718,11 @@ class AbstractPDF2XHTML extends PDFTextStripper { } if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; - if (fann.getFile() instanceof PDComplexFileSpecification) { - handlePDComplexFileSpec(fann.getAttachmentName(), - "annotationFileAttachment", - (PDComplexFileSpecification) fann.getFile()); - } + String subtype = "annotationFileAttachment"; + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "source", "source", "CDATA", subtype); + processDocOnAction("", subtype, fann.getFile(), + attributes); } else if (annotation instanceof PDAnnotationWidget) { handleWidget((PDAnnotationWidget) annotation); } else { @@ -717,8 +736,11 @@ class AbstractPDF2XHTML extends PDFTextStripper { num3DAnnotations++; } for (COSDictionary fileSpec : findFileSpecs(annotation.getCOSObject())) { - PDComplexFileSpecification cfs = new PDComplexFileSpecification(fileSpec); - handlePDComplexFileSpec(cfs.getFilename(), annotationSubtype, cfs); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "source", "source", "CDATA", annotationSubtype); + processDocOnAction("", annotationSubtype, + createFileSpecification(fileSpec), + attributes); } } // TODO: remove once PDFBOX-1143 is fixed: @@ -821,20 +843,32 @@ class AbstractPDF2XHTML extends PDFTextStripper { return PDFDOMUtil.findType(cosDict, types, MAX_RECURSION_DEPTH); } - private void handlePDComplexFileSpec(String attachmentName, String annotationType, - PDComplexFileSpecification fileSpec) throws IOException { + private void extractFilesfromEFTree(PDNameTreeNode efTree, + List<NameSpecTuple> embeddedFileNames, + int depth) throws IOException { + if (depth > MAX_RECURSION_DEPTH) { + throw new IOException("Hit max recursion depth"); + } + Map<String, PDComplexFileSpecification> names = null; try { - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "source", "source", "CDATA", annotationType); - extractMultiOSPDEmbeddedFiles(attachmentName, annotationType, fileSpec, attributes); - } catch (SAXException e) { - throw new IOException("file embedded in annotation sax exception", e); - } catch (TikaException e) { - throw new IOException("file embedded in annotation tika exception", e); + names = efTree.getNames(); } catch (IOException e) { - handleCatchableIOE(e); + //LOG? + } + if (names != null) { + for (Map.Entry<String, PDComplexFileSpecification> e : names.entrySet()) { + embeddedFileNames.add(new NameSpecTuple(e.getKey(), e.getValue())); + } } + List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); + if (kids == null) { + return; + } else { + for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { + extractFilesfromEFTree(node, embeddedFileNames, depth + 1); + } + } } @@ -908,17 +942,17 @@ class AbstractPDF2XHTML extends PDFTextStripper { addNonNullAttribute("trigger", actionTrigger.name(), attributes); if (action instanceof PDActionImportData) { - processDoc("", "", ((PDActionImportData) action).getFile(), attributes); + processDocOnAction("", "", ((PDActionImportData) action).getFile(), attributes); } else if (action instanceof PDActionLaunch) { PDActionLaunch pdActionLaunch = (PDActionLaunch) action; addNonNullAttribute("id", pdActionLaunch.getF(), attributes); addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes); addNonNullAttribute("operation", pdActionLaunch.getO(), attributes); addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes); - processDoc(pdActionLaunch.getF(), "", pdActionLaunch.getFile(), attributes); + processDocOnAction(pdActionLaunch.getF(), "", pdActionLaunch.getFile(), attributes); } else if (action instanceof PDActionRemoteGoTo) { PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo) action; - processDoc("", "", remoteGoTo.getFile(), attributes); + processDocOnAction("", "", remoteGoTo.getFile(), attributes); } else if (action instanceof PDActionJavaScript) { PDActionJavaScript jsAction = (PDActionJavaScript) action; Metadata m = new Metadata(); @@ -1323,6 +1357,33 @@ class AbstractPDF2XHTML extends PDFTextStripper { } } + private PDFileSpecification createFileSpecification(COSBase cosBase) { + try { + return PDFileSpecification.createFS(cosBase); + } catch (IOException e) { + //swallow for now + } + return null; + } + + private static class NameSpecTuple { + private final String name; + private final PDComplexFileSpecification spec; + + public NameSpecTuple(String name, PDComplexFileSpecification spec) { + this.name = name; + this.spec = spec; + } + + public String getName() { + return name; + } + + public PDComplexFileSpecification getSpec() { + return spec; + } + } + enum ActionTrigger { AFTER_DOCUMENT_PRINT, AFTER_DOCUMENT_SAVE, ANNOTATION_CURSOR_ENTERS, ANNOTATION_CURSOR_EXIT, ANNOTATION_LOSE_INPUT_FOCUS, ANNOTATION_MOUSE_CLICK, ANNOTATION_MOUSE_RELEASED, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 7df87a171..ffa05f393 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1433,5 +1433,4 @@ public class PDFParserTest extends TikaTest { metadata.set(TikaCoreProperties.TIKA_CONTENT, contentHandler.toString()); return metadata; }*/ - }