[tika] branch main updated: TIKA-4012 -- improve extraction of embedded docs in PDFs by looking beyond names tree and annotations (#1079)

tallison Thu, 13 Apr 2023 12:03:42 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new 4dbaa08c4 TIKA-4012 -- improve extraction of embedded docs in PDFs by 
looking beyond names tree and annotations (#1079)
4dbaa08c4 is described below

commit 4dbaa08c41ed3bc24e930598076deb50d502d08b
Author: Tim Allison <talli...@apache.org>
AuthorDate: Thu Apr 13 15:03:12 2023 -0400

    TIKA-4012 -- improve extraction of embedded docs in PDFs by looking beyond 
names tree and annotations (#1079)
    
    * TIKA-4012 -- improve embedded document extraction in PDFs
---
 CHANGES.txt                                        |   2 +
 .../main/java/org/apache/tika/metadata/PDF.java    |   3 +
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 235 +++++++++++++--------
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |   1 -
 4 files changed, 153 insertions(+), 88 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 4f424d520..c437ddaf1 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -8,6 +8,8 @@ Release 2.7.1 - ???
 
    * Add extraction of rendition layout value and version from Epub 
(TIKA-4013).
 
+   * Improve embedded file extraction from PDFs (TIKA-4012).
+
    * Update to PDFBox 2.0.28 (TIKA-4016).
 
    * Users may now avoid the ZeroByteFileException via a
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java 
b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index 1b96231c4..2b21e2590 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -175,4 +175,7 @@ public interface PDF {
      * Number of 3D annotations a PDF contains.  This makes {@link PDF#HAS_3D} 
redundant.
      */
     Property NUM_3D_ANNOTATIONS = Property.internalInteger(PDF_PREFIX + 
"num3DAnnotations");
+
+    Property ASSOCIATED_FILE_RELATIONSHIP = Property.internalText(PDF_PREFIX +
+            "associatedFileRelationship");
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 4f2b2c864..f48bf204a 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -35,7 +35,6 @@ import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.ListIterator;
@@ -48,12 +47,12 @@ import javax.xml.stream.XMLStreamException;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
-import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
-import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDPageTree;
 import org.apache.pdfbox.pdmodel.common.COSObjectable;
@@ -148,6 +147,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     private static final String NULL_STRING = "null";
     private static final MediaType XFA_MEDIA_TYPE = 
MediaType.application("vnd.adobe.xdp+xml");
     private static final MediaType XMP_MEDIA_TYPE = 
MediaType.application("rdf+xml");
+
+    private static final COSName AF_RELATIONSHIP = 
COSName.getPDFName("AFRelationship");
     final List<IOException> exceptions = new ArrayList<>();
     final PDDocument pdDocument;
     final XHTMLContentHandler xhtml;
@@ -169,6 +170,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     private final Set<String> triggers = new TreeSet<>();
 
     private final Set<String> actionTypes = new TreeSet<>();
+
+    //these are files that we extract as part of Annotations
+    //We don't want to extract them twice when we go through the
+    //full DOM looking for /Type = /EmbeddedFile
+    private final Set<COSBase> extractedFiles = new HashSet<>();
     //zero-based pageIndex
     int pageIndex = 0;
     int startPage = -1;
@@ -333,52 +339,69 @@ class AbstractPDF2XHTML extends PDFTextStripper {
 
     private void extractEmbeddedDocuments(PDDocument document)
             throws IOException, SAXException, TikaException {
-        PDDocumentNameDictionary namesDictionary =
-                new PDDocumentNameDictionary(document.getDocumentCatalog());
-        PDEmbeddedFilesNameTreeNode efTree = 
namesDictionary.getEmbeddedFiles();
-        if (efTree == null) {
+        //See 14.13.10 for the 2.0 spec.  Associated files can show up in lots 
of places...even
+        // streams.
+        // It would be great to get more context from the /AF info, but we 
risk missing files
+        //if we don't look everywhere.  With the current method, we're at 
least getting all
+        //filespecs at the cost of losing context (to what was this file 
attached: doc, page,
+        // stream, etc?).
+
+        //find all Filespecs TIKA-4012
+        List<COSObject> objs = 
document.getDocument().getObjectsByType(COSName.FILESPEC);
+        Set<COSBase> seen = new HashSet<>();
+        for (COSObject obj : objs) {
+            processDoc("", "", createFileSpecification(obj.getObject()), new 
AttributesImpl());
+            seen.add(obj.getObject());
+        }
+
+        //now go through the embedded files names tree to get those rare cases 
where
+        //a file (instead of a filespec) is attached directly to the names tree
+        //or where the filespec is a direct object
+
+        if (document.getDocumentCatalog() == null) {
             return;
         }
-
-        Map<String, PDComplexFileSpecification> embeddedFileNames = new 
HashMap<>();
-        int depth = 0;
-        //recursively find embedded files
-        extractFilesfromEFTree(efTree, embeddedFileNames, depth);
-        processEmbeddedDocNames(embeddedFileNames);
-
-    }
-
-    private void extractFilesfromEFTree(PDNameTreeNode efTree,
-                                        Map<String, 
PDComplexFileSpecification> embeddedFileNames,
-                                        int depth) throws IOException {
-        if (depth > MAX_RECURSION_DEPTH) {
-            throw new IOException("Hit max recursion depth");
+        if (document.getDocumentCatalog().getNames() == null) {
+            return;
         }
-        Map<String, PDComplexFileSpecification> names = null;
-        try {
-            names = efTree.getNames();
-        } catch (IOException e) {
-            //LOG?
+        if (document.getDocumentCatalog().getNames().getEmbeddedFiles() == 
null) {
+            return;
         }
-        if (names != null) {
-            for (Map.Entry<String, PDComplexFileSpecification> e : 
names.entrySet()) {
-                embeddedFileNames.put(e.getKey(), e.getValue());
+        //use a list instead of a name-based map in case there are key 
collisions
+        //that could hide attachments
+        List<NameSpecTuple> specs = new ArrayList<>();
+        extractFilesfromEFTree(
+                document.getDocumentCatalog().getNames().getEmbeddedFiles(), 
specs, 0);
+        //this avoids duplication with the above /FileSpec searching, but also 
in the case
+        //where the same underlying file has different names in the EFTree
+        for (NameSpecTuple nameSpecTuple : specs) {
+            if (seen.contains(nameSpecTuple.getSpec().getCOSObject())) {
+                continue;
             }
+            processDoc(nameSpecTuple.getName(), "", nameSpecTuple.getSpec(), 
new AttributesImpl());
+            seen.add(nameSpecTuple.getSpec().getCOSObject());
         }
+    }
 
-        List<PDNameTreeNode<PDComplexFileSpecification>> kids = 
efTree.getKids();
-        if (kids == null) {
+    private void processDocOnAction(String name, String annotationType, 
PDFileSpecification spec,
+                            AttributesImpl attributes)
+            throws TikaException, SAXException, IOException {
+        if (spec == null) {
             return;
-        } else {
-            for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
-                extractFilesfromEFTree(node, embeddedFileNames, depth + 1);
-            }
         }
+        processDoc(name, annotationType, spec, attributes);
+        extractedFiles.add(spec.getCOSObject());
     }
 
     private void processDoc(String name, String annotationType, 
PDFileSpecification spec,
                             AttributesImpl attributes)
             throws TikaException, SAXException, IOException {
+        if (spec == null) {
+            return;
+        }
+        if (extractedFiles.contains(spec.getCOSObject())) {
+            return;
+        }
         if (spec instanceof PDSimpleFileSpecification) {
             //((PDSimpleFileSpecification)spec).getFile();
             attributes.addAttribute("", "class", "class", "CDATA", "linked");
@@ -394,17 +417,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
-    private void processEmbeddedDocNames(Map<String, 
PDComplexFileSpecification> embeddedFileNames)
-            throws IOException, SAXException, TikaException {
-        if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
-            return;
-        }
-
-        for (Map.Entry<String, PDComplexFileSpecification> ent : 
embeddedFileNames.entrySet()) {
-            processDoc(ent.getKey(), "", ent.getValue(), new AttributesImpl());
-        }
-    }
-
     private void extractMultiOSPDEmbeddedFiles(String displayName,
                                                String annotationType,
                                                PDComplexFileSpecification spec,
@@ -414,42 +426,42 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         if (spec == null) {
             return;
         }
+
         //current strategy is to pull all, not just first non-null
-        extractPDEmbeddedFile(displayName, annotationType, 
spec.getFileUnicode(), spec.getFile(),
-                spec.getFileDescription(), spec.getEmbeddedFile(), attributes);
-        extractPDEmbeddedFile(displayName, annotationType, 
spec.getFileUnicode(), spec.getFileMac(),
-                spec.getFileDescription(), spec.getEmbeddedFileMac(), 
attributes);
-        extractPDEmbeddedFile(displayName, annotationType, 
spec.getFileUnicode(), spec.getFileDos(),
-                spec.getFileDescription(), spec.getEmbeddedFileDos(), 
attributes);
-        extractPDEmbeddedFile(displayName, annotationType, 
spec.getFileUnicode(),
-                spec.getFileUnix(),
-                spec.getFileDescription(), spec.getEmbeddedFileUnix(), 
attributes);
+        extractPDEmbeddedFile(displayName, annotationType, spec,
+                spec.getFile(), spec.getEmbeddedFile(), attributes);
+        extractPDEmbeddedFile(displayName, annotationType, spec,
+                spec.getFileMac(), spec.getEmbeddedFileMac(), attributes);
+        extractPDEmbeddedFile(displayName, annotationType, spec,
+                spec.getFileDos(), spec.getEmbeddedFileDos(), attributes);
+        extractPDEmbeddedFile(displayName, annotationType, spec,
+                spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes);
 
         //Check for /Thumb (thumbnail image);
         // /CI (collection item) adobe specific, can have /adobe:DisplayName 
and a summary
     }
 
     private void extractPDEmbeddedFile(String displayName,
-                                       String annotationType, String 
unicodeFileName,
+                                       String annotationType,
+                                       PDComplexFileSpecification spec,
                                        String fileName,
-                                       String description, PDEmbeddedFile file,
+                                       PDEmbeddedFile pdEmbeddedFile,
                                        AttributesImpl attributes)
-            throws SAXException, IOException, TikaException {
+            throws SAXException, IOException {
 
-        if (file == null) {
+        if (pdEmbeddedFile == null) {
             //skip silently
             return;
         }
 
-        fileName = (fileName == null || "".equals(fileName.trim())) ? 
unicodeFileName : fileName;
+        fileName = (fileName == null || "".equals(fileName.trim())) ? 
spec.getFileUnicode() : fileName;
         fileName = (fileName == null || "".equals(fileName.trim())) ? 
displayName : fileName;
 
         // TODO: other metadata?
         Metadata embeddedMetadata = new Metadata();
         embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
-        embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
         //if the stream is missing a size, -1 is returned
-        long sz = file.getSize();
+        long sz = pdEmbeddedFile.getSize();
         if (sz > -1) {
             embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(sz));
         }
@@ -459,18 +471,25 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         if (!StringUtils.isBlank(annotationType)) {
             embeddedMetadata.set(PDF.EMBEDDED_FILE_ANNOTATION_TYPE, 
annotationType);
         }
-        if (!StringUtils.isBlank(file.getSubtype())) {
-            embeddedMetadata.set(PDF.EMBEDDED_FILE_SUBTYPE, file.getSubtype());
+        if (!StringUtils.isBlank(pdEmbeddedFile.getSubtype())) {
+            embeddedMetadata.set(PDF.EMBEDDED_FILE_SUBTYPE, 
pdEmbeddedFile.getSubtype());
+        }
+        if (!StringUtils.isBlank(spec.getFileDescription())) {
+            embeddedMetadata.set(PDF.EMBEDDED_FILE_DESCRIPTION, 
spec.getFileDescription());
         }
-        if (!StringUtils.isBlank(description)) {
-            embeddedMetadata.set(PDF.EMBEDDED_FILE_DESCRIPTION, description);
+        String afRelationship = 
spec.getCOSObject().getNameAsString(AF_RELATIONSHIP);
+        if (StringUtils.isBlank(afRelationship)) {
+            afRelationship = spec.getCOSObject().getString(AF_RELATIONSHIP);
+        }
+        if (!StringUtils.isBlank(afRelationship)) {
+            embeddedMetadata.set(PDF.ASSOCIATED_FILE_RELATIONSHIP, 
afRelationship);
         }
         if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
             return;
         }
         TikaInputStream stream = null;
         try {
-            stream = TikaInputStream.get(file.createInputStream());
+            stream = TikaInputStream.get(pdEmbeddedFile.createInputStream());
         } catch (IOException e) {
             //store this exception in the parent's metadata
             EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
@@ -699,11 +718,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                 }
                 if (annotation instanceof PDAnnotationFileAttachment) {
                     PDAnnotationFileAttachment fann = 
(PDAnnotationFileAttachment) annotation;
-                    if (fann.getFile() instanceof PDComplexFileSpecification) {
-                        handlePDComplexFileSpec(fann.getAttachmentName(),
-                                "annotationFileAttachment",
-                                (PDComplexFileSpecification) fann.getFile());
-                    }
+                    String subtype = "annotationFileAttachment";
+                    AttributesImpl attributes = new AttributesImpl();
+                    attributes.addAttribute("", "source", "source", "CDATA", 
subtype);
+                    processDocOnAction("", subtype, fann.getFile(),
+                                attributes);
                 } else if (annotation instanceof PDAnnotationWidget) {
                     handleWidget((PDAnnotationWidget) annotation);
                 } else {
@@ -717,8 +736,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                         num3DAnnotations++;
                     }
                     for (COSDictionary fileSpec : 
findFileSpecs(annotation.getCOSObject())) {
-                        PDComplexFileSpecification cfs = new 
PDComplexFileSpecification(fileSpec);
-                        handlePDComplexFileSpec(cfs.getFilename(), 
annotationSubtype, cfs);
+                        AttributesImpl attributes = new AttributesImpl();
+                        attributes.addAttribute("", "source", "source", 
"CDATA", annotationSubtype);
+                        processDocOnAction("", annotationSubtype,
+                                createFileSpecification(fileSpec),
+                                attributes);
                     }
                 }
                 // TODO: remove once PDFBOX-1143 is fixed:
@@ -821,20 +843,32 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         return PDFDOMUtil.findType(cosDict, types, MAX_RECURSION_DEPTH);
     }
 
-    private void handlePDComplexFileSpec(String attachmentName, String 
annotationType,
-                                         PDComplexFileSpecification fileSpec) 
throws IOException {
+    private void extractFilesfromEFTree(PDNameTreeNode efTree,
+                                        List<NameSpecTuple> embeddedFileNames,
+                                        int depth) throws IOException {
+        if (depth > MAX_RECURSION_DEPTH) {
+            throw new IOException("Hit max recursion depth");
+        }
+        Map<String, PDComplexFileSpecification> names = null;
         try {
-            AttributesImpl attributes = new AttributesImpl();
-            attributes.addAttribute("", "source", "source", "CDATA", 
annotationType);
-            extractMultiOSPDEmbeddedFiles(attachmentName, annotationType, 
fileSpec, attributes);
-        } catch (SAXException e) {
-            throw new IOException("file embedded in annotation sax exception", 
e);
-        } catch (TikaException e) {
-            throw new IOException("file embedded in annotation tika 
exception", e);
+            names = efTree.getNames();
         } catch (IOException e) {
-            handleCatchableIOE(e);
+            //LOG?
+        }
+        if (names != null) {
+            for (Map.Entry<String, PDComplexFileSpecification> e : 
names.entrySet()) {
+                embeddedFileNames.add(new NameSpecTuple(e.getKey(), 
e.getValue()));
+            }
         }
 
+        List<PDNameTreeNode<PDComplexFileSpecification>> kids = 
efTree.getKids();
+        if (kids == null) {
+            return;
+        } else {
+            for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
+                extractFilesfromEFTree(node, embeddedFileNames, depth + 1);
+            }
+        }
     }
 
 
@@ -908,17 +942,17 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         addNonNullAttribute("trigger", actionTrigger.name(), attributes);
 
         if (action instanceof PDActionImportData) {
-            processDoc("", "", ((PDActionImportData) action).getFile(), 
attributes);
+            processDocOnAction("", "", ((PDActionImportData) 
action).getFile(), attributes);
         } else if (action instanceof PDActionLaunch) {
             PDActionLaunch pdActionLaunch = (PDActionLaunch) action;
             addNonNullAttribute("id", pdActionLaunch.getF(), attributes);
             addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), 
attributes);
             addNonNullAttribute("operation", pdActionLaunch.getO(), 
attributes);
             addNonNullAttribute("parameters", pdActionLaunch.getP(), 
attributes);
-            processDoc(pdActionLaunch.getF(), "", pdActionLaunch.getFile(), 
attributes);
+            processDocOnAction(pdActionLaunch.getF(), "", 
pdActionLaunch.getFile(), attributes);
         } else if (action instanceof PDActionRemoteGoTo) {
             PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo) action;
-            processDoc("", "", remoteGoTo.getFile(), attributes);
+            processDocOnAction("", "", remoteGoTo.getFile(), attributes);
         } else if (action instanceof PDActionJavaScript) {
             PDActionJavaScript jsAction = (PDActionJavaScript) action;
             Metadata m = new Metadata();
@@ -1323,6 +1357,33 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
+    private PDFileSpecification createFileSpecification(COSBase cosBase) {
+        try {
+            return PDFileSpecification.createFS(cosBase);
+        } catch (IOException e) {
+            //swallow for now
+        }
+        return null;
+    }
+
+    private static class NameSpecTuple {
+        private final String name;
+        private final PDComplexFileSpecification spec;
+
+        public NameSpecTuple(String name, PDComplexFileSpecification spec) {
+            this.name = name;
+            this.spec = spec;
+        }
+
+        public String getName() {
+            return name;
+        }
+
+        public PDComplexFileSpecification getSpec() {
+            return spec;
+        }
+    }
+
     enum ActionTrigger {
         AFTER_DOCUMENT_PRINT, AFTER_DOCUMENT_SAVE, ANNOTATION_CURSOR_ENTERS, 
ANNOTATION_CURSOR_EXIT,
         ANNOTATION_LOSE_INPUT_FOCUS, ANNOTATION_MOUSE_CLICK, 
ANNOTATION_MOUSE_RELEASED,
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 7df87a171..ffa05f393 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1433,5 +1433,4 @@ public class PDFParserTest extends TikaTest {
         metadata.set(TikaCoreProperties.TIKA_CONTENT, 
contentHandler.toString());
         return metadata;
     }*/
-
 }

[tika] branch main updated: TIKA-4012 -- improve extraction of embedded docs in PDFs by looking beyond names tree and annotations (#1079)

Reply via email to