[tika] 01/01: TIKA-4012 -- improve document extraction in PDFs

tallison Tue, 11 Apr 2023 13:14:37 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4012
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 884b1c92a41b409ec7a2c7e7714f3a0669f40708
Author: tallison <talli...@apache.org>
AuthorDate: Tue Apr 11 16:14:11 2023 -0400

    TIKA-4012 -- improve document extraction in PDFs
---
 CHANGES.txt                                        |   2 +
 .../main/java/org/apache/tika/metadata/PDF.java    |   3 +
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 155 ++++++++++-----------
 3 files changed, 78 insertions(+), 82 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 36c880824..f20c6c596 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -8,6 +8,8 @@ Release 2.7.1 - ???
 
    * Add extraction of rendition layout value and version from Epub 
(TIKA-4013).
 
+   * Improve embedded file extraction from PDFs (TIKA-4012).
+
    * Users may now avoid the ZeroByteFileException via a
      setting on the AutoDetectParserConfig (TIKA-3976).
 
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java 
b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index 1b96231c4..2b21e2590 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -175,4 +175,7 @@ public interface PDF {
      * Number of 3D annotations a PDF contains.  This makes {@link PDF#HAS_3D} 
redundant.
      */
     Property NUM_3D_ANNOTATIONS = Property.internalInteger(PDF_PREFIX + 
"num3DAnnotations");
+
+    Property ASSOCIATED_FILE_RELATIONSHIP = Property.internalText(PDF_PREFIX +
+            "associatedFileRelationship");
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 4f2b2c864..e68dd97d6 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -35,7 +35,6 @@ import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.ListIterator;
@@ -48,17 +47,16 @@ import javax.xml.stream.XMLStreamException;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
-import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
-import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDPageTree;
 import org.apache.pdfbox.pdmodel.common.COSObjectable;
 import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction;
-import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
 import 
org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
 import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
 import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
@@ -148,6 +146,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     private static final String NULL_STRING = "null";
     private static final MediaType XFA_MEDIA_TYPE = 
MediaType.application("vnd.adobe.xdp+xml");
     private static final MediaType XMP_MEDIA_TYPE = 
MediaType.application("rdf+xml");
+
+    private static final COSName AF_RELATIONSHIP = 
COSName.getPDFName("AFRelationship");
     final List<IOException> exceptions = new ArrayList<>();
     final PDDocument pdDocument;
     final XHTMLContentHandler xhtml;
@@ -169,6 +169,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     private final Set<String> triggers = new TreeSet<>();
 
     private final Set<String> actionTypes = new TreeSet<>();
+
+    //these are files that we extract as part of Annotations
+    //We don't want to extract them twice when we go through the
+    //full DOM looking for /Type = /EmbeddedFile
+    private final Set<COSBase> extractedFiles = new HashSet<>();
     //zero-based pageIndex
     int pageIndex = 0;
     int startPage = -1;
@@ -333,52 +338,37 @@ class AbstractPDF2XHTML extends PDFTextStripper {
 
     private void extractEmbeddedDocuments(PDDocument document)
             throws IOException, SAXException, TikaException {
-        PDDocumentNameDictionary namesDictionary =
-                new PDDocumentNameDictionary(document.getDocumentCatalog());
-        PDEmbeddedFilesNameTreeNode efTree = 
namesDictionary.getEmbeddedFiles();
-        if (efTree == null) {
-            return;
-        }
-
-        Map<String, PDComplexFileSpecification> embeddedFileNames = new 
HashMap<>();
-        int depth = 0;
-        //recursively find embedded files
-        extractFilesfromEFTree(efTree, embeddedFileNames, depth);
-        processEmbeddedDocNames(embeddedFileNames);
+        //See 14.13.10 fo the 2.0 spec.  Associated files can show up in lots 
of places...even
+        // streams.
+        // It would be great to get more context from the /AF info, but we 
risk missing files
+        //if we don't look everywhere.  With the current method, we're at 
least getting all
+        //filespecs at the cost of losing context (to what was this file 
attached?).
 
-    }
-
-    private void extractFilesfromEFTree(PDNameTreeNode efTree,
-                                        Map<String, 
PDComplexFileSpecification> embeddedFileNames,
-                                        int depth) throws IOException {
-        if (depth > MAX_RECURSION_DEPTH) {
-            throw new IOException("Hit max recursion depth");
-        }
-        Map<String, PDComplexFileSpecification> names = null;
-        try {
-            names = efTree.getNames();
-        } catch (IOException e) {
-            //LOG?
-        }
-        if (names != null) {
-            for (Map.Entry<String, PDComplexFileSpecification> e : 
names.entrySet()) {
-                embeddedFileNames.put(e.getKey(), e.getValue());
-            }
+        List<COSObject> objs = 
document.getDocument().getObjectsByType(COSName.FILESPEC);
+        for (COSObject obj : objs) {
+            processDoc("", "", createFileSpecification(obj.getObject()), new 
AttributesImpl());
         }
+    }
 
-        List<PDNameTreeNode<PDComplexFileSpecification>> kids = 
efTree.getKids();
-        if (kids == null) {
+    private void processDocOnAction(String name, String annotationType, 
PDFileSpecification spec,
+                            AttributesImpl attributes)
+            throws TikaException, SAXException, IOException {
+        if (spec == null) {
             return;
-        } else {
-            for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
-                extractFilesfromEFTree(node, embeddedFileNames, depth + 1);
-            }
         }
+        processDoc(name, annotationType, spec, attributes);
+        extractedFiles.add(spec.getCOSObject());
     }
 
     private void processDoc(String name, String annotationType, 
PDFileSpecification spec,
                             AttributesImpl attributes)
             throws TikaException, SAXException, IOException {
+        if (spec == null) {
+            return;
+        }
+        if (extractedFiles.contains(spec.getCOSObject())) {
+            return;
+        }
         if (spec instanceof PDSimpleFileSpecification) {
             //((PDSimpleFileSpecification)spec).getFile();
             attributes.addAttribute("", "class", "class", "CDATA", "linked");
@@ -394,17 +384,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
-    private void processEmbeddedDocNames(Map<String, 
PDComplexFileSpecification> embeddedFileNames)
-            throws IOException, SAXException, TikaException {
-        if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
-            return;
-        }
-
-        for (Map.Entry<String, PDComplexFileSpecification> ent : 
embeddedFileNames.entrySet()) {
-            processDoc(ent.getKey(), "", ent.getValue(), new AttributesImpl());
-        }
-    }
-
     private void extractMultiOSPDEmbeddedFiles(String displayName,
                                                String annotationType,
                                                PDComplexFileSpecification spec,
@@ -414,42 +393,43 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         if (spec == null) {
             return;
         }
+
+
         //current strategy is to pull all, not just first non-null
-        extractPDEmbeddedFile(displayName, annotationType, 
spec.getFileUnicode(), spec.getFile(),
-                spec.getFileDescription(), spec.getEmbeddedFile(), attributes);
-        extractPDEmbeddedFile(displayName, annotationType, 
spec.getFileUnicode(), spec.getFileMac(),
-                spec.getFileDescription(), spec.getEmbeddedFileMac(), 
attributes);
-        extractPDEmbeddedFile(displayName, annotationType, 
spec.getFileUnicode(), spec.getFileDos(),
-                spec.getFileDescription(), spec.getEmbeddedFileDos(), 
attributes);
-        extractPDEmbeddedFile(displayName, annotationType, 
spec.getFileUnicode(),
-                spec.getFileUnix(),
-                spec.getFileDescription(), spec.getEmbeddedFileUnix(), 
attributes);
+        extractPDEmbeddedFile(displayName, annotationType, spec,
+                spec.getFile(), spec.getEmbeddedFile(), attributes);
+        extractPDEmbeddedFile(displayName, annotationType, spec,
+                spec.getFileMac(), spec.getEmbeddedFileMac(), attributes);
+        extractPDEmbeddedFile(displayName, annotationType, spec,
+                spec.getFileDos(), spec.getEmbeddedFileDos(), attributes);
+        extractPDEmbeddedFile(displayName, annotationType, spec,
+                spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes);
 
         //Check for /Thumb (thumbnail image);
         // /CI (collection item) adobe specific, can have /adobe:DisplayName 
and a summary
     }
 
     private void extractPDEmbeddedFile(String displayName,
-                                       String annotationType, String 
unicodeFileName,
+                                       String annotationType,
+                                       PDComplexFileSpecification spec,
                                        String fileName,
-                                       String description, PDEmbeddedFile file,
+                                       PDEmbeddedFile pdEmbeddedFile,
                                        AttributesImpl attributes)
-            throws SAXException, IOException, TikaException {
+            throws SAXException, IOException {
 
-        if (file == null) {
+        if (pdEmbeddedFile == null) {
             //skip silently
             return;
         }
 
-        fileName = (fileName == null || "".equals(fileName.trim())) ? 
unicodeFileName : fileName;
+        fileName = (fileName == null || "".equals(fileName.trim())) ? 
spec.getFileUnicode() : fileName;
         fileName = (fileName == null || "".equals(fileName.trim())) ? 
displayName : fileName;
 
         // TODO: other metadata?
         Metadata embeddedMetadata = new Metadata();
         embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
-        embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
         //if the stream is missing a size, -1 is returned
-        long sz = file.getSize();
+        long sz = pdEmbeddedFile.getSize();
         if (sz > -1) {
             embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(sz));
         }
@@ -459,18 +439,22 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         if (!StringUtils.isBlank(annotationType)) {
             embeddedMetadata.set(PDF.EMBEDDED_FILE_ANNOTATION_TYPE, 
annotationType);
         }
-        if (!StringUtils.isBlank(file.getSubtype())) {
-            embeddedMetadata.set(PDF.EMBEDDED_FILE_SUBTYPE, file.getSubtype());
+        if (!StringUtils.isBlank(pdEmbeddedFile.getSubtype())) {
+            embeddedMetadata.set(PDF.EMBEDDED_FILE_SUBTYPE, 
pdEmbeddedFile.getSubtype());
+        }
+        if (!StringUtils.isBlank(spec.getFileDescription())) {
+            embeddedMetadata.set(PDF.EMBEDDED_FILE_DESCRIPTION, 
spec.getFileDescription());
         }
-        if (!StringUtils.isBlank(description)) {
-            embeddedMetadata.set(PDF.EMBEDDED_FILE_DESCRIPTION, description);
+        if 
(!StringUtils.isBlank(spec.getCOSObject().getString(AF_RELATIONSHIP))) {
+            embeddedMetadata.set(PDF.ASSOCIATED_FILE_RELATIONSHIP,
+                    spec.getCOSObject().getString(AF_RELATIONSHIP));
         }
         if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
             return;
         }
         TikaInputStream stream = null;
         try {
-            stream = TikaInputStream.get(file.createInputStream());
+            stream = TikaInputStream.get(pdEmbeddedFile.createInputStream());
         } catch (IOException e) {
             //store this exception in the parent's metadata
             EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
@@ -699,11 +683,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                 }
                 if (annotation instanceof PDAnnotationFileAttachment) {
                     PDAnnotationFileAttachment fann = 
(PDAnnotationFileAttachment) annotation;
-                    if (fann.getFile() instanceof PDComplexFileSpecification) {
-                        handlePDComplexFileSpec(fann.getAttachmentName(),
-                                "annotationFileAttachment",
-                                (PDComplexFileSpecification) fann.getFile());
-                    }
+                    processDocOnAction("", "annotationFileAttachment", 
fann.getFile(),
+                                new AttributesImpl());
                 } else if (annotation instanceof PDAnnotationWidget) {
                     handleWidget((PDAnnotationWidget) annotation);
                 } else {
@@ -717,8 +698,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                         num3DAnnotations++;
                     }
                     for (COSDictionary fileSpec : 
findFileSpecs(annotation.getCOSObject())) {
-                        PDComplexFileSpecification cfs = new 
PDComplexFileSpecification(fileSpec);
-                        handlePDComplexFileSpec(cfs.getFilename(), 
annotationSubtype, cfs);
+                        processDocOnAction("", annotationSubtype,
+                                createFileSpecification(fileSpec),
+                                new AttributesImpl());
                     }
                 }
                 // TODO: remove once PDFBOX-1143 is fixed:
@@ -908,17 +890,17 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         addNonNullAttribute("trigger", actionTrigger.name(), attributes);
 
         if (action instanceof PDActionImportData) {
-            processDoc("", "", ((PDActionImportData) action).getFile(), 
attributes);
+            processDocOnAction("", "", ((PDActionImportData) 
action).getFile(), attributes);
         } else if (action instanceof PDActionLaunch) {
             PDActionLaunch pdActionLaunch = (PDActionLaunch) action;
             addNonNullAttribute("id", pdActionLaunch.getF(), attributes);
             addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), 
attributes);
             addNonNullAttribute("operation", pdActionLaunch.getO(), 
attributes);
             addNonNullAttribute("parameters", pdActionLaunch.getP(), 
attributes);
-            processDoc(pdActionLaunch.getF(), "", pdActionLaunch.getFile(), 
attributes);
+            processDocOnAction(pdActionLaunch.getF(), "", 
pdActionLaunch.getFile(), attributes);
         } else if (action instanceof PDActionRemoteGoTo) {
             PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo) action;
-            processDoc("", "", remoteGoTo.getFile(), attributes);
+            processDocOnAction("", "", remoteGoTo.getFile(), attributes);
         } else if (action instanceof PDActionJavaScript) {
             PDActionJavaScript jsAction = (PDActionJavaScript) action;
             Metadata m = new Metadata();
@@ -1323,6 +1305,15 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
+    private PDFileSpecification createFileSpecification(COSBase cosBase) {
+        try {
+            return PDFileSpecification.createFS(cosBase);
+        } catch (IOException e) {
+            //swallow for now
+        }
+        return null;
+    }
+
     enum ActionTrigger {
         AFTER_DOCUMENT_PRINT, AFTER_DOCUMENT_SAVE, ANNOTATION_CURSOR_ENTERS, 
ANNOTATION_CURSOR_EXIT,
         ANNOTATION_LOSE_INPUT_FOCUS, ANNOTATION_MOUSE_CLICK, 
ANNOTATION_MOUSE_RELEASED,

[tika] 01/01: TIKA-4012 -- improve document extraction in PDFs

Reply via email to