This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit fd474b6541cb397e9a1db4965b1725b1d9b5e241
Author: tballison <[email protected]>
AuthorDate: Wed Oct 19 12:18:29 2022 -0400

    TIKA-3886 -- extract annotationtype for embedded files in PDFs
---
 .../main/java/org/apache/tika/metadata/PDF.java    | 13 +++++++
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 40 +++++++++++++++-------
 2 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java 
b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index bdecb3a9b..1400804c5 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -139,6 +139,18 @@ public interface PDF {
     Property EMBEDDED_FILE_DESCRIPTION = Property.externalText(PDF_PREFIX +
             "embeddedFileDescription");
 
+    /**
+     * If the file came from an annotation and there was a type
+     */
+    Property EMBEDDED_FILE_ANNOTATION_TYPE = Property.internalText(PDF_PREFIX +
+            "embeddedFileAnnotationType");
+
+    /**
+     *     literal string from the PDEmbeddedFile#getSubtype(), should be what 
the PDF
+     *     alleges is the embedded file's mime type
+     */
+    Property EMBEDDED_FILE_SUBTYPE = Property.internalText(PDF_PREFIX +
+            "embeddedFileSubtype");
     /**
      * If the PDF has an annotation of type 3D
      */
@@ -147,4 +159,5 @@ public interface PDF {
     Property ANNOTATION_TYPES = Property.internalTextBag(PDF_PREFIX + 
"annotationTypes");
 
     Property ANNOTATION_SUBTYPES = Property.internalTextBag(PDF_PREFIX + 
"annotationSubtypes");
+
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index bd9b99c6c..63331b2fa 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -370,9 +370,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
-    private void processDoc(String name, PDFileSpecification spec, 
AttributesImpl attributes)
+    private void processDoc(String name, String annotationType, 
PDFileSpecification spec,
+                            AttributesImpl attributes)
             throws TikaException, SAXException, IOException {
         if (spec instanceof PDSimpleFileSpecification) {
+            //((PDSimpleFileSpecification)spec).getFile();
             attributes.addAttribute("", "class", "class", "CDATA", "linked");
             attributes.addAttribute("", "id", "id", "CDATA", spec.getFile());
             xhtml.startElement("div", attributes);
@@ -381,7 +383,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             if (attributes.getIndex("source") < 0) {
                 attributes.addAttribute("", "source", "source", "CDATA", 
"attachment");
             }
-            extractMultiOSPDEmbeddedFiles(name, (PDComplexFileSpecification) 
spec, attributes);
+            extractMultiOSPDEmbeddedFiles(name, annotationType, 
(PDComplexFileSpecification) spec,
+                    attributes);
         }
     }
 
@@ -392,11 +395,13 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
 
         for (Map.Entry<String, PDComplexFileSpecification> ent : 
embeddedFileNames.entrySet()) {
-            processDoc(ent.getKey(), ent.getValue(), new AttributesImpl());
+            processDoc(ent.getKey(), "", ent.getValue(), new AttributesImpl());
         }
     }
 
-    private void extractMultiOSPDEmbeddedFiles(String displayName, 
PDComplexFileSpecification spec,
+    private void extractMultiOSPDEmbeddedFiles(String displayName,
+                                               String annotationType,
+                                               PDComplexFileSpecification spec,
                                                AttributesImpl attributes)
             throws IOException, SAXException, TikaException {
 
@@ -404,20 +409,23 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             return;
         }
         //current strategy is to pull all, not just first non-null
-        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 
spec.getFile(),
+        extractPDEmbeddedFile(displayName, annotationType, 
spec.getFileUnicode(), spec.getFile(),
                 spec.getFileDescription(), spec.getEmbeddedFile(), attributes);
-        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 
spec.getFileMac(),
+        extractPDEmbeddedFile(displayName, annotationType, 
spec.getFileUnicode(), spec.getFileMac(),
                 spec.getFileDescription(), spec.getEmbeddedFileMac(), 
attributes);
-        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 
spec.getFileDos(),
+        extractPDEmbeddedFile(displayName, annotationType, 
spec.getFileUnicode(), spec.getFileDos(),
                 spec.getFileDescription(), spec.getEmbeddedFileDos(), 
attributes);
-        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), 
spec.getFileUnix(),
+        extractPDEmbeddedFile(displayName, annotationType, 
spec.getFileUnicode(),
+                spec.getFileUnix(),
                 spec.getFileDescription(), spec.getEmbeddedFileUnix(), 
attributes);
 
         //Check for /Thumb (thumbnail image);
         // /CI (collection item) adobe specific, can have /adobe:DisplayName 
and a summary
     }
 
-    private void extractPDEmbeddedFile(String displayName, String 
unicodeFileName, String fileName,
+    private void extractPDEmbeddedFile(String displayName,
+                                       String annotationType, String 
unicodeFileName,
+                                       String fileName,
                                        String description, PDEmbeddedFile file,
                                        AttributesImpl attributes)
             throws SAXException, IOException, TikaException {
@@ -438,6 +446,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
         embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, 
fileName);
+        if (!StringUtils.isBlank(annotationType)) {
+            embeddedMetadata.set(PDF.EMBEDDED_FILE_ANNOTATION_TYPE, 
annotationType);
+        }
+        if (!StringUtils.isBlank(file.getSubtype())) {
+            embeddedMetadata.set(PDF.EMBEDDED_FILE_SUBTYPE, file.getSubtype());
+        }
         if (!StringUtils.isBlank(description)) {
             embeddedMetadata.set(PDF.EMBEDDED_FILE_DESCRIPTION, description);
         }
@@ -800,7 +814,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         try {
             AttributesImpl attributes = new AttributesImpl();
             attributes.addAttribute("", "source", "source", "CDATA", 
annotationType);
-            extractMultiOSPDEmbeddedFiles(attachmentName, fileSpec, 
attributes);
+            extractMultiOSPDEmbeddedFiles(attachmentName, annotationType, 
fileSpec, attributes);
         } catch (SAXException e) {
             throw new IOException("file embedded in annotation sax exception", 
e);
         } catch (TikaException e) {
@@ -874,17 +888,17 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         addNonNullAttribute("trigger", actionTrigger.name(), attributes);
 
         if (action instanceof PDActionImportData) {
-            processDoc("", ((PDActionImportData) action).getFile(), 
attributes);
+            processDoc("", "", ((PDActionImportData) action).getFile(), 
attributes);
         } else if (action instanceof PDActionLaunch) {
             PDActionLaunch pdActionLaunch = (PDActionLaunch) action;
             addNonNullAttribute("id", pdActionLaunch.getF(), attributes);
             addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), 
attributes);
             addNonNullAttribute("operation", pdActionLaunch.getO(), 
attributes);
             addNonNullAttribute("parameters", pdActionLaunch.getP(), 
attributes);
-            processDoc(pdActionLaunch.getF(), pdActionLaunch.getFile(), 
attributes);
+            processDoc(pdActionLaunch.getF(), "", pdActionLaunch.getFile(), 
attributes);
         } else if (action instanceof PDActionRemoteGoTo) {
             PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo) action;
-            processDoc("", remoteGoTo.getFile(), attributes);
+            processDoc("", "", remoteGoTo.getFile(), attributes);
         } else if (action instanceof PDActionJavaScript) {
             PDActionJavaScript jsAction = (PDActionJavaScript) action;
             Metadata m = new Metadata();

Reply via email to