This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5062690cb18be20a6bde5b5e5e55755586c79ee2
Author: tballison <[email protected]>
AuthorDate: Wed Oct 19 13:07:57 2022 -0400

    TIKA-3886 -- Extract PDF actions and triggers into the file's metadata
---
 CHANGES.txt                                        |  5 ++++
 .../main/java/org/apache/tika/metadata/PDF.java    | 10 ++++++++
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 28 ++++++++++++++++++++--
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 5370bc65e..8c7cfeb85 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,10 @@
 Release 2.5.1 - ???
 
+   * Fix bug in OpenSearch emitter that prevented upserts on
+     documents with embedded files (TIKA-3882).
+
+   * Extract PDF actions and triggers into the file's metadata (TIKA-3886).
+
    * Add a tika-async-cli module (TIKA-3885).
 
    * Fetch keys sent via headers to tika server are now URL decoded 
(TIKA-3864).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java 
b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index 1400804c5..e683f321d 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -82,9 +82,19 @@ public interface PDF {
     /**
      * This specifies where an action or destination would be found/triggered
      * in the document: on document open, before close, etc.
+     *
+     * This is included in the embedded document (js only for now?), not the 
container PDF.
      */
     Property ACTION_TRIGGER = Property.internalText(PDF_PREFIX + 
"actionTrigger");
 
+    /**
+     * This is a list of all action or destination triggers contained
+     * within a given PDF.
+     */
+    Property ACTION_TRIGGERS = Property.internalTextBag(PDF_PREFIX + 
"actionTriggers");
+
+    Property ACTION_TYPES = Property.internalTextBag(PDF_PREFIX + 
"actionTypes");
+
     Property CHARACTERS_PER_PAGE = Property.internalIntegerSequence(PDF_PREFIX 
+ "charsPerPage");
 
     Property UNMAPPED_UNICODE_CHARS_PER_PAGE =
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 63331b2fa..58daefb72 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -165,6 +165,10 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     private final Set<String> fontNames = new TreeSet<>();
     private final Set<String> annotationTypes = new TreeSet<>();
     private final Set<String> annotationSubtypes = new TreeSet<>();
+
+    private final Set<String> triggers = new TreeSet<>();
+
+    private final Set<String> actionTypes = new TreeSet<>();
     //zero-based pageIndex
     int pageIndex = 0;
     int startPage = -1;
@@ -442,7 +446,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         Metadata embeddedMetadata = new Metadata();
         embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
         embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
-        embeddedMetadata.set(Metadata.CONTENT_LENGTH, 
Long.toString(file.getSize()));
+        //if the stream is missing a size, -1 is returned
+        long sz = file.getSize();
+        if (sz > -1) {
+            embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(sz));
+        }
         embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
         embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, 
fileName);
@@ -880,8 +888,16 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         if (action == null || !config.isExtractActions()) {
             return;
         }
+        triggers.add(actionTrigger.name());
+        String actionOrDestString = "destination";
+        if (action instanceof PDAction) {
+            actionOrDestString = "action";
+            String actionType = ((PDAction)action).getType();
+            if (! StringUtils.isBlank(actionType)) {
+                actionTypes.add(actionType);
+            }
+        }
         AttributesImpl attributes = new AttributesImpl();
-        String actionOrDestString = (action instanceof PDAction) ? "action" : 
"destination";
 
         addNonNullAttribute("class", actionOrDestString, attributes);
         addNonNullAttribute("type", action.getClass().getSimpleName(), 
attributes);
@@ -972,6 +988,14 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             for (String annotationSubtype : annotationSubtypes) {
                 metadata.add(PDF.ANNOTATION_SUBTYPES, annotationSubtype);
             }
+
+            for (String trigger : triggers) {
+                metadata.add(PDF.ACTION_TRIGGERS, trigger);
+            }
+
+            for (String actionType : actionTypes) {
+                metadata.add(PDF.ACTION_TYPES, actionType);
+            }
             xhtml.endDocument();
         } catch (TikaException | SAXException e) {
             throw new IOException("Unable to end a document", e);

Reply via email to