This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5062690cb18be20a6bde5b5e5e55755586c79ee2 Author: tballison <[email protected]> AuthorDate: Wed Oct 19 13:07:57 2022 -0400 TIKA-3886 -- Extract PDF actions and triggers into the file's metadata --- CHANGES.txt | 5 ++++ .../main/java/org/apache/tika/metadata/PDF.java | 10 ++++++++ .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 28 ++++++++++++++++++++-- 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 5370bc65e..8c7cfeb85 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,10 @@ Release 2.5.1 - ??? + * Fix bug in OpenSearch emitter that prevented upserts on + documents with embedded files (TIKA-3882). + + * Extract PDF actions and triggers into the file's metadata (TIKA-3886). + * Add a tika-async-cli module (TIKA-3885). * Fetch keys sent via headers to tika server are now URL decoded (TIKA-3864). diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java index 1400804c5..e683f321d 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java @@ -82,9 +82,19 @@ public interface PDF { /** * This specifies where an action or destination would be found/triggered * in the document: on document open, before close, etc. + * + * This is included in the embedded document (js only for now?), not the container PDF. */ Property ACTION_TRIGGER = Property.internalText(PDF_PREFIX + "actionTrigger"); + /** + * This is a list of all action or destination triggers contained + * within a given PDF. + */ + Property ACTION_TRIGGERS = Property.internalTextBag(PDF_PREFIX + "actionTriggers"); + + Property ACTION_TYPES = Property.internalTextBag(PDF_PREFIX + "actionTypes"); + Property CHARACTERS_PER_PAGE = Property.internalIntegerSequence(PDF_PREFIX + "charsPerPage"); Property UNMAPPED_UNICODE_CHARS_PER_PAGE = diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index 63331b2fa..58daefb72 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -165,6 +165,10 @@ class AbstractPDF2XHTML extends PDFTextStripper { private final Set<String> fontNames = new TreeSet<>(); private final Set<String> annotationTypes = new TreeSet<>(); private final Set<String> annotationSubtypes = new TreeSet<>(); + + private final Set<String> triggers = new TreeSet<>(); + + private final Set<String> actionTypes = new TreeSet<>(); //zero-based pageIndex int pageIndex = 0; int startPage = -1; @@ -442,7 +446,11 @@ class AbstractPDF2XHTML extends PDFTextStripper { Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName); embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); - embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); + //if the stream is missing a size, -1 is returned + long sz = file.getSize(); + if (sz > -1) { + embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(sz)); + } embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName); @@ -880,8 +888,16 @@ class AbstractPDF2XHTML extends PDFTextStripper { if (action == null || !config.isExtractActions()) { return; } + triggers.add(actionTrigger.name()); + String actionOrDestString = "destination"; + if (action instanceof PDAction) { + actionOrDestString = "action"; + String actionType = ((PDAction)action).getType(); + if (! StringUtils.isBlank(actionType)) { + actionTypes.add(actionType); + } + } AttributesImpl attributes = new AttributesImpl(); - String actionOrDestString = (action instanceof PDAction) ? "action" : "destination"; addNonNullAttribute("class", actionOrDestString, attributes); addNonNullAttribute("type", action.getClass().getSimpleName(), attributes); @@ -972,6 +988,14 @@ class AbstractPDF2XHTML extends PDFTextStripper { for (String annotationSubtype : annotationSubtypes) { metadata.add(PDF.ANNOTATION_SUBTYPES, annotationSubtype); } + + for (String trigger : triggers) { + metadata.add(PDF.ACTION_TRIGGERS, trigger); + } + + for (String actionType : actionTypes) { + metadata.add(PDF.ACTION_TYPES, actionType); + } xhtml.endDocument(); } catch (TikaException | SAXException e) { throw new IOException("Unable to end a document", e);
