This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit fd474b6541cb397e9a1db4965b1725b1d9b5e241 Author: tballison <[email protected]> AuthorDate: Wed Oct 19 12:18:29 2022 -0400 TIKA-3886 -- extract annotationtype for embedded files in PDFs --- .../main/java/org/apache/tika/metadata/PDF.java | 13 +++++++ .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 40 +++++++++++++++------- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java index bdecb3a9b..1400804c5 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java @@ -139,6 +139,18 @@ public interface PDF { Property EMBEDDED_FILE_DESCRIPTION = Property.externalText(PDF_PREFIX + "embeddedFileDescription"); + /** + * If the file came from an annotation and there was a type + */ + Property EMBEDDED_FILE_ANNOTATION_TYPE = Property.internalText(PDF_PREFIX + + "embeddedFileAnnotationType"); + + /** + * literal string from the PDEmbeddedFile#getSubtype(), should be what the PDF + * alleges is the embedded file's mime type + */ + Property EMBEDDED_FILE_SUBTYPE = Property.internalText(PDF_PREFIX + + "embeddedFileSubtype"); /** * If the PDF has an annotation of type 3D */ @@ -147,4 +159,5 @@ public interface PDF { Property ANNOTATION_TYPES = Property.internalTextBag(PDF_PREFIX + "annotationTypes"); Property ANNOTATION_SUBTYPES = Property.internalTextBag(PDF_PREFIX + "annotationSubtypes"); + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index bd9b99c6c..63331b2fa 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -370,9 +370,11 @@ class AbstractPDF2XHTML extends PDFTextStripper { } } - private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) + private void processDoc(String name, String annotationType, PDFileSpecification spec, + AttributesImpl attributes) throws TikaException, SAXException, IOException { if (spec instanceof PDSimpleFileSpecification) { + //((PDSimpleFileSpecification)spec).getFile(); attributes.addAttribute("", "class", "class", "CDATA", "linked"); attributes.addAttribute("", "id", "id", "CDATA", spec.getFile()); xhtml.startElement("div", attributes); @@ -381,7 +383,8 @@ class AbstractPDF2XHTML extends PDFTextStripper { if (attributes.getIndex("source") < 0) { attributes.addAttribute("", "source", "source", "CDATA", "attachment"); } - extractMultiOSPDEmbeddedFiles(name, (PDComplexFileSpecification) spec, attributes); + extractMultiOSPDEmbeddedFiles(name, annotationType, (PDComplexFileSpecification) spec, + attributes); } } @@ -392,11 +395,13 @@ class AbstractPDF2XHTML extends PDFTextStripper { } for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) { - processDoc(ent.getKey(), ent.getValue(), new AttributesImpl()); + processDoc(ent.getKey(), "", ent.getValue(), new AttributesImpl()); } } - private void extractMultiOSPDEmbeddedFiles(String displayName, PDComplexFileSpecification spec, + private void extractMultiOSPDEmbeddedFiles(String displayName, + String annotationType, + PDComplexFileSpecification spec, AttributesImpl attributes) throws IOException, SAXException, TikaException { @@ -404,20 +409,23 @@ class AbstractPDF2XHTML extends PDFTextStripper { return; } //current strategy is to pull all, not just first non-null - extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), + extractPDEmbeddedFile(displayName, annotationType, spec.getFileUnicode(), spec.getFile(), spec.getFileDescription(), spec.getEmbeddedFile(), attributes); - extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), + extractPDEmbeddedFile(displayName, annotationType, spec.getFileUnicode(), spec.getFileMac(), spec.getFileDescription(), spec.getEmbeddedFileMac(), attributes); - extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), + extractPDEmbeddedFile(displayName, annotationType, spec.getFileUnicode(), spec.getFileDos(), spec.getFileDescription(), spec.getEmbeddedFileDos(), attributes); - extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), + extractPDEmbeddedFile(displayName, annotationType, spec.getFileUnicode(), + spec.getFileUnix(), spec.getFileDescription(), spec.getEmbeddedFileUnix(), attributes); //Check for /Thumb (thumbnail image); // /CI (collection item) adobe specific, can have /adobe:DisplayName and a summary } - private void extractPDEmbeddedFile(String displayName, String unicodeFileName, String fileName, + private void extractPDEmbeddedFile(String displayName, + String annotationType, String unicodeFileName, + String fileName, String description, PDEmbeddedFile file, AttributesImpl attributes) throws SAXException, IOException, TikaException { @@ -438,6 +446,12 @@ class AbstractPDF2XHTML extends PDFTextStripper { embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName); + if (!StringUtils.isBlank(annotationType)) { + embeddedMetadata.set(PDF.EMBEDDED_FILE_ANNOTATION_TYPE, annotationType); + } + if (!StringUtils.isBlank(file.getSubtype())) { + embeddedMetadata.set(PDF.EMBEDDED_FILE_SUBTYPE, file.getSubtype()); + } if (!StringUtils.isBlank(description)) { embeddedMetadata.set(PDF.EMBEDDED_FILE_DESCRIPTION, description); } @@ -800,7 +814,7 @@ class AbstractPDF2XHTML extends PDFTextStripper { try { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "source", "source", "CDATA", annotationType); - extractMultiOSPDEmbeddedFiles(attachmentName, fileSpec, attributes); + extractMultiOSPDEmbeddedFiles(attachmentName, annotationType, fileSpec, attributes); } catch (SAXException e) { throw new IOException("file embedded in annotation sax exception", e); } catch (TikaException e) { @@ -874,17 +888,17 @@ class AbstractPDF2XHTML extends PDFTextStripper { addNonNullAttribute("trigger", actionTrigger.name(), attributes); if (action instanceof PDActionImportData) { - processDoc("", ((PDActionImportData) action).getFile(), attributes); + processDoc("", "", ((PDActionImportData) action).getFile(), attributes); } else if (action instanceof PDActionLaunch) { PDActionLaunch pdActionLaunch = (PDActionLaunch) action; addNonNullAttribute("id", pdActionLaunch.getF(), attributes); addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes); addNonNullAttribute("operation", pdActionLaunch.getO(), attributes); addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes); - processDoc(pdActionLaunch.getF(), pdActionLaunch.getFile(), attributes); + processDoc(pdActionLaunch.getF(), "", pdActionLaunch.getFile(), attributes); } else if (action instanceof PDActionRemoteGoTo) { PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo) action; - processDoc("", remoteGoTo.getFile(), attributes); + processDoc("", "", remoteGoTo.getFile(), attributes); } else if (action instanceof PDActionJavaScript) { PDActionJavaScript jsAction = (PDActionJavaScript) action; Metadata m = new Metadata();
