This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4082 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4fd6fcd9f53637c1f5e34dc528b6801a27daf37e Author: tballison <talli...@apache.org> AuthorDate: Thu Jun 15 15:35:39 2023 -0400 TIKA-4082 -- allow users to choose to have the PDFParser throw an EncryptedDocumentException for PDFs that have an AssociatedFile relationship of EncryptedPayload --- .../java/org/apache/tika/parser/pdf/PDFParser.java | 56 ++++++++++++++++++++- .../apache/tika/parser/pdf/PDFParserConfig.java | 11 ++++ .../org/apache/tika/parser/pdf/PDFParserTest.java | 12 +++++ .../test-documents/testMicrosoftIRMServices.pdf | Bin 0 -> 290327 bytes 4 files changed, 78 insertions(+), 1 deletion(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index eb2bd4664..c51a5152d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -34,6 +34,8 @@ import org.apache.pdfbox.cos.COSArray; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSObject; +import org.apache.pdfbox.cos.COSString; import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream; import org.apache.pdfbox.io.RandomAccessRead; @@ -130,6 +132,10 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia */ private static final long serialVersionUID = -752276948656079347L; private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE); + + private static COSName AF_RELATIONSHIP = COSName.getPDFName("AFRelationship"); + + private static COSName ENCRYPTED_PAYLOAD = COSName.getPDFName("EncryptedPayload"); private PDFParserConfig defaultConfig = new PDFParserConfig(); public Set<MediaType> getSupportedTypes(ParseContext context) { @@ -188,7 +194,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia if (tstream != null) { tstream.setOpenContainer(pdfDocument); } - + checkEncryptedPayload(pdfDocument, localConfig); boolean hasXFA = hasXFA(pdfDocument, metadata); boolean hasMarkedContent = hasMarkedContent(pdfDocument, metadata); extractMetadata(pdfDocument, metadata, context); @@ -238,6 +244,32 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia } } + private void checkEncryptedPayload(PDDocument pdfDocument, PDFParserConfig localConfig) + throws IOException, EncryptedDocumentException { + if (! localConfig.isThrowOnEncryptedPayload()) { + return; + } + List<COSObject> fileSpecs = pdfDocument.getDocument().getObjectsByType(COSName.FILESPEC); + //Do we want to also check that this is a portfolio PDF/contains a "collection"? + for (COSObject obj : fileSpecs) { + if (obj.getObject() instanceof COSDictionary) { + COSBase relationship = obj.getDictionaryObject(AF_RELATIONSHIP); + if (relationship != null && relationship.equals(ENCRYPTED_PAYLOAD)) { + String name = ""; + COSBase uf = obj.getDictionaryObject(COSName.UF); + COSBase f = obj.getDictionaryObject(COSName.F); + if (uf != null && uf instanceof COSString) { + name = ((COSString)uf).getString(); + } else if (f != null && f instanceof COSString) { + name = ((COSString)f).getString(); + } + throw new EncryptedDocumentException("PDF file contains an encrypted " + + "payload: '" + name + "'"); + } + } + } + } + private void scanXRefOffsets(PDFParserConfig localConfig, TikaInputStream tikaInputStream, Metadata metadata, @@ -986,6 +1018,28 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia return defaultConfig.getMaxIncrementalUpdates(); } + /** + * If the file contains an embedded file with a defined 'AssociatedFile' + * value of 'EncryptedPayload', then throw an {@link EncryptedDocumentException}. + *<p> + * Microsoft IRM v2 wraps the encrypted document inside a container PDF. + * See TIKA-4082. + * <p> + * The goal of this is to make the user experience the same for + * traditionally encrypted files and PDFs that are containers + * for `EncryptedPayload`s. + * <p> + * The default value is <code>false</code>. + * + * @param throwOnEncryptedPayload + */ + public void setThrowOnEncryptedPayload(boolean throwOnEncryptedPayload) { + defaultConfig.setThrowOnEncryptedPayload(throwOnEncryptedPayload); + } + + public boolean isThrowOnEncryptedPayload() { + return defaultConfig.isThrowOnEncryptedPayload(); + } /** * This is a no-op. There is no need to initialize multiple fields. * The regular field loading should happen without this. diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index d401b9608..0ee4b274b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -148,6 +148,8 @@ public class PDFParserConfig implements Serializable { int maxIncrementalUpdates = 10; + private boolean throwOnEncryptedPayload = false; + /** * @return whether or not to extract only inline image metadata and not render the images */ @@ -924,6 +926,15 @@ public class PDFParserConfig implements Serializable { userConfigured.add("maxIncrementalUpdates"); } + public void setThrowOnEncryptedPayload(boolean throwOnEncryptedPayload) { + this.throwOnEncryptedPayload = throwOnEncryptedPayload; + userConfigured.add("throwOnEncryptedPayload"); + } + + public boolean isThrowOnEncryptedPayload() { + return throwOnEncryptedPayload; + } + public enum OCR_STRATEGY { AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index ffa05f393..9b499407b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -21,6 +21,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; @@ -1402,6 +1403,17 @@ public class PDFParserTest extends TikaTest { //components we're looking for. } + @Test + public void testThrowOnEncryptedPayload() throws Exception { + PDFParserConfig pdfParserConfig = new PDFParserConfig(); + pdfParserConfig.setThrowOnEncryptedPayload(true); + ParseContext parseContext = new ParseContext(); + parseContext.set(PDFParserConfig.class, pdfParserConfig); + assertThrows(EncryptedDocumentException.class, () -> { + getRecursiveMetadata("testMicrosoftIRMServices.pdf", parseContext); + }); + } + /** * TODO -- need to test signature extraction */ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf new file mode 100644 index 000000000..6d827d0db Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf differ