This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4082
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4fd6fcd9f53637c1f5e34dc528b6801a27daf37e
Author: tballison <talli...@apache.org>
AuthorDate: Thu Jun 15 15:35:39 2023 -0400

    TIKA-4082 -- allow users to choose to have the PDFParser throw an 
EncryptedDocumentException for PDFs that have an AssociatedFile relationship of 
EncryptedPayload
---
 .../java/org/apache/tika/parser/pdf/PDFParser.java |  56 ++++++++++++++++++++-
 .../apache/tika/parser/pdf/PDFParserConfig.java    |  11 ++++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  12 +++++
 .../test-documents/testMicrosoftIRMServices.pdf    | Bin 0 -> 290327 bytes
 4 files changed, 78 insertions(+), 1 deletion(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index eb2bd4664..c51a5152d 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -34,6 +34,8 @@ import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.cos.COSString;
 import org.apache.pdfbox.io.MemoryUsageSetting;
 import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
 import org.apache.pdfbox.io.RandomAccessRead;
@@ -130,6 +132,10 @@ public class PDFParser extends AbstractParser implements 
RenderingParser, Initia
      */
     private static final long serialVersionUID = -752276948656079347L;
     private static final Set<MediaType> SUPPORTED_TYPES = 
Collections.singleton(MEDIA_TYPE);
+
+    private static COSName AF_RELATIONSHIP = 
COSName.getPDFName("AFRelationship");
+
+    private static COSName ENCRYPTED_PAYLOAD = 
COSName.getPDFName("EncryptedPayload");
     private PDFParserConfig defaultConfig = new PDFParserConfig();
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -188,7 +194,7 @@ public class PDFParser extends AbstractParser implements 
RenderingParser, Initia
             if (tstream != null) {
                 tstream.setOpenContainer(pdfDocument);
             }
-
+            checkEncryptedPayload(pdfDocument, localConfig);
             boolean hasXFA = hasXFA(pdfDocument, metadata);
             boolean hasMarkedContent = hasMarkedContent(pdfDocument, metadata);
             extractMetadata(pdfDocument, metadata, context);
@@ -238,6 +244,32 @@ public class PDFParser extends AbstractParser implements 
RenderingParser, Initia
         }
     }
 
+    private void checkEncryptedPayload(PDDocument pdfDocument, PDFParserConfig 
localConfig)
+            throws IOException, EncryptedDocumentException {
+        if (! localConfig.isThrowOnEncryptedPayload()) {
+            return;
+        }
+        List<COSObject> fileSpecs = 
pdfDocument.getDocument().getObjectsByType(COSName.FILESPEC);
+        //Do we want to also check that this is a portfolio PDF/contains a 
"collection"?
+        for (COSObject obj : fileSpecs) {
+            if (obj.getObject() instanceof COSDictionary) {
+                COSBase relationship = 
obj.getDictionaryObject(AF_RELATIONSHIP);
+                if (relationship != null && 
relationship.equals(ENCRYPTED_PAYLOAD)) {
+                    String name = "";
+                    COSBase uf = obj.getDictionaryObject(COSName.UF);
+                    COSBase f = obj.getDictionaryObject(COSName.F);
+                    if (uf != null && uf instanceof COSString) {
+                        name = ((COSString)uf).getString();
+                    } else if (f != null && f instanceof COSString) {
+                        name = ((COSString)f).getString();
+                    }
+                    throw new EncryptedDocumentException("PDF file contains an 
encrypted " +
+                                    "payload: '" + name + "'");
+                }
+            }
+        }
+    }
+
     private void scanXRefOffsets(PDFParserConfig localConfig,
                                  TikaInputStream tikaInputStream,
                                  Metadata metadata,
@@ -986,6 +1018,28 @@ public class PDFParser extends AbstractParser implements 
RenderingParser, Initia
         return defaultConfig.getMaxIncrementalUpdates();
     }
 
+    /**
+     * If the file contains an embedded file with a defined 'AssociatedFile'
+     * value of 'EncryptedPayload', then throw an {@link 
EncryptedDocumentException}.
+     *<p>
+     * Microsoft IRM v2 wraps the encrypted document inside a container PDF.
+     * See TIKA-4082.
+     * <p>
+     * The goal of this is to make the user experience the same for
+     * traditionally encrypted files and PDFs that are containers
+     * for `EncryptedPayload`s.
+     * <p>
+     * The default value is <code>false</code>.
+     *
+     * @param throwOnEncryptedPayload
+     */
+    public void setThrowOnEncryptedPayload(boolean throwOnEncryptedPayload) {
+        defaultConfig.setThrowOnEncryptedPayload(throwOnEncryptedPayload);
+    }
+
+    public boolean isThrowOnEncryptedPayload() {
+        return defaultConfig.isThrowOnEncryptedPayload();
+    }
     /**
      * This is a no-op.  There is no need to initialize multiple fields.
      * The regular field loading should happen without this.
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index d401b9608..0ee4b274b 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -148,6 +148,8 @@ public class PDFParserConfig implements Serializable {
 
     int maxIncrementalUpdates = 10;
 
+    private boolean throwOnEncryptedPayload = false;
+
     /**
      * @return whether or not to extract only inline image metadata and not 
render the images
      */
@@ -924,6 +926,15 @@ public class PDFParserConfig implements Serializable {
         userConfigured.add("maxIncrementalUpdates");
     }
 
+    public void setThrowOnEncryptedPayload(boolean throwOnEncryptedPayload) {
+        this.throwOnEncryptedPayload = throwOnEncryptedPayload;
+        userConfigured.add("throwOnEncryptedPayload");
+    }
+
+    public boolean isThrowOnEncryptedPayload() {
+        return throwOnEncryptedPayload;
+    }
+
     public enum OCR_STRATEGY {
         AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION;
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ffa05f393..9b499407b 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -21,6 +21,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
 
@@ -1402,6 +1403,17 @@ public class PDFParserTest extends TikaTest {
         //components we're looking for.
     }
 
+    @Test
+    public void testThrowOnEncryptedPayload() throws Exception {
+        PDFParserConfig pdfParserConfig = new PDFParserConfig();
+        pdfParserConfig.setThrowOnEncryptedPayload(true);
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(PDFParserConfig.class, pdfParserConfig);
+        assertThrows(EncryptedDocumentException.class, () -> {
+            getRecursiveMetadata("testMicrosoftIRMServices.pdf", parseContext);
+        });
+    }
+
     /**
      * TODO -- need to test signature extraction
      */
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf
new file mode 100644
index 000000000..6d827d0db
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf
 differ

Reply via email to